Thomas 1 månad sedan
förälder
incheckning
955bd9fc0e
4 ändrade filer med 61 tillägg och 35 borttagningar
  1. 5 3
      src/callers/deep_variant.rs
  2. 12 1
      src/helpers.rs
  3. 1 1
      src/pipes/somatic.rs
  4. 43 30
      src/scan/scan.rs

+ 5 - 3
src/callers/deep_variant.rs

@@ -87,12 +87,12 @@ use log::{debug, info};
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use regex::Regex;
 use rust_htslib::bam::{self, Read};
-use uuid::Uuid;
 use std::{
     fmt, fs,
     path::{Path, PathBuf},
     process::{Command as ProcessCommand, Stdio},
 };
+use uuid::Uuid;
 
 use crate::{
     annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
@@ -101,7 +101,8 @@ use crate::{
         vcf::Vcf,
     },
     commands::{
-        LocalBatchRunner, SlurmRunner, bcftools::{BcftoolsConcat, BcftoolsIndex, BcftoolsKeepPass}
+        bcftools::{BcftoolsConcat, BcftoolsKeepPass},
+        LocalBatchRunner, SlurmRunner,
     },
     config::Config,
     helpers::{
@@ -113,7 +114,8 @@ use crate::{
     run, run_many,
     runners::Run,
     variant::{
-        variant_collection::VariantCollection, vcf_variant::{Label, Variants}
+        variant_collection::VariantCollection,
+        vcf_variant::{Label, Variants},
     },
 };
 

+ 12 - 1
src/helpers.rs

@@ -2,6 +2,7 @@ use anyhow::Context;
 use bitcode::{Decode, Encode};
 use glob::glob;
 use log::{debug, error, warn};
+use rust_htslib::bam::Read;
 use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};
 use std::{
@@ -910,6 +911,16 @@ pub fn get_genome_sizes(
     Ok(sizes)
 }
 
+pub fn bam_contigs<P: AsRef<std::path::Path>>(bam: P) -> anyhow::Result<Vec<String>> {
+    let reader = rust_htslib::bam::Reader::from_path(&bam)?;
+    let header = rust_htslib::bam::Header::from_template(reader.header());
+
+    Ok(get_genome_sizes(&header)?
+        .into_keys()
+        .collect())
+}
+
+
 /// Split genome into ~n_parts equal-sized chunks (by number of bases),
 /// returning for each chunk a list of regions in `ctg:start-end` form.
 /// Split genome into ~n_parts equal-sized chunks (by bases).
@@ -990,7 +1001,7 @@ pub fn split_genome_into_n_regions_exact(
     let base_per_part = total_bases / parts as u64;
     let remainder = total_bases % parts as u64;
 
-    let mut target_sizes: Vec<u64> = (0..parts)
+    let target_sizes: Vec<u64> = (0..parts)
         .map(|i| {
             if i < remainder as usize {
                 base_per_part + 1

+ 1 - 1
src/pipes/somatic.rs

@@ -234,7 +234,7 @@ impl Run for SomaticPipe {
         let mut to_run_if_req = create_should_run!(
             &id,
             &config,
-            // SomaticScan,
+            SomaticScan,
             ClairS,
             NanomonSV,
             Severus,

+ 43 - 30
src/scan/scan.rs

@@ -8,16 +8,16 @@ use rayon::{
     iter::{IntoParallelIterator, ParallelIterator},
     slice::ParallelSliceMut,
 };
-use rust_htslib::bam::IndexedReader;
+use rust_htslib::bam::{self, IndexedReader, Read};
 
-use crate::helpers::is_file_older;
+use crate::helpers::{bam_contigs, get_genome_sizes, is_file_older};
 use crate::io::writers::get_gz_writer;
 use crate::math::filter_outliers_modified_z_score_with_indices;
 
 use crate::pipes::{Initialize, ShouldRun};
 use crate::runners::Run;
 use crate::variant::vcf_variant::Label;
-use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};
+use crate::{config::Config, scan::bin::Bin};
 
 /// Represents a count of reads in a genomic bin, including various metrics and outlier information.
 #[derive(Debug)]
@@ -287,15 +287,27 @@ impl fmt::Display for BinOutlier {
 /// - A `Bin` object cannot be created for a specific region.
 /// - Any I/O operation (e.g., writing results) fails.
 pub fn par_whole_scan(id: &str, time_point: &str, config: &Config) -> anyhow::Result<()> {
-    let bin_size = config.count_bin_size;
-    let chunk_n_bin = config.count_n_chunks;
+    let bin_size = config.count_bin_size ;
+    let chunk_n_bin = config.count_n_chunks ;
     let bam_path = &config.solo_bam(id, time_point);
     let out_dir = config.somatic_scan_solo_output_dir(id, time_point);
 
     info!("Starting whole genome scan for {bam_path}, with bin size of {bin_size} nt and by chunks of {chunk_n_bin} bins.");
     fs::create_dir_all(&out_dir)?;
 
-    for (contig, length) in read_dict(&config.dict_file)? {
+    let reader = bam::Reader::from_path(bam_path)
+        .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
+    let header = bam::Header::from_template(reader.header());
+    let contigs: Vec<(String, u32)> = get_genome_sizes(&header)?
+        .into_iter()
+        .map(|(ctg, len)| {
+            u32::try_from(len)
+                .map(|l| (ctg.clone(), l))
+                .with_context(|| format!("Contig {ctg} length {len} exceeds u32::MAX"))
+        })
+        .collect::<anyhow::Result<_>>()?;
+
+    for (contig, length) in contigs {
         let out_file = config.somatic_scan_solo_count_file(id, time_point, &contig);
         // let out_file = format!("{out_dir}/{contig}_count.tsv.gz");
 
@@ -544,33 +556,34 @@ impl ShouldRun for SomaticScan {
     /// Determines whether SomaticScan should re-run by checking whether
     /// any of the count output files are outdated or missing relative to the BAMs.
     fn should_run(&self) -> bool {
-        let mrd_bam_path = &self.config.normal_bam(&self.id);
-        let diag_bam_path = &self.config.tumoral_bam(&self.id);
-
-        match read_dict(&self.config.dict_file) {
-            Ok(dict) => {
-                for (contig, _) in dict {
-                    let diag_count_file = self
-                        .config
-                        .somatic_scan_tumoral_count_file(&self.id, &contig);
-                    if is_file_older(&diag_count_file, diag_bam_path, true).unwrap_or(true) {
-                        return true;
-                    }
-                    let mrd_count_file = self
-                        .config
-                        .somatic_scan_normal_count_file(&self.id, &contig);
-                    if is_file_older(&mrd_count_file, mrd_bam_path, true).unwrap_or(true) {
-                        return true;
-                    }
-                }
-                false
-            }
+        let mrd_bam_path = self.config.normal_bam(&self.id);
+        let diag_bam_path = self.config.tumoral_bam(&self.id);
+
+        let contigs = match bam_contigs(&diag_bam_path) {
+            Ok(c) => c,
             Err(e) => {
-                error!("Failed to read dict file: {}\n{e}", self.config.dict_file);
-                // Don't run if dict is unreadable
-                false
+                error!("Failed to read BAM header: {e}");
+                return false;
+            }
+        };
+
+        for contig in contigs {
+            let diag_count_file = self
+                .config
+                .somatic_scan_tumoral_count_file(&self.id, &contig);
+            if is_file_older(&diag_count_file, &diag_bam_path, true).unwrap_or(true) {
+                return true;
+            }
+
+            let mrd_count_file = self
+                .config
+                .somatic_scan_normal_count_file(&self.id, &contig);
+            if is_file_older(&mrd_count_file, &mrd_bam_path, true).unwrap_or(true) {
+                return true;
             }
         }
+
+        false
     }
 }