Thomas před 4 dny
rodič
revize
37eef933cf
2 změnil soubory, kde provedl 46 přidání a 24 odebrání
  1. 10 3
      src/scan/bin.rs
  2. 36 21
      src/scan/scan.rs

+ 10 - 3
src/scan/bin.rs

@@ -61,6 +61,7 @@ impl Bin {
         length: u32,
         min_mapq: u8,
     ) -> anyhow::Result<Self> {
+        anyhow::ensure!(length > 0, "bin length must be > 0");
         let end = start + length - 1;
 
         bam_reader
@@ -92,6 +93,9 @@ impl Bin {
                     continue; // Not overlapping this bin
                 }
 
+                let has_fbinv =
+                    fb_inv_from_record(record, &header, min_mapq, Some(150), Some(200)).is_some();
+
                 // Clone the underlying Record
                 reads_store.insert(record.qname().to_vec(), record.clone());
 
@@ -105,7 +109,7 @@ impl Bin {
                         if record.mapq() < min_mapq {
                             low_qualities[i] += 1;
                         }
-                        if let Some(_fbinv) = fb_inv_from_record(record, &header, min_mapq, Some(150), Some(200)) {
+                        if has_fbinv {
                             fb_invs[i] += 1;
                         }
                     }
@@ -357,6 +361,9 @@ impl Bin {
     }
 
     pub fn mean_coverage_from_depths(&self) -> f64 {
+        if self.depths.is_empty() {
+            return 0.0;
+        }
         self.depths.iter().sum::<u32>() as f64 / self.depths.len() as f64
     }
 }
@@ -399,7 +406,8 @@ pub fn parse_bin_record_into<'a>(
     anyhow::ensure!(contig == contig_expected, "unexpected contig");
 
     let start = parse_u32(rec.get(i_start).context("missing start")?)?;
-    let end   = parse_u32(rec.get(i_end).context("missing end")?)?;
+    let end = parse_u32(rec.get(i_end).context("missing end")?)?;
+    anyhow::ensure!(end >= start, "invalid bin coordinates: end < start ({start} > {end})");
 
     parse_csv_u32_into(&mut buf.depths, rec.get(i_depths).context("missing depths")?)
         .context("parse depths")?;
@@ -548,4 +556,3 @@ impl BinStats {
         })
     }
 }
-

+ 36 - 21
src/scan/scan.rs

@@ -106,8 +106,17 @@ impl From<&Bin> for BinCount {
     /// A new `BinCount` instance populated with data from the `Bin`.
     fn from(bin: &Bin) -> Self {
         let n_reads = bin.n_reads();
-        let n_reads_float = n_reads as f64;
         let (n_sa, n_start, n_end) = bin.count_reads_sa_start_end();
+        let (ratio_sa, ratio_start, ratio_end) = if n_reads == 0 {
+            (f64::NAN, f64::NAN, f64::NAN)
+        } else {
+            let n_reads_float = n_reads as f64;
+            (
+                n_sa as f64 / n_reads_float,
+                n_start as f64 / n_reads_float,
+                n_end as f64 / n_reads_float,
+            )
+        };
 
         Self {
             contig: bin.contig.clone(),
@@ -115,9 +124,9 @@ impl From<&Bin> for BinCount {
             end: bin.end,
             n_reads: n_reads as u32,
             coverage: bin.mean_coverage_from_depths(),
-            ratio_sa: n_sa as f64 / n_reads_float,
-            ratio_start: n_start as f64 / n_reads_float,
-            ratio_end: n_end as f64 / n_reads_float,
+            ratio_sa,
+            ratio_start,
+            ratio_end,
             outlier: None,
             depths: bin.depths.clone(),
             low_qualities: bin.low_qualities.clone(),
@@ -345,6 +354,10 @@ fn build_bin_defs_grid(
     bin_size: u32,
     chunk_n_bin: u32,
 ) -> (Vec<BinDef>, Vec<Option<usize>>) {
+    if bin_size == 0 || chunk_n_bin == 0 {
+        return (Vec::new(), Vec::new());
+    }
+
     let n_bin = contig_len / bin_size; // keep your bitwise logic
     let n_chunks = n_bin.div_ceil(chunk_n_bin); // keep your bitwise logic
 
@@ -477,6 +490,9 @@ pub fn scan_contig(
     chunk_n_bin: u32,
     min_mapq: u8,
 ) -> anyhow::Result<Vec<BinCount>> {
+    anyhow::ensure!(bin_size > 0, "bin_size must be > 0");
+    anyhow::ensure!(chunk_n_bin > 0, "chunk_n_bin must be > 0");
+
     // Build bin definitions EXACTLY like the old code (bitwise-identical ranges),
     // plus a safe mapping from grid-bin index -> defs index.
     let (defs, grid_to_def) = build_bin_defs_grid(contig_len, bin_size, chunk_n_bin);
@@ -742,7 +758,7 @@ pub fn par_whole_scan(id: &str, time_point: &str, config: &Config) -> anyhow::Re
                 .with_context(|| format!("failed to create output dir: {out_parent:?}"))?;
 
             let tmp_path = out_parent.join(format!(
-                ".{}.{}.{}.tmp",
+                ".{}.{}.{}.tmp.gz",
                 contig,
                 std::process::id(),
                 std::thread::current().name().unwrap_or("rayon")
@@ -792,20 +808,13 @@ pub fn par_whole_scan(id: &str, time_point: &str, config: &Config) -> anyhow::Re
 /// method to identify outliers and marks them in the `outlier` field of each `BinCount`.
 ///
 /// ## Workflow:
-/// 1. Wraps the input slice in a `Mutex` for thread-safe access.
-/// 2. Defines helper functions to extract specific ratios from `BinCount` objects.
-/// 3. Iterates over three outlier types: SA, Start, and End.
-/// 4. For each type:
-///    a. Filters and collects non-NaN ratios along with their indices.
-///    b. Identifies outliers using the `filter_outliers_modified_z_score_with_indices` function.
-///    c. Marks identified outliers in the original `BinCount` objects.
+/// 1. Materializes `(index, [ratio_sa, ratio_start, ratio_end])` for each bin.
+/// 2. For each outlier type, filters non-NaN ratios and runs modified Z-score detection.
+/// 3. Marks outlier labels back into `bin_counts` in place.
 ///
 /// ## Parallelization:
-/// - Uses `par_iter()` for parallel processing of `BinCount` objects.
-/// - Applies outlier marking in parallel using `par_iter().for_each()`.
-///
-/// ## Thread Safety:
-/// - Uses `Mutex` to ensure thread-safe access to the shared `bin_counts` slice.
+/// - Uses Rayon `par_iter()` for ratio extraction/filtering.
+/// - Final outlier labels are applied sequentially to the mutable slice.
 ///
 /// ## Outlier Types:
 /// - `BinOutlier::SA`: Outliers in supplementary alignment ratio.
@@ -902,7 +911,7 @@ pub fn somatic_scan(id: &str, config: &Config) -> anyhow::Result<()> {
 /// This struct encapsulates:
 /// - Initialization and conditional cleanup of prior outputs
 /// - Logic for checking whether a re-run is necessary (based on BAM and output timestamps)
-/// - Coordinated parallel scanning of both normal and tumoral inputs
+/// - Coordinated scanning of both normal and tumoral inputs
 #[derive(Debug)]
 pub struct SomaticScan {
     id: String,
@@ -934,8 +943,14 @@ impl Initialize for SomaticScan {
 
         // Force re-run: clean up previous output directories for both normal and tumoral scans
         if somatic_scan.config.somatic_scan_force {
-            fs::remove_dir_all(somatic_scan.config.somatic_scan_normal_output_dir(id))?;
-            fs::remove_dir_all(somatic_scan.config.somatic_scan_tumoral_output_dir(id))?;
+            let normal_out = somatic_scan.config.somatic_scan_normal_output_dir(id);
+            let tumoral_out = somatic_scan.config.somatic_scan_tumoral_output_dir(id);
+            if Path::new(&normal_out).exists() {
+                fs::remove_dir_all(&normal_out)?;
+            }
+            if Path::new(&tumoral_out).exists() {
+                fs::remove_dir_all(&tumoral_out)?;
+            }
         }
 
         Ok(somatic_scan)
@@ -978,7 +993,7 @@ impl ShouldRun for SomaticScan {
 }
 
 impl Run for SomaticScan {
-    /// Executes the full scan pipeline in parallel for first normal then tumoral samples.
+    /// Executes the full scan pipeline sequentially: normal then tumoral.
     ///
     /// # Returns
     /// An error if the underlying scan function (`par_whole_scan`) fails for either sample.