Pārlūkot izejas kodu

gz_writer + variants_stats + high depth ranges + BinCount depths + Bin depths

Thomas 11 mēneši atpakaļ
vecāks
revīzija
9d84177b64

+ 3 - 0
src/config.rs

@@ -67,6 +67,7 @@ pub struct Config {
     pub nanomonsv_solo_output_dir: String,
     pub nanomonsv_solo_passed_vcf: String,
     pub somatic_pipe_force: bool,
+    pub min_high_quality_depth: u32,
 }
 
 // Here comes names that can't be changed from output of tools
@@ -187,6 +188,8 @@ impl Default for Config {
             somatic_min_constit_depth: 5,
             somatic_max_alt_constit: 1,
             min_shannon_entropy: 1.0,
+
+            min_high_quality_depth: 14,
         }
     }
 }

+ 30 - 29
src/helpers.rs

@@ -319,35 +319,36 @@ where
     bins
 }
 
-fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> {
-    if data.is_empty() || num_bins == 0 {
-        return vec![];
-    }
-
-    let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize;
-
-    (0..num_bins)
-        .map(|i| {
-            let start = i * bin_size;
-            let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()`
-
-            // If `start` is out of bounds, return (0, 0)
-            if start >= data.len() {
-                return (0, 0);
-            }
-
-            let bin = &data[start..end];
-
-            let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum();
-            let count = bin.len() as u128;
-            let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32
-
-            let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum();
-
-            (mean_x, sum_n)
-        })
-        .collect()
-}
+// fn aggregate_data(data: &[(u128, u32)], num_bins: usize) -> Vec<(u32, u32)> {
+//     if data.is_empty() || num_bins == 0 {
+//         return vec![];
+//     }
+//
+//     let bin_size = ((data.len() as f64) / (num_bins as f64)).ceil() as usize;
+//
+//     (0..num_bins)
+//         .map(|i| {
+//             let start = i * bin_size;
+//             let end = (start + bin_size).min(data.len()); // Ensure `end` does not exceed `data.len()`
+//
+//             // If `start` is out of bounds, return (0, 0)
+//             if start >= data.len() {
+//                 return (0, 0);
+//             }
+//
+//             let bin = &data[start..end];
+//
+//             let sum_x: u128 = bin.iter().map(|&(x, _)| x).sum();
+//             let count = bin.len() as u128;
+//             let mean_x = (sum_x / count) as u32; // Rounded down to nearest u32
+//
+//             let sum_n: u32 = bin.iter().map(|&(_, n)| n).sum();
+//
+//             (mean_x, sum_n)
+//         })
+//         .collect()
+// }
+//
 
 pub fn app_storage_dir() -> anyhow::Result<PathBuf> {
     let app_name = env!("CARGO_PKG_NAME");

+ 13 - 1
src/io/gff.rs

@@ -5,6 +5,15 @@ use crate::{config::Config, positions::GenomeRange};
 
 use super::readers::get_gz_reader;
 
+/// # Arguments
+/// * `feature_type` - GFF feature type to filter (e.g., "gene", "exon", "CDS", "transcript")
+/// * `config` - Application configuration containing refseq_gff path
+/// 
+/// # Returns
+/// Vec of genomic ranges matching specified feature type
+/// 
+/// # Errors
+/// Returns anyhow::Error for I/O failures and parsing errors
 pub fn features_ranges(feature_type: &str, config: &Config) -> anyhow::Result<Vec<GenomeRange>> {
     let path = &config.refseq_gff;
     let mut reader = get_gz_reader(path)
@@ -13,7 +22,10 @@ pub fn features_ranges(feature_type: &str, config: &Config) -> anyhow::Result<Ve
 
     let mut res = Vec::new();
     for result in reader.record_bufs() {
-        let record = result?;
+        let record = result.with_context(|| {
+            format!("Error reading GFF file {path}")
+        })?;
+
         let ty = record.ty();
         if ty != feature_type {
             continue;

+ 1 - 0
src/io/mod.rs

@@ -7,3 +7,4 @@ pub mod fasta;
 pub mod pod5_footer_generated;
 pub mod gff;
 pub mod bam;
+pub mod writers;

+ 26 - 0
src/io/writers.rs

@@ -0,0 +1,26 @@
+use std::{
+    fs::{self, File, OpenOptions},
+    path::Path,
+};
+
+use anyhow::Context;
+use bgzip::{BGZFWriter, Compression};
+
+pub fn get_gz_writer(path: &str, force: bool) -> anyhow::Result<BGZFWriter<File>> {
+    if !path.ends_with(".gz") {
+        anyhow::bail!("The file should end with gz");
+    }
+
+    if force && Path::new(path).exists() {
+        fs::remove_file(path).with_context(|| anyhow::anyhow!("Failed to remove file: {path}"))?;
+    }
+
+    let file = OpenOptions::new()
+        .write(true) // Open the file for writing
+        .create_new(true)
+        .truncate(true)
+        .open(path)
+        .with_context(|| anyhow::anyhow!("failed to open the file: {path}"))?;
+
+    Ok(BGZFWriter::new(file, Compression::default()))
+}

+ 17 - 7
src/lib.rs

@@ -27,7 +27,7 @@ lazy_static! {
 
 #[cfg(test)]
 mod tests {
-    use std::{collections::HashMap, fs, ops::Index, path::Path};
+    use std::{collections::HashMap, fs, path::Path};
 
     use annotation::{vep::{VepLine, VEP}, Annotations};
     use callers::{nanomonsv::nanomonsv_create_pon, savana::{Savana, SavanaReadCounts}, severus::{Severus, SeverusSolo}};
@@ -45,7 +45,7 @@ mod tests {
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{annotation::{vep::VepImpact, Annotation}, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::{CNSegment, SavanaCN}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, par_overlaps}, scan::scan::{par_whole_scan, somatic_scan}, variant::{variant::AlterationCategory, variant_collection::VariantsStats}};
+    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::{CNSegment, SavanaCN}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, par_overlaps}, scan::scan::{par_whole_scan, somatic_scan}, variant::{variant::AlterationCategory, variants_stats::{self, VariantsStats}}};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -822,17 +822,17 @@ Ok(())
         let merged = merge_overlapping_genome_ranges(&r);
         
         let variants = variant_collection::Variants::load_from_file(&path)?;
-        let full = variant_collection::somatic_rates(&variants.data, &merged, &config);
+        let full = variants_stats::somatic_rates(&variants.data, &merged, &config);
         info!("{full:#?}");
 
         let restrained: Vec<variant_collection::Variant> = variants.data.iter().filter(|v| v.vcf_variants.len() >= 2)
             .cloned().collect();
-        let min_2 = variant_collection::somatic_rates(&restrained, &merged, &config);
+        let min_2 = variants_stats::somatic_rates(&restrained, &merged, &config);
         info!("{min_2:#?}");
 
         let restrained: Vec<variant_collection::Variant> = restrained.iter().filter(|v| v.vcf_variants.len() >= 3)
             .cloned().collect();
-        let min_3 = variant_collection::somatic_rates(&restrained, &merged, &config);
+        let min_3 = variants_stats::somatic_rates(&restrained, &merged, &config);
         info!("{min_3:#?}");
 
 
@@ -869,6 +869,16 @@ Ok(())
         Ok(())
     }
 
-
-
+    #[test]
+    fn quality_ranges() -> anyhow::Result<()> {
+        init();
+        let r = variants_stats::high_depth_somatic("ADJAGBA", &Config::default())?;
+        r.iter().for_each(|e| {
+            let contig = e.key();
+            let ranges = e.value();
+            let su = ranges.iter().map(|range| range.range.len()).sum::<usize>();
+            info!("{contig}: {su} high depth bases");
+        });
+        Ok(())
+    }
 }

+ 1 - 0
src/math.rs

@@ -63,6 +63,7 @@ pub fn filter_outliers_modified_z_score_with_indices(
     indices
         .into_iter()
         .zip(data.iter())
+        .par_bridge()
         .filter_map(|(index, &x)| {
             // Calculate the Modified Z-Score
             let modified_z_score = 0.6745 * (x - median).abs() / mad;

+ 2 - 1
src/pipes/somatic.rs

@@ -20,7 +20,8 @@ use crate::{
     runners::Run,
     variant::{
         variant::{load_variants, run_variants, CallerBox},
-        variant_collection::{ExternalAnnotation, VariantCollection, Variants, VariantsStats},
+        variant_collection::{ExternalAnnotation, VariantCollection, Variants},
+        variants_stats::VariantsStats,
     },
 };
 

+ 76 - 40
src/scan/bin.rs

@@ -1,6 +1,7 @@
 use std::collections::HashMap;
 
 use anyhow::Context;
+use log::debug;
 use rust_htslib::bam::{ext::BamRecordExtensions, record::Aux, IndexedReader, Read, Record};
 
 use crate::io::bam::{primary_record, primary_records};
@@ -21,11 +22,13 @@ pub struct Bin {
     /// Map of read query names to their BAM records
     pub reads_store: HashMap<Vec<u8>, Record>,
     /// Indexed BAM reader for fetching additional records
-    pub bam_reader: IndexedReader,
+    // pub bam_reader: IndexedReader,
     /// Average read length within this bin
-    pub reads_mean_len: u32,
+    // pub reads_mean_len: u32,
     /// Number of reads filtered due to low mapping quality
     pub n_low_mapq: u32,
+    /// Vec of depths
+    pub depths: Vec<u32>,
 }
 
 impl Bin {
@@ -49,10 +52,15 @@ impl Bin {
     /// let bin = Bin::new("sample.bam", "chr1", 1000000, 1000)?;
     /// println!("Loaded {} reads", bin.n_reads());
     /// ```
-    pub fn new(bam_path: &str, contig: &str, start: u32, length: u32) -> anyhow::Result<Self> {
-        const DEFAULT_MAPQ: u8 = 50;
-        let mut bam_reader = IndexedReader::from_path(bam_path)
-            .with_context(|| format!("Can't open BAM file: {}", bam_path))?;
+    pub fn new(
+        bam_reader: &mut IndexedReader,
+        contig: &str,
+        start: u32,
+        length: u32,
+        min_mapq: u8,
+    ) -> anyhow::Result<Self> {
+        // let mut bam_reader = IndexedReader::from_path(bam_path)
+        //     .with_context(|| format!("Can't open BAM file: {}", bam_path))?;
 
         // Calculate inclusive end position
         let end = start + length - 1;
@@ -66,44 +74,69 @@ impl Bin {
 
         let mut reads_store: HashMap<Vec<u8>, Record> = HashMap::new();
         let mut n_low_mapq = 0;
-        let mut lengths = Vec::new();
+        // let mut lengths = Vec::new();
 
         // Process all records in the region
-        for read_result in bam_reader.records() {
-            let record = read_result.context("Error while parsing BAM record")?;
-
-            // Skip reads with low mapping quality
-            if record.mapq() < DEFAULT_MAPQ {
-                n_low_mapq += 1;
+        // for read_result in bam_reader.records() {
+        //     let record = read_result.context("Error while parsing BAM record")?;
+        //
+        //     // Skip reads with low mapping quality
+        //     if record.mapq() < min_mapq {
+        //         n_low_mapq += 1;
+        //         continue;
+        //     }
+        //
+        //     // Store the read length
+        //     lengths.push(
+        //         record
+        //             .reference_end()
+        //             .saturating_sub(record.reference_start()),
+        //     );
+        //
+        //     // Store the record by query name
+        //     reads_store.insert(record.qname().to_vec(), record);
+        // }
+        // Calculate mean read length, handling the empty case
+        // let reads_mean_len = if lengths.is_empty() {
+        //     0
+        // } else {
+        //     (lengths.iter().sum::<i64>() as f64 / lengths.len() as f64) as u32
+        // };
+
+        let mut depths = Vec::new();
+        for pileup in bam_reader.pileup() {
+            let pileup = pileup?;
+            let pos = pileup.pos();
+
+            if pos < start {
                 continue;
+            } else if pos > end {
+                break;
             }
 
-            // Store the read length
-            lengths.push(
-                record
-                    .reference_end()
-                    .saturating_sub(record.reference_start()),
-            );
-
-            // Store the record by query name
-            reads_store.insert(record.qname().to_vec(), record);
+            let depth = pileup
+                .alignments()
+                .filter(|a| {
+                    let record = a.record();
+                    if record.mapq() < min_mapq {
+                        n_low_mapq += 1;
+                        return false;
+                    }
+                    reads_store.insert(record.qname().to_vec(), record);
+
+                    true
+                })
+                .count();
+            depths.push(depth as u32);
         }
 
-        // Calculate mean read length, handling the empty case
-        let reads_mean_len = if lengths.is_empty() {
-            0
-        } else {
-            (lengths.iter().sum::<i64>() as f64 / lengths.len() as f64) as u32
-        };
-
         Ok(Bin {
             contig: contig.to_string(),
             start,
             end,
             reads_store,
-            bam_reader,
             n_low_mapq,
-            reads_mean_len,
+            depths,
         })
     }
 
@@ -125,11 +158,11 @@ impl Bin {
     /// # Returns
     ///
     /// A vector of primary alignment records for reads with supplementary alignments.
-    pub fn sa_primary(&mut self) -> Vec<Record> {
+    pub fn sa_primary(&mut self, bam_reader: &mut IndexedReader) -> Vec<Record> {
         self.reads_store
             .values()
             .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
-            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
+            .map(|record| primary_record(bam_reader, record.clone()))
             .collect()
     }
 
@@ -255,7 +288,7 @@ impl Bin {
     /// # Returns
     ///
     /// A vector of primary alignment records.
-    pub fn se_primary(&mut self, pos: u32) -> Vec<Record> {
+    pub fn se_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
         let records: Vec<Record> = self
             .reads_store
             .values()
@@ -265,7 +298,7 @@ impl Bin {
             .cloned()
             .collect();
 
-        primary_records(&mut self.bam_reader, records)
+        primary_records(bam_reader, records)
     }
 
     /// Gets primary alignments for reads that start at a specific position.
@@ -277,11 +310,11 @@ impl Bin {
     /// # Returns
     ///
     /// A vector of primary alignment records.
-    pub fn s_primary(&mut self, pos: u32) -> Vec<Record> {
+    pub fn s_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
         self.reads_store
             .values()
             .filter(|record| record.reference_start() as u32 == pos)
-            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
+            .map(|record| primary_record(bam_reader, record.clone()))
             .collect()
     }
 
@@ -294,11 +327,11 @@ impl Bin {
     /// # Returns
     ///
     /// A vector of primary alignment records.
-    pub fn e_primary(&mut self, pos: u32) -> Vec<Record> {
+    pub fn e_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
         self.reads_store
             .values()
             .filter(|record| record.reference_end() as u32 == pos)
-            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
+            .map(|record| primary_record(bam_reader, record.clone()))
             .collect()
     }
 
@@ -339,5 +372,8 @@ impl Bin {
         // Divide by the bin length to get average coverage
         total_bases_covered / bin_length
     }
-}
 
+    pub fn mean_coverage_from_depths(&self) -> f64 {
+        self.depths.iter().sum::<u32>() as f64 / self.depths.len() as f64
+    }
+}

+ 0 - 2
src/scan/mod.rs

@@ -1,4 +1,2 @@
-
 pub mod bin;
 pub mod scan;
-

+ 69 - 56
src/scan/scan.rs

@@ -1,14 +1,16 @@
-use std::{fmt, fs, fs::File, io::Write, sync::Mutex};
+use std::{fmt, fs, io::Write, sync::Mutex};
 
+use anyhow::Context;
 use log::{debug, error, info};
 use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator};
 use rayon::{
     iter::{IntoParallelIterator, ParallelIterator},
     slice::ParallelSliceMut,
 };
+use rust_htslib::bam::IndexedReader;
 
+use crate::io::writers::get_gz_writer;
 use crate::math::filter_outliers_modified_z_score_with_indices;
-use crate::positions::{contig_to_num, GenomeRange, GetGenomeRange};
 use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};
 
 /// Represents a count of reads in a genomic bin, including various metrics and outlier information.
@@ -32,6 +34,7 @@ pub struct BinCount {
     pub ratio_end: f64,
     /// Optional vector of outlier types for this bin.
     pub outlier: Option<Vec<BinOutlier>>,
+    pub depths: Vec<u32>,
 }
 
 impl From<&Bin> for BinCount {
@@ -52,11 +55,12 @@ impl From<&Bin> for BinCount {
             start: bin.start,
             end: bin.end,
             n_reads: n_reads as u32,
-            coverage: bin.mean_coverage(),
+            coverage: bin.mean_coverage_from_depths(),
             ratio_sa: n_sa as f64 / n_reads_float,
             ratio_start: n_start as f64 / n_reads_float,
             ratio_end: n_end as f64 / n_reads_float,
             outlier: None,
+            depths: bin.depths.clone(),
         }
     }
 }
@@ -68,7 +72,7 @@ impl BinCount {
     /// A `String` representing the `BinCount` as a TSV row.
     pub fn to_tsv_row(&self) -> String {
         format!(
-            "{}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.6}\t{:.6}\t{}",
+            "{}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.6}\t{:.6}\t{}\t{}",
             self.contig,
             self.start,
             self.end,
@@ -84,7 +88,12 @@ impl BinCount {
                     .map(|e| e.to_string())
                     .collect::<Vec<String>>()
                     .join(", "))
-                .unwrap_or_default()
+                .unwrap_or_default(),
+            self.depths
+                .iter()
+                .map(|e| e.to_string())
+                .collect::<Vec<String>>()
+                .join(",")
         )
     }
 
@@ -104,7 +113,7 @@ impl BinCount {
     /// - The outlier field is not in the expected format.
     pub fn from_tsv_row(row: &str) -> anyhow::Result<Self> {
         let fields: Vec<&str> = row.split('\t').collect();
-        if fields.len() != 9 {
+        if fields.len() != 10 {
             anyhow::bail!("Invalid number of fields in TSV row");
         }
 
@@ -124,6 +133,15 @@ impl BinCount {
             None
         };
 
+        let depths = fields[9]
+            .split(',')
+            .map(|s| {
+                s.trim() // Remove any whitespace around numbers
+                    .parse::<u32>()
+                    .with_context(|| format!("Failed to parse '{}' as u32", s))
+            })
+            .collect::<anyhow::Result<Vec<u32>>>()?;
+
         Ok(BinCount {
             contig: fields[0].to_string(),
             start: fields[1].parse()?,
@@ -134,6 +152,7 @@ impl BinCount {
             ratio_start: fields[6].parse()?,
             ratio_end: fields[7].parse()?,
             outlier,
+            depths,
         })
     }
 }
@@ -144,7 +163,6 @@ impl BinCount {
 //     }
 // }
 
-
 /// Represents types of outliers that can be detected in a genomic bin.
 #[derive(Debug, Clone)]
 pub enum BinOutlier {
@@ -231,13 +249,9 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
     let bin_size = config.count_bin_size;
     let chunk_n_bin = config.count_n_chunks;
     info!("Starting whole genome scan for {bam_path}, with bin size of {bin_size} nt and by chunks of {chunk_n_bin} bins.");
-    fs::create_dir(out_dir)?;
+    fs::create_dir_all(out_dir)?;
 
     for (contig, length) in read_dict(&config.dict_file)? {
-        if contig.as_str() == "chrM" {
-            continue;
-        }
-
         let n_bin = length / bin_size;
         // Calculate number of chunks using ceiling division
         let n_chunks = n_bin.div_ceil(chunk_n_bin);
@@ -259,6 +273,9 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
                 // Calculate number of bins in this chunk with ceiling division
                 let n_bins_in_chunk = chunk_length.div_ceil(bin_size);
 
+                let mut bam_reader = IndexedReader::from_path(bam_path)
+                    .with_context(|| format!("Can't open BAM file: {}", bam_path)).unwrap();
+
                 // Process each bin in the chunk
                 (0..n_bins_in_chunk)
                     // .into_iter()
@@ -267,7 +284,13 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
                         // Ensure we don't exceed remaining length
                         let bin_length = std::cmp::min(bin_size, chunk_length - j * bin_size);
                         // debug!("chunk start:{chunk_start}, length: {chunk_length}, n_bins: {n_bins_in_chunk}, first bin start: {bin_start} bin length: {bin_length}");
-                        match Bin::new(bam_path, &contig, bin_start, bin_length) {
+                        match Bin::new(
+                            &mut bam_reader,
+                            &contig,
+                            bin_start,
+                            bin_length,
+                            config.bam_min_mapq,
+                        ) {
                             Ok(bin) => Some(BinCount::from(&bin)),
                             Err(e) => {
                                 error!("Failed to get Bin: {e}");
@@ -285,9 +308,11 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
         debug!("Scan {contig}, computing outliers");
         fill_outliers(&mut bins);
 
-        let out_file = format!("{out_dir}/{contig}_count.tsv");
+        let out_file = format!("{out_dir}/{contig}_count.tsv.gz");
         debug!("Scan {contig}, writing file");
-        let mut file = File::create(out_file)?;
+
+        let mut file = get_gz_writer(&out_file, true)
+            .with_context(|| anyhow::anyhow!("failed to open the file: {out_file}"))?;
         for bin in bins {
             writeln!(file, "{}", bin.to_tsv_row())?;
         }
@@ -342,51 +367,38 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
 /// // After this call, some BinCount objects may have their outlier field populated
 /// ```
 pub fn fill_outliers(bin_counts: &mut [BinCount]) {
-    let bin_counts = Mutex::new(bin_counts);
-
-    fn get_ratio_sa(c: &BinCount) -> f64 {
-        c.ratio_sa
-    }
-    fn get_ratio_start(c: &BinCount) -> f64 {
-        c.ratio_start
-    }
-    fn get_ratio_end(c: &BinCount) -> f64 {
-        c.ratio_end
-    }
-    type OutlierTypeInfo = (fn(&BinCount) -> f64, BinOutlier);
+    let intermediate: Vec<_> = bin_counts
+        .par_iter()
+        .enumerate()
+        .map(|(i, c)| (i, [c.ratio_sa, c.ratio_start, c.ratio_end]))
+        .collect();
 
-    let outlier_types: [OutlierTypeInfo; 3] = [
-        (get_ratio_sa, BinOutlier::SA),
-        (get_ratio_start, BinOutlier::Start),
-        (get_ratio_end, BinOutlier::End),
-    ];
+    let outlier_types = [BinOutlier::SA, BinOutlier::Start, BinOutlier::End];
 
-    for (get_ratio, outlier_type) in outlier_types.iter() {
-        let (indices, ratios): (Vec<usize>, Vec<f64>) = bin_counts
-            .lock()
-            .unwrap()
-            .par_iter()
-            .enumerate()
-            .filter_map(|(i, c)| {
-                let ratio = get_ratio(c);
-                if !ratio.is_nan() {
-                    Some((i, ratio))
-                } else {
-                    None
-                }
-            })
-            .unzip();
+    let outliers: Vec<(usize, BinOutlier)> = outlier_types
+        .iter()
+        .enumerate()
+        .flat_map(|(ratio_idx, outlier_type)| {
+            let (indices, ratios): (Vec<_>, Vec<_>) = intermediate
+                .par_iter()
+                .filter_map(|(i, ratio_array)| {
+                    let ratio = ratio_array[ratio_idx];
+                    (!ratio.is_nan()).then_some((*i, ratio))
+                })
+                .unzip();
 
-        let outlier_indices = filter_outliers_modified_z_score_with_indices(&ratios, indices);
+            filter_outliers_modified_z_score_with_indices(&ratios, indices)
+                .into_iter()
+                .map(move |i| (i, outlier_type.clone()))
+        })
+        .collect();
 
-        outlier_indices.par_iter().for_each(|&i| {
-            let mut bin_counts = bin_counts.lock().unwrap();
-            bin_counts[i]
-                .outlier
-                .get_or_insert_with(Vec::new)
-                .push(outlier_type.clone());
-        });
-    }
+    outliers.iter().for_each(|(i, outlier_type)| {
+        bin_counts[*i]
+            .outlier
+            .get_or_insert_with(Vec::new)
+            .push(outlier_type.clone());
+    });
 }
 
 /// Performs a somatic scan for a given sample ID by analyzing both normal and tumoral BAM files.
@@ -427,3 +439,4 @@ pub fn somatic_scan(id: &str, config: &Config) -> anyhow::Result<()> {
         config,
     )
 }
+

+ 1 - 0
src/variant/mod.rs

@@ -1,3 +1,4 @@
 pub mod variant;
 pub mod variant_collection;
+pub mod variants_stats;
 

+ 0 - 208
src/variant/variant_collection.rs

@@ -794,214 +794,6 @@ impl Variants {
     }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct VariantsStats {
-    pub n: u32,
-    #[serde(serialize_with = "serialize_dashmap_sort")]
-    pub alteration_categories: DashMap<String, u32>,
-    #[serde(serialize_with = "serialize_dashmap_sort")]
-    pub cosmic: DashMap<u64, u32>,
-    #[serde(serialize_with = "serialize_dashmap_sort")]
-    pub n_alts: DashMap<u32, u32>,
-    #[serde(serialize_with = "serialize_dashmap_sort")]
-    pub depths: DashMap<u32, u32>,
-    #[serde(serialize_with = "serialize_dashmap_sort")]
-    pub vafs: DashMap<OrderedFloat<f32>, u32>,
-    #[serde(serialize_with = "serialize_dashmap_sort")]
-    pub vep_impact: DashMap<String, u32>,
-    #[serde(serialize_with = "serialize_dashmap_sort")]
-    pub consequences: DashMap<String, u32>,
-    #[serde(serialize_with = "serialize_dashmap_sort")]
-    pub genes: DashMap<String, u32>,
-    pub n_gnomad: usize,
-    pub gnomad: Vec<(String, Vec<(f64, usize)>)>,
-}
-
-use serde::Serializer;
-
-pub fn serialize_dashmap_sort<S, T>(
-    data: &DashMap<T, u32>,
-    serializer: S,
-) -> Result<S::Ok, S::Error>
-where
-    S: Serializer,
-    T: Serialize + Ord + std::hash::Hash + Clone,
-{
-    let ordered: BTreeMap<_, _> = data
-        .iter()
-        .map(|entry| (entry.key().clone(), *entry.value()))
-        .collect();
-
-    ordered.serialize(serializer)
-}
-
-impl VariantsStats {
-    pub fn new(variants: &Variants) -> Self {
-        let n = variants.data.len() as u32;
-        let alteration_categories: DashMap<String, u32> = DashMap::new();
-        let vep_impact: DashMap<String, u32> = DashMap::new();
-        let genes: DashMap<String, u32> = DashMap::new();
-        let consequences: DashMap<String, u32> = DashMap::new();
-        let n_alts: DashMap<u32, u32> = DashMap::new();
-        let depths: DashMap<u32, u32> = DashMap::new();
-        let vafs: DashMap<OrderedFloat<f32>, u32> = DashMap::new();
-        let cosmic: DashMap<u64, u32> = DashMap::new();
-        let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
-
-        variants.data.par_iter().for_each(|v| {
-            if let Ok(best_vep) = v.best_vep() {
-                if let Some(impact) = best_vep.extra.impact {
-                    *vep_impact.entry(impact.to_string()).or_default() += 1;
-                }
-                if let Some(gene) = best_vep.extra.symbol {
-                    *genes.entry(gene).or_default() += 1;
-                }
-                if let Some(csqs) = best_vep.consequence {
-                    let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
-                    csq.sort();
-                    csq.dedup();
-                    *consequences.entry(csq.join(", ")).or_default() += 1;
-                }
-            }
-
-            let (n_alt, depth) = v.n_alt_depth();
-            *n_alts.entry(n_alt as u32).or_default() += 1;
-            *depths.entry(depth as u32).or_default() += 1;
-
-            let vaf = OrderedFloat::from(((n_alt * 1_000.0 / depth).round() / 10.0) as f32);
-            if !vaf.is_nan() {
-                *vafs.entry(vaf).or_default() += 1;
-            }
-
-            v.annotations.iter().for_each(|annotation| {
-                match annotation {
-                    Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
-                    Annotation::GnomAD(v) => {
-                        v.to_vec()
-                            .iter()
-                            .map(|e| e.to_string_value_pair())
-                            .for_each(|(key, value)| {
-                                gnomads.entry(key).or_default().push(value);
-                            });
-                    }
-                    _ => (),
-                };
-            });
-            let mut alteration_category_str = v
-                .alteration_category()
-                .iter()
-                .map(|c| c.to_string())
-                .collect::<Vec<String>>();
-            alteration_category_str.sort();
-            alteration_category_str.dedup();
-
-            *alteration_categories
-                .entry(alteration_category_str.join(", "))
-                .or_default() += 1;
-        });
-
-        let mut n_gnomad = 0;
-        let gnomad = gnomads
-            .iter()
-            .map(|e| {
-                let data = e.value().to_vec();
-                n_gnomad = data.len();
-                (e.key().to_string(), bin_data(data, 0.0001))
-            })
-            .collect();
-
-        Self {
-            n,
-            alteration_categories,
-            cosmic,
-            gnomad,
-            vafs,
-            n_alts,
-            depths,
-            vep_impact,
-            consequences,
-            genes,
-            n_gnomad,
-        }
-    }
-
-    pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
-        let file = File::create(filename)
-            .map_err(|e| anyhow::anyhow!("Failed to create file: {}\n{e}", filename))?;
-        let mut writer = BGZFWriter::new(file, bgzip::Compression::default());
-
-        serde_json::to_writer(&mut writer, self)
-            .map_err(|e| anyhow::anyhow!("Failed to serialize JSON to file: {}\n{e}", filename))?;
-
-        writer.close().map_err(|e| {
-            anyhow::anyhow!("Failed to close BGZF writer for file: {}\n{e}", filename)
-        })?;
-
-        debug!("Successfully saved variants to {}", filename);
-        Ok(())
-    }
-    pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
-        let file =
-            File::open(filename).with_context(|| format!("Failed to open file: {}", filename))?;
-        let mut reader = BGZFReader::new(file)
-            .with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;
-
-        let variants: Self = serde_json::from_reader(&mut reader)
-            .with_context(|| format!("Failed to deserialize JSON from file: {}", filename))?;
-
-        debug!("Successfully loaded variants from {}", filename);
-        Ok(variants)
-    }
-}
-
-#[derive(Debug)]
-pub struct SomaticVariantRates {
-    pub wgs_length: u32,
-    pub total_variants: usize,
-    pub somatic_mutation_rate_wgs: f64,
-    pub exon_count: usize,
-    pub variants_in_coding: usize,
-    pub coding_variants: usize,
-    pub total_exon_bases: u32,
-    pub somatic_mutation_rate_coding: f64,
-    pub somatic_nonsynonymous_rate_coding: f64,
-}
-
-pub fn somatic_rates(
-    variants: &[Variant],
-    feature_ranges: &Vec<GenomeRange>,
-    config: &Config,
-) -> anyhow::Result<SomaticVariantRates> {
-    let ol = par_overlaps(variants, feature_ranges);
-
-    let n_coding = ol
-        .iter()
-        .filter_map(|i| variants[*i].best_vep().ok())
-        .filter_map(|bv| bv.impact())
-        .filter(|impact| *impact <= VepImpact::MODERATE)
-        .count();
-
-    let n_bases_m: u32 = feature_ranges.par_iter().map(|gr| gr.length()).sum();
-    let mega_base_m = n_bases_m as f64 / 10.0e6;
-
-    let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
-    let rate_wgs = variants.len() as f64 / (wgs_len as f64 / 10.0e6);
-
-    let n_exons_mb = ol.len() as f64 / mega_base_m;
-    let n_coding_mb = n_coding as f64 / mega_base_m;
-
-    Ok(SomaticVariantRates {
-        total_variants: variants.len(),
-        exon_count: feature_ranges.len(),
-        variants_in_coding: ol.len(),
-        coding_variants: n_coding,
-        total_exon_bases: n_bases_m,
-        wgs_length: wgs_len,
-        somatic_mutation_rate_wgs: rate_wgs,
-        somatic_mutation_rate_coding: n_exons_mb,
-        somatic_nonsynonymous_rate_coding: n_coding_mb,
-    })
-}
 
 /// Creates a new Variant instance from a collection of VcfVariants and annotations.
 ///

+ 306 - 0
src/variant/variants_stats.rs

@@ -0,0 +1,306 @@
+use std::{collections::{BTreeMap, HashMap}, io::BufRead};
+
+use anyhow::Context;
+use dashmap::DashMap;
+use log::debug;
+use ordered_float::OrderedFloat;
+use rayon::prelude::*;
+use serde::{Deserialize, Serialize, Serializer};
+
+use crate::{
+    annotation::{vep::VepImpact, Annotation},
+    config::Config,
+    helpers::bin_data,
+    io::{dict::read_dict, readers::get_gz_reader, writers::get_gz_writer},
+    positions::{contig_to_num, par_overlaps, GenomeRange},
+    scan::scan::BinCount,
+};
+
+use super::variant_collection::{Variant, Variants};
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct VariantsStats {
+    pub n: u32,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub alteration_categories: DashMap<String, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub cosmic: DashMap<u64, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub n_alts: DashMap<u32, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub depths: DashMap<u32, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub vafs: DashMap<OrderedFloat<f32>, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub vep_impact: DashMap<String, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub consequences: DashMap<String, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub genes: DashMap<String, u32>,
+    pub n_gnomad: usize,
+    pub gnomad: Vec<(String, Vec<(f64, usize)>)>,
+}
+pub fn serialize_dashmap_sort<S, T>(
+    data: &DashMap<T, u32>,
+    serializer: S,
+) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+    T: Serialize + Ord + std::hash::Hash + Clone,
+{
+    let ordered: BTreeMap<_, _> = data
+        .iter()
+        .map(|entry| (entry.key().clone(), *entry.value()))
+        .collect();
+
+    ordered.serialize(serializer)
+}
+
+impl VariantsStats {
+    pub fn new(variants: &Variants) -> Self {
+        let n = variants.data.len() as u32;
+        let alteration_categories: DashMap<String, u32> = DashMap::new();
+        let vep_impact: DashMap<String, u32> = DashMap::new();
+        let genes: DashMap<String, u32> = DashMap::new();
+        let consequences: DashMap<String, u32> = DashMap::new();
+        let n_alts: DashMap<u32, u32> = DashMap::new();
+        let depths: DashMap<u32, u32> = DashMap::new();
+        let vafs: DashMap<OrderedFloat<f32>, u32> = DashMap::new();
+        let cosmic: DashMap<u64, u32> = DashMap::new();
+        let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
+
+        variants.data.par_iter().for_each(|v| {
+            if let Ok(best_vep) = v.best_vep() {
+                if let Some(impact) = best_vep.extra.impact {
+                    *vep_impact.entry(impact.to_string()).or_default() += 1;
+                }
+                if let Some(gene) = best_vep.extra.symbol {
+                    *genes.entry(gene).or_default() += 1;
+                }
+                if let Some(csqs) = best_vep.consequence {
+                    let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
+                    csq.sort();
+                    csq.dedup();
+                    *consequences.entry(csq.join(", ")).or_default() += 1;
+                }
+            }
+
+            let (n_alt, depth) = v.n_alt_depth();
+            *n_alts.entry(n_alt as u32).or_default() += 1;
+            *depths.entry(depth as u32).or_default() += 1;
+
+            let vaf = OrderedFloat::from(((n_alt * 1_000.0 / depth).round() / 10.0) as f32);
+            if !vaf.is_nan() {
+                *vafs.entry(vaf).or_default() += 1;
+            }
+
+            v.annotations.iter().for_each(|annotation| {
+                match annotation {
+                    Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
+                    Annotation::GnomAD(v) => {
+                        v.to_vec()
+                            .iter()
+                            .map(|e| e.to_string_value_pair())
+                            .for_each(|(key, value)| {
+                                gnomads.entry(key).or_default().push(value);
+                            });
+                    }
+                    _ => (),
+                };
+            });
+            let mut alteration_category_str = v
+                .alteration_category()
+                .iter()
+                .map(|c| c.to_string())
+                .collect::<Vec<String>>();
+            alteration_category_str.sort();
+            alteration_category_str.dedup();
+
+            *alteration_categories
+                .entry(alteration_category_str.join(", "))
+                .or_default() += 1;
+        });
+
+        let mut n_gnomad = 0;
+        let gnomad = gnomads
+            .iter()
+            .map(|e| {
+                let data = e.value().to_vec();
+                n_gnomad = data.len();
+                (e.key().to_string(), bin_data(data, 0.0001))
+            })
+            .collect();
+
+        Self {
+            n,
+            alteration_categories,
+            cosmic,
+            gnomad,
+            vafs,
+            n_alts,
+            depths,
+            vep_impact,
+            consequences,
+            genes,
+            n_gnomad,
+        }
+    }
+
+    pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
+        let mut writer = get_gz_writer(filename, true)
+            .with_context(|| anyhow::anyhow!("Failed to create file: {}", filename))?;
+
+        serde_json::to_writer(&mut writer, self)
+            .with_context(|| anyhow::anyhow!("Failed to serialize JSON to file: {}", filename))?;
+
+        writer.close().with_context(|| {
+            anyhow::anyhow!("Failed to close BGZF writer for file: {}", filename)
+        })?;
+
+        debug!("Successfully saved variants to {}", filename);
+        Ok(())
+    }
+    pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
+        let mut reader = get_gz_reader(filename)
+            .with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;
+
+        let variants: Self = serde_json::from_reader(&mut reader)
+            .with_context(|| format!("Failed to deserialize JSON from file: {}", filename))?;
+
+        debug!("Successfully loaded variants from {}", filename);
+        Ok(variants)
+    }
+}
+
+#[derive(Debug)]
+pub struct SomaticVariantRates {
+    pub wgs_length: u32,
+    pub total_variants: usize,
+    pub somatic_mutation_rate_wgs: f64,
+    pub exon_count: usize,
+    pub variants_in_coding: usize,
+    pub coding_variants: usize,
+    pub total_exon_bases: u32,
+    pub somatic_mutation_rate_coding: f64,
+    pub somatic_nonsynonymous_rate_coding: f64,
+}
+
+pub fn somatic_rates(
+    variants: &[Variant],
+    feature_ranges: &Vec<GenomeRange>,
+    config: &Config,
+) -> anyhow::Result<SomaticVariantRates> {
+    let ol = par_overlaps(variants, feature_ranges);
+
+    let n_coding = ol
+        .iter()
+        .filter_map(|i| variants[*i].best_vep().ok())
+        .filter_map(|bv| bv.impact())
+        .filter(|impact| *impact <= VepImpact::MODERATE)
+        .count();
+
+    let n_bases_m: u32 = feature_ranges.par_iter().map(|gr| gr.length()).sum();
+    let mega_base_m = n_bases_m as f64 / 10.0e6;
+
+    let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
+    let rate_wgs = variants.len() as f64 / (wgs_len as f64 / 10.0e6);
+
+    let n_exons_mb = ol.len() as f64 / mega_base_m;
+    let n_coding_mb = n_coding as f64 / mega_base_m;
+
+    Ok(SomaticVariantRates {
+        total_variants: variants.len(),
+        exon_count: feature_ranges.len(),
+        variants_in_coding: ol.len(),
+        coding_variants: n_coding,
+        total_exon_bases: n_bases_m,
+        wgs_length: wgs_len,
+        somatic_mutation_rate_wgs: rate_wgs,
+        somatic_mutation_rate_coding: n_exons_mb,
+        somatic_nonsynonymous_rate_coding: n_coding_mb,
+    })
+}
+
+pub fn high_depth_somatic(id: &str, config: &Config) -> anyhow::Result<DashMap<String, Vec<GenomeRange>>> {
+    let mut contigs: Vec<String> = (1..22).map(|i| format!("chr{i}")).collect();
+    contigs.extend(["chrX", "chrY", "chrM"].into_iter().map(String::from));
+
+    let results: DashMap<String, Vec<GenomeRange>> = DashMap::new();
+    contigs.into_par_iter().for_each(|contig| {
+        let mrd_path = format!("{}/{contig}_count.tsv.gz", config.normal_dir_count(id));
+        let mrd_reader =
+            get_gz_reader(&mrd_path).with_context(|| format!("Failed to open: {mrd_path}"));
+        let diag_path = format!("{}/{contig}_count.tsv.gz", config.tumoral_dir_count(id));
+        let diag_reader =
+            get_gz_reader(&diag_path).with_context(|| format!("Failed to open: {diag_path}"));
+
+        if let (Ok(mrd_reader), Ok(diag_reader)) = (mrd_reader, diag_reader) {
+            let ranges: Vec<GenomeRange> = mrd_reader
+                .lines()
+                .zip(diag_reader.lines())
+                .filter_map(|(mrd, diag)| {
+                    if let (Ok(mrd), Ok(diag)) = (mrd, diag) {
+                        if let (Ok(mrd), Ok(diag)) =
+                        (BinCount::from_tsv_row(&mrd), BinCount::from_tsv_row(&diag))
+                        {
+                            assert_eq!(mrd.contig, diag.contig);
+                            assert_eq!(mrd.start, diag.start);
+                            let bools: Vec<bool> = mrd
+                                .depths
+                                .into_iter()
+                                .zip(diag.depths)
+                                .map(|(depth_mrd, depth_diag)| {
+                                    depth_mrd >= config.min_high_quality_depth
+                                    && depth_diag >= config.min_high_quality_depth
+                                })
+                                .collect();
+                            let ranges = ranges_from_consecutive_true(&bools, mrd.start, &mrd.contig);
+                            return Some(ranges);
+                        }
+                    }
+                    None
+                })
+                .flatten()
+                .collect();
+            results.insert(contig, ranges);
+        }
+    });
+
+    Ok(results)
+}
+
+fn ranges_from_consecutive_true(vec: &[bool], start: u32, contig: &str) -> Vec<GenomeRange> {
+    let contig = contig_to_num(contig);
+    let mut ranges = Vec::new();
+    let mut current_start: Option<u32> = None;
+
+    // Iterate through elements starting from specified position
+    for (i, &value) in vec.iter().enumerate() {
+        let i = i as u32 + start;
+        match (value, current_start) {
+            // Begin new range
+            (true, None) => current_start = Some(i),
+            // Finalize current range
+            (false, Some(start_idx)) => {
+                // ranges.push(start_idx..i);
+                ranges.push(GenomeRange {
+                    contig,
+                    range: start_idx..i,
+                });
+                current_start = None;
+            }
+            _ => {}
+        }
+    }
+
+    // Add any remaining active range
+    if let Some(start_idx) = current_start {
+        // ranges.push(start_idx..vec.len() as u32);
+        ranges.push(GenomeRange {
+            contig,
+            range: start_idx..vec.len() as u32 + start,
+        });
+    }
+
+    ranges
+}