瀏覽代碼

variants rates + ncbi versions

Thomas 8 月之前
父節點
當前提交
980c173f72
共有 9 個文件被更改,包括 615 次插入147 次删除
  1. 4 3
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 74 46
      src/annotation/ncbi.rs
  4. 4 1
      src/annotation/vep.rs
  5. 164 20
      src/lib.rs
  6. 3 3
      src/pipes/somatic.rs
  7. 161 1
      src/positions.rs
  8. 30 15
      src/scan/scan.rs
  9. 174 58
      src/variant/variants_stats.rs

+ 4 - 3
Cargo.lock

@@ -2939,6 +2939,7 @@ dependencies = [
  "rayon",
  "rusqlite",
  "rust-htslib 0.49.0",
+ "semver 1.0.26",
  "serde",
  "serde_json",
  "tar",
@@ -3699,7 +3700,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
 dependencies = [
- "semver 1.0.25",
+ "semver 1.0.26",
 ]
 
 [[package]]
@@ -3846,9 +3847,9 @@ dependencies = [
 
 [[package]]
 name = "semver"
-version = "1.0.25"
+version = "1.0.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03"
+checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
 
 [[package]]
 name = "semver-parser"

+ 1 - 0
Cargo.toml

@@ -44,6 +44,7 @@ tar = "0.4.43"
 flatbuffers = "25.2.10"
 ordered-float = { version = "5.0.0", features = ["serde"] }
 bitcode = "0.6.5"
+semver = "1.0.26"
 
 [profile.dev]
 opt-level = 0

+ 74 - 46
src/annotation/ncbi.rs

@@ -1,4 +1,5 @@
 use anyhow::{Context, Ok, Result};
+use semver::Version;
 use serde::{Deserialize, Serialize};
 use std::str::FromStr;
 
@@ -72,79 +73,106 @@ pub struct NCBIAcc {
     /// The numeric part of the accession
     pub number: u64,
     /// The version number of the accession
-    pub version: f32,
+    pub version: Version,
 }
 
 impl FromStr for NCBIAcc {
     type Err = anyhow::Error;
-    /// Parses a string into an NCBIAcc struct.
+    /// Parses a string into an `NCBIAcc` struct.
     ///
     /// This method handles various formats of NCBI accessions, including:
-    /// - Standard format: "PREFIX_NUMBER.VERSION" (e.g., "NM_001234.5")
-    /// - Unassigned transcripts: "unassigned_transcript_NUMBER_VERSION"
-    /// - Accessions without versions
-    /// - Accessions without numbers (treated as having max u64 number and version 0.0)
+    ///
+    /// - **Standard format**: `"PREFIX_NUMBER.VERSION"` (e.g., `"NM_001234.5"` or `"NM_001234.1_2_3"`).
+    ///   Trailing underscores after the version are converted to dots and parsed as a semantic version (e.g., `"1_2_3"` → `"1.2.3"`).
+    ///
+    /// - **Unassigned transcripts**: `"unassigned_transcript_NUMBER_VERSION"` (e.g., `"unassigned_transcript_56789_1"`).
+    ///
+    /// - **Accessions without versions**: Parsed with default version `"0.0.0"`.
+    ///
+    /// - **Accessions without numbers**: Treated as having `u64::MAX` as the number and version `"0.0.0"`.
+    ///
+    /// Internally uses the [`semver::Version`](https://docs.rs/semver/latest/semver/struct.Version.html) type for rich multi-part versioning support.
     ///
     /// # Arguments
-    /// * `s` - A string slice representing the NCBI accession
+    /// * `s` - A string slice representing the NCBI accession.
     ///
     /// # Returns
-    /// * `Ok(NCBIAcc)` if parsing is successful
-    /// * `Err(anyhow::Error)` if parsing fails
+    /// * `Ok(NCBIAcc)` if parsing is successful.
+    /// * `Err(anyhow::Error)` if parsing fails.
     ///
     /// # Examples
     /// ```
+    /// use std::str::FromStr;
+    /// use your_crate::NCBIAcc;
+    ///
     /// let acc1 = NCBIAcc::from_str("NM_001234.5").unwrap();
     /// assert_eq!(acc1.prefix, "NM");
     /// assert_eq!(acc1.number, 1234);
-    /// assert_eq!(acc1.version, 5.0);
+    /// assert_eq!(acc1.version.to_string(), "5.0.0");
     ///
     /// let acc2 = NCBIAcc::from_str("unassigned_transcript_56789_1").unwrap();
     /// assert_eq!(acc2.prefix, "unassigned_transcript");
     /// assert_eq!(acc2.number, 56789);
-    /// assert_eq!(acc2.version, 1.0);
+    /// assert_eq!(acc2.version.to_string(), "1.0.0");
     ///
     /// let acc3 = NCBIAcc::from_str("XR_123456").unwrap();
     /// assert_eq!(acc3.prefix, "XR_123456");
     /// assert_eq!(acc3.number, u64::MAX);
-    /// assert_eq!(acc3.version, 0.0);
+    /// assert_eq!(acc3.version.to_string(), "0.0.0");
+    ///
+    /// let acc4 = NCBIAcc::from_str("NM_001234.1_2_3").unwrap();
+    /// assert_eq!(acc4.version.to_string(), "1.2.3");
     /// ```
     fn from_str(s: &str) -> Result<Self> {
-        if s.contains("unassigned_transcript_") {
-            let s = s.replace("unassigned_transcript_", "");
-            let (num, v) = if s.contains("_") {
-                s.split_once("_").context("first split error")?
-            } else {
-                (s.as_str(), "0")
-            };
-            Ok(NCBIAcc {
+        if s.starts_with("unassigned_transcript_") {
+            let s = s.trim_start_matches("unassigned_transcript_");
+
+            let (num_str, version_str_raw) = s.split_once('_').unwrap_or((s, "0")); // fallback: version = 0
+
+            let number = num_str
+                .parse::<u64>()
+                .context("Error parsing number in unassigned_transcript")?;
+
+            let version = Version::parse(&normalize_to_semver(version_str_raw))
+                .context("Error parsing version in unassigned_transcript")?;
+
+            return Ok(NCBIAcc {
                 prefix: "unassigned_transcript".to_string(),
-                number: num.parse().context("Error parsing NCBI accession number")?,
-                version: v.parse().context("Error parsing NCBI accession version")?,
-            })
-        } else if s.contains("_") {
-            if s.contains(".") {
-                let (rest, v) = s.split_once(".").context("first split error")?;
-                let (pref, num) = rest.split_once("_").context("second split error")?;
-                let v = v.replace("_", ".");
-                Ok(NCBIAcc {
-                    prefix: pref.to_string(),
-                    number: num.parse().context("Error parsing NCBI accession number")?,
-                    version: v.parse().context("Error parsing NCBI accession version")?,
-                })
-            } else {
-                Ok(NCBIAcc {
-                    prefix: s.to_string(),
-                    number: u64::MAX,
-                    version: 0.0,
-                })
-            }
-        } else {
-            Ok(NCBIAcc {
-                prefix: s.to_string(),
-                number: u64::MAX,
-                version: 0.0,
-            })
+                number,
+                version,
+            });
+        }
+
+        if s.contains('.') {
+            let (rest, raw_ver) = s.split_once('.').context("dot split failed")?;
+            let (prefix, num) = rest.split_once('_').context("underscore split failed")?;
+
+            let version_str = normalize_to_semver(&raw_ver.replace('.', "_"));
+            let version = Version::parse(&version_str)
+                .with_context(|| format!("error parsing version: '{}'", version_str))?;
+
+            return Ok(NCBIAcc {
+                prefix: prefix.to_string(),
+                number: num.parse().context("error parsing number")?,
+                version,
+            });
         }
+
+        Ok(NCBIAcc {
+            prefix: s.to_string(),
+            number: u64::MAX,
+            version: Version::parse("0.0.0").unwrap(),
+        })
+    }
+}
+
+fn normalize_to_semver(v: &str) -> String {
+    let parts: Vec<&str> = v.split('_').collect();
+
+    match parts.len() {
+        1 => format!("{}.0.0", parts[0]),            // "1" -> "1.0.0"
+        2 => format!("{}.{}.0", parts[0], parts[1]), // "1_2" -> "1.2.0"
+        3 => format!("{}.{}.{}", parts[0], parts[1], parts[2]), // "1_2_3"
+        _ => parts.join("."),                        // fallback
     }
 }

+ 4 - 1
src/annotation/vep.rs

@@ -730,7 +730,10 @@ pub fn run_vep(in_path: &str, out_path: &str) -> anyhow::Result<()> {
 /// # Note
 /// This function assumes that the VEP annotations are well-formed and contain
 /// the necessary fields for comparison (consequence, feature). It will log warnings
-/// for any parsing errors encountered during the selection process.
+/// for any parsing errors encountered during the selection process.        
+
+
+
 pub fn get_best_vep(d: &[VEP]) -> anyhow::Result<VEP> {
     let best_impact_veps = d
         .iter()

+ 164 - 20
src/lib.rs

@@ -45,7 +45,7 @@ mod tests {
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::{CNSegment, SavanaCN}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, par_overlaps}, scan::scan::{par_whole_scan, somatic_scan}, variant::{variant::AlterationCategory, variants_stats::{self, VariantsStats}}};
+    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::{CNSegment, SavanaCN}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, par_overlaps, range_intersection_par, sort_ranges}, scan::scan::{par_whole_scan, somatic_scan}, variant::{variant::AlterationCategory, variants_stats::{self, VariantsStats}}};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -765,7 +765,7 @@ mod tests {
         let path = format!("{}/{id}/diag/somatic_variants.json.gz", config.result_dir);
         let variants = variant_collection::Variants::load_from_json(&path)?;
 
-        VariantsStats::new(&variants).save_to_json(&format!("{}/{id}/diag/somatic_variants_stats.json.gz", config.result_dir));
+        VariantsStats::new(&variants, id, &config)?.save_to_json(&format!("{}/{id}/diag/somatic_variants_stats.json.gz", config.result_dir));
 Ok(())
     }
 
@@ -818,22 +818,34 @@ Ok(())
         let id = "ADJAGBA";
         let config = Config::default();
         let path = format!("{}/{id}/diag/somatic_variants.bit", config.result_dir);
-        let r = features_ranges("exon", &config)?;
-        let merged = merge_overlapping_genome_ranges(&r);
+        let exon_ranges = features_ranges("exon", &config)?;
+        let exon_ranges = merge_overlapping_genome_ranges(&exon_ranges);
         
         let variants = variant_collection::Variants::load_from_file(&path)?;
-        let full = variants_stats::somatic_rates(&variants.data, &merged, &config);
+        let full = variants_stats::somatic_rates(&variants.data, &exon_ranges, &config);
+        info!("{full:#?}");
+
+        // let restrained: Vec<variant_collection::Variant> = variants.data.iter().filter(|v| v.vcf_variants.len() >= 2)
+        //     .cloned().collect();
+        // let min_2 = variants_stats::somatic_rates(&restrained, &exon_ranges, &config);
+        // info!("{min_2:#?}");
+        //
+        // let restrained: Vec<variant_collection::Variant> = restrained.iter().filter(|v| v.vcf_variants.len() >= 3)
+        //     .cloned().collect();
+        // let min_3 = variants_stats::somatic_rates(&restrained, &exon_ranges, &config);
+        // info!("{min_3:#?}");
+
+        let mut high_depth_ranges = variants_stats::high_depth_somatic(id, &config)?;
+        high_depth_ranges.par_sort_by_key(|r| ( r.contig, r.range.start ));
+
+        let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
+        let exons_high_depth = range_intersection_par(&high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(), &exon_ranges_ref);
+
+        let full = variants_stats::somatic_rates(&variants.data, &exons_high_depth, &config);
         info!("{full:#?}");
 
-        let restrained: Vec<variant_collection::Variant> = variants.data.iter().filter(|v| v.vcf_variants.len() >= 2)
-            .cloned().collect();
-        let min_2 = variants_stats::somatic_rates(&restrained, &merged, &config);
-        info!("{min_2:#?}");
 
-        let restrained: Vec<variant_collection::Variant> = restrained.iter().filter(|v| v.vcf_variants.len() >= 3)
-            .cloned().collect();
-        let min_3 = variants_stats::somatic_rates(&restrained, &merged, &config);
-        info!("{min_3:#?}");
+
 
 
         // info!("n variants loaded: {}", variants.data.len());
@@ -872,13 +884,145 @@ Ok(())
     #[test]
     fn quality_ranges() -> anyhow::Result<()> {
         init();
-        let r = variants_stats::high_depth_somatic("ADJAGBA", &Config::default())?;
-        r.iter().for_each(|e| {
-            let contig = e.key();
-            let ranges = e.value();
-            let su = ranges.iter().map(|range| range.range.len()).sum::<usize>();
-            info!("{contig}: {su} high depth bases");
-        });
+        let r = variants_stats::high_depth_somatic("ADJAGBA", &Config::default())?.iter().map(|range| range.length()).sum::<u32>();
+        info!("{r} high depth bases");
         Ok(())
     }
+
+    fn gr(contig: u8, start: u32, end: u32) -> GenomeRange {
+        GenomeRange {
+            contig,
+            range: start..end,
+        }
+    }
+
+    #[test]
+    fn test_both_empty() {
+        let a: Vec<&GenomeRange> = vec![];
+        let b: Vec<&GenomeRange> = vec![];
+        let result = range_intersection_par(&a, &b);
+        assert!(result.is_empty());
+    }
+
+    #[test]
+    fn test_one_empty() {
+        let a = [gr(1, 0, 100)];
+        let b: Vec<&GenomeRange> = vec![];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b);
+        sort_ranges(&mut result);
+        assert!(result.is_empty());
+    }
+
+    #[test]
+    fn test_single_range_no_overlap() {
+        let a = [gr(1, 0, 100)];
+        let b = [gr(1, 100, 200)];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let b_refs: Vec<&GenomeRange> = b.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b_refs);
+        sort_ranges(&mut result);
+        assert!(result.is_empty());
+    }
+
+    #[test]
+    fn test_single_range_full_overlap() {
+        let a = [gr(1, 0, 100)];
+        let b = [gr(1, 0, 100)];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let b_refs: Vec<&GenomeRange> = b.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b_refs);
+        let mut expected = [gr(1, 0, 100)];
+        sort_ranges(&mut result);
+        sort_ranges(&mut expected);
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_different_contigs() {
+        let a = [gr(1, 0, 100)];
+        let b = [gr(2, 0, 100)];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let b_refs: Vec<&GenomeRange> = b.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b_refs);
+        sort_ranges(&mut result);
+        assert!(result.is_empty());
+    }
+
+    #[test]
+    fn test_touching_ranges() {
+        let a = [gr(1, 0, 100)];
+        let b = [gr(1, 100, 200)];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let b_refs: Vec<&GenomeRange> = b.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b_refs);
+        sort_ranges(&mut result);
+        assert!(result.is_empty());
+    }
+
+    #[test]
+    fn test_complete_subrange() {
+        let a = [gr(1, 0, 200)];
+        let b = [gr(1, 50, 150)];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let b_refs: Vec<&GenomeRange> = b.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b_refs);
+        let mut expected = [gr(1, 50, 150)];
+        sort_ranges(&mut result);
+        sort_ranges(&mut expected);
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_multiple_overlaps_same_contig() {
+        let a = [gr(1, 0, 50), gr(1, 75, 125)];
+        let b = [gr(1, 25, 100), gr(1, 150, 200)];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let b_refs: Vec<&GenomeRange> = b.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b_refs);
+        let mut expected = [gr(1, 25, 50), gr(1, 75, 100)];
+        sort_ranges(&mut result);
+        sort_ranges(&mut expected);
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_multiple_contigs() {
+        let a = [gr(1, 0, 100), gr(2, 50, 150), gr(3, 200, 300)];
+        let b = [gr(1, 50, 150), gr(2, 0, 100), gr(4, 0, 100)];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let b_refs: Vec<&GenomeRange> = b.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b_refs);
+        let mut expected = [gr(1, 50, 100), gr(2, 50, 100)];
+        sort_ranges(&mut result);
+        sort_ranges(&mut expected);
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_adjacent_ranges() {
+        let a = [gr(1, 0, 50), gr(1, 50, 100)];
+        let b = [gr(1, 25, 75)];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let b_refs: Vec<&GenomeRange> = b.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b_refs);
+        let mut expected = [gr(1, 25, 50), gr(1, 50, 75)];
+        sort_ranges(&mut result);
+        sort_ranges(&mut expected);
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_minimal_overlap() {
+        let a = [gr(1, 0, 100)];
+        let b = [gr(1, 99, 200)];
+        let a_refs: Vec<&GenomeRange> = a.iter().collect();
+        let b_refs: Vec<&GenomeRange> = b.iter().collect();
+        let mut result = range_intersection_par(&a_refs, &b_refs);
+        let mut expected = [gr(1, 99, 100)];
+        sort_ranges(&mut result);
+        sort_ranges(&mut expected);
+        assert_eq!(result, expected);
+    }
+
 }

+ 3 - 3
src/pipes/somatic.rs

@@ -509,7 +509,7 @@ impl Run for Somatic {
         let vep_stats = annotations.vep_stats()?;
         vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?;
 
-        VariantsStats::new(&variants)
+        VariantsStats::new(&variants, &id, &config)?
             .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json"))?;
 
         info!("Final unique variants: {}", variants.data.len());
@@ -523,7 +523,7 @@ impl Run for Somatic {
 pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
     info!("Loading Germline");
     let annotations = Annotations::default();
-    let clairs_germline = ClairS::initialize(&id, config)?.germline(&annotations)?;
+    let clairs_germline = ClairS::initialize(&id, config.clone())?.germline(&annotations)?;
 
     info!("Annotation with Cosmic and GnomAD.");
     let ext_annot = ExternalAnnotation::init()?;
@@ -533,7 +533,7 @@ pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
     variants.merge(clairs_germline, &annotations);
     info!("-- {}", variants.data.len());
 
-    let u = VariantsStats::new(&variants);
+    let u = VariantsStats::new(&variants, &id, &config)?;
     info!("++ {}", u.n);
 
     Ok(())

+ 161 - 1
src/positions.rs

@@ -450,6 +450,12 @@ impl TryFrom<(&str, &str, &str)> for GenomeRange {
 }
 
 impl GenomeRange {
+    pub fn new(contig: &str, start: u32, end: u32) -> Self {
+        Self {
+            contig: contig_to_num(contig),
+            range: start..end,
+        }
+    }
     /// Creates a `GenomeRange` from 1-based inclusive start and end positions.
     ///
     /// This method is useful when working with data sources that use 1-based coordinates
@@ -522,7 +528,7 @@ pub fn merge_overlapping_genome_ranges(ranges: &[GenomeRange]) -> Vec<GenomeRang
         return vec![];
     }
 
-    // ranges.sort_by_key(|r| (r.contig, r.range.start));
+    // ranges.par_sort_by_key(|r| (r.contig, r.range.start));
 
     let chunk_size = (ranges.len() / rayon::current_num_threads()).max(1);
 
@@ -620,6 +626,156 @@ pub fn overlaps_par(positions: &[&GenomePosition], ranges: &[&GenomeRange]) -> V
         })
         .collect()
 }
+//
+// pub fn range_intersection_par(a: &[GenomeRange], b: &[GenomeRange]) -> Vec<GenomeRange> {
+//     // Group ranges by contig for both inputs
+//     let mut a_map = std::collections::HashMap::new();
+//     for range in a {
+//         a_map.entry(&range.contig).or_insert(Vec::new()).push(range);
+//     }
+//
+//     let mut b_map = std::collections::HashMap::new();
+//     for range in b {
+//         b_map.entry(&range.contig).or_insert(Vec::new()).push(range);
+//     }
+//
+//     // Process common contigs in parallel
+//     a_map
+//         .into_par_iter()
+//         .filter_map(|(contig, mut a_ranges)| {
+//             // Sort ranges by start position (end is only used for tie-breaking)
+//             a_ranges.sort_by_key(|r| (r.range.start, r.range.end));
+//
+//             // Get corresponding ranges from b
+//             let mut b_ranges = match b_map.get(contig) {
+//                 Some(ranges) => ranges.clone(),
+//                 None => return None,
+//             };
+//             b_ranges.sort_by_key(|r| (r.range.start, r.range.end));
+//
+//             let mut intersections = Vec::new();
+//             let mut i = 0;
+//             let mut j = 0;
+//
+//             // Two-pointer intersection algorithm for 0-based, end-exclusive ranges
+//             while i < a_ranges.len() && j < b_ranges.len() {
+//                 let a_range = &a_ranges[i];
+//                 let b_range = &b_ranges[j];
+//
+//                 // Check if ranges are completely non-overlapping
+//                 if a_range.range.end <= b_range.range.start {
+//                     // |a_range| ends before |b_range| starts
+//                     i += 1;
+//                 } else if b_range.range.end <= a_range.range.start {
+//                     // |b_range| ends before |a_range| starts
+//                     j += 1;
+//                 } else {
+//                     // Calculate intersection of overlapping ranges
+//                     let start = a_range.range.start.max(b_range.range.start);
+//                     let end = a_range.range.end.min(b_range.range.end);
+//
+//                     // Only create a range if start < end (non-zero length)
+//                     if start < end {
+//                         intersections.push(GenomeRange {
+//                             contig: *contig,
+//                             range: start..end,
+//                         });
+//                     }
+//
+//                     // Move the pointer with the earlier end coordinate
+//                     if a_range.range.end < b_range.range.end {
+//                         i += 1;
+//                     } else {
+//                         j += 1;
+//                     }
+//                 }
+//             }
+//
+//             Some(intersections)
+//         })
+//         .flatten()
+//         .collect()
+// }
+
+pub fn range_intersection_par(a: &[&GenomeRange], b: &[&GenomeRange]) -> Vec<GenomeRange> {
+    let (a_contigs, b_contigs) = rayon::join(
+        || extract_contig_indices(a),
+        || extract_contig_indices(b),
+    );
+
+    a_contigs.into_par_iter()
+        .filter_map(|(contig, a_start, a_end)| {
+            let Some((b_start, b_end)) = find_contig_indices(&b_contigs, contig) else {
+                return None;
+            };
+
+            let a_ranges = &a[a_start..a_end];
+            let b_ranges = &b[b_start..b_end];
+            
+            let mut intersections = Vec::new();
+            let (mut i, mut j) = (0, 0);
+
+            while i < a_ranges.len() && j < b_ranges.len() {
+                let a_range = &a_ranges[i].range;
+                let b_range = &b_ranges[j].range;
+
+                match (a_range.end <= b_range.start, b_range.end <= a_range.start) {
+                    (true, _) => i += 1,
+                    (_, true) => j += 1,
+                    _ => {
+                        let start = a_range.start.max(b_range.start);
+                        let end = a_range.end.min(b_range.end);
+                        if start < end {
+                            intersections.push(GenomeRange {
+                                contig,
+                                range: start..end,
+                            });
+                        }
+
+                        if a_range.end < b_range.end {
+                            i += 1;
+                        } else {
+                            j += 1;
+                        }
+                    }
+                }
+            }
+
+            Some(intersections)
+        })
+        .flatten()
+        .collect()
+}
+
+// Helper to create (contig, start index, end index) tuples
+fn extract_contig_indices(ranges: &[&GenomeRange]) -> Vec<(u8, usize, usize)> {
+    let mut indices = Vec::new();
+    let mut current_contig = None;
+    let mut start_idx = 0;
+
+    for (i, range) in ranges.iter().enumerate() {
+        if current_contig != Some(range.contig) {
+            if let Some(contig) = current_contig {
+                indices.push((contig, start_idx, i));
+            }
+            current_contig = Some(range.contig);
+            start_idx = i;
+        }
+    }
+
+    if let Some(contig) = current_contig {
+        indices.push((contig, start_idx, ranges.len()));
+    }
+
+    indices
+}
+
+// Binary search to find contig indices in precomputed list
+fn find_contig_indices(contigs: &[(u8, usize, usize)], target: u8) -> Option<(usize, usize)> {
+    contigs.binary_search_by(|(c, _, _)| c.cmp(&target))
+        .ok()
+        .map(|idx| (contigs[idx].1, contigs[idx].2))
+}
 
 /// A parallel version of the overlap finding function that works with any types
 /// implementing `GetGenomePosition` and `GetGenomeRange` traits.
@@ -677,3 +833,7 @@ pub trait GetGenomeRange {
     /// Returns a reference to the `GenomeRange` for this item.
     fn range(&self) -> &GenomeRange;
 }
+
+pub fn sort_ranges(ranges: &mut [GenomeRange]) {
+    ranges.par_sort_by_key(|r| (r.contig, r.range.start, r.range.end));
+}

+ 30 - 15
src/scan/scan.rs

@@ -1,4 +1,4 @@
-use std::{fmt, fs, io::Write, sync::Mutex};
+use std::{fmt, fs, io::Write};
 
 use anyhow::Context;
 use log::{debug, error, info};
@@ -11,6 +11,7 @@ use rust_htslib::bam::IndexedReader;
 
 use crate::io::writers::get_gz_writer;
 use crate::math::filter_outliers_modified_z_score_with_indices;
+
 use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};
 
 /// Represents a count of reads in a genomic bin, including various metrics and outlier information.
@@ -133,30 +134,44 @@ impl BinCount {
             None
         };
 
-        let depths = fields[9]
-            .split(',')
-            .map(|s| {
-                s.trim() // Remove any whitespace around numbers
-                    .parse::<u32>()
-                    .with_context(|| format!("Failed to parse '{}' as u32", s))
-            })
-            .collect::<anyhow::Result<Vec<u32>>>()?;
+        let depths = if fields[9].trim().is_empty() {
+            Vec::new()
+        } else {
+            fields[9]
+                .split(',')
+                .map(|s| {
+                    s.trim()
+                        .parse::<u32>()
+                        .with_context(|| format!("Failed to parse '{}' as u32", s))
+                })
+                .collect::<anyhow::Result<Vec<u32>>>()?
+        };
 
         Ok(BinCount {
             contig: fields[0].to_string(),
             start: fields[1].parse()?,
             end: fields[2].parse()?,
             n_reads: fields[3].parse()?,
-            coverage: fields[4].parse()?,
-            ratio_sa: fields[5].parse()?,
-            ratio_start: fields[6].parse()?,
-            ratio_end: fields[7].parse()?,
+            coverage: parse_f64_allow_nan(fields[4])?,
+            ratio_sa: parse_f64_allow_nan(fields[5])?,
+            ratio_start: parse_f64_allow_nan(fields[6])?,
+            ratio_end: parse_f64_allow_nan(fields[7])?,
             outlier,
             depths,
         })
     }
 }
 
+fn parse_f64_allow_nan(s: &str) -> anyhow::Result<f64> {
+    let s = s.trim(); // Remove whitespace
+    if s.eq_ignore_ascii_case("NaN") {
+        Ok(f64::NAN)
+    } else {
+        s.parse::<f64>()
+            .with_context(|| format!("Failed to parse '{}' as f64", s))
+    }
+}
+
 // impl GetGenomeRange for BinCount {
 //     fn range(&self) -> GenomeRange {
 //         GenomeRange { contig: contig_to_num(&self.contig), range: self.start..(self.end + 1) }
@@ -274,7 +289,8 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
                 let n_bins_in_chunk = chunk_length.div_ceil(bin_size);
 
                 let mut bam_reader = IndexedReader::from_path(bam_path)
-                    .with_context(|| format!("Can't open BAM file: {}", bam_path)).unwrap();
+                    .with_context(|| format!("Failed to open BAM file: {}", bam_path))
+                    .unwrap();
 
                 // Process each bin in the chunk
                 (0..n_bins_in_chunk)
@@ -439,4 +455,3 @@ pub fn somatic_scan(id: &str, config: &Config) -> anyhow::Result<()> {
         config,
     )
 }
-

+ 174 - 58
src/variant/variants_stats.rs

@@ -1,4 +1,4 @@
-use std::{collections::{BTreeMap, HashMap}, io::BufRead};
+use std::{collections::BTreeMap, io::BufRead, sync::Arc};
 
 use anyhow::Context;
 use dashmap::DashMap;
@@ -11,8 +11,8 @@ use crate::{
     annotation::{vep::VepImpact, Annotation},
     config::Config,
     helpers::bin_data,
-    io::{dict::read_dict, readers::get_gz_reader, writers::get_gz_writer},
-    positions::{contig_to_num, par_overlaps, GenomeRange},
+    io::{dict::read_dict, gff::features_ranges, readers::get_gz_reader, writers::get_gz_writer},
+    positions::{contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par, GenomeRange},
     scan::scan::BinCount,
 };
 
@@ -37,9 +37,16 @@ pub struct VariantsStats {
     pub consequences: DashMap<String, u32>,
     #[serde(serialize_with = "serialize_dashmap_sort")]
     pub genes: DashMap<String, u32>,
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub genes_nonsynonymus: DashMap<String, u32>,
+
     pub n_gnomad: usize,
     pub gnomad: Vec<(String, Vec<(f64, usize)>)>,
+
+    pub somatic_rates: SomaticVariantRates,
+    pub high_depth_somatic_rates: SomaticVariantRates,
 }
+
 pub fn serialize_dashmap_sort<S, T>(
     data: &DashMap<T, u32>,
     serializer: S,
@@ -57,11 +64,12 @@ where
 }
 
 impl VariantsStats {
-    pub fn new(variants: &Variants) -> Self {
+    pub fn new(variants: &Variants, id:&str, config: &Config) -> anyhow::Result<Self> {
         let n = variants.data.len() as u32;
         let alteration_categories: DashMap<String, u32> = DashMap::new();
         let vep_impact: DashMap<String, u32> = DashMap::new();
         let genes: DashMap<String, u32> = DashMap::new();
+        let genes_nonsynonymus: DashMap<String, u32> = DashMap::new();
         let consequences: DashMap<String, u32> = DashMap::new();
         let n_alts: DashMap<u32, u32> = DashMap::new();
         let depths: DashMap<u32, u32> = DashMap::new();
@@ -71,12 +79,18 @@ impl VariantsStats {
 
         variants.data.par_iter().for_each(|v| {
             if let Ok(best_vep) = v.best_vep() {
-                if let Some(impact) = best_vep.extra.impact {
+                if let Some(ref impact) = best_vep.extra.impact {
                     *vep_impact.entry(impact.to_string()).or_default() += 1;
                 }
-                if let Some(gene) = best_vep.extra.symbol {
-                    *genes.entry(gene).or_default() += 1;
+                if let Some(ref gene) = best_vep.extra.symbol {
+                    *genes.entry(gene.to_string()).or_default() += 1;
+                }
+                if let (Some(impact), Some(gene)) = (best_vep.extra.impact, best_vep.extra.symbol) {
+                    if impact <= VepImpact::MODERATE {
+                        *genes_nonsynonymus.entry(gene).or_default() += 1;
+                    }
                 }
+
                 if let Some(csqs) = best_vep.consequence {
                     let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
                     csq.sort();
@@ -131,7 +145,21 @@ impl VariantsStats {
             })
             .collect();
 
-        Self {
+        let exon_ranges = features_ranges("exon", &config)?;
+        let exon_ranges = merge_overlapping_genome_ranges(&exon_ranges);
+
+        let all_somatic_rates = somatic_rates(&variants.data, &exon_ranges, &config)?;
+
+        let mut high_depth_ranges = high_depth_somatic(id, &config)?;
+        high_depth_ranges.par_sort_by_key(|r| ( r.contig, r.range.start ));
+
+        let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
+        let exons_high_depth = range_intersection_par(&high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(), &exon_ranges_ref);
+
+        let high_depth = somatic_rates(&variants.data, &exons_high_depth, &config)?;
+
+
+        Ok(Self {
             n,
             alteration_categories,
             cosmic,
@@ -142,8 +170,11 @@ impl VariantsStats {
             vep_impact,
             consequences,
             genes,
+            genes_nonsynonymus,
             n_gnomad,
-        }
+            somatic_rates: all_somatic_rates,
+            high_depth_somatic_rates: high_depth
+        })
     }
 
     pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
@@ -160,6 +191,7 @@ impl VariantsStats {
         debug!("Successfully saved variants to {}", filename);
         Ok(())
     }
+
     pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
         let mut reader = get_gz_reader(filename)
             .with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;
@@ -172,25 +204,25 @@ impl VariantsStats {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct SomaticVariantRates {
     pub wgs_length: u32,
     pub total_variants: usize,
     pub somatic_mutation_rate_wgs: f64,
     pub exon_count: usize,
-    pub variants_in_coding: usize,
-    pub coding_variants: usize,
     pub total_exon_bases: u32,
+    pub variants_in_coding: usize,
     pub somatic_mutation_rate_coding: f64,
+    pub nonsynonymous_variants: usize,
     pub somatic_nonsynonymous_rate_coding: f64,
 }
 
 pub fn somatic_rates(
     variants: &[Variant],
-    feature_ranges: &Vec<GenomeRange>,
+    exon_ranges: &Vec<GenomeRange>,
     config: &Config,
 ) -> anyhow::Result<SomaticVariantRates> {
-    let ol = par_overlaps(variants, feature_ranges);
+    let ol = par_overlaps(variants, exon_ranges);
 
     let n_coding = ol
         .iter()
@@ -199,7 +231,7 @@ pub fn somatic_rates(
         .filter(|impact| *impact <= VepImpact::MODERATE)
         .count();
 
-    let n_bases_m: u32 = feature_ranges.par_iter().map(|gr| gr.length()).sum();
+    let n_bases_m: u32 = exon_ranges.par_iter().map(|gr| gr.length()).sum();
     let mega_base_m = n_bases_m as f64 / 10.0e6;
 
     let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
@@ -210,9 +242,9 @@ pub fn somatic_rates(
 
     Ok(SomaticVariantRates {
         total_variants: variants.len(),
-        exon_count: feature_ranges.len(),
+        exon_count: exon_ranges.len(),
         variants_in_coding: ol.len(),
-        coding_variants: n_coding,
+        nonsynonymous_variants: n_coding,
         total_exon_bases: n_bases_m,
         wgs_length: wgs_len,
         somatic_mutation_rate_wgs: rate_wgs,
@@ -221,52 +253,136 @@ pub fn somatic_rates(
     })
 }
 
-pub fn high_depth_somatic(id: &str, config: &Config) -> anyhow::Result<DashMap<String, Vec<GenomeRange>>> {
-    let mut contigs: Vec<String> = (1..22).map(|i| format!("chr{i}")).collect();
-    contigs.extend(["chrX", "chrY", "chrM"].into_iter().map(String::from));
-
-    let results: DashMap<String, Vec<GenomeRange>> = DashMap::new();
-    contigs.into_par_iter().for_each(|contig| {
-        let mrd_path = format!("{}/{contig}_count.tsv.gz", config.normal_dir_count(id));
-        let mrd_reader =
-            get_gz_reader(&mrd_path).with_context(|| format!("Failed to open: {mrd_path}"));
-        let diag_path = format!("{}/{contig}_count.tsv.gz", config.tumoral_dir_count(id));
-        let diag_reader =
-            get_gz_reader(&diag_path).with_context(|| format!("Failed to open: {diag_path}"));
-
-        if let (Ok(mrd_reader), Ok(diag_reader)) = (mrd_reader, diag_reader) {
-            let ranges: Vec<GenomeRange> = mrd_reader
+/// Computes high-depth somatic regions across all chromosomes for a given sample.
+///
+/// This function reads count files (compressed TSVs) for both normal and tumoral samples
+/// for each contig (`chr1` to `chr22`, `chrX`, `chrY`, and `chrM`). It identifies genomic regions
+/// where both normal and tumoral depths exceed a configured quality threshold, and extracts
+/// consecutive high-quality bins as genomic ranges.
+///
+/// The function performs the computation in parallel across contigs for better performance,
+/// and includes contextual error handling to help trace issues related to I/O or data parsing.
+///
+/// # Arguments
+///
+/// * `id` - The identifier of the sample.
+/// * `config` - A reference to a `Config` struct containing paths and thresholds.
+///
+/// # Returns
+///
+/// A `Result` containing a vector of `GenomeRange` objects representing high-depth somatic regions,
+/// or an error if any file reading, parsing, or logical check fails.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - Any of the input files (normal or tumoral) can't be opened
+/// - Any line in the files fails to read or parse
+/// - The `BinCount` objects from corresponding lines don't match in position
+///
+/// # Parallelism
+///
+/// This function leverages `rayon` to parallelize processing of contigs.
+///
+/// # Example
+///
+/// ```rust
+/// let config = Config::load_from("config_path.toml")?;
+/// let regions = high_depth_somatic("sample_001", &config)?;
+/// for region in regions {
+///     println!("{:?}", region);
+/// }
+/// ```
+///
+/// # Requirements
+///
+/// - The file names must follow the pattern `{contig}_count.tsv.gz`
+/// - The structure of lines must match what `BinCount::from_tsv_row` expects
+pub fn high_depth_somatic(id: &str, config: &Config) -> anyhow::Result<Vec<GenomeRange>> {
+    // Generate contigs from chr1 to chr22, then chrX, Y, M
+    let contigs = (1..=22)
+        .map(|i| format!("chr{i}"))
+        .chain(["chrX", "chrY", "chrM"].iter().map(|s| s.to_string()))
+        .collect::<Vec<_>>();
+
+    let config = Arc::new(config); // Wrap the config in an Arc for shared ownership
+
+    // Process contigs in parallel with proper error propagation
+    let results: Vec<Vec<GenomeRange>> = contigs
+        .into_par_iter()
+        .map(|contig| {
+            let config = Arc::clone(&config);
+            // Build file paths
+            let mrd_path = format!("{}/{contig}_count.tsv.gz", config.normal_dir_count(id));
+            let diag_path = format!("{}/{contig}_count.tsv.gz", config.tumoral_dir_count(id));
+
+            // Open readers with proper error context
+            let mrd_reader = get_gz_reader(&mrd_path)
+                .with_context(|| format!("Failed to open MRD file: {mrd_path}"))?;
+            let diag_reader = get_gz_reader(&diag_path)
+                .with_context(|| format!("Failed to open Diag file: {diag_path}"))?;
+
+            // Process lines in pairs
+            let ranges = mrd_reader
                 .lines()
                 .zip(diag_reader.lines())
-                .filter_map(|(mrd, diag)| {
-                    if let (Ok(mrd), Ok(diag)) = (mrd, diag) {
-                        if let (Ok(mrd), Ok(diag)) =
-                        (BinCount::from_tsv_row(&mrd), BinCount::from_tsv_row(&diag))
-                        {
-                            assert_eq!(mrd.contig, diag.contig);
-                            assert_eq!(mrd.start, diag.start);
-                            let bools: Vec<bool> = mrd
-                                .depths
-                                .into_iter()
-                                .zip(diag.depths)
-                                .map(|(depth_mrd, depth_diag)| {
-                                    depth_mrd >= config.min_high_quality_depth
-                                    && depth_diag >= config.min_high_quality_depth
-                                })
-                                .collect();
-                            let ranges = ranges_from_consecutive_true(&bools, mrd.start, &mrd.contig);
-                            return Some(ranges);
-                        }
+                .enumerate()
+                .map(|(line_num, (mrd_line, diag_line))| {
+                    let line_num = line_num + 1; // Convert to 1-based indexing
+
+                    // Read lines with context
+                    let mrd_line = mrd_line.with_context(|| {
+                        format!("MRD file {mrd_path} line {line_num} read error")
+                    })?;
+                    let diag_line = diag_line.with_context(|| {
+                        format!("Diag file {diag_path} line {line_num} read error")
+                    })?;
+
+                    // Parse both lines
+                    let mrd = BinCount::from_tsv_row(&mrd_line).with_context(|| {
+                        format!("Failed to parse MRD line {line_num}: {mrd_line}")
+                    })?;
+                    let diag = BinCount::from_tsv_row(&diag_line).with_context(|| {
+                        format!("Failed to parse Diag line {line_num}: {diag_line}")
+                    })?;
+
+                    // Validate matching positions
+                    if mrd.contig != diag.contig {
+                        anyhow::bail!(
+                            "Contig mismatch at line {line_num}: {} vs {}",
+                            mrd.contig,
+                            diag.contig
+                        );
+                    }
+                    if mrd.start != diag.start {
+                        anyhow::bail!(
+                            "Start position mismatch at line {line_num}: {} vs {}",
+                            mrd.start,
+                            diag.start
+                        );
                     }
-                    None
+
+                    // Calculate high-depth regions
+                    let bools: Vec<bool> = mrd
+                        .depths
+                        .iter()
+                        .zip(diag.depths.iter())
+                        .map(|(&m, &d)| {
+                            m >= config.min_high_quality_depth && d >= config.min_high_quality_depth
+                        })
+                        .collect();
+
+                    Ok(ranges_from_consecutive_true(&bools, mrd.start, &mrd.contig))
                 })
-                .flatten()
-                .collect();
-            results.insert(contig, ranges);
-        }
-    });
+                .collect::<anyhow::Result<Vec<_>>>()?;
+
+            // Flatten nested ranges and return contig's ranges
+            Ok(ranges.into_iter().flatten().collect::<Vec<GenomeRange>>())
+        })
+        .collect::<anyhow::Result<Vec<_>>>()?;
 
-    Ok(results)
+    // Flatten the results from all contigs into a single vector
+    Ok(results.into_iter().flatten().collect())
 }
 
 fn ranges_from_consecutive_true(vec: &[bool], start: u32, contig: &str) -> Vec<GenomeRange> {