Explorar el Código

GenesBedIndex

Thomas hace 2 semanas
padre
commit
ff594b7c02

+ 43 - 42
src/annotation/cosmic.rs

@@ -15,68 +15,69 @@ pub struct Cosmic {
 impl FromStr for Cosmic {
     type Err = anyhow::Error;
 
-    /// Parses a `Cosmic` instance from a semicolon-delimited string.
+    /// Parses COSMIC annotation fields from a semicolon-delimited string.
     ///
-    /// # Expected Input Format
-    /// The input string must follow the format:
+    /// # Accepted Input
     ///
-    /// ```text
-    /// <field1>;<field2>;CNT=<number>
-    /// ```
+    /// The input is expected to be a `;`-separated list of `key=value` fields.
+    /// The `CNT` field may appear **anywhere** in the string.
     ///
-    /// - The input must contain exactly three parts, separated by semicolons (`;`).
-    /// - The third part must be of the form `CNT=<number>`, where `<number>` can be parsed as a `u64`.
-    /// - If the first part contains the word `"MISSING"`, parsing will fail.
+    /// Examples of valid inputs:
     ///
-    /// Generated with echtvar encode json:
-    /// ```json
-    /// [{"field":"GENOME_SCREEN_SAMPLE_COUNT", "alias": "CNT"}]
+    /// ```text
+    /// CNT=188
+    /// foo=bar;CNT=42
+    /// X=1;Y=2;CNT=7;Z=3
     /// ```
     ///
-    /// # Examples
+    /// # Missing Data
     ///
-    /// ```
-    /// use your_crate::Cosmic;
-    /// use std::str::FromStr;
+    /// - If the string contains the literal `"MISSING"`, parsing fails.
+    /// - If no `CNT` field is present, parsing fails.
     ///
-    /// let input = "ID1;info;CNT=42";
-    /// let cosmic = Cosmic::from_str(input).unwrap();
-    /// assert_eq!(cosmic.cosmic_cnt, 42);
-    /// ```
+    /// Handling of missing COSMIC annotation (i.e. mapping to `Option<Cosmic>`)
+    /// is expected to be performed by the caller.
     ///
     /// # Errors
     ///
-    /// - Returns an error if the string does not contain exactly three semicolon-separated parts.
-    /// - Returns an error if `"MISSING"` is found in the first part.
-    /// - Returns an error if the third part is not in `key=value` format.
-    /// - Returns an error if the value is not a valid `u64`.
+    /// Returns an error if:
+    /// - The input string is empty.
+    /// - A `CNT` field is present but cannot be parsed as `u64`.
+    /// - No `CNT` field is found.
     fn from_str(s: &str) -> anyhow::Result<Self> {
         let s = s.trim();
-        let vs: Vec<&str> = s.split(";").map(str::trim).collect();
-        if vs.len() != 3 {
-            return Err(anyhow::anyhow!(
-                "Expected 3 semicolon-separated parts in Cosmic string, got {}: {s}",
-                vs.len()
-            ));
+        if s.is_empty() {
+            return Err(anyhow::anyhow!("Empty Cosmic string"));
         }
 
-        if vs[0].contains("MISSING") {
+        // Keep your existing semantics: MISSING => parsing fails
+        if s.contains("MISSING") {
             return Err(anyhow::anyhow!("MISSING values in Cosmic results: {s}"));
         }
 
-        let v: Vec<&str> = vs[2].split("=").map(str::trim).collect();
+        // Scan semicolon-delimited key=value fields and look for CNT
+        let mut cnt: Option<u64> = None;
 
-        if v.len() != 2 {
-            return Err(anyhow::anyhow!(
-                "Expected key=value format in third field: {}",
-                vs[2]
-            ));
-        }
+        for part in s.split(';').map(str::trim).filter(|p| !p.is_empty()) {
+            let (k, v) = match part.split_once('=') {
+                Some(kv) => kv,
+                None => continue, // ignore non key=value tokens if any
+            };
 
-        let count = v[1]
-            .parse::<u64>()
-            .map_err(|e| anyhow::anyhow!("Failed to parse COSMIC CNT from '{}': {}", v[1], e))?;
+            let key = k.trim();
+            let val = v.trim();
+
+            if key == "CNT" {
+                let parsed = val
+                    .parse::<u64>()
+                    .map_err(|e| anyhow::anyhow!("Failed to parse COSMIC CNT from '{val}': {e}"))?;
+                cnt = Some(parsed);
+                break;
+            }
+        }
 
-        Ok(Cosmic { cosmic_cnt: count })
+        let cosmic_cnt =
+            cnt.ok_or_else(|| anyhow::anyhow!("Missing CNT field in Cosmic string: {s}"))?;
+        Ok(Cosmic { cosmic_cnt })
     }
 }

+ 71 - 10
src/annotation/echtvar.rs

@@ -80,21 +80,82 @@ pub fn run_echtvar(
 }
 
 pub fn parse_echtvar_val(s: &str) -> Result<(Option<Cosmic>, Option<GnomAD>)> {
-    let mi: Vec<_> = s.match_indices(r";gnomad").collect();
-    let (i, _) = mi[0];
-    let str_cosmic = &s[..i];
-    let str_gnomad = &s[i + 1..];
+    let mut cosmic_parts = Vec::new();
+    let mut gnomad_parts = Vec::new();
 
-    let cosmic = if str_cosmic.contains("MISSING") {
-        None
-    } else {
-        Some(str_cosmic.parse::<Cosmic>()?)
+    for part in s.split(';').map(str::trim).filter(|p| !p.is_empty()) {
+        if part.starts_with("gnomad_") {
+            gnomad_parts.push(part);
+        } else {
+            cosmic_parts.push(part);
+        }
+    }
+
+    // COSMIC: missing if CNT is absent or CNT is -1 or MISSING
+    let cosmic = {
+        let cnt = cosmic_parts.iter().find(|p| p.starts_with("CNT=")).copied();
+        match cnt {
+            Some("CNT=-1") | Some("CNT=MISSING") | None => None,
+            Some(_) => Some(cosmic_parts.join(";").parse::<Cosmic>()?),
+        }
     };
 
-    let gnomad = if str_gnomad.contains("-1") {
+    // gnomAD: your choice; this keeps your "all -1 => None" behavior
+    let gnomad = if gnomad_parts.is_empty() || gnomad_parts.iter().all(|p| p.ends_with("=-1")) {
         None
     } else {
-        Some(str_gnomad.parse::<GnomAD>()?)
+        Some(gnomad_parts.join(";").parse::<GnomAD>()?)
     };
+
     Ok((cosmic, gnomad))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{
+        annotation::{Annotation, Annotations, Caller, Sample},
+        helpers::test_init,
+        variant::{variant_collection::ExternalAnnotation, vcf_variant::VcfVariant},
+    };
+
+    #[test]
+    fn echtvar_parse() -> anyhow::Result<()> {
+        test_init();
+
+        let s = "gnomad_ac=1;gnomad_an=-1;gnomad_af=-1;gnomad_af_oth=-1;gnomad_af_ami=-1;gnomad_af_sas=-1;gnomad_af_fin=-1;gnomad_af_eas=-1;gnomad_af_amr=-1;gnomad_af_afr=-1;gnomad_af_mid=-1;gnomad_af_asj=-1;gnomad_af_nfe=-1;CNT=188";
+        let (cosmic, gnomad) = parse_echtvar_val(s)?;
+
+        println!("{cosmic:#?}");
+
+        Ok(())
+    }
+
+    #[test]
+    fn echtvar_run() -> anyhow::Result<()> {
+        test_init();
+        let config = Config::default();
+
+        let variants: Vec<VcfVariant> = vec![
+            "chr12\t25116560\t.\tC\tG\t18.2\tPASS\t.\tGT:GQ:DP:AD:VAF:PL\t1/1:18:45:37,7:0.155556:18,25,0".parse()?,
+            "chr1\t5619\trs1470452993\tA\tC \t.\tPASS\t.\t.\t.".parse()?
+        ];
+
+        let annotations = Annotations::default();
+
+        let caller = Annotation::Callers(Caller::ClairS, Sample::Somatic);
+
+         variants.iter().for_each(|v| {
+            annotations.insert_update(v.hash(), std::slice::from_ref(&caller));
+        });
+
+        let ext_annot = ExternalAnnotation::init(&config)?;
+        ext_annot.annotate(&variants, &annotations)?;
+
+        println!("{annotations:#?}");
+
+        Ok(())
+    }
+
+    
+}

+ 5 - 0
src/annotation/gnomad.rs

@@ -20,6 +20,11 @@ use std::str::FromStr;
 ///  { "field": "AF_nfe", "alias": "gnomad_af_nfe", "multiplier": 2000000 }
 ///]
 /// ```
+/// Then encode with
+/// ```
+/// echtvar encode gnomAD_4-2022_10-gnomad.echtvar.zip gnomad_echtvar.json Homo_sapiens-GCA_009914755.4-2022_10-gnomad.vcf.gz
+/// ```
+/// Be careful if your contigs are begining by "chr" you should rename gnomad vcf contigs
 
 #[derive(Debug, PartialEq, Serialize, Deserialize, Clone, Encode, Decode)]
 pub struct GnomAD {

+ 0 - 1
src/callers/savana.rs

@@ -863,7 +863,6 @@ mod tests {
 
         let mut caller = Savana::initialize("DUMCO", &config)?;
         caller.run()?;
-        // crate::runners::Run::run(&mut caller)?;
         Ok(())
     }
 }

+ 1 - 1
src/collection/vcf.rs

@@ -91,7 +91,7 @@ impl Vcf {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct VcfCollection {
     pub vcfs: Vec<Vcf>,
 }

+ 68 - 1
src/io/bed.rs

@@ -1,6 +1,6 @@
 use std::{
     io::{BufRead, BufReader},
-    str::FromStr,
+    str::FromStr, sync::Arc,
 };
 
 use anyhow::Context;
@@ -180,3 +180,70 @@ fn extract_contig_indices_bed(rows: &[BedRow]) -> Vec<(u8, usize, usize)> {
     out.push((current, start, rows.len()));
     out
 }
+
+#[derive(Clone)]
+pub struct GenesBedIndex {
+    // contig (u8) -> BedRows on that contig, sorted by range.start
+    by_contig: Vec<Arc<[BedRow]>>,
+}
+
+impl GenesBedIndex {
+    pub fn new(mut rows: Vec<BedRow>) -> Self {
+        // Ensure deterministic grouping & fast queries
+        rows.sort_unstable_by(|a, b| {
+            a.range.contig
+                .cmp(&b.range.contig)
+                .then_with(|| a.range.range.start.cmp(&b.range.range.start))
+                .then_with(|| a.range.range.end.cmp(&b.range.range.end))
+        });
+
+        // 256 is fine if you encode contigs into u8
+        let mut tmp: Vec<Vec<BedRow>> = vec![Vec::new(); 256];
+        for r in rows {
+            tmp[r.range.contig as usize].push(r);
+        }
+
+        let by_contig = tmp
+            .into_iter()
+            .map(Arc::<[BedRow]>::from)
+            .collect();
+
+        Self { by_contig }
+    }
+
+    #[inline]
+    pub fn query_genes(&self, contig: u8, start: u32, end: u32) -> Vec<String> {
+        let (s, e) = if start <= end { (start, end) } else { (end, start) };
+        let rows = &self.by_contig[contig as usize];
+        if rows.is_empty() {
+            return Vec::new();
+        }
+
+        // lower_bound on start
+        let mut i = match rows.binary_search_by_key(&s, |r| r.range.range.start) {
+            Ok(i) | Err(i) => i,
+        };
+        i = i.saturating_sub(1);
+
+        let mut out: Vec<String> = Vec::new();
+        while i < rows.len() {
+            let r = &rows[i];
+            let rs = r.range.range.start;
+            let re = r.range.range.end;
+
+            if rs > e {
+                break;
+            }
+            if re >= s && rs <= e {
+                if let Some(name) = &r.name {
+                    out.push(name.clone());
+                }
+            }
+            i += 1;
+        }
+
+        out.sort_unstable();
+        out.dedup();
+        out
+    }
+}

+ 2 - 2
src/variant/variant_collection.rs

@@ -1573,7 +1573,7 @@ impl ExternalAnnotation {
 
                 // Run echtvar
                 run_echtvar(&in_tmp, &out_tmp, &config)?;
-                fs::remove_file(in_tmp)?;
+                // fs::remove_file(in_tmp)?;
 
                 // Parse echtvar output
                 let mut reader = ReaderBuilder::new()
@@ -1604,7 +1604,7 @@ impl ExternalAnnotation {
                     chunk_results.push((hash, cosmic, gnomad));
                 }
 
-                fs::remove_file(out_tmp)?;
+                // fs::remove_file(out_tmp)?;
                 Ok(chunk_results)
             })
             .flatten()

+ 1 - 1
src/variant/vcf_variant.rs

@@ -420,7 +420,7 @@ impl VcfVariant {
     /// a `BNDDesc` struct containing detailed information about the breakend.
     ///
     /// # Returns
-    /// - `Ok(BNDDesc)` if parsing is successful
+    /// - `Ok(BNDDesc)` if parsing is successful 1-based
     /// - `Err` if parsing fails or if the alteration is not a BND
     ///
     /// # Errors