Browse Source

VariantsStats

Thomas 9 months ago
parent
commit
088093d58f
5 changed files with 188 additions and 30 deletions
  1. 60 2
      src/annotation/gnomad.rs
  2. 22 1
      src/lib.rs
  3. 20 24
      src/pipes/somatic.rs
  4. 1 1
      src/variant/variant.rs
  5. 85 2
      src/variant/variant_collection.rs

+ 60 - 2
src/annotation/gnomad.rs

@@ -1,6 +1,6 @@
-use std::str::FromStr;
-use serde::{Serialize, Deserialize};
 use anyhow::{anyhow, Ok, Result};
+use serde::{Deserialize, Serialize};
+use std::str::FromStr;
 
 #[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
 pub struct GnomAD {
@@ -59,3 +59,61 @@ impl FromStr for GnomAD {
         })
     }
 }
+
+#[allow(clippy::upper_case_acronyms, non_camel_case_types)]
+#[derive(Debug, PartialEq, Clone)]
+pub enum GnomADValue {
+    AC(u64),
+    AN(u64),
+    AF(f64),
+    AF_OTH(f64),
+    AF_AMI(f64),
+    AF_SAS(f64),
+    AF_FIN(f64),
+    AF_EAS(f64),
+    AF_AMR(f64),
+    AF_AFR(f64),
+    AF_MID(f64),
+    AF_ASJ(f64),
+    AF_NFE(f64),
+}
+
+impl GnomADValue {
+    pub fn to_string_value_pair(&self) -> (String, f64) {
+        match self {
+            GnomADValue::AC(value) => ("AC".to_string(), *value as f64),
+            GnomADValue::AN(value) => ("AN".to_string(), *value as f64),
+            GnomADValue::AF(value) => ("AF".to_string(), *value),
+            GnomADValue::AF_OTH(value) => ("AF_OTH".to_string(), *value),
+            GnomADValue::AF_AMI(value) => ("AF_AMI".to_string(), *value),
+            GnomADValue::AF_SAS(value) => ("AF_SAS".to_string(), *value),
+            GnomADValue::AF_FIN(value) => ("AF_FIN".to_string(), *value),
+            GnomADValue::AF_EAS(value) => ("AF_EAS".to_string(), *value),
+            GnomADValue::AF_AMR(value) => ("AF_AMR".to_string(), *value),
+            GnomADValue::AF_AFR(value) => ("AF_AFR".to_string(), *value),
+            GnomADValue::AF_MID(value) => ("AF_MID".to_string(), *value),
+            GnomADValue::AF_ASJ(value) => ("AF_ASJ".to_string(), *value),
+            GnomADValue::AF_NFE(value) => ("AF_NFE".to_string(), *value),
+        }
+    }
+}
+
+impl GnomAD {
+    pub fn to_vec(&self) -> Vec<GnomADValue> {
+        vec![
+            GnomADValue::AC(self.gnomad_ac),
+            GnomADValue::AN(self.gnomad_an),
+            GnomADValue::AF(self.gnomad_af),
+            GnomADValue::AF_OTH(self.gnomad_af_oth),
+            GnomADValue::AF_AMI(self.gnomad_af_ami),
+            GnomADValue::AF_SAS(self.gnomad_af_sas),
+            GnomADValue::AF_FIN(self.gnomad_af_fin),
+            GnomADValue::AF_EAS(self.gnomad_af_eas),
+            GnomADValue::AF_AMR(self.gnomad_af_amr),
+            GnomADValue::AF_AFR(self.gnomad_af_afr),
+            GnomADValue::AF_MID(self.gnomad_af_mid),
+            GnomADValue::AF_ASJ(self.gnomad_af_asj),
+            GnomADValue::AF_NFE(self.gnomad_af_nfe),
+        ]
+    }
+}

+ 22 - 1
src/lib.rs

@@ -43,7 +43,7 @@ mod tests {
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, variant::variant::AlterationCategory};
+    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, pipes::somatic::const_stats, variant::{variant::AlterationCategory, variant_collection::VariantsStats}};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -728,4 +728,25 @@ mod tests {
         Ok(())
 
     }
+
+    #[test]
+    fn variants_stats() -> anyhow::Result<()> {
+        init();
+        let id = "ADJAGBA";
+        let config = Config::default();
+        let path = format!("{}/{id}/diag/somatic_variants.json.gz", config.result_dir);
+        let variants = variant_collection::Variants::load_from_json(&path)?;
+
+        VariantsStats::new(&variants).save_to_json(&format!("{}/{id}/diag/somatic_variants_stats.json.gz", config.result_dir));
+Ok(())
+    }
+
+    #[test]
+    fn constit_stats() {
+        init();
+        let id = "ADJAGBA";
+        let config = Config::default();
+
+        const_stats(id.to_string(), config);
+    }
 }

+ 20 - 24
src/pipes/somatic.rs

@@ -20,7 +20,7 @@ use crate::{
     runners::Run,
     variant::{
         variant::{load_variants, run_variants, CallerBox},
-        variant_collection::{ExternalAnnotation, VariantCollection, Variants},
+        variant_collection::{ExternalAnnotation, VariantCollection, Variants, VariantsStats},
     },
 };
 
@@ -514,27 +514,23 @@ impl Run for Somatic {
     }
 }
 
-// 0-based position
-fn sequence_at(
-    fasta_reader: &mut noodles_fasta::IndexedReader<noodles_fasta::io::BufReader<File>>,
-    contig: &str,
-    position: usize,
-    len: usize,
-) -> anyhow::Result<String> {
-    // convert to 1-based
-    let position = position + 1;
-
-    let start = position.saturating_sub(len / 2).max(1);
-    let end = start + len - 1;
-    // debug!("Region {contig}:{start}-{end} (1-based inclusive)");
-
-    let start = noodles_core::Position::try_from(start)?;
-    let end = noodles_core::Position::try_from(end)?;
-    let interval = noodles_core::region::interval::Interval::from(start..=end);
-
-    let r = noodles_core::Region::new(contig.to_string(), interval);
-    let record = fasta_reader.query(&r)?;
-    let s = String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase();
-
-    Ok(s)
+
+pub fn const_stats(id: String, config: Config) -> anyhow::Result<()> {
+    info!("Loading Germline");
+    let annotations = Annotations::default();
+        let clairs_germline =
+            ClairS::initialize(&id, config)?.germline(&annotations)?;
+
+    info!("Annotation with Cosmic and GnomAD.");
+    let ext_annot = ExternalAnnotation::init()?;
+    ext_annot.annotate(&clairs_germline.variants, &annotations)?;
+
+    let mut variants = Variants::default();
+    variants.merge(clairs_germline, &annotations);
+    info!("-- {}", variants.data.len());
+
+    let u = VariantsStats::new(&variants);
+    info!("++ {}", u.n);
+
+    Ok(())
 }

+ 1 - 1
src/variant/variant.rs

@@ -361,7 +361,7 @@ impl fmt::Display for AlterationCategory {
                 AlterationCategory::DUP => "DUP",
                 AlterationCategory::INV => "INV",
                 AlterationCategory::CNV => "CNV",
-                AlterationCategory::BND | AlterationCategory::TRL => "BND",
+                AlterationCategory::BND | AlterationCategory::TRL => "TRL",
                 AlterationCategory::Other => "Other",
             }
         )

+ 85 - 2
src/variant/variant_collection.rs

@@ -8,6 +8,7 @@ use std::{
 use anyhow::Context;
 use bgzip::{BGZFReader, BGZFWriter};
 use csv::ReaderBuilder;
+use dashmap::DashMap;
 use log::{debug, error, info, warn};
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
@@ -18,7 +19,7 @@ use crate::{
     annotation::{
         cosmic::Cosmic,
         echtvar::{parse_echtvar_val, run_echtvar},
-        gnomad::GnomAD,
+        gnomad::{GnomAD, GnomADValue},
         vep::{run_vep, VepLine, VEP},
         Annotation, Annotations,
     },
@@ -26,7 +27,7 @@ use crate::{
         bam::{counts_at, counts_ins_at},
         vcf::Vcf,
     },
-    helpers::{app_storage_dir, estimate_shannon_entropy, temp_file_path, Hash128},
+    helpers::{app_storage_dir, estimate_shannon_entropy, mean, temp_file_path, Hash128},
     io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header},
     positions::GenomePosition,
 };
@@ -724,6 +725,88 @@ impl Variants {
     }
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct VariantsStats {
+    pub n: u32,
+    pub alteration_categories: DashMap<String, u32>,
+    pub cosmic: DashMap<u64, u32>,
+    pub gnomad: DashMap<String, Vec<f64>>,
+}
+
+impl VariantsStats {
+    pub fn new(variants: &Variants) -> Self {
+        let n = variants.data.len() as u32;
+        let alteration_categories: DashMap<String, u32> = DashMap::new();
+        let cosmic: DashMap<u64, u32> = DashMap::new();
+        let gnomad: DashMap<String, Vec<f64>> = DashMap::new();
+
+        variants.data.par_iter().for_each(|v| {
+            v.annotations.iter().for_each(|annotation| {
+                match annotation {
+                    Annotation::Cosmic(v) => *cosmic.entry(v.cosmic_cnt).or_default() += 1,
+                    Annotation::GnomAD(v) => {
+                        v.to_vec()
+                            .iter()
+                            .map(|e| e.to_string_value_pair())
+                            .for_each(|(key, value)| {
+                                gnomad.entry(key).or_default().push(value);
+                            });
+                    }
+                    _ => (),
+                };
+            });
+            let mut alteration_category_str = v
+                .alteration_category()
+                .iter()
+                .map(|c| c.to_string())
+                .collect::<Vec<String>>();
+            alteration_category_str.sort();
+            alteration_category_str.dedup();
+
+            *alteration_categories.entry(alteration_category_str.join(", ")).or_default() += 1;
+        });
+
+        gnomad.iter().for_each(|e| {
+            println!("{}\t{}", e.key(), mean(e.value()));
+        });
+
+        Self {
+            n,
+            alteration_categories,
+            cosmic,
+            gnomad,
+        }
+    }
+
+    pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
+        let file = File::create(filename)
+            .with_context(|| format!("Failed to create file: {}", filename))?;
+        let mut writer = BGZFWriter::new(file, bgzip::Compression::default());
+
+        serde_json::to_writer(&mut writer, self)
+            .with_context(|| format!("Failed to serialize JSON to file: {}", filename))?;
+
+        writer
+            .close()
+            .with_context(|| format!("Failed to close BGZF writer for file: {}", filename))?;
+
+        debug!("Successfully saved variants to {}", filename);
+        Ok(())
+    }
+    pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
+        let file =
+            File::open(filename).with_context(|| format!("Failed to open file: {}", filename))?;
+        let mut reader = BGZFReader::new(file)
+            .with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;
+
+        let variants: Self = serde_json::from_reader(&mut reader)
+            .with_context(|| format!("Failed to deserialize JSON from file: {}", filename))?;
+
+        debug!("Successfully loaded variants from {}", filename);
+        Ok(variants)
+    }
+}
+
 /// Creates a new Variant instance from a collection of VcfVariants and annotations.
 ///
 /// This function consolidates information from one or more VcfVariants into a single Variant,