Browse Source

HighDepths -> HighDepth / insertions_len

Thomas 8 months ago
parent
commit
57e6bce18b
3 changed files with 65 additions and 5 deletions
  1. 3 3
      src/annotation/mod.rs
  2. 52 1
      src/variant/variant_collection.rs
  3. 10 1
      src/variant/variants_stats.rs

+ 3 - 3
src/annotation/mod.rs

@@ -74,7 +74,7 @@ pub enum Annotation {
     /// Timing of replication for the variant's genomic position.
     ReplicationTiming(ReplicationClass),
 
-    HighDepths,
+    HighDepth,
 
     Panel(String),
 
@@ -210,7 +210,7 @@ impl fmt::Display for Annotation {
                 ReplicationClass::Early => "ReplicationEarly".into(),
                 ReplicationClass::Late => "ReplicationLate".into(),
             },
-            Annotation::HighDepths => "HighDepths".into(),
+            Annotation::HighDepth => "HighDepths".into(),
             Annotation::Panel(name) => format!("Panel_{name}"),
         };
         write!(f, "{}", s)
@@ -353,7 +353,7 @@ impl Annotations {
                     | Annotation::VEP(_)
                     | Annotation::TriNucleotides(_)
                     | Annotation::ReplicationTiming(_)
-                    | Annotation::HighDepths
+                    | Annotation::HighDepth
                     | Annotation::CpG
                     | Annotation::Panel(_)
                     | Annotation::HighConstitAlt => categorical.push(ann.to_string()),

+ 52 - 1
src/variant/variant_collection.rs

@@ -14,7 +14,7 @@ use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
-use super::variant::{AlterationCategory, ReferenceAlternative, VcfVariant};
+use super::variant::{AlterationCategory, Info, ReferenceAlternative, VcfVariant};
 use crate::{
     annotation::{
         cosmic::Cosmic,
@@ -482,6 +482,57 @@ impl Variant {
             .collect()
     }
 
+    /// Returns the maximum inferred insertion length among all VCF variants.
+    ///
+    /// This method computes the insertion length from either:
+    /// - The `SVINSLEN` entry in the variant's `INFO` field, if present.
+    /// - Or, if `SVINSLEN` is not available and the variant represents a standard
+    ///   nucleotide-to-nucleotides substitution, it estimates the insertion length as
+    ///   `alt.len() - 1`, where `alt` is the alternative allele sequence.
+    ///
+    /// The returned value is the maximum insertion length across all variants,
+    /// or `None` if no valid insertion length could be determined.
+    ///
+    /// # Returns
+    /// * `Some(u32)` - The maximum insertion length found.
+    /// * `None` - If no `SVINSLEN` or inferrable insertion is available in any variant.
+    ///
+    /// # Examples
+    /// ```rust
+    /// let insertion_length = variant_set.insertion_length();
+    /// if let Some(len) = insertion_length {
+    ///     println!("Max insertion length: {}", len);
+    /// }
+    /// ```
+    pub fn insertion_length(&self) -> Option<u32> {
+        self.vcf_variants
+            .iter()
+            .filter_map(|v| {
+                v.infos
+                    .0
+                    .iter()
+                    .find_map(|i| {
+                        if let Info::SVINSLEN(len) = i {
+                            Some(*len)
+                        } else {
+                            None
+                        }
+                    })
+                    .or_else(|| {
+                        if let (
+                            ReferenceAlternative::Nucleotide(_),
+                            ReferenceAlternative::Nucleotides(nt),
+                        ) = (&v.reference, &self.alternative)
+                        {
+                            Some(nt.len().saturating_sub(1) as u32)
+                        } else {
+                            None
+                        }
+                    })
+            })
+            .max()
+    }
+
     pub fn n_alt_depth(&self) -> (f64, f64) {
         let (n_alts, depths): (Vec<u32>, Vec<u32>) = self
             .vcf_variants

+ 10 - 1
src/variant/variants_stats.rs

@@ -56,6 +56,9 @@ pub struct VariantsStats {
     pub somatic_rates: SomaticVariantRates,
     pub high_depth_somatic_rates: SomaticVariantRates,
     pub mutation_rates: Vec<(String, (u32, usize))>,
+
+    #[serde(serialize_with = "serialize_dashmap_sort")]
+    pub insertions_len: DashMap<u32, u32>,
 }
 
 pub fn serialize_dashmap_sort<S, T>(
@@ -88,6 +91,7 @@ impl VariantsStats {
         let cosmic: DashMap<u64, u32> = DashMap::new();
         let gnomads: DashMap<String, Vec<f64>> = DashMap::new();
         let context: DashMap<String, Vec<String>> = DashMap::new();
+        let insertions_len: DashMap<u32, u32> = DashMap::new();
 
         variants.data.par_iter().for_each(|v| {
             // VEP
@@ -156,6 +160,10 @@ impl VariantsStats {
             *alteration_categories
                 .entry(alteration_category_str.join(", "))
                 .or_default() += 1;
+
+            if let Some(len) = v.insertion_length() {
+                *insertions_len.entry(len).or_default() += 1;
+            }
         });
 
         let mut n_gnomad = 0;
@@ -185,7 +193,7 @@ impl VariantsStats {
         let mut mutation_rates = Vec::new();
 
         // HighDepths
-        let ann = Annotation::HighDepths;
+        let ann = Annotation::HighDepth;
         let res = variants.annotate_with_ranges(
             &high_depth_ranges,
             Some(ann.clone()),
@@ -353,6 +361,7 @@ impl VariantsStats {
             somatic_rates: all_somatic_rates,
             high_depth_somatic_rates: high_depth,
             mutation_rates,
+            insertions_len,
         })
     }