Thomas пре 7 месеци
родитељ
комит
3604b225cf
8 измењених фајлова са 256 додато и 43 уклоњено
  1. 3 1
      src/callers/deep_somatic.rs
  2. 6 11
      src/collection/bam.rs
  3. 47 0
      src/helpers.rs
  4. 19 0
      src/io/fasta.rs
  5. 8 0
      src/lib.rs
  6. 7 0
      src/pipes/somatic.rs
  7. 39 2
      src/variant/variant.rs
  8. 127 29
      src/variant/variant_collection.rs

+ 3 - 1
src/callers/deep_somatic.rs

@@ -71,7 +71,8 @@ impl Initialize for DeepSomatic {
 impl ShouldRun for DeepSomatic {
     fn should_run(&self) -> bool {
         let passed_vcf = &self.config.deepsomatic_passed_vcf(&self.id);
-        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true).unwrap_or(true)
+        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true)
+            .unwrap_or(true)
             || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
         if result {
             info!("DeepSomatic should run for id: {}.", self.id);
@@ -189,6 +190,7 @@ impl Variants for DeepSomatic {
         info!("Loading variants from {}: {}", caller, vcf_passed);
         let variants = read_vcf(&vcf_passed)
             .map_err(|e| anyhow::anyhow!("Failed to read DeepSomatic VCF {}.\n{e}", vcf_passed))?;
+
         variants.par_iter().for_each(|v| {
             annotations.insert_update(v.hash(), &add);
         });

+ 6 - 11
src/collection/bam.rs

@@ -672,7 +672,7 @@ pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<Has
 }
 
 /// Result of querying a read at a reference position.
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, Eq, PartialEq)]
 pub enum PileBase {
     A,
     C,
@@ -680,7 +680,7 @@ pub enum PileBase {
     T,
     N,
     Ins,  // insertion immediately after the queried base
-    Del,  // deletion covering the queried base
+    Del((Vec<u8>, u32)),  // deletion covering the queried base
     Skip, // reference skip (N)
 }
 
@@ -729,7 +729,7 @@ pub fn base_at_new(
         /* Does the queried reference coordinate fall inside this CIGAR op? */
         if ref_pos + consume_ref > at_pos {
             return Ok(Some(match op {
-                Cigar::Del(_) => PileBase::Del,
+                Cigar::Del(e) => PileBase::Del((record.qname().to_vec(), *e )),
                 Cigar::RefSkip(_) => PileBase::Skip,
                 _ => {
                     let offset = (at_pos - ref_pos) as usize;
@@ -788,19 +788,14 @@ pub fn nt_pileup_new(
         }
 
         for aln in pileup.alignments() {
-            // A CIGAR ‘D’ covering this reference base.
-            if aln.is_del() {
-                pile.push(PileBase::Del);
-                continue;
-            }
-
             // Handle the three indel states reported by HTSlib.
             match aln.indel() {
                 rust_htslib::bam::pileup::Indel::Ins(_) => {
                     pile.push(PileBase::Ins); // insertion *starts at* this base
                 }
-                rust_htslib::bam::pileup::Indel::Del(_) => {
-                    pile.push(PileBase::Del); // deletion length > 1, but still covers here
+                rust_htslib::bam::pileup::Indel::Del(e) => {
+                    let qname = aln.record().qname().to_vec();
+                    pile.push(PileBase::Del((qname, e))); // deletion length > 1, but still covers here
                 }
                 rust_htslib::bam::pileup::Indel::None => {
                     // Regular match/mismatch: delegate to `base_at`.

+ 47 - 0
src/helpers.rs

@@ -626,3 +626,50 @@ where
     })
 }
 
+
+/// Represents a string made of either:
+/// - one character repeated N times,
+/// - a two-character block repeated N times,
+/// - or no valid repetition pattern.
+#[derive(Debug, PartialEq)]
+pub enum Repeat {
+    None,
+    RepOne(char, usize),
+    RepTwo(String, usize),
+}
+
+pub fn detect_repetition(s: &str) -> Repeat {
+    let len = s.chars().count();
+    if len == 0 {
+        return Repeat::None;
+    }
+
+    // Check for single-char repetition
+    let mut chars = s.chars();
+    let first = chars.next().unwrap();
+    if s.chars().all(|c| c == first) {
+        return Repeat::RepOne(first, len);
+    }
+
+    // Check for two-char block repetition
+    if len % 2 == 0 {
+        let mut iter = s.chars();
+        let a = iter.next().unwrap();
+        let b = iter.next().unwrap();
+        let mut count = 1;
+
+        while let (Some(c), Some(d)) = (iter.next(), iter.next()) {
+            if c != a || d != b {
+                return Repeat::None;
+            }
+            count += 1;
+        }
+
+        return Repeat::RepTwo(format!("{}{}", a, b), count);
+    }
+
+    Repeat::None
+}
+
+
+

+ 19 - 0
src/io/fasta.rs

@@ -24,3 +24,22 @@ pub fn sequence_at(
 
     Ok(s)
 }
+
+
+pub fn sequence_range(
+    fasta_reader: &mut noodles_fasta::IndexedReader<noodles_fasta::io::BufReader<File>>,
+    contig: &str,
+    start: usize,
+    end: usize,
+) -> anyhow::Result<String> {
+
+    let start = noodles_core::Position::try_from(start + 1)?;
+    let end = noodles_core::Position::try_from(end + 1)?;
+    let interval = noodles_core::region::interval::Interval::from(start..=end);
+
+    let r = noodles_core::Region::new(contig.to_string(), interval);
+    let record = fasta_reader.query(&r)?;
+    let s = String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase();
+
+    Ok(s)
+}

Разлика између датотеке није приказан због своје велике величине
+ 8 - 0
src/lib.rs


+ 7 - 0
src/pipes/somatic.rs

@@ -285,6 +285,13 @@ impl Run for SomaticPipe {
                 "{stats_dir}/{id}_annotations_02_post_germline.json"
             ))?;
 
+        // Remove deletions stretch
+        // info!("Removing deletions stretchs:");
+        // variants_collections.iter_mut().for_each(|coll| {
+        //     let rm = coll.remove_strech();
+        //     info!("\t - {} {}: {}", coll.vcf.caller, coll.vcf.time, rm);
+        // });
+
         // MASK mapq
         let (mut high_depth_ranges, mut low_quality_ranges) =
             somatic_depth_quality_ranges(&id, &config)?;

+ 39 - 2
src/variant/variant.rs

@@ -259,7 +259,7 @@ impl VcfVariant {
                         if bnd_desc.a_contig != bnd_desc.b_contig {
                             AlterationCategory::TRL
                         } else {
-                            AlterationCategory::BND
+                            AlterationCategory::DEL
                         }
                     } else {
                         AlterationCategory::from(sv_type)
@@ -336,6 +336,18 @@ impl VcfVariant {
             return Some(len);
         }
 
+        match self.bnd_desc()  {
+            Ok(bnd_desc) if bnd_desc.a_contig == bnd_desc.b_contig => {
+                if bnd_desc.a_position < bnd_desc.b_position {
+                    return Some(bnd_desc.b_position - bnd_desc.a_position)
+                } else {
+
+                    return Some(bnd_desc.a_position - bnd_desc.b_position)
+                }
+            },
+            _ => (),
+        }
+
         // Final fallback: infer from reference and alternative
         if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_)) =
             (&self.reference, &self.alternative)
@@ -346,7 +358,32 @@ impl VcfVariant {
         if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotides(bnt)) =
             (&self.reference, &self.alternative)
         {
-            return Some(nt.len().saturating_sub(bnt.len()) as u32);
+            if bnt.len() < nt.len() {
+                return Some(nt.len().saturating_sub(bnt.len()) as u32);
+            }
+        }
+
+        None
+    }
+
+    pub fn deletion_seq(&self) -> Option<String> {
+        if self.alteration_category() != AlterationCategory::DEL {
+            return None;
+        }
+
+        // infer from reference and alternative
+        if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_)) =
+            (&self.reference, &self.alternative)
+        {
+            return Some(nt.iter().skip(1).map(|e| e.to_string()).collect());
+        }
+
+        if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotides(bnt)) =
+            (&self.reference, &self.alternative)
+        {
+            if bnt.len() < nt.len() {
+                return Some(nt.iter().skip(1).map(|e| e.to_string()).collect());
+            }
         }
 
         None

+ 127 - 29
src/variant/variant_collection.rs

@@ -9,6 +9,7 @@ use anyhow::Context;
 use bgzip::{BGZFReader, BGZFWriter};
 use bitcode::{Decode, Encode};
 use csv::ReaderBuilder;
+use dashmap::DashMap;
 use log::{debug, error, info, warn};
 use pandora_lib_assembler::assembler::calculate_shannon_entropy;
 use rayon::prelude::*;
@@ -31,7 +32,9 @@ use crate::{
         bam::{counts_at, counts_ins_at},
         vcf::Vcf,
     },
-    helpers::{app_storage_dir, estimate_shannon_entropy, mean, temp_file_path, Hash128},
+    helpers::{
+        app_storage_dir, detect_repetition, estimate_shannon_entropy, mean, temp_file_path, Hash128, Repeat,
+    },
     io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header, writers::get_gz_writer},
     positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
     variant::variant::VariantId,
@@ -316,6 +319,56 @@ impl VariantCollection {
             });
     }
 
+    pub fn remove_strech(&mut self) -> usize {
+    // 1) Concurrently collect indices of RepOne/RepTwo deletions keyed by "contig:pos"
+    let deletions_to_rm: DashMap<String, Vec<usize>> = DashMap::new();
+    self.variants
+        .par_iter()
+        .enumerate()
+        .for_each(|(i, v)| {
+            if let Some(del_seq) = v.deletion_seq() {
+                if matches!(detect_repetition(&del_seq), Repeat::RepOne(_, _) | Repeat::RepTwo(_, _)) {
+                    let key = format!("{}:{}", v.position.contig, v.position.position);
+                    deletions_to_rm
+                        .entry(key)
+                        .or_default()
+                        .value_mut()
+                        .push(i);
+                }
+            }
+        });
+
+    // 2) Build a HashSet of all indices where Vec.len() > 1
+    let to_remove: HashSet<usize> = deletions_to_rm
+        .iter()
+        .filter_map(|entry| {
+            let idxs = entry.value();
+            if idxs.len() > 1 {
+                // clone here is fine since each Vec is small
+                Some(idxs.clone())
+            } else {
+                None
+            }
+        })
+        .flatten()
+        .collect();
+
+    // 3) Drain & rebuild, dropping any variant whose index is in `to_remove`
+    self.variants = self
+        .variants
+        .drain(..)
+        .enumerate()
+        .filter_map(|(i, v)| {
+            if to_remove.contains(&i) {
+                None
+            } else {
+                Some(v)
+            }
+        })
+        .collect();
+
+        to_remove.len()
+}
     /// Annotates variants with information from a constitutional BAM file.
     ///
     /// This function processes variants in parallel chunks and adds annotations
@@ -350,8 +403,9 @@ impl VariantCollection {
         constit_bam_path: &str,
         max_threads: u8,
     ) -> anyhow::Result<()> {
-        fn folder<'a>(alt_seq: &'a str) -> impl Fn((i32, i32), (String, i32)) -> (i32, i32) + 'a {
+        fn folder<'a>(alt_seq: &'a str) -> impl Fn((u32, u32), (String, i32)) -> (u32, u32) + 'a {
             move |(depth_acc, alt_acc), (seq, n): (String, i32)| {
+                let n = n as u32;
                 if seq == alt_seq {
                     (depth_acc + n, alt_acc + n)
                 } else {
@@ -361,7 +415,7 @@ impl VariantCollection {
         }
 
         fn match_repeats(
-            v: &Vec<(String, i32)>,
+            v: &[(String, i32)],
             nt: char,
             n: usize,
             e: usize,
@@ -390,6 +444,11 @@ impl VariantCollection {
                 let mut bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path)
                     .map_err(|e| anyhow::anyhow!("Failed to open BAM file: {e}"))?;
 
+                let c = crate::config::Config::default();
+
+                let mut fasta_reader = noodles_fasta::indexed_reader::Builder::default()
+                    .build_from_path(c.reference)?;
+
                 for var in chunk {
                     let key = var.hash();
                     let mut anns = annotations.store.entry(key).or_default();
@@ -419,40 +478,79 @@ impl VariantCollection {
                             }
                             AlterationCategory::DEL => {
                                 if let Some(del_repr) = var.deletion_desc() {
-                                    let pileup_start = counts_at(
+                                    let len = var.deletion_len().unwrap_or_default();
+
+                                    let pileup_start = crate::collection::bam::nt_pileup_new(
                                         &mut bam,
                                         &var.position.contig(),
                                         del_repr.start.saturating_sub(1),
+                                        false,
                                     )?;
-                                    let (start_depth, start_alt) =
-                                        pileup_start.into_iter().fold((0, 0), folder("D"));
 
-                                    let pileup_end = counts_at(
+                                    let pileup_end = crate::collection::bam::nt_pileup_new(
                                         &mut bam,
                                         &var.position.contig(),
                                         del_repr.end.saturating_sub(1),
+                                        false,
                                     )?;
-                                    let (end_depth, end_alt) =
-                                        pileup_end.into_iter().fold((0, 0), folder("D"));
 
-                                    // outside start (one base upstream)
-                                    let pileup_out_start = counts_at(
-                                        &mut bam,
-                                        &var.position.contig(),
-                                        del_repr.start.saturating_sub(2),
-                                    )?;
-                                    let (_out_start_depth, out_start_alt) =
-                                        pileup_out_start.into_iter().fold((0, 0), folder("D"));
-
-                                    // outside end (one base downstream)
-                                    let pileup_out_end =
-                                        counts_at(&mut bam, &var.position.contig(), del_repr.end)?;
-                                    let (_out_end_depth, out_end_alt) =
-                                        pileup_out_end.into_iter().fold((0, 0), folder("D"));
-
-                                    let depth = start_depth.min(end_depth);
-                                    let alt = start_alt.min(end_alt).saturating_sub(out_start_alt.max(out_end_alt));
-                                    // debug!("{} {alt} / {depth}", var.variant_id());
+                                    let tol = if len > 1 {
+                                        let seq = crate::io::fasta::sequence_range(
+                                            &mut fasta_reader,
+                                            &var.position.contig(),
+                                            del_repr.start as usize - 1,
+                                            del_repr.end as usize - 1,
+                                        )?;
+
+                                        match detect_repetition(&seq) {
+                                            Repeat::None => 0,
+                                            Repeat::RepOne(_, _) => 3,
+                                            Repeat::RepTwo(_, _) => 3,
+                                        }
+                                    } else {
+                                        0
+                                    };
+                                    // println!("TOL {tol}");
+
+                                    let end_qnames: Vec<Vec<u8>> = pileup_end
+                                        .iter()
+                                        // .inspect(|e| {
+                                        //     if let crate::collection::bam::PileBase::Del((_, l)) =
+                                        //         e
+                                        //     {
+                                        //         println!("{l}");
+                                        //     }
+                                        // })
+                                        .filter_map(|e| match e {
+                                            crate::collection::bam::PileBase::Del((qn, l))
+                                                if *l >= len.saturating_sub(tol).max(1)
+                                                    && *l <= len + tol =>
+                                            {
+                                                Some(qn.to_vec())
+                                            }
+                                            _ => None,
+                                        })
+                                        .collect();
+                                    // println!("ends {}", end_qnames.len());
+
+                                    let alt: u32 = pileup_start
+                                        .iter()
+                                        .map(|pb| match pb {
+                                            crate::collection::bam::PileBase::Del((qn, l))
+                                                if /* end_qnames.contains(qn) */
+                                                     *l >= len.saturating_sub(tol).max(1)
+                                                    && *l <= len + tol =>
+                                            {
+                                                1
+                                            }
+                                            _ => 0,
+                                        })
+                                        .sum();
+
+                                    let depth = pileup_start.len().min(pileup_end.len());
+
+                                    debug!("{} {alt} / {depth} {len}", var.variant_id());
+
                                     anns.push(Annotation::ConstitAlt(alt as u16));
                                     anns.push(Annotation::ConstitDepth(depth as u16));
                                 }
@@ -471,8 +569,8 @@ impl VariantCollection {
                                         // If stretch of same nt consider eq +/- 3 nt
                                         let pv = pileup.clone().into_iter().collect::<Vec<_>>();
                                         let res = match_repeats(&pv, repeated, n, 3);
-                                        let depth = pileup.values().sum();
-                                        let alt = res.iter().map(|(_, n)| n).sum();
+                                        let depth = pileup.values().map(|e| *e as u32).sum();
+                                        let alt = res.iter().map(|(_, n)| *n as u32).sum();
                                         (depth, alt)
                                     }
                                     _ => pileup.into_iter().fold((0, 0), folder(&alt_seq)),

Неке датотеке нису приказане због велике количине промена