Przeglądaj źródła

BNDdesc with BNDGraph // Scan verify if bam exists // depth at

Thomas 7 miesięcy temu
rodzic
commit
1a306f5e24
8 zmienionych plików z 679 dodań i 80 usunięć
  1. 21 2
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 143 0
      src/collection/bam.rs
  4. 2 2
      src/collection/mod.rs
  5. 102 6
      src/lib.rs
  6. 11 4
      src/scan/scan.rs
  7. 391 66
      src/variant/variant.rs
  8. 8 0
      src/variant/variant_collection.rs

+ 21 - 2
Cargo.lock

@@ -1395,6 +1395,12 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
 
+[[package]]
+name = "fixedbitset"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
+
 [[package]]
 name = "flatbuffers"
 version = "25.2.10"
@@ -2853,7 +2859,7 @@ dependencies = [
  "nom",
  "pandora_lib_blastn",
  "pandora_lib_igv",
- "petgraph",
+ "petgraph 0.6.5",
  "rayon",
  "regex",
  "rust-htslib 0.47.1",
@@ -2949,6 +2955,7 @@ dependencies = [
  "pandora_lib_assembler",
  "pandora_lib_scan",
  "pandora_lib_variants",
+ "petgraph 0.8.1",
  "rand 0.9.0",
  "rayon",
  "rusqlite",
@@ -3120,8 +3127,20 @@ version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
- "fixedbitset",
+ "fixedbitset 0.4.2",
+ "indexmap",
+]
+
+[[package]]
+name = "petgraph"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a98c6720655620a521dcc722d0ad66cd8afd5d86e34a89ef691c50b7b24de06"
+dependencies = [
+ "fixedbitset 0.5.7",
+ "hashbrown 0.15.2",
  "indexmap",
+ "serde",
 ]
 
 [[package]]

+ 1 - 0
Cargo.toml

@@ -45,6 +45,7 @@ flatbuffers = "25.2.10"
 ordered-float = { version = "5.0.0", features = ["serde"] }
 bitcode = "0.6.5"
 semver = "1.0.26"
+petgraph = "0.8.1"
 
 [profile.dev]
 opt-level = 0

+ 143 - 0
src/collection/bam.rs

@@ -670,3 +670,146 @@ pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<Has
     }
     Ok(genome)
 }
+
+/// Result of querying a read at a reference position.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum PileBase {
+    A,
+    C,
+    G,
+    T,
+    N,
+    Ins,  // insertion immediately after the queried base
+    Del,  // deletion covering the queried base
+    Skip, // reference skip (N)
+}
+
+/// Decode one HTSlib 4bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `Base`
+#[inline]
+fn decode(b: u8) -> PileBase {
+    match b & 0b1111 {
+        0 => PileBase::A,
+        1 => PileBase::C,
+        2 => PileBase::G,
+        3 => PileBase::T,
+        _ => PileBase::N,
+    }
+}
+
+pub fn base_at_new(
+    record: &rust_htslib::bam::Record,
+    at_pos: i64,
+    with_next_ins: bool,
+) -> anyhow::Result<Option<PileBase>> {
+    if record.pos() > at_pos {
+        return Ok(None);
+    }
+
+    let mut ref_pos = record.pos();
+    let mut read_pos = 0_i64;
+    let cigar = record.cigar();
+    let seq = record.seq().as_bytes(); // already expanded to ASCII (A,C,G,T,N)
+
+    for (i, op) in cigar.iter().enumerate() {
+        let (consume_r, consume_ref) = match op {
+            Cigar::Match(l) | Cigar::Equal(l) | Cigar::Diff(l) => (*l as i64, *l as i64),
+            Cigar::Ins(l) | Cigar::SoftClip(l) => (*l as i64, 0),
+            Cigar::Del(l) | Cigar::RefSkip(l) => (0, *l as i64),
+            _ => (0, 0),
+        };
+
+        /* look ahead for an insertion immediately AFTER the queried base */
+        if with_next_ins
+            && ref_pos + consume_ref == at_pos + 1
+            && matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
+        {
+            return Ok(Some(PileBase::Ins));
+        }
+
+        /* Does the queried reference coordinate fall inside this CIGAR op? */
+        if ref_pos + consume_ref > at_pos {
+            return Ok(Some(match op {
+                Cigar::Del(_) => PileBase::Del,
+                Cigar::RefSkip(_) => PileBase::Skip,
+                _ => {
+                    let offset = (at_pos - ref_pos) as usize;
+                    if read_pos as usize + offset >= seq.len() {
+                        return Ok(None); // out‑of‑range, malformed BAM
+                    }
+                    decode(seq[read_pos as usize + offset])
+                }
+            }));
+        }
+
+        read_pos += consume_r;
+        ref_pos += consume_ref;
+    }
+
+    Ok(None) // beyond alignment
+}
+
+/// Return the pile‑up of **all reads** at `chr:pos` (0‑based, inclusive).
+///
+/// * `bam`           – an open [`rust_htslib::bam::IndexedReader`].  
+/// * `chr` / `pos`   – reference contig name and coordinate.  
+/// * `with_next_ins` – if `true`, report [`Base::Ins`] when an insertion starts
+///                     **right after** the queried base.
+///
+/// The function bounds the internal pile‑up depth to 10 000 reads to protect
+/// against malformed BAM files that could otherwise allocate unbounded memory.
+///
+/// # Errors
+///
+/// Propagates any I/O or parsing error from *rust‑htslib*, wrapped in
+/// [`anyhow::Error`].
+pub fn nt_pileup_new(
+    bam: &mut rust_htslib::bam::IndexedReader,
+    chr: &str,
+    pos: u32,
+    with_next_ins: bool,
+) -> anyhow::Result<Vec<PileBase>> {
+    // Storage for the per‑read observations.
+    let mut pile = Vec::new();
+
+    // Restrict BAM iterator to the one‑base span of interest.
+    bam.fetch((chr, pos, pos + 1))?;
+
+    // Hard cap on depth (adjust if your use‑case needs deeper coverage).
+    bam.pileup().set_max_depth(10_000);
+
+    // Stream the pileup; HTSlib walks the reads for us.
+    for pileup in bam.pileup() {
+        // Attach context to any low‑level error.
+        let pileup = pileup.context(format!("can't pile up BAM at {chr}:{pos}"))?;
+
+        // Skip other positions quickly.
+        if pileup.pos() != pos {
+            continue;
+        }
+
+        for aln in pileup.alignments() {
+            // A CIGAR ‘D’ covering this reference base.
+            if aln.is_del() {
+                pile.push(PileBase::Del);
+                continue;
+            }
+
+            // Handle the three indel states reported by HTSlib.
+            match aln.indel() {
+                rust_htslib::bam::pileup::Indel::Ins(_) => {
+                    pile.push(PileBase::Ins); // insertion *starts at* this base
+                }
+                rust_htslib::bam::pileup::Indel::Del(_) => {
+                    pile.push(PileBase::Del); // deletion length > 1, but still covers here
+                }
+                rust_htslib::bam::pileup::Indel::None => {
+                    // Regular match/mismatch: delegate to `base_at`.
+                    if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins)? {
+                        pile.push(base);
+                    }
+                }
+            }
+        }
+    }
+    Ok(pile)
+}

+ 2 - 2
src/collection/mod.rs

@@ -81,13 +81,13 @@ impl Collections {
             result_dir,
             ..
         } = &config;
-        let pod5 = Pod5Collection::new(pod_dir, corrected_fc_path, result_dir)?;
+        // let pod5 = Pod5Collection::new(pod_dir, corrected_fc_path, result_dir)?;
         let bam = BamCollection::new(result_dir);
         let vcf = VcfCollection::new(result_dir);
         let modbases = ModBasesCollection::new(result_dir);
 
         Ok(Self {
-            pod5,
+            pod5: Pod5Collection::default(),
             bam,
             vcf,
             modbases,

+ 102 - 6
src/lib.rs

@@ -177,7 +177,7 @@ mod tests {
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::SavanaCN}, collection::{bam, flowcells::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig, ShouldRun}, commands::dorado::Dorado, helpers::find_files, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges}, scan::scan::somatic_scan, variant::{variant::AlterationCategory, variants_stats::{self, VariantsStats}}};
+    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::SavanaCN}, collection::{bam::{self, nt_pileup_new}, flowcells::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig, ShouldRun}, commands::dorado::Dorado, helpers::find_files, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges}, scan::scan::somatic_scan, variant::{variant::{AlterationCategory, BNDDesc, BNDGraph, ToBNDGraph}, variants_stats::{self, VariantsStats}}};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -669,9 +669,9 @@ mod tests {
             CollectionsConfig::default()
         )?;
         for (a, _) in collections.bam_pairs().iter() {
-            // if a.id.as_str() != "CHAMPION" {
-            //     continue;
-            // }
+            if  ["AUBERT", "BAFFREAU", "BAILLEUL"].contains(&a.id.as_str()) {
+                continue;
+            }
             if let Err(e) = SomaticPipe::initialize(&a.id, Config::default()).map(|mut p| if p.should_run() {
                 if let Err(e) = p.run() {
                     error!("{e}");
@@ -681,7 +681,7 @@ mod tests {
             }
         }
         Ok(())
-        // let id = "HENAUX";
+        // let id = "VILI";
         // SomaticPipe::initialize(id, Config::default())?.run()
     }
 
@@ -853,6 +853,10 @@ mod tests {
             if id == "BANGA" {
                 continue;
             }
+            if id == "ARM" {
+                continue;
+            }
+
 
             match SomaticPipe::initialize(id, Config::default())?.run() {
                 Ok(_) => (),
@@ -863,7 +867,7 @@ mod tests {
     }
 
     #[test]
-    fn run_somatic_case() -> anyhow::Result<()> {
+    fn somatic_cases() -> anyhow::Result<()> {
         init();
         let id = "ADJAGBA";
         let config = Config { somatic_pipe_force: true, ..Default::default() };
@@ -1224,4 +1228,96 @@ mod tests {
 
         Ok(())
     }
+
+    /// helper to build a forward‑strand BND (same contig) where
+    ///   A = pos,  B = pos + 5
+    fn fwd(contig: &str, pos: u32) -> BNDDesc {
+        BNDDesc {
+            a_contig: contig.into(),
+            a_position: pos,
+            a_sens: true,
+            b_contig: contig.into(),
+            b_position: pos + 5,
+            b_sens: true,
+            added_nt: String::new(),
+        }
+    }
+
+    /// Build a six‑node *forward* chain relying **only** on `auto_connect()`
+    /// (no manual edges) and assert the Hamiltonian path spans all nodes.
+    #[test]
+    fn hamiltonian_chain_auto() {
+        // positions 10,15   20,25   …  60,65  satisfy B(u) ≤ A(v)
+        let bnds: Vec<BNDDesc> = (1..=6).map(|i| fwd("chr1", i * 10)).collect();
+        let g: BNDGraph<()> = bnds.to_bnd_graph(); // trait uses auto_connect()
+        // ensure auto_connect produced 5 edges in a line
+        assert_eq!(g.inner().edge_count(), 5);
+        let path = g.hamiltonian_path().expect("chain should be Hamiltonian");
+        assert_eq!(path.len(), 6);
+    }
+
+    /// Two disconnected "V" shapes -> auto_connect creates 2 edges in each
+    /// component, no Hamiltonian path, components sorted by size.
+    #[test]
+    fn components_after_auto_connect() {
+        // comp1: a->b<-c on reverse strand of chrX
+        let a = BNDDesc { 
+            a_contig: "chrX".into(), a_position: 300, a_sens: false,
+            b_contig: "chrX".into(), b_position: 250, b_sens: false, 
+            added_nt: String::new() 
+        };
+        let b = BNDDesc { 
+            a_contig: "chrX".into(), a_position: 200, a_sens: false, 
+            b_contig: "chrX".into(), b_position: 150, b_sens: false, 
+            added_nt: String::new() 
+        };
+        let c = BNDDesc { 
+            a_contig: "chrX".into(), a_position: 100, a_sens: false, 
+            b_contig: "chrX".into(), b_position:  50, b_sens: false,
+            added_nt: String::new() 
+        };
+        // comp2: three‑node forward chain on chrY
+        let d = fwd("chrY", 10);
+        let e = fwd("chrY", 20);
+        let f = fwd("chrY", 30);
+        let g: BNDGraph<()> = vec![a,b,c,d,e,f].to_bnd_graph();
+        assert!(g.hamiltonian_path().is_none());
+        let comps = g.components_by_size();
+        comps.iter().for_each(|a| println!("{}", g.fmt_path(a) ));
+        assert_eq!(comps.len(), 2);
+        assert_eq!(comps[0].len(), 3);
+        assert_eq!(comps[1].len(), 3);
+    }
+
+    #[test]
+    fn bnd_connections() {
+        // comp1: a->b<-c on reverse strand of chrX
+        let a = BNDDesc { 
+            a_contig: "chrX".into(), a_position: 300, a_sens: true,
+            b_contig: "chr14".into(), b_position: 250, b_sens: false, 
+            added_nt: String::new() 
+        };
+        let b = BNDDesc { 
+            a_contig: "chr14".into(), a_position: 200, a_sens: false, 
+            b_contig: "chrX".into(), b_position: 301, b_sens: true, 
+            added_nt: String::new() 
+        };
+        let c = BNDDesc { 
+            a_contig: "chrX".into(), a_position: 302, a_sens: true, 
+            b_contig: "chrZ".into(), b_position:  50, b_sens: false,
+            added_nt: String::new() 
+        };
+        // comp2: three‑node forward chain on chrY
+        let d = fwd("chrY", 10);
+        let e = fwd("chrY", 20);
+        let f = fwd("chrY", 30);
+        let g: BNDGraph<()> = vec![a,b,c,d,e,f].to_bnd_graph();
+        assert!(g.hamiltonian_path().is_none());
+        let comps = g.components_by_size();
+        comps.iter().for_each(|a| println!("{}", g.fmt_path(a) ));
+        assert_eq!(comps.len(), 2);
+        assert_eq!(comps[0].len(), 3);
+        assert_eq!(comps[1].len(), 3);
+    }
+
 }

+ 11 - 4
src/scan/scan.rs

@@ -1,3 +1,4 @@
+use std::path::Path;
 use std::{fmt, fs, io::Write};
 
 use anyhow::Context;
@@ -562,10 +563,16 @@ impl Run for SomaticScan {
     /// # Returns
     /// An error if the underlying scan function (`par_whole_scan`) fails for either sample.
     fn run(&mut self) -> anyhow::Result<()> {
-        info!("Starting scan for {} normal.", self.id);
-        par_whole_scan(&self.id, &self.config.normal_name, &self.config)?;
-        info!("Starting scan for {} tumoral.", self.id);
-        par_whole_scan(&self.id, &self.config.tumoral_name, &self.config)
+        if Path::new(&self.config.normal_bam(&self.id)).exists() {
+            info!("Starting scan for {} normal.", self.id);
+            par_whole_scan(&self.id, &self.config.normal_name, &self.config)?;
+        }
+
+        if Path::new(&self.config.tumoral_bam(&self.id)).exists() {
+            info!("Starting scan for {} tumoral.", self.id);
+            par_whole_scan(&self.id, &self.config.tumoral_name, &self.config)?;
+        }
+        Ok(())
     }
 }
 

+ 391 - 66
src/variant/variant.rs

@@ -238,9 +238,9 @@ impl VcfVariant {
             (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
                 AlterationCategory::INS
             }
-            (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
-                AlterationCategory::Other
-            }
+            // (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
+            //     AlterationCategory::Other
+            // }
             (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
                 AlterationCategory::DEL
             }
@@ -283,67 +283,10 @@ impl VcfVariant {
     /// # Errors
     /// This function will return an error if:
     /// - The alteration category is not BND
-    /// - The alternative string cannot be parsed into exactly 3 parts
-    /// - The b_position cannot be parsed as a number
     pub fn bnd_desc(&self) -> anyhow::Result<BNDDesc> {
         let alt = self.alternative.to_string();
-        if alt.contains('[') || alt.contains(']') {
-            let alt_rep = alt.replace("[", ";").replace("]", ";");
-            let alt_is_joined_after = !alt_rep.starts_with(";");
-            let parts = alt_rep
-                .split(";")
-                .filter(|c| !c.is_empty())
-                .collect::<Vec<&str>>();
-
-            if alt_is_joined_after {
-                // a is ref b is alt
-                let a_sens = true;
-                let a_contig = self.position.contig();
-                let a_position = self.position.position + 1;
-
-                let added_nt = parts[0][1..].to_string();
-
-                let b_sens = alt.contains('[');
-                let (contig, pos) = parts[1].split_once(':').unwrap();
-                let b_contig = contig.to_string();
-                let b_position: u32 = pos.parse()?;
-
-                Ok(BNDDesc {
-                    a_contig,
-                    a_position,
-                    a_sens,
-                    b_contig,
-                    b_position,
-                    b_sens,
-                    added_nt,
-                })
-            } else {
-                // a is alt b is ref
-                let b_sens = true;
-                let b_contig = self.position.contig();
-                let b_position = self.position.position + 1;
-
-                let mut added_nt = parts[1].to_string();
-                added_nt.pop();
-
-                let a_sens = alt.contains(']');
-                let (contig, pos) = parts[0].split_once(':').unwrap();
-                let a_contig = contig.to_string();
-                let a_position: u32 = pos.parse()?;
-
-                Ok(BNDDesc {
-                    a_contig,
-                    a_position,
-                    a_sens,
-                    b_contig,
-                    b_position,
-                    b_sens,
-                    added_nt,
-                })
-            }
-        } else {
-            Err(anyhow::anyhow!("The alteration is not BND: {alt}"))
-        }
+        BNDDesc::from_vcf_breakend(&self.position().contig(), self.position.position + 1, &alt)
+            .context(format!("The alteration is not BND: {alt}"))
     }
 
     /// Returns the length of the deletion if the variant is a deletion (`DEL`).
@@ -412,15 +355,393 @@ impl VcfVariant {
 
 #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
 pub struct BNDDesc {
+    /// Coordinates of the **5′ end** (A)
     pub a_contig: String,
-    pub a_position: u32, // 1-based
+    pub a_position: u32,
     pub a_sens: bool,
+
+    /// Coordinates of the **3′ end** (B)
     pub b_contig: String,
-    pub b_position: u32, // 1-based
+    pub b_position: u32,
     pub b_sens: bool,
+
+    /// Inserted nucleotides at the junction (may be empty)
     pub added_nt: String,
 }
 
+impl BNDDesc {
+    /// Construct a `BNDDesc` from VCF `CHROM`, `POS`, and break‑end `ALT` string.
+    /// Supports the four VCF break‑end ALT forms:
+    ///
+    /// 1) `t[p[` → A(5′)=t flank, B(3′)=p flank (forward-forward)
+    /// 2) `t]p]` → A(5′)=t flank, B(3′)=p flank but on reverse (`a_sens=true`, `b_sens=false`)
+    /// 3) `]p]t` → A(5′)=p flank, B(3′)=t flank (`a_sens=false`, `b_sens=true`)
+    /// 4) `[p[t` → A(5′)=p flank, B(3′)=t flank with both on reverse (`a_sens=false`, `b_sens=false`)
+    ///
+    /// Here `t` is the local flank sequence, `p` is the remote contig:pos.
+    pub fn from_vcf_breakend(chrom: &str, pos: u32, alt: &str) -> Option<Self> {
+        // locate first bracket
+        let (open, bracket) = alt.char_indices().find(|&(_, c)| c == '[' || c == ']')?;
+        // find matching bracket
+        let close = alt[open + 1..].find(bracket)? + open + 1;
+        // parse remote contig:pos between brackets
+        let addr = &alt[open + 1..close];
+        let mut parts = addr.splitn(2, ':');
+        let rc = parts.next()?;
+        let rp: u32 = parts.next()?.parse().ok()?;
+        // inserted sequence
+        let seq_before = &alt[..open];
+        let seq_after = &alt[close + 1..];
+        let mut added_nt = String::new();
+        added_nt.push_str(seq_before);
+        added_nt.push_str(seq_after);
+        // determine form and orientation
+        // form 1 & 2: bracket after t (open>0): local before remote
+        // form 3 & 4: bracket before t (open==0): remote before local
+        let after_local = open > 0;
+        let (a_sens, b_sens) = match (bracket, after_local) {
+            ('[', true) => (true, true),    // form 1
+            (']', true) => (true, false),   // form 2
+            (']', false) => (false, true),  // form 3
+            ('[', false) => (false, false), // form 4
+            _ => return None,
+        };
+        // assign A/B depending on form
+        let (a_contig, a_position, b_contig, b_position) = if after_local {
+            // forms 1 & 2: A is local, B is remote
+            (chrom.into(), pos, rc.into(), rp)
+        } else {
+            // forms 3 & 4: A is remote, B is local
+            (rc.into(), rp, chrom.into(), pos)
+        };
+        Some(Self {
+            a_contig,
+            a_position,
+            a_sens,
+            b_contig,
+            b_position,
+            b_sens,
+            added_nt,
+        })
+    }
+
+    /// Serialize this `BNDDesc` back into a VCF ALT break‑end string.
+    /// Reconstructs one of the four canonical forms using `added_nt` as the flank
+    /// sequence (`t`) and `b_contig:b_position` as the remote site (`p`).
+    pub fn to_vcf_breakend(&self) -> String {
+        // `t` is inserted NTs, `p` is remote site
+        let t = &self.added_nt;
+        let p = format!("{}:{}", self.b_contig, self.b_position);
+        match (self.a_sens, self.b_sens) {
+            (true, true) => format!("{}[{}[", t, p),   // t[p[
+            (true, false) => format!("{}]{}]", t, p),  // t]p]
+            (false, true) => format!("]{}]{}", p, t),  // ]p]t
+            (false, false) => format!("[{}[{}", p, t), // [p[t
+        }
+    }
+}
+
+impl core::fmt::Display for BNDDesc {
+    /// Compact textual form: `(A:contig:pos<arrow>;B:contig:pos<arrow>;ins=...)`.
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        fn arrow(b: bool) -> &'static str {
+            if b {
+                "->"
+            } else {
+                "<-"
+            }
+        }
+        write!(
+            f,
+            "({}:{}{};{}:{}{};ins={})",
+            self.a_contig,
+            self.a_position,
+            arrow(self.a_sens),
+            self.b_contig,
+            self.b_position,
+            arrow(self.b_sens),
+            self.added_nt
+        )
+    }
+}
+
+use petgraph::graph::{DiGraph, NodeIndex};
+use petgraph::visit::{IntoNodeIdentifiers, NodeIndexable};
+use petgraph::Direction;
+
+/// Wrapper around `petgraph::DiGraph` with `BNDDesc` nodes.
+pub struct BNDGraph<E = ()> {
+    graph: DiGraph<BNDDesc, E>,
+}
+
+impl<E> Default for BNDGraph<E> {
+    fn default() -> Self {
+        Self {
+            graph: DiGraph::new(),
+        }
+    }
+}
+
+impl<E> BNDGraph<E> {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn add_breakpoint(&mut self, desc: BNDDesc) -> NodeIndex {
+        self.graph.add_node(desc)
+    }
+
+    pub fn successors(&self, n: NodeIndex) -> impl Iterator<Item = NodeIndex> + '_ {
+        self.graph.neighbors_directed(n, Direction::Outgoing)
+    }
+
+    pub fn predecessors(&self, n: NodeIndex) -> impl Iterator<Item = NodeIndex> + '_ {
+        self.graph.neighbors_directed(n, Direction::Incoming)
+    }
+
+    pub fn inner(&self) -> &DiGraph<BNDDesc, E> {
+        &self.graph
+    }
+
+    /// Stringify all nodes for quick debugging.
+    pub fn nodes_as_strings(&self) -> Vec<String> {
+        self.graph.node_weights().map(|n| n.to_string()).collect()
+    }
+}
+
+#[allow(clippy::collapsible_if)]
+impl<E: Default> BNDGraph<E> {
+    /// Build edges following downstream 5′→3′ logic for two patterns:
+    ///
+    /// **Pattern 1** (B → A, *u precedes v*):
+    ///
+    /// * forward (→)  `u.b ≤ v.a`   → edge **u → v**
+    /// * reverse (←)  `u.b ≥ v.a`   → edge **v → u**
+    ///
+    /// **Pattern 2** (A → B, *v precedes u*):
+    ///
+    /// * forward (→)  `u.a ≥ v.b`   → edge **v → u**
+    /// * reverse (←)  `u.a ≤ v.b`   → edge **u → v**
+    pub fn auto_connect(&mut self)
+    where
+        E: Default,
+    {
+        let nodes: Vec<NodeIndex> = self.graph.node_indices().collect();
+        let mut pending: Vec<(NodeIndex, NodeIndex)> = Vec::new();
+
+        for &u_idx in &nodes {
+            for &v_idx in &nodes {
+                if u_idx == v_idx {
+                    continue;
+                }
+                let u = &self.graph[u_idx];
+                let v = &self.graph[v_idx];
+
+                // Pattern 1: B(u) -> A(v)  (u precedes v)
+                if u.b_contig == v.a_contig && u.b_sens == v.a_sens {
+                    if (u.b_sens && u.b_position <= v.a_position)
+                        || (!u.b_sens && u.b_position >= v.a_position)
+                    {
+                        pending.push((u_idx, v_idx));
+                    }
+                }
+
+                // Pattern 2: A(u) -> B(v)  (v precedes u)
+                if u.a_contig == v.b_contig && u.a_sens == v.b_sens {
+                    if (u.a_sens && u.a_position >= v.b_position)
+                        || (!u.a_sens && u.a_position <= v.b_position)
+                    {
+                        // Edge direction depends on strand logic
+                        let (src, dst) = if u.a_sens {
+                            // forward ⇒ edge v -> u
+                            (v_idx, u_idx)
+                        } else {
+                            // reverse ⇒ edge u -> v
+                            (u_idx, v_idx)
+                        };
+                        pending.push((src, dst));
+                    }
+                }
+            }
+        }
+
+        // second pass – insert, skipping duplicates
+        for (src, dst) in pending {
+            if self.graph.find_edge(src, dst).is_none() {
+                self.graph.add_edge(src, dst, E::default());
+            }
+        }
+    }
+
+    /// Add an edge if absent.
+    pub fn add_edge_if_absent(&mut self, src: NodeIndex, dst: NodeIndex) {
+        if self.graph.find_edge(src, dst).is_none() {
+            self.graph.add_edge(src, dst, E::default());
+        }
+    }
+
+    /// Pretty‑print a sequence of nodes indicating **actual traversal
+    /// direction** between successive nodes:
+    /// * `→` when the graph contains an edge from *previous* → *current*.
+    /// * `←` when the edge exists from *current* → *previous* (path goes
+    ///   "against" the stored direction).
+    ///
+    /// If neither directional edge is present the placeholder `--` is used so
+    /// debugging clearly shows missing links.
+    pub fn fmt_path(&self, path: &[NodeIndex]) -> String {
+        use core::fmt::Write as _;
+        let mut out = String::new();
+        if let Some((&first, rest)) = path.split_first() {
+            let _ = write!(out, "{}", &self.graph[first]);
+            let mut prev = first;
+            for &curr in rest {
+                let arrow = if self.graph.find_edge(prev, curr).is_some() {
+                    " → "
+                } else if self.graph.find_edge(curr, prev).is_some() {
+                    " ← "
+                } else {
+                    " -- " // unexpected: no direct edge
+                };
+                out.push_str(arrow);
+                let _ = write!(out, "{}", &self.graph[curr]);
+                prev = curr;
+            }
+        }
+        out
+    }
+
+    // ------------------------------------------------------------------
+    // Traversal utilities
+    // ------------------------------------------------------------------
+
+    /// Return a directed path that visits **every node exactly once** (Hamiltonian
+    /// path) if one exists.  Uses naive DFS back-tracking which is exponential—
+    /// fine for small graphs (<15 nodes) but aborts early otherwise.
+    pub fn hamiltonian_path(&self) -> Option<Vec<NodeIndex>> {
+        let n = self.graph.node_count();
+        if n == 0 {
+            return Some(Vec::new());
+        }
+        if n > 15 {
+            // avoid combinatorial explosion
+            return None;
+        }
+        let mut path = Vec::with_capacity(n);
+        let mut visited = vec![false; self.graph.node_bound()];
+        for start in self.graph.node_identifiers() {
+            path.push(start);
+            visited[start.index()] = true;
+            if self.dfs_hamiltonian(start, &mut visited, &mut path, n) {
+                return Some(path);
+            }
+            visited[start.index()] = false;
+            path.clear();
+        }
+        None
+    }
+
+    /**
+    Recursive helper for [`hamiltonian_path`](#method.hamiltonian_path).
+
+    * `current` – node we are currently expanding from.
+    * `visited` – mutable bitmap indicating which nodes are already on `path`.
+    * `path`    – growing list of nodes (last element is always `current`).
+    * `target`  – desired final length (= `graph.node_count()`).
+
+    The function explores each outgoing neighbour that hasn’t been visited yet,
+    marking it **visited → recurse → unvisit** (classic DFS back-tracking).  It
+    returns `true` as soon as a full-length path is discovered, which bubbles
+    up through the recursion to terminate the search early.
+    */
+    fn dfs_hamiltonian(
+        &self,
+        current: NodeIndex,
+        visited: &mut [bool],
+        path: &mut Vec<NodeIndex>,
+        target: usize,
+    ) -> bool {
+        // Base‑case: reached required length → success.
+        if path.len() == target {
+            return true;
+        }
+
+        // Explore every outgoing neighbour.
+        for next in self.graph.neighbors(current) {
+            if !visited[next.index()] {
+                // choose
+                visited[next.index()] = true;
+                path.push(next);
+
+                // explore
+                if self.dfs_hamiltonian(next, visited, path, target) {
+                    return true;
+                }
+
+                // un-choose (back-track)
+                path.pop();
+                visited[next.index()] = false;
+            }
+        }
+        false
+    }
+
+    /// Weakly-connected components sorted descending by size (no `Clone` bound).
+    pub fn components_by_size(&self) -> Vec<Vec<NodeIndex>> {
+        let mut visited = vec![false; self.graph.node_bound()];
+        let mut comps: Vec<Vec<NodeIndex>> = Vec::new();
+        for start in self.graph.node_indices() {
+            if visited[start.index()] {
+                continue;
+            }
+            // DFS over undirected neighbours
+            let mut stack = vec![start];
+            let mut comp = Vec::new();
+            visited[start.index()] = true;
+            while let Some(n) = stack.pop() {
+                comp.push(n);
+                for neigh in self.graph.neighbors_undirected(n) {
+                    if !visited[neigh.index()] {
+                        visited[neigh.index()] = true;
+                        stack.push(neigh);
+                    }
+                }
+            }
+            comps.push(comp);
+        }
+        comps.sort_by_key(|c| std::cmp::Reverse(c.len()));
+        comps
+    }
+
+    /// Convenience wrapper: try to return a Hamiltonian path; if impossible,
+    /// return all weakly‑connected subgraphs ordered by size.
+    pub fn path_or_components(&self) -> Result<Vec<NodeIndex>, Vec<Vec<NodeIndex>>> {
+        if let Some(p) = self.hamiltonian_path() {
+            Ok(p)
+        } else {
+            Err(self.components_by_size())
+        }
+    }
+}
+
+pub trait ToBNDGraph<E = ()> {
+    /// Consume the vector and return a `BNDGraph` whose edges are created with
+    /// `auto_connect()`.
+    fn to_bnd_graph(self) -> BNDGraph<E>
+    where
+        E: Default;
+}
+
+impl<E: Default> ToBNDGraph<E> for Vec<BNDDesc> {
+    fn to_bnd_graph(self) -> BNDGraph<E> {
+        let mut g = BNDGraph::<E>::new();
+        for b in self {
+            g.add_breakpoint(b);
+        }
+        g.auto_connect();
+        g
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
 pub struct DeletionDesc {
     pub contig: String,
@@ -1671,13 +1992,17 @@ impl ReferenceAlternative {
         match self {
             ReferenceAlternative::Nucleotide(_) => None,
             ReferenceAlternative::Nucleotides(bases) => {
-                let seq = bases.iter().skip(1).map(|b| b.to_string()).collect::<String>();
+                let seq = bases
+                    .iter()
+                    .skip(1)
+                    .map(|b| b.to_string())
+                    .collect::<String>();
                 if seq.len() > 1 {
                     Some(estimate_shannon_entropy(&seq))
                 } else {
                     None
                 }
-            },
+            }
             ReferenceAlternative::Unstructured(_) => None,
         }
     }

+ 8 - 0
src/variant/variant_collection.rs

@@ -344,10 +344,12 @@ impl VariantCollection {
                         let mut n_alt = 0;
                         let mut depth = 0;
                         let mut alt_seq = var.alternative.to_string();
+                        let mut is_ins = false;
 
                         let c = if var.alteration_category() == AlterationCategory::INS {
                             // TODO: Add stretch comparison
                             alt_seq = alt_seq.split_off(1);
+                            is_ins = true;
                             counts_ins_at(&mut bam, &var.position.contig(), var.position.position)?
                         } else if var.alteration_category() == AlterationCategory::DEL {
                             alt_seq = "D".to_string();
@@ -359,6 +361,12 @@ impl VariantCollection {
                         for (seq, n) in c {
                             if seq == alt_seq {
                                 n_alt = n;
+                            } else if is_ins && seq.len() > 1 {
+                                // insertion sequence DO NOT have to match 
+                                // rule assumed to reduce FP from Solo callers
+                                // with this rule solo callers won't be able to
+                                // call the variant of an insertion.
+                                n_alt = n;
                             }
                             if seq == *"D" && alt_seq != *"D" {
                                 continue;