use crate::{ annotation::Annotations, collection::ShouldRun, helpers::{estimate_shannon_entropy, mean, Hash128}, positions::{GenomePosition, GetGenomePosition, VcfPosition}, runners::Run, variant::variant_collection::VariantCollection, }; use anyhow::{anyhow, Context}; use bitcode::{Decode, Encode}; use log::{error, info}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::{ cmp::Ordering, collections::{BTreeSet, HashSet}, fmt, hash::Hash, str::FromStr, }; /// Represents a variant in the Variant Call Format (VCF). #[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)] pub struct VcfVariant { /// A 128-bit hash of the variant's key properties for efficient comparison and storage. pub hash: Hash128, /// The genomic position of the variant. pub position: GenomePosition, /// The identifier of the variant. pub id: String, /// The reference allele. pub reference: ReferenceAlternative, /// The alternative allele. pub alternative: ReferenceAlternative, /// The quality score of the variant call, if available. pub quality: Option, /// The filter status of the variant. pub filter: Filter, /// Additional information about the variant. pub infos: Infos, /// Genotype information and other sample-specific data. pub formats: Formats, } impl PartialEq for VcfVariant { /// Compares two VcfVariants for equality. /// /// Note: This comparison only considers position, reference, and alternative. /// It intentionally ignores id, filter, info, format, and quality. fn eq(&self, other: &Self) -> bool { // Nota bene: id, filter, info, format and quality is intentionally not compared self.position == other.position && self.reference == other.reference && self.alternative == other.alternative } } impl Eq for VcfVariant {} impl FromStr for VcfVariant { type Err = anyhow::Error; /// Parses a VcfVariant from a string representation. /// /// The input string is expected to be a tab-separated VCF line. /// /// # Errors /// /// Returns an error if parsing fails for any field. fn from_str(s: &str) -> anyhow::Result { let v: Vec<&str> = s.split('\t').collect(); let vcf_position: VcfPosition = ( *v.first().ok_or(anyhow!("Can't get contig from: {s}"))?, *v.get(1).ok_or(anyhow!("Can't get position from: {s}"))?, ) .try_into() .context(format!("Can't parse position from: {s}"))?; let formats = if v.len() >= 10 { ( *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?, *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?, ) .try_into() .context(format!("Can't parse formats from: {s}"))? } else { Formats::default() }; let position: GenomePosition = vcf_position.into(); let reference: ReferenceAlternative = v .get(3) .ok_or(anyhow!("Can't parse reference from: {s}"))? .parse() .context(format!("Can't parse reference from: {s}"))?; let alternative: ReferenceAlternative = v .get(4) .ok_or(anyhow!("Can't parse alternative from: {s}"))? .parse() .context(format!("Can't parse alternative from: {s}"))?; // Blake3 128 bytes Hash let mut hasher = blake3::Hasher::new(); hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes hasher.update(reference.to_string().as_bytes()); // Reference string as bytes hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes let hash = hasher.finalize(); let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap()); Ok(Self { hash, position, id: v .get(2) .ok_or(anyhow!("Can't parse id from: {s}"))? .to_string(), reference, alternative, quality: v .get(5) .map(|s| s.parse::().ok()) // Try to parse as f64; returns Option .unwrap_or(None), filter: v .get(6) .ok_or(anyhow!("Can't parse filter from: {s}"))? .parse() .context(format!("Can't parse filter from: {s}"))?, infos: v .get(7) .ok_or(anyhow!("Can't parse infos from: {s}"))? .parse() .context(format!("Can't parse infos from: {s}"))?, formats, }) } } // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag impl VcfVariant { /// Converts the VcfVariant into a VCF-formatted row string. /// /// This method creates a tab-separated string representation of the variant, /// suitable for writing to a VCF file. pub fn into_vcf_row(&self) -> String { let vcf_position: VcfPosition = self.position.clone().into(); let (contig, position) = vcf_position.into(); let mut columns = vec![ contig, position, self.id.to_string(), self.reference.to_string(), self.alternative.to_string(), self.quality .map(|v| v.to_string()) .unwrap_or(".".to_string()), self.filter.to_string(), self.infos.to_string(), ]; if !self.formats.0.is_empty() { let (format, values) = self.formats.clone().into(); columns.push(format); columns.push(values); } columns.join("\t") } /// Returns the hash of the variant. pub fn hash(&self) -> Hash128 { self.hash } /// Creates a new VcfVariant with common attributes from DeepVariant and CLAIRS. /// /// This method generates a new variant with shared properties, resetting some fields /// to default or empty values. pub fn commun_deepvariant_clairs(&self) -> VcfVariant { VcfVariant { hash: self.hash, position: self.position.clone(), id: self.id.clone(), reference: self.reference.clone(), alternative: self.alternative.clone(), quality: self.quality, filter: Filter::Other(".".to_string()), infos: Infos(vec![Info::Empty]), formats: self.formats.commun_deepvariant_clairs(), } } /// Checks if the variant has an SVTYPE info field. /// /// Returns true if the variant contains structural variation type information. pub fn has_svtype(&self) -> bool { self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_))) } pub fn n_alt_depth(&self) -> Option<(u32, u32)> { let r = self.formats.n_alt_depth(); if r.is_some() { r } else { self.infos.n_alt_depth() } } /// Retrieves the structural variation type of the variant, if present. /// /// Returns Some(SVType) if the variant has an SVTYPE info field, pub fn svtype(&self) -> Option { self.infos.0.iter().find_map(|e| { if let Info::SVTYPE(sv_type) = e { Some(sv_type.clone()) } else { None } }) } /// Determines the alteration category of the variant. /// /// This method analyzes the reference and alternative alleles to classify /// the variant into one of several alteration categories: /// - SNV (Single Nucleotide Variant) /// - INS (Insertion) /// - DEL (Deletion) /// - Other (including structural variants and complex alterations) /// /// The classification is based on the following rules: /// 1. If both reference and alternative are single nucleotides, it's an SNV. /// 2. If reference is a single nucleotide and alternative is multiple nucleotides, it's an insertion. /// 3. If reference is multiple nucleotides and alternative is a single nucleotide, it's a deletion. /// 4. For cases where both are multiple nucleotides, the longer one determines if it's an insertion or deletion. /// 5. If none of the above apply, it checks for structural variant types. /// 6. If no structural variant type is found, it's classified as "Other". /// /// # Returns /// An `AlterationCategory` enum representing the type of alteration. pub fn alteration_category(&self) -> AlterationCategory { match (&self.reference, &self.alternative) { (ReferenceAlternative::Nucleotide(a), ReferenceAlternative::Nucleotide(b)) if *a != Base::N && *b != Base::N => { AlterationCategory::SNV } (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => { AlterationCategory::INS } (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => { AlterationCategory::DEL } (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) if a.len() < b.len() => { AlterationCategory::INS } (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) if a.len() > b.len() => { AlterationCategory::DEL } _ => match self.svtype() { Some(sv_type) => { if let Ok(bnd_desc) = self.bnd_desc() { if bnd_desc.a_contig != bnd_desc.b_contig { AlterationCategory::TRL } else if bnd_desc.a_sens != bnd_desc.b_sens { AlterationCategory::DELINV } else { AlterationCategory::DEL } } else { AlterationCategory::from(sv_type) } } None => AlterationCategory::Other, }, } } pub fn inserted_seq(&self) -> Option { if self.alteration_category() != AlterationCategory::INS { return None; } if let Some(ins) = self.infos.0.iter().find_map(|e| match e { Info::SVINSSEQ(ins) => Some(ins.to_string()), _ => None, }) { return Some(ins); } match &self.alternative { ReferenceAlternative::Nucleotides(nt) => nt .get(1..) .map(|slice| slice.iter().map(ToString::to_string).collect()), _ => None, } } /// Parses and constructs a BND (breakend) description from the alternative string. /// /// This function interprets the BND notation in the alternative string and creates /// a `BNDDesc` struct containing detailed information about the breakend. /// /// # Returns /// - `Ok(BNDDesc)` if parsing is successful /// - `Err` if parsing fails or if the alteration is not a BND /// /// # Errors /// This function will return an error if: /// - The alteration category is not BND pub fn bnd_desc(&self) -> anyhow::Result { let alt = self.alternative.to_string(); BNDDesc::from_vcf_breakend(&self.position().contig(), self.position.position + 1, &alt) .context(format!("The alteration is not BND: {alt}")) } /// Returns the length of the deletion if the variant is a deletion (`DEL`). pub fn deletion_len(&self) -> Option { if self.alteration_category() != AlterationCategory::DEL { return None; } // Try using the END field if let Some(len) = self.infos.0.iter().find_map(|i| { if let Info::END(end) = i { Some(end.saturating_sub(self.position.position)) } else { None } }) { return Some(len); } // Fallback to SVLEN field if let Some(len) = self.infos.0.iter().find_map(|i| { if let Info::SVLEN(len) = i { Some(len.unsigned_abs()) } else { None } }) { return Some(len); } match self.bnd_desc() { Ok(bnd_desc) if bnd_desc.a_contig == bnd_desc.b_contig => { if bnd_desc.a_position < bnd_desc.b_position { return Some(bnd_desc.b_position - bnd_desc.a_position); } else { return Some(bnd_desc.a_position - bnd_desc.b_position); } } _ => (), } // Final fallback: infer from reference and alternative if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_)) = (&self.reference, &self.alternative) { return Some(nt.len().saturating_sub(1) as u32); } if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotides(bnt)) = (&self.reference, &self.alternative) { if bnt.len() < nt.len() { return Some(nt.len().saturating_sub(bnt.len()) as u32); } } None } pub fn deletion_seq(&self) -> Option { if self.alteration_category() != AlterationCategory::DEL { return None; } // infer from reference and alternative if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_)) = (&self.reference, &self.alternative) { return Some(nt.iter().skip(1).map(|e| e.to_string()).collect()); } if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotides(bnt)) = (&self.reference, &self.alternative) { if bnt.len() < nt.len() { return Some(nt.iter().skip(1).map(|e| e.to_string()).collect()); } } None } pub fn deletion_desc(&self) -> Option { match self.bnd_desc() { Ok(bnd_desc) if bnd_desc.a_contig == bnd_desc.b_contig => { if bnd_desc.a_position < bnd_desc.b_position { return Some(DeletionDesc { contig: bnd_desc.a_contig, start: bnd_desc.a_position, end: bnd_desc.b_position, }); } else { return Some(DeletionDesc { contig: bnd_desc.a_contig, start: bnd_desc.b_position, end: bnd_desc.a_position, }); } } _ => (), } self.deletion_len().map(|len| DeletionDesc { contig: self.position.contig(), start: self.position.position + 2, end: (self.position.position + 1) .checked_add(len) .unwrap_or(self.position.position + 2), }) } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] pub struct BNDDesc { /// Coordinates of the **5′ end** (A) pub a_contig: String, pub a_position: u32, pub a_sens: bool, /// Coordinates of the **3′ end** (B) pub b_contig: String, pub b_position: u32, pub b_sens: bool, /// Inserted nucleotides at the junction (may be empty) pub added_nt: String, } pub trait GroupByThreshold { fn group_by(&self, threshold: u32) -> Vec>; } impl GroupByThreshold for Vec { fn group_by(&self, threshold: u32) -> Vec> { let mut sorted_vec = self.clone(); sorted_vec.sort_by(|a, b| { (&a.a_contig, a.a_sens, a.a_position).cmp(&(&b.a_contig, b.a_sens, b.a_position)) }); let mut grouped: Vec> = Vec::new(); for desc in sorted_vec { if let Some(last_group) = grouped.last_mut() { if let Some(last_desc) = last_group.last() { if last_desc.a_contig == desc.a_contig && last_desc.a_sens == desc.a_sens && (desc.a_position as i64 - last_desc.a_position as i64).abs() <= threshold as i64 { last_group.push(desc); continue; } } } grouped.push(vec![desc]); } grouped } } impl BNDDesc { /// Construct a `BNDDesc` from VCF `CHROM`, `POS`, and break‑end `ALT` string. /// Supports the four VCF break‑end ALT forms: /// /// 1) `t[p[` → A(5′)=t flank, B(3′)=p flank (+ + ) /// 2) `t]p]` → A(5′)=t flank, B(3′)=p flank but on reverse (+ -) /// 3) `]p]t` → A(5′)=p flank, B(3′)=t flank (- -) /// 4) `[p[t` → A(5′)=p flank, B(3′)=t flank with both on reverse (- +) /// /// Here `t` is the local flank sequence, `p` is the remote contig:pos. pub fn from_vcf_breakend(t_chrom: &str, t_pos: u32, alt: &str) -> Option { // locate first bracket let (open, bracket) = alt.char_indices().find(|&(_, c)| c == '[' || c == ']')?; // find matching bracket let close = alt[open + 1..].find(bracket)? + open + 1; // parse remote contig:pos between brackets let addr = &alt[open + 1..close]; let mut parts = addr.splitn(2, ':'); let p_contig = parts.next()?; let p_position: u32 = parts.next()?.parse().ok()?; // inserted sequence let seq_before = &alt[..open]; let seq_after = &alt[close + 1..]; let mut added_nt = String::new(); added_nt.push_str(seq_before); added_nt.push_str(seq_after); added_nt.remove(0); // determine form and orientation // form 1 & 2: bracket after t (open>0): t before p // form 3 & 4: bracket before t (open==0): p before t let after_local = open > 0; let (t_sens, p_sens) = match (bracket, after_local) { ('[', true) => (true, true), // form 1 t[p[ (']', true) => (true, false), // form 2 t]p] (']', false) => (true, true), // form 3 ]p]t ('[', false) => (true, false), // form 4 [p[t _ => return None, }; // assign A/B depending on form let (a_contig, a_position, a_sens, b_contig, b_position, b_sens, added_nt) = if after_local { ( t_chrom.into(), t_pos, t_sens, p_contig.into(), p_position, p_sens, added_nt, ) } else { ( p_contig.into(), p_position, p_sens, t_chrom.into(), t_pos, t_sens, reverse_complement(&added_nt), ) }; Some(Self { a_contig, a_position, a_sens, b_contig, b_position, b_sens, added_nt, }) } pub fn rc(&self) -> Self { Self { a_contig: self.b_contig.clone(), a_position: self.b_position, a_sens: !self.b_sens, b_contig: self.a_contig.clone(), b_position: self.a_position, b_sens: !self.a_sens, // TODO change seq to RC nd should find base on fa added_nt: reverse_complement(&self.added_nt), } } } pub fn reverse_complement(dna: &str) -> String { fn complement(base: char) -> char { match base { 'A' => 'T', 'T' => 'A', 'C' => 'G', 'G' => 'C', 'a' => 't', 't' => 'a', 'c' => 'g', 'g' => 'c', 'N' => 'N', 'n' => 'n', _ => 'N', } } dna.chars().rev().map(complement).collect() } impl core::fmt::Display for BNDDesc { /// Compact textual form: `(A:contig:pos;B:contig:pos;ins=...)`. fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { fn arrow(b: bool) -> &'static str { if b { "->" } else { "<-" } } write!( f, "({}{}:{}::{}::{}:{}{})", arrow(self.a_sens), self.a_contig, self.a_position, self.added_nt, self.b_contig, self.b_position, arrow(self.b_sens), ) } } use petgraph::graph::{DiGraph, NodeIndex}; use petgraph::visit::{IntoNodeIdentifiers, NodeIndexable}; use petgraph::Direction; /// Wrapper around `petgraph::DiGraph` with `BNDDesc` nodes. pub struct BNDGraph { graph: DiGraph, } impl Default for BNDGraph { fn default() -> Self { Self { graph: DiGraph::new(), } } } impl BNDGraph { pub fn new() -> Self { Self::default() } pub fn add_breakpoint(&mut self, desc: BNDDesc) -> NodeIndex { self.graph.add_node(desc) } pub fn successors(&self, n: NodeIndex) -> impl Iterator + '_ { self.graph.neighbors_directed(n, Direction::Outgoing) } pub fn predecessors(&self, n: NodeIndex) -> impl Iterator + '_ { self.graph.neighbors_directed(n, Direction::Incoming) } pub fn inner(&self) -> &DiGraph { &self.graph } /// Stringify all nodes pub fn nodes_as_strings(&self) -> Vec { self.graph.node_weights().map(|n| n.to_string()).collect() } } #[allow(clippy::collapsible_if)] impl BNDGraph { /// Build edges following downstream 5′→3′ logic for two patterns: /// /// **Pattern 1** (B → A, *u precedes v*): /// /// * forward (→)  `u.b ≤ v.a`   → edge **u → v** /// * reverse (←)  `u.b ≥ v.a`   → edge **v → u** /// /// **Pattern 2** (A → B, *v precedes u*): /// /// * forward (→)  `u.a ≥ v.b`   → edge **v → u** /// * reverse (←)  `u.a ≤ v.b`   → edge **u → v** pub fn auto_connect(&mut self) where E: Default, { let nodes: Vec = self.graph.node_indices().collect(); let mut pending: Vec<(NodeIndex, NodeIndex)> = Vec::new(); for &u_idx in &nodes { for &v_idx in &nodes { if u_idx == v_idx { continue; } let u = &self.graph[u_idx]; let v = &self.graph[v_idx]; // Pattern 1: B(u) -> A(v) (u precedes v) if u.b_contig == v.a_contig && u.b_sens == v.a_sens { if (u.b_sens && u.b_position <= v.a_position) || (!u.b_sens && u.b_position >= v.a_position) { pending.push((u_idx, v_idx)); } } // Pattern 2: A(u) -> B(v) (v precedes u) if u.a_contig == v.b_contig && u.a_sens == v.b_sens { if (u.a_sens && u.a_position >= v.b_position) || (!u.a_sens && u.a_position <= v.b_position) { // Edge direction depends on strand logic let (src, dst) = if u.a_sens { // forward ⇒ edge v -> u (v_idx, u_idx) } else { // reverse ⇒ edge u -> v (u_idx, v_idx) }; pending.push((src, dst)); } } } } // second pass – insert, skipping duplicates for (src, dst) in pending { if self.graph.find_edge(src, dst).is_none() { self.graph.add_edge(src, dst, E::default()); } } } /// Add an edge if absent. pub fn add_edge_if_absent(&mut self, src: NodeIndex, dst: NodeIndex) { if self.graph.find_edge(src, dst).is_none() { self.graph.add_edge(src, dst, E::default()); } } /// Pretty‑print a sequence of nodes indicating **actual traversal /// direction** between successive nodes: /// * `→` when the graph contains an edge from *previous* → *current*. /// * `←` when the edge exists from *current* → *previous* (path goes /// "against" the stored direction). /// /// If neither directional edge is present the placeholder `--` is used so /// debugging clearly shows missing links. pub fn fmt_path(&self, path: &[NodeIndex]) -> String { use core::fmt::Write as _; let mut out = String::new(); if let Some((&first, rest)) = path.split_first() { let _ = write!(out, "{}", &self.graph[first]); let mut prev = first; for &curr in rest { let arrow = if self.graph.find_edge(prev, curr).is_some() { " → " } else if self.graph.find_edge(curr, prev).is_some() { " ← " } else { " -- " // unexpected: no direct edge }; out.push_str(arrow); let _ = write!(out, "{}", &self.graph[curr]); prev = curr; } } out } // ------------------------------------------------------------------ // Traversal utilities // ------------------------------------------------------------------ /// Return a directed path that visits **every node exactly once** (Hamiltonian /// path) if one exists. Uses naive DFS back-tracking which is exponential— /// fine for small graphs (<15 nodes) but aborts early otherwise. pub fn hamiltonian_path(&self) -> Option> { let n = self.graph.node_count(); if n == 0 { return Some(Vec::new()); } if n > 15 { // avoid combinatorial explosion return None; } let mut path = Vec::with_capacity(n); let mut visited = vec![false; self.graph.node_bound()]; for start in self.graph.node_identifiers() { path.push(start); visited[start.index()] = true; if self.dfs_hamiltonian(start, &mut visited, &mut path, n) { return Some(path); } visited[start.index()] = false; path.clear(); } None } /** Recursive helper for [`hamiltonian_path`](#method.hamiltonian_path). * `current` – node we are currently expanding from. * `visited` – mutable bitmap indicating which nodes are already on `path`. * `path`    – growing list of nodes (last element is always `current`). * `target`  – desired final length (= `graph.node_count()`). The function explores each outgoing neighbour that hasn’t been visited yet, marking it **visited → recurse → unvisit** (classic DFS back-tracking). It returns `true` as soon as a full-length path is discovered, which bubbles up through the recursion to terminate the search early. */ fn dfs_hamiltonian( &self, current: NodeIndex, visited: &mut [bool], path: &mut Vec, target: usize, ) -> bool { // Base‑case: reached required length → success. if path.len() == target { return true; } // Explore every outgoing neighbour. for next in self.graph.neighbors(current) { if !visited[next.index()] { // choose visited[next.index()] = true; path.push(next); // explore if self.dfs_hamiltonian(next, visited, path, target) { return true; } // un-choose (back-track) path.pop(); visited[next.index()] = false; } } false } /// Weakly-connected components sorted descending by size (no `Clone` bound). pub fn components_by_size(&self) -> Vec> { let mut visited = vec![false; self.graph.node_bound()]; let mut comps: Vec> = Vec::new(); for start in self.graph.node_indices() { if visited[start.index()] { continue; } // DFS over undirected neighbours let mut stack = vec![start]; let mut comp = Vec::new(); visited[start.index()] = true; while let Some(n) = stack.pop() { comp.push(n); for neigh in self.graph.neighbors_undirected(n) { if !visited[neigh.index()] { visited[neigh.index()] = true; stack.push(neigh); } } } comps.push(comp); } comps.sort_by_key(|c| std::cmp::Reverse(c.len())); comps } /// Convenience wrapper: try to return a Hamiltonian path; if impossible, /// return all weakly‑connected subgraphs ordered by size. pub fn path_or_components(&self) -> Result, Vec>> { if let Some(p) = self.hamiltonian_path() { Ok(p) } else { Err(self.components_by_size()) } } } pub trait ToBNDGraph { /// Consume the vector and return a `BNDGraph` whose edges are created with /// `auto_connect()`. fn to_bnd_graph(self) -> BNDGraph where E: Default; } impl ToBNDGraph for Vec { fn to_bnd_graph(self) -> BNDGraph { let mut g = BNDGraph::::new(); for b in self { g.add_breakpoint(b); } g.auto_connect(); g } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] pub struct DeletionDesc { pub contig: String, pub start: u32, // 1-based [) pub end: u32, } impl fmt::Display for DeletionDesc { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}:{}_{}_del", self.contig, self.start, self.end) } } impl DeletionDesc { pub fn len(&self) -> u32 { self.end.saturating_sub(self.start) + 1 } pub fn is_empty(&self) -> bool { self.len() == 0 } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)] pub enum AlterationCategory { SNV, DEL, INS, DUP, INV, CNV, TRL, BND, DELINV, Other, } impl fmt::Display for AlterationCategory { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", match self { AlterationCategory::SNV => "SNV", AlterationCategory::DEL => "DEL", AlterationCategory::INS => "INS", AlterationCategory::DUP => "DUP", AlterationCategory::INV => "INV", AlterationCategory::CNV => "CNV", AlterationCategory::BND => "BND", AlterationCategory::TRL => "TRL", AlterationCategory::Other => "Other", AlterationCategory::DELINV => "DELINV", } ) } } impl From for AlterationCategory { fn from(sv_type: SVType) -> Self { match sv_type { SVType::DEL => AlterationCategory::DEL, SVType::INS => AlterationCategory::INS, SVType::DUP => AlterationCategory::DUP, SVType::INV => AlterationCategory::INV, SVType::CNV => AlterationCategory::CNV, SVType::BND => AlterationCategory::BND, } } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)] pub enum SVType { DEL, INS, DUP, INV, CNV, BND, } impl FromStr for SVType { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "DEL" => Ok(SVType::DEL), "INS" => Ok(SVType::INS), "DUP" => Ok(SVType::DUP), "INV" => Ok(SVType::INV), "CNV" => Ok(SVType::CNV), "BND" => Ok(SVType::BND), _ => Err(anyhow!("Can't parse SVTYPE={s}")), } } } impl fmt::Display for SVType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", match self { SVType::DEL => "DEL", SVType::INS => "INS", SVType::DUP => "DUP", SVType::INV => "INV", SVType::CNV => "CNV", SVType::BND => "BND", } ) } } impl VariantId for VcfVariant { fn variant_id(&self) -> String { format!( "{}:{}_{}_{}", self.position.contig(), self.position.position + 1, self.reference, self.alternative ) } } impl GetGenomePosition for VcfVariant { fn position(&self) -> &GenomePosition { &self.position } } impl PartialOrd for VcfVariant { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for VcfVariant { fn cmp(&self, other: &Self) -> Ordering { self.position.cmp(&other.position) } } /// A container for a list of VCF `INFO` fields. /// /// Represents a parsed set of key-value annotations or flags found in the INFO column. /// /// # Example /// ``` /// use your_crate::Infos; /// use std::str::FromStr; /// /// let infos = Infos::from_str("SVTYPE=DEL;END=12345;TUMOUR_AF=0.25,0.15").unwrap(); /// println!("{}", infos); // Displays: SVTYPE=DEL;END=12345;TUMOUR_AF=0.25,0.15 /// ``` #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)] pub struct Infos(pub Vec); impl FromStr for Infos { type Err = anyhow::Error; /// Parses a semicolon-separated list of INFO fields from a VCF record. fn from_str(s: &str) -> anyhow::Result { Ok(Self( s.split(";") .map(Info::from_str) .collect::, _>>() .map_err(|e| anyhow::anyhow!("Failed to parse info: {e}"))?, )) } } impl fmt::Display for Infos { /// Formats the `Infos` as a semicolon-separated VCF-style INFO string. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let items: Vec<_> = self .0 .iter() .filter(|info| !matches!(info, Info::Empty)) .map(ToString::to_string) .collect(); if items.is_empty() { write!(f, ".") } else { write!(f, "{}", items.join(";")) } } } impl Infos { pub fn n_alt_depth(&self) -> Option<(u32, u32)> { use Info::*; let mut tumor_alt: Option = None; let mut tumor_depth: Option = None; for info in self.0.iter() { match info { TUMOUR_DP_AT(v) => { let m = mean(v); tumor_depth = Some(m.round() as u32); } TUMOUR_READ_SUPPORT(v) => { tumor_alt = Some(*v); } _ => (), } } match (tumor_alt, tumor_depth) { (Some(a), Some(d)) => Some((a, d)), _ => None, } } } /// Enum representing a single INFO field in a VCF record. /// /// Supports both standard fields and Severus-specific structural variant annotations. /// Handles string values, numeric values, vectors, and flags. /// /// Variants with `Vec<_>` represent fields with multiple comma-separated values. #[allow(non_camel_case_types)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)] pub enum Info { Empty, H, F, P, FAU(u32), FCU(u32), FGU(u32), FTU(u32), RAU(u32), RCU(u32), RGU(u32), RTU(u32), SVTYPE(SVType), MATEID(String), NORMAL_READ_SUPPORT(u32), TUMOUR_READ_SUPPORT(u32), NORMAL_ALN_SUPPORT(u32), TUMOUR_ALN_SUPPORT(u32), SVLEN(i32), TUMOUR_DP_BEFORE(Vec), TUMOUR_DP_AT(Vec), TUMOUR_DP_AFTER(Vec), NORMAL_DP_BEFORE(Vec), NORMAL_DP_AT(Vec), NORMAL_DP_AFTER(Vec), TUMOUR_AF(Vec), NORMAL_AF(Vec), BP_NOTATION(String), SOURCE(String), CLUSTERED_READS_TUMOUR(u32), CLUSTERED_READS_NORMAL(u32), TUMOUR_ALT_HP(Vec), TUMOUR_PS(Vec), NORMAL_ALT_HP(Vec), NORMAL_PS(Vec), TUMOUR_TOTAL_HP_AT(Vec), NORMAL_TOTAL_HP_AT(Vec), ORIGIN_STARTS_STD_DEV(f32), ORIGIN_MAPQ_MEAN(f32), ORIGIN_EVENT_SIZE_STD_DEV(f32), ORIGIN_EVENT_SIZE_MEDIAN(f32), ORIGIN_EVENT_SIZE_MEAN(f32), END_STARTS_STD_DEV(f32), END_MAPQ_MEAN(f32), END_EVENT_SIZE_STD_DEV(f32), END_EVENT_SIZE_MEDIAN(f32), END_EVENT_SIZE_MEAN(f32), CLASS(String), END(u32), SVINSLEN(u32), SVINSSEQ(String), // Severus PRECISE, IMPRECISE, STRANDS(String), DETAILED_TYPE(String), INSLEN(i32), MAPQ(u32), PHASESETID(String), HP(u32), CLUSTERID(String), INSSEQ(String), MATE_ID(String), INSIDE_VNTR(String), ALINGED_POS(String), } impl FromStr for Info { type Err = anyhow::Error; /// Parses a single `INFO` key or key=value string into a typed `Info` variant. /// /// Handles both presence/absence flags and key-value fields fn from_str(s: &str) -> anyhow::Result { if s.contains('=') { let (key, value) = s .split_once('=') .context(format!("Can't split with `=` in string: {s}"))?; Ok(match key { "FAU" => Info::FAU(parse_value(value, key)?), "FCU" => Info::FCU(parse_value(value, key)?), "FGU" => Info::FGU(parse_value(value, key)?), "FTU" => Info::FTU(parse_value(value, key)?), "RAU" => Info::RAU(parse_value(value, key)?), "RCU" => Info::RCU(parse_value(value, key)?), "RGU" => Info::RGU(parse_value(value, key)?), "RTU" => Info::RTU(parse_value(value, key)?), "SVLEN" => Info::SVLEN(parse_value(value, key)?), "END" => Info::END(parse_value(value, key)?), "SVINSLEN" => Info::SVINSLEN(parse_value(value, key)?), "SVTYPE" => Info::SVTYPE(value.parse()?), "MATEID" => Info::MATEID(value.to_string()), "NORMAL_READ_SUPPORT" => Info::NORMAL_READ_SUPPORT(parse_value(value, key)?), "TUMOUR_READ_SUPPORT" => Info::TUMOUR_READ_SUPPORT(parse_value(value, key)?), "NORMAL_ALN_SUPPORT" => Info::NORMAL_ALN_SUPPORT(parse_value(value, key)?), "TUMOUR_ALN_SUPPORT" => Info::TUMOUR_ALN_SUPPORT(parse_value(value, key)?), "SVINSSEQ" => Info::SVINSSEQ(value.to_string()), "TUMOUR_DP_BEFORE" => Info::TUMOUR_DP_BEFORE(parse_vec_value(value, key)?), "TUMOUR_DP_AT" => Info::TUMOUR_DP_AT(parse_vec_value(value, key)?), "TUMOUR_DP_AFTER" => Info::TUMOUR_DP_AFTER(parse_vec_value(value, key)?), "NORMAL_DP_BEFORE" => Info::NORMAL_DP_BEFORE(parse_vec_value(value, key)?), "NORMAL_DP_AT" => Info::NORMAL_DP_AT(parse_vec_value(value, key)?), "NORMAL_DP_AFTER" => Info::NORMAL_DP_AFTER(parse_vec_value(value, key)?), "TUMOUR_AF" => Info::TUMOUR_AF(parse_vec_value(value, key)?), "NORMAL_AF" => Info::NORMAL_AF(parse_vec_value(value, key)?), "BP_NOTATION" => Info::BP_NOTATION(value.to_string()), "SOURCE" => Info::SOURCE(value.to_string()), "CLUSTERED_READS_TUMOUR" => Info::CLUSTERED_READS_TUMOUR(parse_value(value, key)?), "CLUSTERED_READS_NORMAL" => Info::CLUSTERED_READS_NORMAL(parse_value(value, key)?), "TUMOUR_ALT_HP" => Info::TUMOUR_ALT_HP(parse_vec_value(value, key)?), "TUMOUR_PS" => Info::TUMOUR_PS(parse_vec_value(value, key)?), "NORMAL_ALT_HP" => Info::NORMAL_ALT_HP(parse_vec_value(value, key)?), "NORMAL_PS" => Info::NORMAL_PS(parse_vec_value(value, key)?), "TUMOUR_TOTAL_HP_AT" => Info::TUMOUR_TOTAL_HP_AT(parse_vec_value(value, key)?), "NORMAL_TOTAL_HP_AT" => Info::NORMAL_TOTAL_HP_AT(parse_vec_value(value, key)?), "ORIGIN_STARTS_STD_DEV" => Info::ORIGIN_STARTS_STD_DEV(parse_value(value, key)?), "ORIGIN_MAPQ_MEAN" => Info::ORIGIN_MAPQ_MEAN(parse_value(value, key)?), "ORIGIN_EVENT_SIZE_STD_DEV" => { Info::ORIGIN_EVENT_SIZE_STD_DEV(parse_value(value, key)?) } "ORIGIN_EVENT_SIZE_MEDIAN" => { Info::ORIGIN_EVENT_SIZE_MEDIAN(parse_value(value, key)?) } "ORIGIN_EVENT_SIZE_MEAN" => Info::ORIGIN_EVENT_SIZE_MEAN(parse_value(value, key)?), "END_STARTS_STD_DEV" => Info::END_STARTS_STD_DEV(parse_value(value, key)?), "END_MAPQ_MEAN" => Info::END_MAPQ_MEAN(parse_value(value, key)?), "END_EVENT_SIZE_STD_DEV" => Info::END_EVENT_SIZE_STD_DEV(parse_value(value, key)?), "END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?), "END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?), "CLASS" => Info::CLASS(value.to_string()), "PRECISE" => Info::PRECISE, "IMPRECISE" => Info::IMPRECISE, "STRANDS" => Info::STRANDS(value.to_string()), "DETAILED_TYPE" => Info::DETAILED_TYPE(value.to_string()), "INSLEN" => Info::INSLEN(parse_value(value, key)?), "MAPQ" => Info::MAPQ(parse_value(value, key)?), "PHASESETID" => Info::PHASESETID(value.to_string()), "HP" => Info::HP(parse_value(value, key)?), "CLUSTERID" => Info::CLUSTERID(value.to_string()), "INSSEQ" => Info::INSSEQ(value.to_string()), "MATE_ID" => Info::MATE_ID(value.to_string()), "INSIDE_VNTR" => Info::INSIDE_VNTR(value.to_string()), "ALINGED_POS" => Info::ALINGED_POS(value.to_string()), _ => Info::Empty, }) } else { Ok(match s { "H" => Info::H, "F" => Info::F, "P" => Info::P, "PRECISE" => Info::PRECISE, "IMPRECISE" => Info::IMPRECISE, _ => Info::Empty, }) } } } impl fmt::Display for Info { /// Converts the `Info` enum into a VCF-compliant string (key=value or flag). fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Info::Empty => write!(f, "."), Info::H => write!(f, "H"), Info::F => write!(f, "F"), Info::P => write!(f, "P"), // ClairS Info::FAU(v) => write!(f, "FAU={v}"), Info::FCU(v) => write!(f, "FCU={v}"), Info::FGU(v) => write!(f, "FGU={v}"), Info::FTU(v) => write!(f, "FTU={v}"), Info::RAU(v) => write!(f, "RAU={v}"), Info::RCU(v) => write!(f, "RCU={v}"), Info::RGU(v) => write!(f, "RGU={v}"), Info::RTU(v) => write!(f, "RTU={v}"), // Nanomonsv Info::SVTYPE(v) => write!(f, "SVTYPE={v}"), Info::SVLEN(v) => write!(f, "SVLEN={v}"), Info::END(v) => write!(f, "END={v}"), Info::MATEID(v) => write!(f, "MATEID={v}"), Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"), Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"), // SAVANA Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"), Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"), Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"), Info::TUMOUR_ALN_SUPPORT(v) => write!(f, "TUMOUR_ALN_SUPPORT={v}"), Info::TUMOUR_DP_BEFORE(v) => write!(f, "TUMOUR_DP_BEFORE={}", concat_numbers(v)), Info::TUMOUR_DP_AT(v) => write!(f, "TUMOUR_DP_AT={}", concat_numbers(v)), Info::TUMOUR_DP_AFTER(v) => write!(f, "TUMOUR_DP_AFTER={}", concat_numbers(v)), Info::NORMAL_DP_BEFORE(v) => write!(f, "NORMAL_DP_BEFORE={}", concat_numbers(v)), Info::NORMAL_DP_AT(v) => write!(f, "NORMAL_DP_AT={}", concat_numbers(v)), Info::NORMAL_DP_AFTER(v) => write!(f, "NORMAL_DP_AFTER={}", concat_numbers(v)), Info::TUMOUR_AF(v) => write!(f, "TUMOUR_AF={}", concat_numbers(v)), Info::NORMAL_AF(v) => write!(f, "NORMAL_AF={}", concat_numbers(v)), Info::BP_NOTATION(v) => write!(f, "BP_NOTATION={v}"), Info::SOURCE(v) => write!(f, "SOURCE={v}"), Info::CLUSTERED_READS_TUMOUR(v) => write!(f, "CLUSTERED_READS_TUMOUR={v}"), Info::CLUSTERED_READS_NORMAL(v) => write!(f, "CLUSTERED_READS_NORMAL={v}"), Info::TUMOUR_ALT_HP(v) => write!(f, "TUMOUR_ALT_HP={}", concat_numbers(v)), Info::TUMOUR_PS(v) => write!(f, "TUMOUR_PS={}", v.join(",")), Info::NORMAL_ALT_HP(v) => write!(f, "NORMAL_ALT_HP={}", concat_numbers(v)), Info::NORMAL_PS(v) => write!(f, "NORMAL_PS={}", v.join(",")), Info::TUMOUR_TOTAL_HP_AT(v) => write!(f, "TUMOUR_TOTAL_HP_AT={}", concat_numbers(v)), Info::NORMAL_TOTAL_HP_AT(v) => write!(f, "NORMAL_TOTAL_HP_AT={}", concat_numbers(v)), Info::ORIGIN_STARTS_STD_DEV(v) => write!(f, "ORIGIN_STARTS_STD_DEV={v}"), Info::ORIGIN_MAPQ_MEAN(v) => write!(f, "ORIGIN_MAPQ_MEAN={v}"), Info::ORIGIN_EVENT_SIZE_STD_DEV(v) => write!(f, "ORIGIN_EVENT_SIZE_STD_DEV={v}"), Info::ORIGIN_EVENT_SIZE_MEDIAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEDIAN={v}"), Info::ORIGIN_EVENT_SIZE_MEAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEAN={v}"), Info::END_STARTS_STD_DEV(v) => write!(f, "END_STARTS_STD_DEV={v}"), Info::END_MAPQ_MEAN(v) => write!(f, "END_MAPQ_MEAN={v}"), Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"), Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"), Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"), Info::CLASS(v) => write!(f, "CLASS={v}"), // Severus Info::PRECISE => write!(f, "PRECISE"), Info::IMPRECISE => write!(f, "IMPRECISE"), Info::STRANDS(v) => write!(f, "STRANDS={v}"), Info::DETAILED_TYPE(v) => write!(f, "DETAILED_TYPE={v}"), Info::INSLEN(v) => write!(f, "INSLEN={v}"), Info::MAPQ(v) => write!(f, "MAPQ={v}"), Info::PHASESETID(v) => write!(f, "PHASESETID={v}"), Info::HP(v) => write!(f, "HP={v}"), Info::CLUSTERID(v) => write!(f, "CLUSTERID={v}"), Info::INSSEQ(v) => write!(f, "INSSEQ={v}"), Info::MATE_ID(v) => write!(f, "MATE_ID={v}"), Info::INSIDE_VNTR(v) => write!(f, "INSIDE_VNTR={v}"), Info::ALINGED_POS(v) => write!(f, "ALINGED_POS={v}"), } } } pub fn concat_numbers(v: &[T]) -> String { v.iter() .map(|n| n.to_string()) .collect::>() .join(",") } impl Info { /// Returns the VCF key name (e.g. "DP", "SVTYPE", "PRECISE") for this INFO field. /// /// This uses the string representation (`Display`) and extracts the part before `=`, /// which is valid for both key=value and flag-only entries. pub fn key(&self) -> String { self.to_string() .split('=') .next() .unwrap_or(".") .to_string() } /// Returns the complete set of known VCF `INFO` header definitions used by `Info` variants. /// /// # Example /// ``` /// let headers = Info::header_definitions(); /// for line in headers { /// println!("{line}"); /// } /// ``` pub fn header_definitions() -> BTreeSet { let mut set = BTreeSet::new(); macro_rules! push { ($id:expr, $num:expr, $typ:expr, $desc:expr) => { set.insert(format!( r#"##INFO="#, $id, $num, $typ, $desc )); }; } // Flags push!("H", 0, "Flag", "H flag"); push!("F", 0, "Flag", "F flag"); push!("P", 0, "Flag", "P flag"); // Allelic support push!("FAU", 1, "Integer", "Forward A support in tumour"); push!("FCU", 1, "Integer", "Forward C support in tumour"); push!("FGU", 1, "Integer", "Forward G support in tumour"); push!("FTU", 1, "Integer", "Forward T support in tumour"); push!("RAU", 1, "Integer", "Reverse A support in tumour"); push!("RCU", 1, "Integer", "Reverse C support in tumour"); push!("RGU", 1, "Integer", "Reverse G support in tumour"); push!("RTU", 1, "Integer", "Reverse T support in tumour"); // Structural variant metadata push!("SVTYPE", 1, "String", "Structural variant type"); push!("MATEID", 1, "String", "ID of the mate breakend"); push!("SVLEN", 1, "Integer", "Length of structural variant"); push!("SVINSLEN", 1, "Integer", "Length of inserted sequence"); push!("SVINSSEQ", 1, "String", "Inserted sequence"); // Positions and read support push!("END", 1, "Integer", "End position of the variant"); push!( "NORMAL_READ_SUPPORT", 1, "Integer", "Supporting reads in normal sample" ); push!( "TUMOUR_READ_SUPPORT", 1, "Integer", "Supporting reads in tumour sample" ); push!( "NORMAL_ALN_SUPPORT", 1, "Integer", "Aligned reads in normal sample" ); push!( "TUMOUR_ALN_SUPPORT", 1, "Integer", "Aligned reads in tumour sample" ); // Depth profiles push!( "TUMOUR_DP_BEFORE", ".", "Integer", "Depth before breakpoint in tumour" ); push!( "TUMOUR_DP_AT", ".", "Integer", "Depth at breakpoint in tumour" ); push!( "TUMOUR_DP_AFTER", ".", "Integer", "Depth after breakpoint in tumour" ); push!( "NORMAL_DP_BEFORE", ".", "Integer", "Depth before breakpoint in normal" ); push!( "NORMAL_DP_AT", ".", "Integer", "Depth at breakpoint in normal" ); push!( "NORMAL_DP_AFTER", ".", "Integer", "Depth after breakpoint in normal" ); // Allele frequencies push!( "TUMOUR_AF", ".", "Float", "Variant allele frequencies in tumour" ); push!( "NORMAL_AF", ".", "Float", "Variant allele frequencies in normal" ); // Haplotype/phasing push!( "TUMOUR_ALT_HP", ".", "Integer", "Alternate haplotype support in tumour" ); push!("TUMOUR_PS", ".", "String", "Phasing set in tumour"); push!( "NORMAL_ALT_HP", ".", "Integer", "Alternate haplotype support in normal" ); push!("NORMAL_PS", ".", "String", "Phasing set in normal"); push!( "TUMOUR_TOTAL_HP_AT", ".", "Integer", "Total haplotype depth at breakpoint in tumour" ); push!( "NORMAL_TOTAL_HP_AT", ".", "Integer", "Total haplotype depth at breakpoint in normal" ); // Cluster analysis push!( "CLUSTERED_READS_TUMOUR", 1, "Integer", "Clustered reads in tumour" ); push!( "CLUSTERED_READS_NORMAL", 1, "Integer", "Clustered reads in normal" ); // Origin and end-point statistics push!( "ORIGIN_STARTS_STD_DEV", 1, "Float", "STDDEV of read starts at origin" ); push!("ORIGIN_MAPQ_MEAN", 1, "Float", "Mean MAPQ at origin"); push!( "ORIGIN_EVENT_SIZE_STD_DEV", 1, "Float", "STDDEV of event size at origin" ); push!( "ORIGIN_EVENT_SIZE_MEDIAN", 1, "Float", "Median event size at origin" ); push!( "ORIGIN_EVENT_SIZE_MEAN", 1, "Float", "Mean event size at origin" ); push!( "END_STARTS_STD_DEV", 1, "Float", "STDDEV of read starts at end" ); push!("END_MAPQ_MEAN", 1, "Float", "Mean MAPQ at end"); push!( "END_EVENT_SIZE_STD_DEV", 1, "Float", "STDDEV of event size at end" ); push!( "END_EVENT_SIZE_MEDIAN", 1, "Float", "Median event size at end" ); push!("END_EVENT_SIZE_MEAN", 1, "Float", "Mean event size at end"); // Additional push!("BP_NOTATION", 1, "String", "Breakpoint notation"); push!("SOURCE", 1, "String", "Caller source name"); push!("CLASS", 1, "String", "Variant classification"); // Severus push!( "PRECISE", 0, "Flag", "SV with precise breakpoints coordinates and length" ); push!( "IMPRECISE", 0, "Flag", "SV with imprecise breakpoints coordinates and length" ); push!("STRANDS", 1, "String", "Breakpoint strandedness"); push!("DETAILED_TYPE", 1, "String", "Detailed type of the SV"); push!( "INSLEN", 1, "Integer", "Length of the unmapped sequence between breakpoint" ); push!( "MAPQ", 1, "Integer", "Median mapping quality of supporting reads" ); push!( "PHASESETID", 1, "String", "Matching phaseset ID for phased SVs" ); push!("HP", 1, "Integer", "Matching haplotype ID for phased SVs"); push!("CLUSTERID", 1, "String", "Cluster ID in breakpoint_graph"); push!( "INSSEQ", 1, "String", "Insertion sequence between breakpoints" ); push!("MATE_ID", 1, "String", "MATE ID for breakends"); push!( "INSIDE_VNTR", 1, "String", "True if an indel is inside a VNTR" ); push!("ALINGED_POS", 1, "String", "Position in the reference"); set } } /// Format /// Enum representing individual FORMAT fields from a VCF record. /// /// This enum supports common fields used by DeepVariant, Clairs, and nanomonsv, /// as well as a generic fallback for other key-value pairs. /// /// # Examples /// /// ``` /// use your_crate::Format; /// /// let gt = Format::GT("0/1".to_string()); /// let dp = Format::DP(30); /// let ad = Format::AD(vec![10, 20]); /// ``` #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)] pub enum Format { // --- DeepVariant fields --- /// Genotype string, e.g., "0/1", "1/1". GT(String), /// Genotype quality. GQ(u32), /// Read depth (total coverage at the variant position). DP(u32), /// Allelic depths for the ref and alt alleles (e.g., [ref, alt1, alt2...]). AD(Vec), /// Variant allele frequency (e.g., 0.25 for 25%). VAF(f32), /// Phred-scaled genotype likelihoods. PL(Vec), // --- Clairs fields (prefixed with N: for normal sample, or tumor in case of paired) --- /// Normal sample total depth. NDP(u32), /// Normal sample allelic depths (e.g., [ref, alt1, alt2...]). NAD(Vec), /// Allele-specific counts for A, C, G, T bases in tumor sample. AU(u32), CU(u32), GU(u32), TU(u32), /// Allele-specific counts for A, C, G, T bases in normal sample. NAU(u32), NCU(u32), NGU(u32), NTU(u32), // --- nanomonsv fields --- /// Total number of supporting reads in tumor. TR(u32), /// Variant-supporting reads in tumor. VR(u32), // --- Severus fields --- DR(u32), DV(u32), HVAF(Vec), /// Fallback for any other key-value pair not explicitly modeled. /// Contains the raw key and value as strings. Other((String, String)), } /// Container for a list of `Format` items. /// Represents the full FORMAT field and sample value for one sample. /// /// # Examples /// /// ``` /// use your_crate::{Formats, Format}; /// /// let formats = Formats(vec![ /// Format::GT("0/1".to_string()), /// Format::DP(45), /// Format::AD(vec![15, 30]), /// ]); /// ``` #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)] pub struct Formats(pub Vec); impl Formats { /// Returns the tumor alternative read depth and total depth if both are available. /// /// This method looks for: /// - `Format::AD`: to compute the sum of alternative allele depths (excluding reference) /// - `Format::DP`: to get total read depth /// /// Returns `Some((alt_depth, total_depth))` if both are present, else `None`. /// /// # Example /// ``` /// use your_crate::{Formats, Format}; /// /// let f = Formats(vec![ /// Format::AD(vec![10, 20, 5]), /// Format::DP(40), /// ]); /// /// assert_eq!(f.n_alt_depth(), Some((25, 40))); /// ``` pub fn n_alt_depth(&self) -> Option<(u32, u32)> { let mut tumor_alt_depth: Option = None; let mut tumor_ref_depth: Option = None; let mut tumor_total_depth: Option = None; for format in &self.0 { match format { Format::AD(values) => { if values.len() > 1 { tumor_alt_depth = Some(values[1..].iter().sum()); } } Format::DP(value) => { tumor_total_depth = Some(*value); } Format::VR(values) => { tumor_alt_depth = Some(*values); } Format::TR(value) => { tumor_ref_depth = Some(*value); } Format::DV(values) => { tumor_alt_depth = Some(*values); } Format::DR(value) => { tumor_ref_depth = Some(*value); } _ => {} } } match (tumor_alt_depth, tumor_total_depth, tumor_ref_depth) { (Some(alt), Some(total), _) => Some((alt, total)), (Some(alt), _, Some(refe)) => Some((alt, alt + refe)), _ => None, } } } impl TryFrom<(&str, &str)> for Formats { type Error = anyhow::Error; /// Attempts to construct a `Formats` from a pair of colon-separated FORMAT keys and values. /// /// # Arguments /// * `k` - FORMAT field names (e.g., "GT:DP:AD") /// * `v` - Corresponding values (e.g., "0/1:35:10,25") /// /// # Errors /// Returns an error if the number of keys and values do not match or if parsing fails. /// /// # Example /// ``` /// use your_crate::Formats; /// use std::convert::TryFrom; /// /// let f = Formats::try_from(("GT:DP:AD", "0/1:40:12,28")).unwrap(); /// ``` fn try_from((k, v): (&str, &str)) -> anyhow::Result { let keys: Vec<&str> = k.split(':').collect(); let values: Vec<&str> = v.split(':').collect(); if keys.len() != values.len() { anyhow::bail!("Mismatch between keys and values count for {k} {v}"); } Ok(Self( keys.into_iter() .zip(values) .map(|(key, value)| Format::try_from((key, value))) .collect::, _>>() .map_err(|e| anyhow::anyhow!("Failed to parse format: {e}"))?, )) } } impl From for (String, String) { /// Converts `Formats` back into a `(keys, values)` tuple of colon-separated strings. /// /// This is the inverse of the `TryFrom<(&str, &str)>` implementation. /// /// # Example /// ``` /// use your_crate::{Format, Formats}; /// /// let formats = Formats(vec![ /// Format::GT("0/1".to_string()), /// Format::DP(30), /// ]); /// /// let (k, v): (String, String) = formats.into(); /// assert_eq!(k, "GT:DP"); /// assert_eq!(v, "0/1:30"); /// ``` fn from(formats: Formats) -> Self { let mut keys = Vec::new(); let mut values = Vec::new(); for format in formats.0 { let (key, value): (String, String) = format.into(); keys.push(key); values.push(value); } (keys.join(":"), values.join(":")) } } impl TryFrom<(&str, &str)> for Format { type Error = anyhow::Error; /// Tries to convert a `(key, value)` pair into a typed `Format` variant. /// /// This parser supports known FORMAT keys from DeepVariant, Clairs, and nanomonsv. /// Unknown keys are stored as `Format::Other((key, value))`. /// /// # Arguments /// * `key` - FORMAT field name /// * `value` - raw string value associated with the key /// /// # Example /// ``` /// use your_crate::Format; /// use std::convert::TryFrom; /// /// let dp = Format::try_from(("DP", "42")).unwrap(); /// assert!(matches!(dp, Format::DP(42))); /// ``` fn try_from((key, value): (&str, &str)) -> anyhow::Result { let format = match key { // DeepVariant "GT" => Format::GT(value.to_string()), "GQ" => Format::GQ(parse_value(value, key)?), "DP" => Format::DP(parse_value(value, key)?), "AD" => Format::AD(parse_vec_value(value, key)?), "VAF" => Format::VAF(parse_value(value, key)?), "PL" => Format::PL(parse_vec_value(value, key)?), // Clairs "NDP" => Format::NDP(parse_value(value, key)?), "NAD" => Format::NAD(parse_vec_value(value, key)?), "AU" => Format::AU(parse_value(value, key)?), "CU" => Format::CU(parse_value(value, key)?), "GU" => Format::GU(parse_value(value, key)?), "TU" => Format::TU(parse_value(value, key)?), "NAU" => Format::NAU(parse_value(value, key)?), "NCU" => Format::NCU(parse_value(value, key)?), "NGU" => Format::NGU(parse_value(value, key)?), "NTU" => Format::NTU(parse_value(value, key)?), // nanomonsv "TR" => Format::TR(parse_value(value, key)?), "VR" => Format::VR(parse_value(value, key)?), // Severus "DR" => Format::DR(parse_value(value, key)?), "DV" => Format::DV(parse_value(value, key)?), "hVAF" => Format::HVAF(parse_vec_value(value, key)?), // fallback _ => Format::Other((key.to_string(), value.to_string())), }; Ok(format) } } // Helper function to parse a single value (DeepSeek) fn parse_value(value: &str, key: &str) -> anyhow::Result where T::Err: std::fmt::Debug, { value .parse() .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error` .context(format!("Can't parse {}: {}", key, value)) // Add context } // Helper function to parse comma-separated values fn parse_vec_value(value: &str, key: &str) -> anyhow::Result> where T::Err: std::fmt::Debug, { value .split(',') .map(|e| { e.parse() .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error` .context(format!("Failed to parse {}: {}", key, e)) // Add context }) .collect() } impl From for (String, String) { /// Converts a `Format` enum into a `(key, value)` pair, as strings. /// /// This is used to serialize the FORMAT field back into VCF-compatible string values. /// The key corresponds to the field ID (e.g., `"DP"`, `"GT"`), and the value is the encoded string representation. /// /// # Examples /// ``` /// use your_crate::Format; /// let f = Format::DP(42); /// let (k, v): (String, String) = f.into(); /// assert_eq!(k, "DP"); /// assert_eq!(v, "42"); /// ``` fn from(format: Format) -> Self { let concat_u32 = |values: Vec| -> String { values .iter() .map(u32::to_string) .collect::>() .join(",") }; let concat_f32 = |values: Vec| -> String { values .iter() .map(|v| format!("{:.5}", v)) // consistent decimal format .collect::>() .join(",") }; match format { // DeepVariant Format::GT(value) => ("GT".to_string(), value), Format::GQ(value) => ("GQ".to_string(), value.to_string()), Format::DP(value) => ("DP".to_string(), value.to_string()), Format::AD(values) => ("AD".to_string(), concat_u32(values)), Format::VAF(value) => ("VAF".to_string(), format!("{:.5}", value)), Format::PL(values) => ("PL".to_string(), concat_u32(values)), // Clairs Format::NDP(value) => ("NDP".to_string(), value.to_string()), Format::NAD(values) => ("NAD".to_string(), concat_u32(values)), Format::AU(value) => ("AU".to_string(), value.to_string()), Format::CU(value) => ("CU".to_string(), value.to_string()), Format::GU(value) => ("GU".to_string(), value.to_string()), Format::TU(value) => ("TU".to_string(), value.to_string()), Format::NAU(value) => ("NAU".to_string(), value.to_string()), Format::NCU(value) => ("NCU".to_string(), value.to_string()), Format::NGU(value) => ("NGU".to_string(), value.to_string()), Format::NTU(value) => ("NTU".to_string(), value.to_string()), // nanomonsv Format::TR(value) => ("TR".to_string(), value.to_string()), Format::VR(value) => ("VR".to_string(), value.to_string()), // Severus Format::DR(value) => ("DR".to_string(), value.to_string()), Format::DV(value) => ("DV".to_string(), value.to_string()), Format::HVAF(values) => ("hVAF".to_string(), concat_f32(values)), // fallback Format::Other((key, value)) => (key, value), } } } impl Formats { pub fn commun_deepvariant_clairs(&self) -> Self { let filtered_vec: Vec = self .0 .clone() .into_iter() .map(|e| { if let Format::VAF(_v) = e { e // Format::AF(v) } else { e } }) .filter(|format| { matches!( format, Format::GT(_) | Format::GQ(_) | Format::DP(_) | Format::AD(_) /* | Format::AF(_) */ ) }) .collect(); Formats(filtered_vec) } /// Returns a sorted set of VCF header definitions for all possible `Format` fields. /// /// # Example /// ``` /// let headers = Formats::format_headers(); /// for h in headers { /// println!("{}", h); /// } /// ``` pub fn format_headers() -> BTreeSet { let mut headers = BTreeSet::new(); headers .insert(r#"##FORMAT="#.to_string()); headers.insert( r#"##FORMAT="#.to_string(), ); headers.insert( r#"##FORMAT="#.to_string(), ); headers.insert(r#"##FORMAT="#.to_string()); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert(r#"##FORMAT="#.to_string()); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert(r#"##FORMAT="#.to_string()); headers.insert(r#"##FORMAT="#.to_string()); // Severus headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); headers.insert(r#"##FORMAT="#.to_string()); headers.insert( r#"##FORMAT="#.to_string(), ); headers.insert( r#"##FORMAT="# .to_string(), ); // headers.insert( // r#"##FORMAT="# // .to_string(), // ); headers } } /// Filter #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)] pub enum Filter { PASS, Other(String), } impl FromStr for Filter { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "PASS" => Ok(Filter::PASS), _ => Ok(Filter::Other(s.to_string())), } } } impl fmt::Display for Filter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Filter::PASS => write!(f, "PASS"), Filter::Other(ref s) => write!(f, "{}", s), } } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)] pub enum ReferenceAlternative { Nucleotide(Base), Nucleotides(Vec), Unstructured(String), } impl ReferenceAlternative { pub fn len(&self) -> usize { match self { ReferenceAlternative::Nucleotide(_) => 1, ReferenceAlternative::Nucleotides(bases) => bases.len(), ReferenceAlternative::Unstructured(_) => 0, } } pub fn is_empty(&self) -> bool { self.len() == 0 } pub fn ent(&self) -> Option { match self { ReferenceAlternative::Nucleotide(_) => None, ReferenceAlternative::Nucleotides(bases) => { let seq = bases .iter() .skip(1) .map(|b| b.to_string()) .collect::(); if seq.len() > 1 { Some(estimate_shannon_entropy(&seq)) } else { None } } ReferenceAlternative::Unstructured(_) => None, } } } impl FromStr for ReferenceAlternative { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { let possible_bases = s.as_bytes().iter(); let mut res: Vec = Vec::new(); for &base in possible_bases { match base.try_into() { std::result::Result::Ok(b) => res.push(b), Err(_) => { return Ok(Self::Unstructured(s.to_string())); } } } if res.len() == 1 { Ok(Self::Nucleotide(res.pop().unwrap())) } else { Ok(Self::Nucleotides(res)) } } } impl fmt::Display for ReferenceAlternative { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let string = match self { ReferenceAlternative::Nucleotide(b) => b.to_string(), ReferenceAlternative::Nucleotides(bases) => bases .iter() .fold(String::new(), |acc, e| format!("{}{}", acc, e)), ReferenceAlternative::Unstructured(s) => s.to_string(), }; write!(f, "{}", string) } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)] pub enum Base { A, T, C, G, N, } impl TryFrom for Base { type Error = anyhow::Error; fn try_from(base: u8) -> anyhow::Result { match base { // uppercase b'A' | b'a' => Ok(Base::A), b'T' | b't' => Ok(Base::T), b'C' | b'c' => Ok(Base::C), b'G' | b'g' => Ok(Base::G), b'N' | b'n' => Ok(Base::N), // unknown _ => Err(anyhow::anyhow!( "Unknown base: {}", String::from_utf8_lossy(&[base]) )), } } } impl Base { pub fn into_u8(self) -> u8 { match self { Base::A => b'A', Base::T => b'T', Base::C => b'C', Base::G => b'G', Base::N => b'N', } } } impl fmt::Display for Base { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Use `self.number` to refer to each positional data point. let str = match self { Base::A => "A", Base::T => "T", Base::C => "C", Base::G => "G", Base::N => "N", }; write!(f, "{}", str) } } pub trait Variants { fn variants(&self, annotations: &Annotations) -> anyhow::Result; } pub trait VariantId { fn variant_id(&self) -> String; } pub trait Label { fn label(&self) -> String; } /// A trait alias for all dynamically executable pipeline runners. /// /// This trait represents any component that: /// - Can decide whether it needs to run (`ShouldRun`) /// - Implements the actual execution logic (`Run`) /// - Is thread-safe (`Send + Sync`) to be boxed and dispatched concurrently /// /// Components implementing this trait can be boxed as `ShouldRunBox`. pub trait ShouldRunTrait: ShouldRun + Run + Send + Sync + Label {} /// Blanket implementation for all compatible types. impl ShouldRunTrait for T where T: ShouldRun + Run + Send + Sync + Label {} /// A boxed trait object to hold any runner implementing `ShouldRunTrait`. pub type ShouldRunBox = Box; /// Macro to initialize and box multiple `ShouldRunTrait` components. /// /// # Arguments /// * `$id` - Sample ID (typically a string slice) /// * `$config` - Shared configuration object /// * `$($runner:ty),+` - One or more runner types implementing `Initialize + ShouldRunTrait` /// /// # Returns /// A vector of boxed runner components (`Vec`) /// /// # Example /// ```rust /// let modules: Vec = create_should_run!( /// "sample_42", /// config, /// ClairS, /// Savana, /// DeepSomatic, /// )?; /// ``` /// /// # Errors /// This macro uses `?`, so it must be called inside a function that returns `anyhow::Result`. #[macro_export] macro_rules! create_should_run { ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{ use anyhow::Context; let mut runners: Vec = Vec::new(); $( let runner = <$runner>::initialize($id, $config.clone()) .with_context(|| format!( "Failed to initialize should-run checker {} for {}", stringify!($runner), $id ))?; runners.push(Box::new(runner) as ShouldRunBox); )+ runners }}; } /// Macro to initialize and box a list of solo-mode pipeline components that implement `ShouldRunTrait`. /// /// This is typically used for per-timepoint variant callers (e.g., `DeepVariant`), /// where each runner is instantiated for a specific sample timepoint (e.g., "tumoral", "normal"). /// /// Each entry must be provided as a pair: the type of the runner and the timepoint string expression. /// /// # Arguments /// - `$id`: The sample ID (`&str`) passed to each initializer /// - `$config`: A `Config` object (must be cloneable) /// - `$($runner:ty, $arg:expr),+`: One or more runner types with timepoint arguments (e.g., `config.tumoral_name`) /// /// # Returns /// A `Vec` with boxed runners initialized per timepoint. /// /// # Example /// ```rust /// let solo_callers = create_should_run_solo!( /// "sample42", /// config, /// DeepVariant, config.tumoral_name, /// DeepVariant, config.normal_name, /// )?; /// ``` /// /// # Notes /// Using `config.tumoral_name` and `config.normal_name` is preferred over hardcoded "diag"/"mrd". /// /// # Errors /// This macro uses `?` and must be called inside a `Result`-returning context. #[macro_export] macro_rules! create_should_run_solo { ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{ use anyhow::Context; let mut runners: Vec = Vec::new(); $( let runner = <$runner>::initialize($id, $arg, $config.clone()) .with_context(|| format!( "Failed to initialize solo should-run checker {} for '{}' and arg '{}'", stringify!($runner), $id, $arg ))?; runners.push(Box::new(runner) as ShouldRunBox); )+ runners }}; } /// Macro to initialize and box a list of pipeline components that must run once per timepoint /// (i.e., both "tumoral" and "normal") and implement `ShouldRunTrait`. /// /// This is typically used for variant callers like `DeepVariant` that operate in **solo mode** /// but must be run twice — once for the tumoral sample and once for the normal sample. /// /// The macro: /// - Calls `.initialize(id, timepoint, config.clone())` twice per type /// - Uses `config.tumoral_name` and `config.normal_name` as timepoints /// - Returns a flat `Vec` containing both instances for each type /// /// # Arguments /// - `$id`: The sample ID (`&str`) /// - `$config`: The configuration object (must expose `tumoral_name` and `normal_name`) /// - `$($runner:ty),+`: One or more runner types that implement `Initialize` with `(id, timepoint, config)` /// /// # Example /// ```rust /// let runners = create_should_run_normal_tumoral!( /// "sample_42", /// config, /// DeepVariant, /// AnotherCaller, /// )?; /// ``` /// /// This will expand to: /// ```rust /// vec![ /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox, /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox, /// Box::new(AnotherCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox, /// Box::new(AnotherCaller::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox, /// ] /// ``` /// /// # Errors /// This macro uses `?`, so it must be called inside a function that returns `Result`. #[macro_export] macro_rules! create_should_run_normal_tumoral { ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{ use anyhow::Context; let mut runners: Vec = Vec::new(); $( let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone()) .with_context(|| format!( "Failed to initialize tumoral should-run checker {} for {}", stringify!($runner), $id ))?; runners.push(Box::new(tumoral) as ShouldRunBox); let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone()) .with_context(|| format!( "Failed to initialize normal should-run checker {} for {}", stringify!($runner), $id ))?; runners.push(Box::new(normal) as ShouldRunBox); )+ runners }}; } /// Executes each runner in the slice only if `should_run()` returns true. /// /// # Arguments /// * `iterable` - A mutable slice of boxed `InitRun` components /// /// # Returns /// * `Ok(())` if all required runners execute successfully /// * An error if any runner's `run()` method fails /// /// # Notes /// - This function will skip runners that return `false` from `should_run()` pub fn run_if_required(iterable: &mut [ShouldRunBox]) -> anyhow::Result<()> { iterable.iter_mut().try_for_each(|e| { if e.should_run() { e.run() .map_err(|err| anyhow::anyhow!("Failed to run {}.\n{err}", e.label())) } else { info!("No run required for: {}", e.label()); Ok(()) } }) } /// A trait alias for all variant callers that support initialization, execution, /// conditional re-running, and variant extraction (VCF + annotations). /// /// Used to enable polymorphic handling of both solo and somatic callers in the pipeline. pub trait RunnerVariants: Variants + Send + Sync + Label {} /// Blanket implementation for all compatible types. impl RunnerVariants for T where T: Variants + Send + Sync + Label {} pub type CallerBox = Box; #[macro_export] macro_rules! init_somatic_callers { ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{ use anyhow::Context; let mut callers: Vec = Vec::new(); $( let caller = <$runner>::initialize($id, $config.clone()) .with_context(|| format!( "Failed to initialize somatic caller {} for {}", stringify!($runner), $id ))?; callers.push(Box::new(caller) as CallerBox); )+ callers }}; } /// Macro to initialize and box a list of **solo-mode variant callers** for specific timepoints, /// where each runner implements `RunnerVariants`. /// /// This is useful for callers like `DeepVariant` that need to be instantiated with a specific /// sample timepoint (e.g., `config.tumoral_name` or `config.normal_name`). /// /// Each entry must be a pair: a runner type and a timepoint expression (usually from config). /// /// # Arguments /// - `$id`: The sample ID (`&str`) /// - `$config`: The configuration object (must be cloneable) /// - `$($runner:ty, $arg:expr),+`: One or more `(RunnerType, Timepoint)` pairs /// /// # Returns /// A `Vec` containing initialized, boxed solo-mode variant callers. /// /// # Example /// ```rust /// let solo_callers = init_solo_callers!( /// "sample_42", /// config, /// DeepVariant, config.tumoral_name, /// DeepVariant, config.normal_name, /// )?; /// ``` /// /// This will expand to: /// ```rust /// vec![ /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox, /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox, /// ] /// ``` /// /// # Errors /// This macro uses `?` internally, so it must be used inside a `Result`-returning context. #[macro_export] macro_rules! init_solo_callers { ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{ let mut callers: Vec = Vec::new(); $( let caller = <$runner>::initialize($id, $arg, $config.clone()) .with_context(|| format!( "Failed to initialize caller {} for {}", stringify!($runner), $id ))?; callers.push(Box::new(caller) as CallerBox); )+ callers }}; } /// Macro to initialize and box a list of solo-mode **variant callers** for both `normal` and `tumoral` timepoints. /// /// This is designed for types like `DeepVariant` that implement `RunnerVariants` and require /// separate execution for each timepoint. It will: /// - Call `.initialize(id, timepoint, config)` for both `config.tumoral_name` and `config.normal_name` /// - Box the result as `CallerBox` /// /// # Arguments /// - `$id`: Sample ID (usually a `&str`) /// - `$config`: Cloneable configuration object /// - `$($runner:ty),+`: One or more runner types that implement `RunnerVariants` /// /// # Returns /// A `Vec` containing two boxed instances per runner (one for each timepoint). /// /// # Example /// ```rust /// let solo_callers = init_solo_callers_normal_tumoral!( /// "sample_42", /// config, /// DeepVariant, /// OtherSoloCaller, /// )?; /// ``` /// /// This expands to: /// ```rust /// vec![ /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox, /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox, /// Box::new(OtherSoloCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox, /// Box::new(OtherSoloCaller::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox, /// ] /// ``` /// /// # Errors /// This macro uses `?`, so it must be called inside a `Result`-returning context. #[macro_export] macro_rules! init_solo_callers_normal_tumoral { ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{ use anyhow::Context; let mut callers: Vec = Vec::new(); $( let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone()) .with_context(|| format!( "Failed to initialize tumoral caller {} for {} '{}'", stringify!($runner), $id, $config.tumoral_name ))?; callers.push(Box::new(tumoral) as CallerBox); let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone()) .with_context(|| format!( "Failed to initialize normal caller {} for {} '{}'", stringify!($runner), $id, $config.normal_name ))?; callers.push(Box::new(normal) as CallerBox); )+ callers }}; } // pub fn run_variants(iterable: &mut [CallerBox]) -> anyhow::Result<()> { // iterable // .iter_mut() // .try_for_each(|runner| runner.run()) // .map_err(|e| anyhow::anyhow!("Error while calling run_variants.\n{e}")) // } pub fn load_variants( iterable: &mut [CallerBox], annotations: &Annotations, ) -> anyhow::Result> { iterable .par_iter() .map(|runner| { let result = runner.variants(annotations); if let Err(ref e) = result { error!("Failed to load variants from: {}\n{e}", runner.label()); } else { info!("Variants from {} loaded.", runner.label()); }; result }) .filter(|r| r.is_ok()) .collect::>>() .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}")) } // pub fn par_load_variants( // iterable: &mut [Box], // annotations: &Annotations, // ) -> anyhow::Result> { // iterable // .par_iter() // .map(|runner| { // let r = runner.variants(annotations); // if let Err(ref e) = r { // warn!("{e}"); // }; // r // }) // .filter(|r| r.is_ok()) // .collect::>>() // .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}")) // } pub fn parallel_intersection( vec1: &[T], vec2: &[T], ) -> (Vec, Vec, Vec) { let set1: HashSet<_> = vec1.par_iter().cloned().collect(); let set2: HashSet<_> = vec2.par_iter().cloned().collect(); let common: Vec = set1 .par_iter() .filter(|item| set2.contains(item)) .cloned() .collect(); let only_in_first: Vec = set1 .par_iter() .filter(|item| !set2.contains(item)) .cloned() .collect(); let only_in_second: Vec = set2 .par_iter() .filter(|item| !set1.contains(item)) .cloned() .collect(); (common, only_in_first, only_in_second) }