use crate::{ annotation::Annotations, collection::ShouldRun, helpers::Hash128, positions::{GenomePosition, GetGenomePosition, VcfPosition}, runners::Run, variant::variant_collection::VariantCollection, }; use anyhow::{anyhow, Context}; use bitcode::{Decode, Encode}; use log::{error, info}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr}; /// Represents a variant in the Variant Call Format (VCF). #[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)] pub struct VcfVariant { /// A 128-bit hash of the variant's key properties for efficient comparison and storage. pub hash: Hash128, /// The genomic position of the variant. pub position: GenomePosition, /// The identifier of the variant. pub id: String, /// The reference allele. pub reference: ReferenceAlternative, /// The alternative allele. pub alternative: ReferenceAlternative, /// The quality score of the variant call, if available. pub quality: Option, /// The filter status of the variant. pub filter: Filter, /// Additional information about the variant. pub infos: Infos, /// Genotype information and other sample-specific data. pub formats: Formats, } impl PartialEq for VcfVariant { /// Compares two VcfVariants for equality. /// /// Note: This comparison only considers position, reference, and alternative. /// It intentionally ignores id, filter, info, format, and quality. fn eq(&self, other: &Self) -> bool { // Nota bene: id, filter, info, format and quality is intentionally not compared self.position == other.position && self.reference == other.reference && self.alternative == other.alternative } } impl Eq for VcfVariant {} impl FromStr for VcfVariant { type Err = anyhow::Error; /// Parses a VcfVariant from a string representation. /// /// The input string is expected to be a tab-separated VCF line. /// /// # Errors /// /// Returns an error if parsing fails for any field. fn from_str(s: &str) -> anyhow::Result { let v: Vec<&str> = s.split('\t').collect(); let vcf_position: VcfPosition = ( *v.first().ok_or(anyhow!("Can't get contig from: {s}"))?, *v.get(1).ok_or(anyhow!("Can't get position from: {s}"))?, ) .try_into() .context(format!("Can't parse position from: {s}"))?; let formats = if v.len() >= 10 { ( *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?, *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?, ) .try_into() .context(format!("Can't parse formats from: {s}"))? } else { Formats::default() }; let position: GenomePosition = vcf_position.into(); let reference: ReferenceAlternative = v .get(3) .ok_or(anyhow!("Can't parse reference from: {s}"))? .parse() .context(format!("Can't parse reference from: {s}"))?; let alternative: ReferenceAlternative = v .get(4) .ok_or(anyhow!("Can't parse alternative from: {s}"))? .parse() .context(format!("Can't parse alternative from: {s}"))?; // Blake3 128 bytes Hash let mut hasher = blake3::Hasher::new(); hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes hasher.update(reference.to_string().as_bytes()); // Reference string as bytes hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes let hash = hasher.finalize(); let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap()); Ok(Self { hash, position, id: v .get(2) .ok_or(anyhow!("Can't parse id from: {s}"))? .to_string(), reference, alternative, quality: v .get(5) .map(|s| s.parse::().ok()) // Try to parse as f64; returns Option .unwrap_or(None), filter: v .get(6) .ok_or(anyhow!("Can't parse filter from: {s}"))? .parse() .context(format!("Can't parse filter from: {s}"))?, infos: v .get(7) .ok_or(anyhow!("Can't parse infos from: {s}"))? .parse() .context(format!("Can't parse infos from: {s}"))?, formats, }) } } // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag impl VcfVariant { /// Converts the VcfVariant into a VCF-formatted row string. /// /// This method creates a tab-separated string representation of the variant, /// suitable for writing to a VCF file. pub fn into_vcf_row(&self) -> String { let vcf_position: VcfPosition = self.position.clone().into(); let (contig, position) = vcf_position.into(); let mut columns = vec![ contig, position, self.id.to_string(), self.reference.to_string(), self.alternative.to_string(), self.quality .map(|v| v.to_string()) .unwrap_or(".".to_string()), self.filter.to_string(), self.infos.to_string(), ]; if !self.formats.0.is_empty() { let (format, values) = self.formats.clone().into(); columns.push(format); columns.push(values); } columns.join("\t") } /// Returns the hash of the variant. pub fn hash(&self) -> Hash128 { self.hash } /// Creates a new VcfVariant with common attributes from DeepVariant and CLAIRS. /// /// This method generates a new variant with shared properties, resetting some fields /// to default or empty values. pub fn commun_deepvariant_clairs(&self) -> VcfVariant { VcfVariant { hash: self.hash, position: self.position.clone(), id: self.id.clone(), reference: self.reference.clone(), alternative: self.alternative.clone(), quality: self.quality, filter: Filter::Other(".".to_string()), infos: Infos(vec![Info::Empty]), formats: self.formats.commun_deepvariant_clairs(), } } /// Checks if the variant has an SVTYPE info field. /// /// Returns true if the variant contains structural variation type information. pub fn has_svtype(&self) -> bool { self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_))) } /// Retrieves the structural variation type of the variant, if present. /// /// Returns Some(SVType) if the variant has an SVTYPE info field, pub fn svtype(&self) -> Option { self.infos.0.iter().find_map(|e| { if let Info::SVTYPE(sv_type) = e { Some(sv_type.clone()) } else { None } }) } /// Determines the alteration category of the variant. /// /// This method analyzes the reference and alternative alleles to classify /// the variant into one of several alteration categories: /// - SNV (Single Nucleotide Variant) /// - INS (Insertion) /// - DEL (Deletion) /// - Other (including structural variants and complex alterations) /// /// The classification is based on the following rules: /// 1. If both reference and alternative are single nucleotides, it's an SNV. /// 2. If reference is a single nucleotide and alternative is multiple nucleotides, it's an insertion. /// 3. If reference is multiple nucleotides and alternative is a single nucleotide, it's a deletion. /// 4. For cases where both are multiple nucleotides, the longer one determines if it's an insertion or deletion. /// 5. If none of the above apply, it checks for structural variant types. /// 6. If no structural variant type is found, it's classified as "Other". /// /// # Returns /// An `AlterationCategory` enum representing the type of alteration. pub fn alteration_category(&self) -> AlterationCategory { match (&self.reference, &self.alternative) { (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => { AlterationCategory::SNV } (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => { AlterationCategory::INS } (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => { AlterationCategory::Other } (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => { AlterationCategory::DEL } (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) if a.len() < b.len() => { AlterationCategory::INS } (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) if a.len() > b.len() => { AlterationCategory::DEL } _ => match self.svtype() { Some(sv_type) => { if let Ok(bnd_desc) = self.bnd_desc() { if bnd_desc.a_contig != bnd_desc.b_contig { AlterationCategory::TRL } else { AlterationCategory::BND } } else { AlterationCategory::from(sv_type) } } None => AlterationCategory::Other, }, } } /// Parses and constructs a BND (breakend) description from the alternative string. /// /// This function interprets the BND notation in the alternative string and creates /// a `BNDDesc` struct containing detailed information about the breakend. /// /// # Returns /// - `Ok(BNDDesc)` if parsing is successful /// - `Err` if parsing fails or if the alteration is not a BND /// /// # Errors /// This function will return an error if: /// - The alteration category is not BND /// - The alternative string cannot be parsed into exactly 3 parts /// - The b_position cannot be parsed as a number pub fn bnd_desc(&self) -> anyhow::Result { let alt = self.alternative.to_string(); if alt.contains('[') || alt.contains(']') { let alt_rep = alt.replace("[", ";").replace("]", ";"); let alt_is_joined_after = !alt_rep.starts_with(";"); let parts = alt_rep .split(";") .filter(|c| !c.is_empty()) .collect::>(); if alt_is_joined_after { // a is ref b is alt let a_sens = true; let a_contig = self.position.contig(); let a_position = self.position.position + 1; let added_nt = parts[0][1..].to_string(); let b_sens = alt.contains('['); let (contig, pos) = parts[1].split_once(':').unwrap(); let b_contig = contig.to_string(); let b_position: u32 = pos.parse()?; Ok(BNDDesc { a_contig, a_position, a_sens, b_contig, b_position, b_sens, added_nt, }) } else { // a is alt b is ref let b_sens = true; let b_contig = self.position.contig(); let b_position = self.position.position + 1; let mut added_nt = parts[1].to_string(); added_nt.pop(); let a_sens = alt.contains(']'); let (contig, pos) = parts[0].split_once(':').unwrap(); let a_contig = contig.to_string(); let a_position: u32 = pos.parse()?; Ok(BNDDesc { a_contig, a_position, a_sens, b_contig, b_position, b_sens, added_nt, }) } } else { Err(anyhow::anyhow!("The alteration is not BND: {alt}")) } } pub fn deletion_len(&self) -> Option { if self.alteration_category() != AlterationCategory::DEL { return None; } self.infos .0 .iter() .find_map(|i| { if let Info::SVLEN(len) = i { Some(*len as u32) } else { None } }) .or_else(|| { if let ( ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_), ) = (&self.reference, &self.alternative) { Some(nt.len().saturating_sub(1) as u32) } else { None } }) } pub fn deletion_desc(&self) -> Option { if let Some(len) = self.deletion_len() { Some(DeletionDesc { contig: self.position.contig(), start: self.position.position + 1, end: self.position.position + len, }) } else { None } } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] pub struct BNDDesc { pub a_contig: String, pub a_position: u32, // 1-based pub a_sens: bool, pub b_contig: String, pub b_position: u32, // 1-based pub b_sens: bool, pub added_nt: String, } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] pub struct DeletionDesc { pub contig: String, pub start: u32, // 1-based pub end: u32, } impl DeletionDesc { pub fn len(&self) -> u32 { self.end.saturating_sub(self.start) } pub fn is_empty(&self) -> bool { self.len() == 0 } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)] pub enum AlterationCategory { SNV, DEL, INS, DUP, INV, CNV, TRL, BND, Other, } impl fmt::Display for AlterationCategory { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", match self { AlterationCategory::SNV => "SNV", AlterationCategory::DEL => "DEL", AlterationCategory::INS => "INS", AlterationCategory::DUP => "DUP", AlterationCategory::INV => "INV", AlterationCategory::CNV => "CNV", AlterationCategory::BND | AlterationCategory::TRL => "TRL", AlterationCategory::Other => "Other", } ) } } impl From for AlterationCategory { fn from(sv_type: SVType) -> Self { match sv_type { SVType::DEL => AlterationCategory::DEL, SVType::INS => AlterationCategory::INS, SVType::DUP => AlterationCategory::DUP, SVType::INV => AlterationCategory::INV, SVType::CNV => AlterationCategory::CNV, SVType::BND => AlterationCategory::BND, } } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)] pub enum SVType { DEL, INS, DUP, INV, CNV, BND, } impl FromStr for SVType { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "DEL" => Ok(SVType::DEL), "INS" => Ok(SVType::INS), "DUP" => Ok(SVType::DUP), "INV" => Ok(SVType::INV), "CNV" => Ok(SVType::CNV), "BND" => Ok(SVType::BND), _ => Err(anyhow!("Can't parse SVTYPE={s}")), } } } impl fmt::Display for SVType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", match self { SVType::DEL => "DEL", SVType::INS => "INS", SVType::DUP => "DUP", SVType::INV => "INV", SVType::CNV => "CNV", SVType::BND => "BND", } ) } } impl VariantId for VcfVariant { fn variant_id(&self) -> String { format!("{}_{}>{}", self.position, self.reference, self.alternative) } } impl GetGenomePosition for VcfVariant { fn position(&self) -> &GenomePosition { &self.position } } impl PartialOrd for VcfVariant { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for VcfVariant { fn cmp(&self, other: &Self) -> Ordering { self.position.cmp(&other.position) } } /// Info #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)] pub struct Infos(pub Vec); impl FromStr for Infos { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { Ok(Self( s.split(";") .map(Info::from_str) .collect::, _>>() .map_err(|e| anyhow::anyhow!("Failed to parse info: {e}"))?, )) } } impl fmt::Display for Infos { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", self.0 .iter() .map(|e| e.to_string()) .collect::>() .join(";") ) } } #[allow(non_camel_case_types)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)] pub enum Info { Empty, H, F, P, FAU(u32), FCU(u32), FGU(u32), FTU(u32), RAU(u32), RCU(u32), RGU(u32), RTU(u32), SVTYPE(SVType), MATEID(String), NORMAL_READ_SUPPORT(u32), TUMOUR_READ_SUPPORT(u32), NORMAL_ALN_SUPPORT(u32), TUMOUR_ALN_SUPPORT(u32), SVLEN(i32), TUMOUR_DP_BEFORE(Vec), TUMOUR_DP_AT(Vec), TUMOUR_DP_AFTER(Vec), NORMAL_DP_BEFORE(Vec), NORMAL_DP_AT(Vec), NORMAL_DP_AFTER(Vec), TUMOUR_AF(Vec), NORMAL_AF(Vec), BP_NOTATION(String), SOURCE(String), CLUSTERED_READS_TUMOUR(u32), CLUSTERED_READS_NORMAL(u32), TUMOUR_ALT_HP(Vec), TUMOUR_PS(Vec), NORMAL_ALT_HP(Vec), NORMAL_PS(Vec), TUMOUR_TOTAL_HP_AT(Vec), NORMAL_TOTAL_HP_AT(Vec), ORIGIN_STARTS_STD_DEV(f32), ORIGIN_MAPQ_MEAN(f32), ORIGIN_EVENT_SIZE_STD_DEV(f32), ORIGIN_EVENT_SIZE_MEDIAN(f32), ORIGIN_EVENT_SIZE_MEAN(f32), END_STARTS_STD_DEV(f32), END_MAPQ_MEAN(f32), END_EVENT_SIZE_STD_DEV(f32), END_EVENT_SIZE_MEDIAN(f32), END_EVENT_SIZE_MEAN(f32), CLASS(String), END(u32), SVINSLEN(u32), SVINSSEQ(String), } impl FromStr for Info { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { if s.contains('=') { let (key, value) = s .split_once('=') .context(format!("Can't split with `=` in string: {s}"))?; Ok(match key { "FAU" => Info::FAU(parse_value(value, key)?), "FCU" => Info::FCU(parse_value(value, key)?), "FGU" => Info::FGU(parse_value(value, key)?), "FTU" => Info::FTU(parse_value(value, key)?), "RAU" => Info::RAU(parse_value(value, key)?), "RCU" => Info::RCU(parse_value(value, key)?), "RGU" => Info::RGU(parse_value(value, key)?), "RTU" => Info::RTU(parse_value(value, key)?), "SVLEN" => Info::SVLEN(parse_value(value, key)?), "END" => Info::END(parse_value(value, key)?), "SVINSLEN" => Info::SVINSLEN(parse_value(value, key)?), "SVTYPE" => Info::SVTYPE(value.parse()?), "MATEID" => Info::MATEID(value.to_string()), "NORMAL_READ_SUPPORT" => Info::NORMAL_READ_SUPPORT(parse_value(value, key)?), "TUMOUR_READ_SUPPORT" => Info::TUMOUR_READ_SUPPORT(parse_value(value, key)?), "NORMAL_ALN_SUPPORT" => Info::NORMAL_ALN_SUPPORT(parse_value(value, key)?), "TUMOUR_ALN_SUPPORT" => Info::TUMOUR_ALN_SUPPORT(parse_value(value, key)?), "SVINSSEQ" => Info::SVINSSEQ(value.to_string()), "TUMOUR_DP_BEFORE" => Info::TUMOUR_DP_BEFORE(parse_vec_value(value, key)?), "TUMOUR_DP_AT" => Info::TUMOUR_DP_AT(parse_vec_value(value, key)?), "TUMOUR_DP_AFTER" => Info::TUMOUR_DP_AFTER(parse_vec_value(value, key)?), "NORMAL_DP_BEFORE" => Info::NORMAL_DP_BEFORE(parse_vec_value(value, key)?), "NORMAL_DP_AT" => Info::NORMAL_DP_AT(parse_vec_value(value, key)?), "NORMAL_DP_AFTER" => Info::NORMAL_DP_AFTER(parse_vec_value(value, key)?), "TUMOUR_AF" => Info::TUMOUR_AF(parse_vec_value(value, key)?), "NORMAL_AF" => Info::NORMAL_AF(parse_vec_value(value, key)?), "BP_NOTATION" => Info::BP_NOTATION(value.to_string()), "SOURCE" => Info::SOURCE(value.to_string()), "CLUSTERED_READS_TUMOUR" => Info::CLUSTERED_READS_TUMOUR(parse_value(value, key)?), "CLUSTERED_READS_NORMAL" => Info::CLUSTERED_READS_NORMAL(parse_value(value, key)?), "TUMOUR_ALT_HP" => Info::TUMOUR_ALT_HP(parse_vec_value(value, key)?), "TUMOUR_PS" => Info::TUMOUR_PS(parse_vec_value(value, key)?), "NORMAL_ALT_HP" => Info::NORMAL_ALT_HP(parse_vec_value(value, key)?), "NORMAL_PS" => Info::NORMAL_PS(parse_vec_value(value, key)?), "TUMOUR_TOTAL_HP_AT" => Info::TUMOUR_TOTAL_HP_AT(parse_vec_value(value, key)?), "NORMAL_TOTAL_HP_AT" => Info::NORMAL_TOTAL_HP_AT(parse_vec_value(value, key)?), "ORIGIN_STARTS_STD_DEV" => Info::ORIGIN_STARTS_STD_DEV(parse_value(value, key)?), "ORIGIN_MAPQ_MEAN" => Info::ORIGIN_MAPQ_MEAN(parse_value(value, key)?), "ORIGIN_EVENT_SIZE_STD_DEV" => { Info::ORIGIN_EVENT_SIZE_STD_DEV(parse_value(value, key)?) } "ORIGIN_EVENT_SIZE_MEDIAN" => { Info::ORIGIN_EVENT_SIZE_MEDIAN(parse_value(value, key)?) } "ORIGIN_EVENT_SIZE_MEAN" => Info::ORIGIN_EVENT_SIZE_MEAN(parse_value(value, key)?), "END_STARTS_STD_DEV" => Info::END_STARTS_STD_DEV(parse_value(value, key)?), "END_MAPQ_MEAN" => Info::END_MAPQ_MEAN(parse_value(value, key)?), "END_EVENT_SIZE_STD_DEV" => Info::END_EVENT_SIZE_STD_DEV(parse_value(value, key)?), "END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?), "END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?), "CLASS" => Info::CLASS(value.to_string()), _ => Info::Empty, }) } else { Ok(match s { "H" => Info::H, "F" => Info::F, "P" => Info::P, _ => Info::Empty, }) } } } impl fmt::Display for Info { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Info::Empty => write!(f, "."), Info::H => write!(f, "H"), Info::F => write!(f, "F"), Info::P => write!(f, "P"), Info::FAU(v) => write!(f, "FAU={v}"), Info::FCU(v) => write!(f, "FCU={v}"), Info::FGU(v) => write!(f, "FGU={v}"), Info::FTU(v) => write!(f, "FTU={v}"), Info::RAU(v) => write!(f, "RAU={v}"), Info::RCU(v) => write!(f, "RCU={v}"), Info::RGU(v) => write!(f, "RGU={v}"), Info::RTU(v) => write!(f, "RTU={v}"), Info::SVTYPE(v) => write!(f, "SVTYPE={v}"), Info::SVLEN(v) => write!(f, "SVLEN={v}"), Info::END(v) => write!(f, "END={v}"), Info::MATEID(v) => write!(f, "MATEID={v}"), Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"), Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"), Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"), Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"), Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"), Info::TUMOUR_ALN_SUPPORT(v) => write!(f, "TUMOUR_ALN_SUPPORT={v}"), Info::TUMOUR_DP_BEFORE(v) => write!(f, "TUMOUR_DP_BEFORE={}", concat_numbers(v)), Info::TUMOUR_DP_AT(v) => write!(f, "TUMOUR_DP_AT={}", concat_numbers(v)), Info::TUMOUR_DP_AFTER(v) => write!(f, "TUMOUR_DP_AFTER={}", concat_numbers(v)), Info::NORMAL_DP_BEFORE(v) => write!(f, "NORMAL_DP_BEFORE={}", concat_numbers(v)), Info::NORMAL_DP_AT(v) => write!(f, "NORMAL_DP_AT={}", concat_numbers(v)), Info::NORMAL_DP_AFTER(v) => write!(f, "NORMAL_DP_AFTER={}", concat_numbers(v)), Info::TUMOUR_AF(v) => write!(f, "TUMOUR_AF={}", concat_numbers(v)), Info::NORMAL_AF(v) => write!(f, "NORMAL_AF={}", concat_numbers(v)), Info::BP_NOTATION(v) => write!(f, "BP_NOTATION={v}"), Info::SOURCE(v) => write!(f, "SOURCE={v}"), Info::CLUSTERED_READS_TUMOUR(v) => write!(f, "CLUSTERED_READS_TUMOUR={v}"), Info::CLUSTERED_READS_NORMAL(v) => write!(f, "CLUSTERED_READS_NORMAL={v}"), Info::TUMOUR_ALT_HP(v) => write!(f, "TUMOUR_ALT_HP={}", concat_numbers(v)), Info::TUMOUR_PS(v) => write!(f, "TUMOUR_PS={}", v.join(",")), Info::NORMAL_ALT_HP(v) => write!(f, "NORMAL_ALT_HP={}", concat_numbers(v)), Info::NORMAL_PS(v) => write!(f, "NORMAL_PS={}", v.join(",")), Info::TUMOUR_TOTAL_HP_AT(v) => write!(f, "TUMOUR_TOTAL_HP_AT={}", concat_numbers(v)), Info::NORMAL_TOTAL_HP_AT(v) => write!(f, "NORMAL_TOTAL_HP_AT={}", concat_numbers(v)), Info::ORIGIN_STARTS_STD_DEV(v) => write!(f, "ORIGIN_STARTS_STD_DEV={v}"), Info::ORIGIN_MAPQ_MEAN(v) => write!(f, "ORIGIN_MAPQ_MEAN={v}"), Info::ORIGIN_EVENT_SIZE_STD_DEV(v) => write!(f, "ORIGIN_EVENT_SIZE_STD_DEV={v}"), Info::ORIGIN_EVENT_SIZE_MEDIAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEDIAN={v}"), Info::ORIGIN_EVENT_SIZE_MEAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEAN={v}"), Info::END_STARTS_STD_DEV(v) => write!(f, "END_STARTS_STD_DEV={v}"), Info::END_MAPQ_MEAN(v) => write!(f, "END_MAPQ_MEAN={v}"), Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"), Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"), Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"), Info::CLASS(v) => write!(f, "CLASS={v}"), } } } pub fn concat_numbers(v: &[T]) -> String { v.iter() .map(|n| n.to_string()) .collect::>() .join(",") } /// Format #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)] pub enum Format { // DeepVariant GT(String), GQ(u32), DP(u32), AD(Vec), VAF(f32), PL(Vec), // Clairs // when format begins with N: normal // AF(f32), // NAF(f32), // DP(u32), NDP(u32), NAD(Vec), AU(u32), CU(u32), GU(u32), TU(u32), NAU(u32), NCU(u32), NGU(u32), NTU(u32), // nanomonsv TR(u32), VR(u32), Other((String, String)), // (key, value) } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)] pub struct Formats(pub Vec); impl Formats { /// Get the tumoral alternative read depth and total depth as an Option<(u32, u32)>. pub fn n_alt_depth(&self) -> Option<(u32, u32)> { let mut tumor_alt_depth: Option = None; let mut tumor_total_depth: Option = None; for format in &self.0 { match format { // Tumor Allelic Depth (AD) Format::AD(values) => { if values.len() > 1 { // Sum all alternative allele depths (excluding reference allele) tumor_alt_depth = Some(values[1..].iter().sum()); } } // Tumor Total Depth (DP) Format::DP(value) => { tumor_total_depth = Some(*value); } _ => {} } } // Return a tuple (tumor_alt_depth, tumor_total_depth) if both are available match (tumor_alt_depth, tumor_total_depth) { (Some(alt), Some(total)) => Some((alt, total)), _ => None, } } } impl TryFrom<(&str, &str)> for Formats { type Error = anyhow::Error; fn try_from((k, v): (&str, &str)) -> anyhow::Result { let keys: Vec<&str> = k.split(':').collect(); let values: Vec<&str> = v.split(':').collect(); if keys.len() != values.len() { anyhow::bail!("Mismatch between keys and values count for {k} {v}"); } Ok(Self( keys.into_iter() .zip(values) .map(|(key, value)| Format::try_from((key, value))) .collect::, _>>() .map_err(|e| anyhow::anyhow!("Failed to parse format: {e}"))?, )) } } impl From for (String, String) { fn from(formats: Formats) -> Self { let mut keys = Vec::new(); let mut values = Vec::new(); for format in formats.0 { let (key, value): (String, String) = format.into(); keys.push(key); values.push(value); } (keys.join(":"), values.join(":")) } } impl TryFrom<(&str, &str)> for Format { type Error = anyhow::Error; fn try_from((key, value): (&str, &str)) -> anyhow::Result { let format = match key { "GT" => Format::GT(value.to_string()), "GQ" => Format::GQ(parse_value(value, key)?), "DP" => Format::DP(parse_value(value, key)?), "AD" => Format::AD(parse_vec_value(value, key)?), "VAF" => Format::VAF(parse_value(value, key)?), // "AF" => Format::AF(parse_value(value, key)?), // "NAF" => Format::NAF(parse_value(value, key)?), "NDP" => Format::NDP(parse_value(value, key)?), "NAD" => Format::NAD(parse_vec_value(value, key)?), "AU" => Format::AU(parse_value(value, key)?), "CU" => Format::CU(parse_value(value, key)?), "GU" => Format::GU(parse_value(value, key)?), "TU" => Format::TU(parse_value(value, key)?), "NAU" => Format::NAU(parse_value(value, key)?), "NCU" => Format::NCU(parse_value(value, key)?), "NGU" => Format::NGU(parse_value(value, key)?), "NTU" => Format::NTU(parse_value(value, key)?), "PL" => Format::PL(parse_vec_value(value, key)?), "TR" => Format::TR(parse_value(value, key)?), "VR" => Format::VR(parse_value(value, key)?), _ => Format::Other((key.to_string(), value.to_string())), }; Ok(format) } } // Helper function to parse a single value (DeepSeek) fn parse_value(value: &str, key: &str) -> anyhow::Result where T::Err: std::fmt::Debug, { value .parse() .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error` .context(format!("Can't parse {}: {}", key, value)) // Add context } // Helper function to parse comma-separated values (DeepSeek) fn parse_vec_value(value: &str, key: &str) -> anyhow::Result> where T::Err: std::fmt::Debug, { value .split(',') .map(|e| { e.parse() .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error` .context(format!("Failed to parse {}: {}", key, e)) // Add context }) .collect() } impl From for (String, String) { fn from(format: Format) -> Self { let concat = |values: Vec| -> String { values .iter() .map(|v| v.to_string()) .collect::>() .join(",") }; match format { Format::GT(value) => ("GT".to_string(), value), Format::GQ(value) => ("GQ".to_string(), value.to_string()), Format::DP(value) => ("DP".to_string(), value.to_string()), Format::AD(values) => ("AD".to_string(), concat(values)), Format::VAF(value) => ("VAF".to_string(), value.to_string()), Format::PL(values) => ("PL".to_string(), concat(values)), Format::Other((key, value)) => (key, value), // Format::AF(value) => ("AF".to_string(), value.to_string()), // Format::NAF(value) => ("NAF".to_string(), value.to_string()), Format::NDP(value) => ("NDP".to_string(), value.to_string()), Format::NAD(values) => ("NAD".to_string(), concat(values)), Format::AU(value) => ("AU".to_string(), value.to_string()), Format::CU(value) => ("CU".to_string(), value.to_string()), Format::GU(value) => ("GU".to_string(), value.to_string()), Format::TU(value) => ("TU".to_string(), value.to_string()), Format::NAU(value) => ("NAU".to_string(), value.to_string()), Format::NCU(value) => ("NCU".to_string(), value.to_string()), Format::NGU(value) => ("NGU".to_string(), value.to_string()), Format::NTU(value) => ("NTU".to_string(), value.to_string()), Format::TR(value) => ("TR".to_string(), value.to_string()), Format::VR(value) => ("VR".to_string(), value.to_string()), } } } impl Formats { pub fn commun_deepvariant_clairs(&self) -> Self { let filtered_vec: Vec = self .0 .clone() .into_iter() .map(|e| { if let Format::VAF(_v) = e { e // Format::AF(v) } else { e } }) .filter(|format| { matches!( format, Format::GT(_) | Format::GQ(_) | Format::DP(_) | Format::AD(_) /* | Format::AF(_) */ ) }) .collect(); Formats(filtered_vec) } } /// Filter #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)] pub enum Filter { PASS, Other(String), } impl FromStr for Filter { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "PASS" => Ok(Filter::PASS), _ => Ok(Filter::Other(s.to_string())), } } } impl fmt::Display for Filter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Filter::PASS => write!(f, "PASS"), Filter::Other(ref s) => write!(f, "{}", s), } } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)] pub enum ReferenceAlternative { Nucleotide(Base), Nucleotides(Vec), Unstructured(String), } impl FromStr for ReferenceAlternative { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { let possible_bases = s.as_bytes().iter(); let mut res: Vec = Vec::new(); for &base in possible_bases { match base.try_into() { std::result::Result::Ok(b) => res.push(b), Err(_) => { return Ok(Self::Unstructured(s.to_string())); } } } if res.len() == 1 { Ok(Self::Nucleotide(res.pop().unwrap())) } else { Ok(Self::Nucleotides(res)) } } } impl fmt::Display for ReferenceAlternative { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let string = match self { ReferenceAlternative::Nucleotide(b) => b.to_string(), ReferenceAlternative::Nucleotides(bases) => bases .iter() .fold(String::new(), |acc, e| format!("{}{}", acc, e)), ReferenceAlternative::Unstructured(s) => s.to_string(), }; write!(f, "{}", string) } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)] pub enum Base { A, T, C, G, N, } impl TryFrom for Base { type Error = anyhow::Error; fn try_from(base: u8) -> anyhow::Result { match base { b'A' => Ok(Base::A), b'T' => Ok(Base::T), b'C' => Ok(Base::C), b'G' => Ok(Base::G), b'N' => Ok(Base::N), _ => Err(anyhow::anyhow!( "Unknown base: {}", String::from_utf8_lossy(&[base]) )), } } } impl Base { pub fn into_u8(self) -> u8 { match self { Base::A => b'A', Base::T => b'T', Base::C => b'C', Base::G => b'G', Base::N => b'N', } } } impl fmt::Display for Base { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Use `self.number` to refer to each positional data point. let str = match self { Base::A => "A", Base::T => "T", Base::C => "C", Base::G => "G", Base::N => "N", }; write!(f, "{}", str) } } pub trait Variants { fn variants(&self, annotations: &Annotations) -> anyhow::Result; } pub trait VariantId { fn variant_id(&self) -> String; } pub trait Label { fn label(&self) -> String; } /// A trait alias for all dynamically executable pipeline runners. /// /// This trait represents any component that: /// - Can decide whether it needs to run (`ShouldRun`) /// - Implements the actual execution logic (`Run`) /// - Is thread-safe (`Send + Sync`) to be boxed and dispatched concurrently /// /// Components implementing this trait can be boxed as `ShouldRunBox`. pub trait ShouldRunTrait: ShouldRun + Run + Send + Sync + Label {} /// Blanket implementation for all compatible types. impl ShouldRunTrait for T where T: ShouldRun + Run + Send + Sync + Label {} /// A boxed trait object to hold any runner implementing `ShouldRunTrait`. pub type ShouldRunBox = Box; /// Macro to initialize and box multiple `ShouldRunTrait` components. /// /// # Arguments /// * `$id` - Sample ID (typically a string slice) /// * `$config` - Shared configuration object /// * `$($runner:ty),+` - One or more runner types implementing `Initialize + ShouldRunTrait` /// /// # Returns /// A vector of boxed runner components (`Vec`) /// /// # Example /// ```rust /// let modules: Vec = create_should_run!( /// "sample_42", /// config, /// ClairS, /// Savana, /// DeepSomatic, /// )?; /// ``` /// /// # Errors /// This macro uses `?`, so it must be called inside a function that returns `anyhow::Result`. #[macro_export] macro_rules! create_should_run { ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{ use anyhow::Context; let mut runners: Vec = Vec::new(); $( let runner = <$runner>::initialize($id, $config.clone()) .with_context(|| format!( "Failed to initialize should-run checker {} for {}", stringify!($runner), $id ))?; runners.push(Box::new(runner) as ShouldRunBox); )+ runners }}; } /// Macro to initialize and box a list of solo-mode pipeline components that implement `ShouldRunTrait`. /// /// This is typically used for per-timepoint variant callers (e.g., `DeepVariant`), /// where each runner is instantiated for a specific sample timepoint (e.g., "tumoral", "normal"). /// /// Each entry must be provided as a pair: the type of the runner and the timepoint string expression. /// /// # Arguments /// - `$id`: The sample ID (`&str`) passed to each initializer /// - `$config`: A `Config` object (must be cloneable) /// - `$($runner:ty, $arg:expr),+`: One or more runner types with timepoint arguments (e.g., `config.tumoral_name`) /// /// # Returns /// A `Vec` with boxed runners initialized per timepoint. /// /// # Example /// ```rust /// let solo_callers = create_should_run_solo!( /// "sample42", /// config, /// DeepVariant, config.tumoral_name, /// DeepVariant, config.normal_name, /// )?; /// ``` /// /// # Notes /// Using `config.tumoral_name` and `config.normal_name` is preferred over hardcoded "diag"/"mrd". /// /// # Errors /// This macro uses `?` and must be called inside a `Result`-returning context. #[macro_export] macro_rules! create_should_run_solo { ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{ use anyhow::Context; let mut runners: Vec = Vec::new(); $( let runner = <$runner>::initialize($id, $arg, $config.clone()) .with_context(|| format!( "Failed to initialize solo should-run checker {} for '{}' and arg '{}'", stringify!($runner), $id, $arg ))?; runners.push(Box::new(runner) as ShouldRunBox); )+ runners }}; } /// Macro to initialize and box a list of pipeline components that must run once per timepoint /// (i.e., both "tumoral" and "normal") and implement `ShouldRunTrait`. /// /// This is typically used for variant callers like `DeepVariant` that operate in **solo mode** /// but must be run twice — once for the tumoral sample and once for the normal sample. /// /// The macro: /// - Calls `.initialize(id, timepoint, config.clone())` twice per type /// - Uses `config.tumoral_name` and `config.normal_name` as timepoints /// - Returns a flat `Vec` containing both instances for each type /// /// # Arguments /// - `$id`: The sample ID (`&str`) /// - `$config`: The configuration object (must expose `tumoral_name` and `normal_name`) /// - `$($runner:ty),+`: One or more runner types that implement `Initialize` with `(id, timepoint, config)` /// /// # Example /// ```rust /// let runners = create_should_run_normal_tumoral!( /// "sample_42", /// config, /// DeepVariant, /// AnotherCaller, /// )?; /// ``` /// /// This will expand to: /// ```rust /// vec![ /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox, /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox, /// Box::new(AnotherCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox, /// Box::new(AnotherCaller::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox, /// ] /// ``` /// /// # Errors /// This macro uses `?`, so it must be called inside a function that returns `Result`. #[macro_export] macro_rules! create_should_run_normal_tumoral { ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{ use anyhow::Context; let mut runners: Vec = Vec::new(); $( let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone()) .with_context(|| format!( "Failed to initialize tumoral should-run checker {} for {}", stringify!($runner), $id ))?; runners.push(Box::new(tumoral) as ShouldRunBox); let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone()) .with_context(|| format!( "Failed to initialize normal should-run checker {} for {}", stringify!($runner), $id ))?; runners.push(Box::new(normal) as ShouldRunBox); )+ runners }}; } /// Executes each runner in the slice only if `should_run()` returns true. /// /// # Arguments /// * `iterable` - A mutable slice of boxed `InitRun` components /// /// # Returns /// * `Ok(())` if all required runners execute successfully /// * An error if any runner's `run()` method fails /// /// # Notes /// - This function will skip runners that return `false` from `should_run()` pub fn run_if_required(iterable: &mut [ShouldRunBox]) -> anyhow::Result<()> { iterable.iter_mut().try_for_each(|e| { if e.should_run() { e.run() } else { info!("Skipping running: {}", e.label()); Ok(()) } }) } /// A trait alias for all variant callers that support initialization, execution, /// conditional re-running, and variant extraction (VCF + annotations). /// /// Used to enable polymorphic handling of both solo and somatic callers in the pipeline. pub trait RunnerVariants: Variants + Send + Sync + Label {} /// Blanket implementation for all compatible types. impl RunnerVariants for T where T: Variants + Send + Sync + Label {} pub type CallerBox = Box; #[macro_export] macro_rules! init_somatic_callers { ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{ use anyhow::Context; let mut callers: Vec = Vec::new(); $( let caller = <$runner>::initialize($id, $config.clone()) .with_context(|| format!( "Failed to initialize somatic caller {} for {}", stringify!($runner), $id ))?; callers.push(Box::new(caller) as CallerBox); )+ callers }}; } /// Macro to initialize and box a list of **solo-mode variant callers** for specific timepoints, /// where each runner implements `RunnerVariants`. /// /// This is useful for callers like `DeepVariant` that need to be instantiated with a specific /// sample timepoint (e.g., `config.tumoral_name` or `config.normal_name`). /// /// Each entry must be a pair: a runner type and a timepoint expression (usually from config). /// /// # Arguments /// - `$id`: The sample ID (`&str`) /// - `$config`: The configuration object (must be cloneable) /// - `$($runner:ty, $arg:expr),+`: One or more `(RunnerType, Timepoint)` pairs /// /// # Returns /// A `Vec` containing initialized, boxed solo-mode variant callers. /// /// # Example /// ```rust /// let solo_callers = init_solo_callers!( /// "sample_42", /// config, /// DeepVariant, config.tumoral_name, /// DeepVariant, config.normal_name, /// )?; /// ``` /// /// This will expand to: /// ```rust /// vec![ /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox, /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox, /// ] /// ``` /// /// # Errors /// This macro uses `?` internally, so it must be used inside a `Result`-returning context. #[macro_export] macro_rules! init_solo_callers { ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{ let mut callers: Vec = Vec::new(); $( let caller = <$runner>::initialize($id, $arg, $config.clone()) .with_context(|| format!( "Failed to initialize caller {} for {}", stringify!($runner), $id ))?; callers.push(Box::new(caller) as CallerBox); )+ callers }}; } /// Macro to initialize and box a list of solo-mode **variant callers** for both `normal` and `tumoral` timepoints. /// /// This is designed for types like `DeepVariant` that implement `RunnerVariants` and require /// separate execution for each timepoint. It will: /// - Call `.initialize(id, timepoint, config)` for both `config.tumoral_name` and `config.normal_name` /// - Box the result as `CallerBox` /// /// # Arguments /// - `$id`: Sample ID (usually a `&str`) /// - `$config`: Cloneable configuration object /// - `$($runner:ty),+`: One or more runner types that implement `RunnerVariants` /// /// # Returns /// A `Vec` containing two boxed instances per runner (one for each timepoint). /// /// # Example /// ```rust /// let solo_callers = init_solo_callers_normal_tumoral!( /// "sample_42", /// config, /// DeepVariant, /// OtherSoloCaller, /// )?; /// ``` /// /// This expands to: /// ```rust /// vec![ /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox, /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox, /// Box::new(OtherSoloCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox, /// Box::new(OtherSoloCaller::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox, /// ] /// ``` /// /// # Errors /// This macro uses `?`, so it must be called inside a `Result`-returning context. #[macro_export] macro_rules! init_solo_callers_normal_tumoral { ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{ use anyhow::Context; let mut callers: Vec = Vec::new(); $( let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone()) .with_context(|| format!( "Failed to initialize tumoral caller {} for {} '{}'", stringify!($runner), $id, $config.tumoral_name ))?; callers.push(Box::new(tumoral) as CallerBox); let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone()) .with_context(|| format!( "Failed to initialize normal caller {} for {} '{}'", stringify!($runner), $id, $config.normal_name ))?; callers.push(Box::new(normal) as CallerBox); )+ callers }}; } // pub fn run_variants(iterable: &mut [CallerBox]) -> anyhow::Result<()> { // iterable // .iter_mut() // .try_for_each(|runner| runner.run()) // .map_err(|e| anyhow::anyhow!("Error while calling run_variants.\n{e}")) // } pub fn load_variants( iterable: &mut [CallerBox], annotations: &Annotations, ) -> anyhow::Result> { iterable .par_iter() .map(|runner| { let result = runner.variants(annotations); if let Err(ref e) = result { error!("Failed to load variants from: {}\n{e}", runner.label()); } else { info!("Variants from {} loaded.", runner.label()); }; result }) .filter(|r| r.is_ok()) .collect::>>() .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}")) } // pub fn par_load_variants( // iterable: &mut [Box], // annotations: &Annotations, // ) -> anyhow::Result> { // iterable // .par_iter() // .map(|runner| { // let r = runner.variants(annotations); // if let Err(ref e) = r { // warn!("{e}"); // }; // r // }) // .filter(|r| r.is_ok()) // .collect::>>() // .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}")) // } pub fn parallel_intersection( vec1: &[T], vec2: &[T], ) -> (Vec, Vec, Vec) { let set1: HashSet<_> = vec1.par_iter().cloned().collect(); let set2: HashSet<_> = vec2.par_iter().cloned().collect(); let common: Vec = set1 .par_iter() .filter(|item| set2.contains(item)) .cloned() .collect(); let only_in_first: Vec = set1 .par_iter() .filter(|item| !set2.contains(item)) .cloned() .collect(); let only_in_second: Vec = set2 .par_iter() .filter(|item| !set1.contains(item)) .cloned() .collect(); (common, only_in_first, only_in_second) }