use crate::{ annotation::Annotations, helpers::Hash128, positions::{GenomePosition, GetGenomePosition, VcfPosition}, runners::Run, variant::variant_collection::VariantCollection, }; use anyhow::{anyhow, Context, Ok}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VcfVariant { pub hash: Hash128, pub position: GenomePosition, pub id: String, pub reference: ReferenceAlternative, pub alternative: ReferenceAlternative, pub quality: Option, pub filter: Filter, pub infos: Infos, pub formats: Formats, } impl PartialEq for VcfVariant { fn eq(&self, other: &Self) -> bool { // Nota bene: id, filter, info, format and quality is intentionally not compared self.position == other.position && self.reference == other.reference && self.alternative == other.alternative } } impl Eq for VcfVariant {} impl FromStr for VcfVariant { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { let v: Vec<&str> = s.split('\t').collect(); let vcf_position: VcfPosition = ( *v.first().ok_or(anyhow!("Can't get contig from: {s}"))?, *v.get(1).ok_or(anyhow!("Can't get position from: {s}"))?, ) .try_into() .context(format!("Can't parse position from: {s}"))?; let formats = if v.len() == 10 { ( *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?, *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?, ) .try_into() .context(format!("Can't parse formats from: {s}"))? } else { Formats::default() }; let position: GenomePosition = vcf_position.into(); let reference: ReferenceAlternative = v .get(3) .ok_or(anyhow!("Can't parse reference from: {s}"))? .parse() .context(format!("Can't parse reference from: {s}"))?; let alternative: ReferenceAlternative = v .get(4) .ok_or(anyhow!("Can't parse alternative from: {s}"))? .parse() .context(format!("Can't parse alternative from: {s}"))?; // Blake3 128 bytes Hash let mut hasher = blake3::Hasher::new(); hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes hasher.update(reference.to_string().as_bytes()); // Reference string as bytes hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes let hash = hasher.finalize(); let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap()); Ok(Self { hash, position, id: v .get(2) .ok_or(anyhow!("Can't parse id from: {s}"))? .to_string(), reference, alternative, quality: v .get(5) .map(|s| s.parse::().ok()) // Try to parse as f64; returns Option .unwrap_or(None), filter: v .get(6) .ok_or(anyhow!("Can't parse filter from: {s}"))? .parse() .context(format!("Can't parse filter from: {s}"))?, infos: v .get(7) .ok_or(anyhow!("Can't parse infos from: {s}"))? .parse() .context(format!("Can't parse infos from: {s}"))?, formats, }) } } // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag impl VcfVariant { pub fn into_vcf_row(&self) -> String { let vcf_position: VcfPosition = self.position.clone().into(); let (contig, position) = vcf_position.into(); let mut columns = vec![ contig, position, self.id.to_string(), self.reference.to_string(), self.alternative.to_string(), self.quality .map(|v| v.to_string()) .unwrap_or(".".to_string()), self.filter.to_string(), self.infos.to_string(), ]; if !self.formats.0.is_empty() { let (format, values) = self.formats.clone().into(); columns.push(format); columns.push(values); } columns.join("\t") } pub fn hash(&self) -> Hash128 { self.hash } pub fn commun_deepvariant_clairs(&self) -> VcfVariant { VcfVariant { hash: self.hash, position: self.position.clone(), id: self.id.clone(), reference: self.reference.clone(), alternative: self.alternative.clone(), quality: self.quality, filter: Filter::Other(".".to_string()), infos: Infos(vec![Info::Empty]), formats: self.formats.commun_deepvariant_clairs(), } } pub fn has_svtype(&self) -> bool { self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_))) } pub fn svtype(&self) -> Option { self.infos.0.iter().find_map(|e| { if let Info::SVTYPE(sv_type) = e { Some(sv_type.clone()) } else { None } }) } pub fn alteration_category(&self) -> AlterationCategory { match (&self.reference, &self.alternative) { (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => { AlterationCategory::SNV } (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => { AlterationCategory::INS } (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => { AlterationCategory::Other } (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => { AlterationCategory::DEL } (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) if a.len() < b.len() => { AlterationCategory::INS } (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) if a.len() > b.len() => { AlterationCategory::DEL } _ => match self.svtype() { Some(sv_type) => AlterationCategory::from(sv_type), None => AlterationCategory::Other, }, // (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotides(_)) => { // AlterationCategory::Rep // } // (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Unstructured(_)) => { // AlterationCategory::Other // } // (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotide(_)) => { // AlterationCategory::Other // } // (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotides(_)) => { // AlterationCategory::Other // } // (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Unstructured(_)) => { // AlterationCategory::Other // } } } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub enum AlterationCategory { SNV, DEL, INS, DUP, INV, CNV, BND, Other, } impl fmt::Display for AlterationCategory { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", match self { AlterationCategory::SNV => "SNV", AlterationCategory::DEL => "DEL", AlterationCategory::INS => "INS", AlterationCategory::DUP => "DUP", AlterationCategory::INV => "INV", AlterationCategory::CNV => "CNV", AlterationCategory::BND => "BND", AlterationCategory::Other => "Other", } ) } } impl From for AlterationCategory { fn from(sv_type: SVType) -> Self { match sv_type { SVType::DEL => AlterationCategory::DEL, SVType::INS => AlterationCategory::INS, SVType::DUP => AlterationCategory::DUP, SVType::INV => AlterationCategory::INV, SVType::CNV => AlterationCategory::CNV, SVType::BND => AlterationCategory::BND, } } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub enum SVType { DEL, INS, DUP, INV, CNV, BND, } impl FromStr for SVType { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "DEL" => Ok(SVType::DEL), "INS" => Ok(SVType::INS), "DUP" => Ok(SVType::DUP), "INV" => Ok(SVType::INV), "CNV" => Ok(SVType::CNV), "BND" => Ok(SVType::BND), _ => Err(anyhow!("Can't parse SVTYPE={s}")), } } } impl fmt::Display for SVType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", match self { SVType::DEL => "DEL", SVType::INS => "INS", SVType::DUP => "DUP", SVType::INV => "INV", SVType::CNV => "CNV", SVType::BND => "BND", } ) } } impl VariantId for VcfVariant { fn variant_id(&self) -> String { format!("{}_{}>{}", self.position, self.reference, self.alternative) } } impl GetGenomePosition for VcfVariant { fn position(&self) -> &GenomePosition { &self.position } } impl PartialOrd for VcfVariant { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for VcfVariant { fn cmp(&self, other: &Self) -> Ordering { self.position.cmp(&other.position) } } /// Info #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] pub struct Infos(Vec); impl FromStr for Infos { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { Ok(Self( s.split(";") .map(Info::from_str) .collect::, _>>() .map_err(|e| anyhow::anyhow!("Failed to parse info: {e}"))?, )) } } impl fmt::Display for Infos { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", self.0 .iter() .map(|e| e.to_string()) .collect::>() .join(";") ) } } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum Info { Empty, H, F, P, FAU(u32), FCU(u32), FGU(u32), FTU(u32), RAU(u32), RCU(u32), RGU(u32), RTU(u32), SVTYPE(SVType), SVLEN(i32), END(u32), MATEID(String), SVINSLEN(u32), SVINSSEQ(String), } impl FromStr for Info { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { if s.contains("=") { let (key, value) = s .split_once('=') .context(format!("Can't split with `=` {s}"))?; Ok(match key { "FAU" => Info::FAU( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "FCU" => Info::FCU( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "FGU" => Info::FGU( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "FTU" => Info::FTU( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "RAU" => Info::RAU( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "RCU" => Info::RCU( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "RGU" => Info::RGU( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "RTU" => Info::RTU( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "SVTYPE" => Info::SVTYPE(value.parse()?), "SVLEN" => Info::SVLEN( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "END" => Info::END( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "MATEID" => Info::MATEID(value.to_string()), "SVINSLEN" => Info::SVINSLEN( value .parse() .context(format!("Can't parse into u32: {value}"))?, ), "SVINSSEQ" => Info::SVINSSEQ(value.to_string()), _ => Info::Empty, }) } else { Ok(match s { "H" => Info::H, "F" => Info::F, "P" => Info::P, _ => Info::Empty, }) } } } impl fmt::Display for Info { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Info::Empty => write!(f, "."), Info::H => write!(f, "H"), Info::F => write!(f, "F"), Info::P => write!(f, "P"), Info::FAU(v) => write!(f, "FAU={v}"), Info::FCU(v) => write!(f, "FCU={v}"), Info::FGU(v) => write!(f, "FGU={v}"), Info::FTU(v) => write!(f, "FTU={v}"), Info::RAU(v) => write!(f, "RAU={v}"), Info::RCU(v) => write!(f, "RCU={v}"), Info::RGU(v) => write!(f, "RGU={v}"), Info::RTU(v) => write!(f, "RTU={v}"), Info::SVTYPE(v) => write!(f, "SVTYPE={v}"), Info::SVLEN(v) => write!(f, "SVLEN={v}"), Info::END(v) => write!(f, "END={v}"), Info::MATEID(v) => write!(f, "MATEID={v}"), Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"), Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"), } } } /// Format #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum Format { // DeepVariant GT(String), GQ(u32), DP(u32), AD(Vec), VAF(f32), PL(Vec), // Clairs AF(f32), NAF(u32), NDP(u32), NAD(Vec), AU(u32), CU(u32), GU(u32), TU(u32), NAU(u32), NCU(u32), NGU(u32), NTU(u32), // nanomonsv TR(u32), VR(u32), Other((String, String)), // (key, value) } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] pub struct Formats(Vec); impl TryFrom<(&str, &str)> for Formats { type Error = anyhow::Error; fn try_from((k, v): (&str, &str)) -> anyhow::Result { let keys: Vec<&str> = k.split(':').collect(); let values: Vec<&str> = v.split(':').collect(); if keys.len() != values.len() { anyhow::bail!("Mismatch between keys and values count for {k} {v}"); } Ok(Self( keys.into_iter() .zip(values) .map(|(key, value)| Format::try_from((key, value))) .collect::, _>>() .map_err(|e| anyhow::anyhow!("Failed to parse format: {e}"))?, )) } } impl From for (String, String) { fn from(formats: Formats) -> Self { let mut keys = Vec::new(); let mut values = Vec::new(); for format in formats.0 { let (key, value): (String, String) = format.into(); keys.push(key); values.push(value); } (keys.join(":"), values.join(":")) } } impl TryFrom<(&str, &str)> for Format { type Error = anyhow::Error; fn try_from((key, value): (&str, &str)) -> anyhow::Result { Ok(match key { "GT" => Format::GT(value.to_string()), "GQ" => Format::GQ(value.parse().context(format!("Can't parse GQ: {value}"))?), "DP" => Format::DP(value.parse().context(format!("Can't parse DP: {value}"))?), "AD" => Format::AD( value .split(',') .map(|e| e.parse().context("Failed to parse AD")) .collect::>>()?, ), "VAF" => Format::VAF(value.parse().context(format!("Can't parse VAF: {value}"))?), "PL" => Format::PL( value .split(',') .map(|e| e.parse().context("Failed to parse AD")) .collect::>>()?, ), "TR" => Format::TR(value.parse()?), "VR" => Format::TR(value.parse()?), _ => Format::Other((key.to_string(), value.to_string())), }) } } impl From for (String, String) { fn from(format: Format) -> Self { let concat = |values: Vec| -> String { values .iter() .map(|v| v.to_string()) .collect::>() .join(",") }; match format { Format::GT(value) => ("GT".to_string(), value), Format::GQ(value) => ("GQ".to_string(), value.to_string()), Format::DP(value) => ("DP".to_string(), value.to_string()), Format::AD(values) => ("AD".to_string(), concat(values)), Format::VAF(value) => ("VAF".to_string(), value.to_string()), Format::PL(values) => ("PL".to_string(), concat(values)), Format::Other((key, value)) => (key, value), Format::AF(value) => ("AF".to_string(), value.to_string()), Format::NAF(value) => ("NAF".to_string(), value.to_string()), Format::NDP(value) => ("NDP".to_string(), value.to_string()), Format::NAD(values) => ("NAD".to_string(), concat(values)), Format::AU(value) => ("AU".to_string(), value.to_string()), Format::CU(value) => ("CU".to_string(), value.to_string()), Format::GU(value) => ("GU".to_string(), value.to_string()), Format::TU(value) => ("TU".to_string(), value.to_string()), Format::NAU(value) => ("NAU".to_string(), value.to_string()), Format::NCU(value) => ("NCU".to_string(), value.to_string()), Format::NGU(value) => ("NGU".to_string(), value.to_string()), Format::NTU(value) => ("NTU".to_string(), value.to_string()), Format::TR(value) => ("TR".to_string(), value.to_string()), Format::VR(value) => ("VR".to_string(), value.to_string()), } } } impl Formats { pub fn commun_deepvariant_clairs(&self) -> Self { let filtered_vec: Vec = self .0 .clone() .into_iter() .map(|e| { if let Format::VAF(v) = e { Format::AF(v) } else { e } }) .filter(|format| { matches!( format, Format::GT(_) | Format::GQ(_) | Format::DP(_) | Format::AD(_) | Format::AF(_) ) }) .collect(); Formats(filtered_vec) } } /// Filter #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub enum Filter { PASS, Other(String), } impl FromStr for Filter { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "PASS" => Ok(Filter::PASS), _ => Ok(Filter::Other(s.to_string())), } } } impl fmt::Display for Filter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Filter::PASS => write!(f, "PASS"), Filter::Other(ref s) => write!(f, "{}", s), } } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] pub enum ReferenceAlternative { Nucleotide(Base), Nucleotides(Vec), Unstructured(String), } impl FromStr for ReferenceAlternative { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { let possible_bases = s.as_bytes().iter(); let mut res: Vec = Vec::new(); for &base in possible_bases { match base.try_into() { std::result::Result::Ok(b) => res.push(b), Err(_) => { return Ok(Self::Unstructured(s.to_string())); } } } if res.len() == 1 { Ok(Self::Nucleotide(res.pop().unwrap())) } else { Ok(Self::Nucleotides(res)) } } } impl fmt::Display for ReferenceAlternative { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let string = match self { ReferenceAlternative::Nucleotide(b) => b.to_string(), ReferenceAlternative::Nucleotides(bases) => bases .iter() .fold(String::new(), |acc, e| format!("{}{}", acc, e)), ReferenceAlternative::Unstructured(s) => s.to_string(), }; write!(f, "{}", string) } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] pub enum Base { A, T, C, G, N, } impl TryFrom for Base { type Error = anyhow::Error; fn try_from(base: u8) -> anyhow::Result { match base { b'A' => Ok(Base::A), b'T' => Ok(Base::T), b'C' => Ok(Base::C), b'G' => Ok(Base::G), b'N' => Ok(Base::N), _ => Err(anyhow::anyhow!( "Unknown base: {}", String::from_utf8_lossy(&[base]) )), } } } impl Base { pub fn into_u8(self) -> u8 { match self { Base::A => b'A', Base::T => b'T', Base::C => b'C', Base::G => b'G', Base::N => b'N', } } } impl fmt::Display for Base { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Use `self.number` to refer to each positional data point. let str = match self { Base::A => "A", Base::T => "T", Base::C => "C", Base::G => "G", Base::N => "N", }; write!(f, "{}", str) } } pub trait Variants { fn variants(&self, annotations: &Annotations) -> anyhow::Result; } pub trait VariantId { fn variant_id(&self) -> String; } pub trait RunnerVariants: Run + Variants + Send + Sync {} pub type CallerBox = Box; #[macro_export] macro_rules! init_somatic_callers { ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => { vec![ $( Box::new(<$runner>::initialize($id, $config.clone())?) as CallerBox ),+ ] }; } #[macro_export] macro_rules! init_solo_callers { ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => { vec![ $( Box::new(<$runner>::initialize($id, $arg, $config.clone())?) as CallerBox ),+ ] }; } pub fn load_variants( iterable: &mut [CallerBox], annotations: &Annotations, ) -> anyhow::Result> { // First, run all items in parallel iterable .par_iter_mut() .try_for_each(|runner| runner.run())?; // Then, collect variants from all items in parallel let variants: Vec = iterable .par_iter() .map(|runner| runner.variants(annotations)) .collect::>>()?; Ok(variants) } pub fn parallel_intersection( vec1: &[T], vec2: &[T], ) -> (Vec, Vec, Vec) { let set1: HashSet<_> = vec1.par_iter().cloned().collect(); let set2: HashSet<_> = vec2.par_iter().cloned().collect(); let common: Vec = set1 .par_iter() .filter(|item| set2.contains(item)) .cloned() .collect(); let only_in_first: Vec = set1 .par_iter() .filter(|item| !set2.contains(item)) .cloned() .collect(); let only_in_second: Vec = set2 .par_iter() .filter(|item| !set1.contains(item)) .cloned() .collect(); (common, only_in_first, only_in_second) }