| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130 |
- use crate::{
- annotation::Annotations,
- helpers::Hash128,
- positions::{GenomePosition, GetGenomePosition, VcfPosition},
- runners::Run,
- variant::variant_collection::VariantCollection,
- };
- use anyhow::{anyhow, Context};
- use log::warn;
- use rayon::prelude::*;
- use serde::{Deserialize, Serialize};
- use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr};
- /// Represents a variant in the Variant Call Format (VCF).
- #[derive(Debug, Clone, Serialize, Deserialize)]
- pub struct VcfVariant {
- /// A 128-bit hash of the variant's key properties for efficient comparison and storage.
- pub hash: Hash128,
- /// The genomic position of the variant.
- pub position: GenomePosition,
- /// The identifier of the variant.
- pub id: String,
- /// The reference allele.
- pub reference: ReferenceAlternative,
- /// The alternative allele.
- pub alternative: ReferenceAlternative,
- /// The quality score of the variant call, if available.
- pub quality: Option<f32>,
- /// The filter status of the variant.
- pub filter: Filter,
- /// Additional information about the variant.
- pub infos: Infos,
- /// Genotype information and other sample-specific data.
- pub formats: Formats,
- }
- impl PartialEq for VcfVariant {
- /// Compares two VcfVariants for equality.
- ///
- /// Note: This comparison only considers position, reference, and alternative.
- /// It intentionally ignores id, filter, info, format, and quality.
- fn eq(&self, other: &Self) -> bool {
- // Nota bene: id, filter, info, format and quality is intentionally not compared
- self.position == other.position
- && self.reference == other.reference
- && self.alternative == other.alternative
- }
- }
- impl Eq for VcfVariant {}
- impl FromStr for VcfVariant {
- type Err = anyhow::Error;
- /// Parses a VcfVariant from a string representation.
- ///
- /// The input string is expected to be a tab-separated VCF line.
- ///
- /// # Errors
- ///
- /// Returns an error if parsing fails for any field.
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let v: Vec<&str> = s.split('\t').collect();
- let vcf_position: VcfPosition = (
- *v.first().ok_or(anyhow!("Can't get contig from: {s}"))?,
- *v.get(1).ok_or(anyhow!("Can't get position from: {s}"))?,
- )
- .try_into()
- .context(format!("Can't parse position from: {s}"))?;
- let formats = if v.len() >= 10 {
- (
- *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?,
- *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?,
- )
- .try_into()
- .context(format!("Can't parse formats from: {s}"))?
- } else {
- Formats::default()
- };
- let position: GenomePosition = vcf_position.into();
- let reference: ReferenceAlternative = v
- .get(3)
- .ok_or(anyhow!("Can't parse reference from: {s}"))?
- .parse()
- .context(format!("Can't parse reference from: {s}"))?;
- let alternative: ReferenceAlternative = v
- .get(4)
- .ok_or(anyhow!("Can't parse alternative from: {s}"))?
- .parse()
- .context(format!("Can't parse alternative from: {s}"))?;
- // Blake3 128 bytes Hash
- let mut hasher = blake3::Hasher::new();
- hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes
- hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes
- hasher.update(reference.to_string().as_bytes()); // Reference string as bytes
- hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes
- let hash = hasher.finalize();
- let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
- Ok(Self {
- hash,
- position,
- id: v
- .get(2)
- .ok_or(anyhow!("Can't parse id from: {s}"))?
- .to_string(),
- reference,
- alternative,
- quality: v
- .get(5)
- .map(|s| s.parse::<f32>().ok()) // Try to parse as f64; returns Option<f64>
- .unwrap_or(None),
- filter: v
- .get(6)
- .ok_or(anyhow!("Can't parse filter from: {s}"))?
- .parse()
- .context(format!("Can't parse filter from: {s}"))?,
- infos: v
- .get(7)
- .ok_or(anyhow!("Can't parse infos from: {s}"))?
- .parse()
- .context(format!("Can't parse infos from: {s}"))?,
- formats,
- })
- }
- }
- // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag
- impl VcfVariant {
- /// Converts the VcfVariant into a VCF-formatted row string.
- ///
- /// This method creates a tab-separated string representation of the variant,
- /// suitable for writing to a VCF file.
- pub fn into_vcf_row(&self) -> String {
- let vcf_position: VcfPosition = self.position.clone().into();
- let (contig, position) = vcf_position.into();
- let mut columns = vec![
- contig,
- position,
- self.id.to_string(),
- self.reference.to_string(),
- self.alternative.to_string(),
- self.quality
- .map(|v| v.to_string())
- .unwrap_or(".".to_string()),
- self.filter.to_string(),
- self.infos.to_string(),
- ];
- if !self.formats.0.is_empty() {
- let (format, values) = self.formats.clone().into();
- columns.push(format);
- columns.push(values);
- }
- columns.join("\t")
- }
- /// Returns the hash of the variant.
- pub fn hash(&self) -> Hash128 {
- self.hash
- }
- /// Creates a new VcfVariant with common attributes from DeepVariant and CLAIRS.
- ///
- /// This method generates a new variant with shared properties, resetting some fields
- /// to default or empty values.
- pub fn commun_deepvariant_clairs(&self) -> VcfVariant {
- VcfVariant {
- hash: self.hash,
- position: self.position.clone(),
- id: self.id.clone(),
- reference: self.reference.clone(),
- alternative: self.alternative.clone(),
- quality: self.quality,
- filter: Filter::Other(".".to_string()),
- infos: Infos(vec![Info::Empty]),
- formats: self.formats.commun_deepvariant_clairs(),
- }
- }
- /// Checks if the variant has an SVTYPE info field.
- ///
- /// Returns true if the variant contains structural variation type information.
- pub fn has_svtype(&self) -> bool {
- self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_)))
- }
- /// Retrieves the structural variation type of the variant, if present.
- ///
- /// Returns Some(SVType) if the variant has an SVTYPE info field,
- pub fn svtype(&self) -> Option<SVType> {
- self.infos.0.iter().find_map(|e| {
- if let Info::SVTYPE(sv_type) = e {
- Some(sv_type.clone())
- } else {
- None
- }
- })
- }
- /// Determines the alteration category of the variant.
- ///
- /// This method analyzes the reference and alternative alleles to classify
- /// the variant into one of several alteration categories:
- /// - SNV (Single Nucleotide Variant)
- /// - INS (Insertion)
- /// - DEL (Deletion)
- /// - Other (including structural variants and complex alterations)
- ///
- /// The classification is based on the following rules:
- /// 1. If both reference and alternative are single nucleotides, it's an SNV.
- /// 2. If reference is a single nucleotide and alternative is multiple nucleotides, it's an insertion.
- /// 3. If reference is multiple nucleotides and alternative is a single nucleotide, it's a deletion.
- /// 4. For cases where both are multiple nucleotides, the longer one determines if it's an insertion or deletion.
- /// 5. If none of the above apply, it checks for structural variant types.
- /// 6. If no structural variant type is found, it's classified as "Other".
- ///
- /// # Returns
- /// An `AlterationCategory` enum representing the type of alteration.
- pub fn alteration_category(&self) -> AlterationCategory {
- match (&self.reference, &self.alternative) {
- (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
- AlterationCategory::SNV
- }
- (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
- AlterationCategory::INS
- }
- (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
- AlterationCategory::Other
- }
- (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
- AlterationCategory::DEL
- }
- (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
- if a.len() < b.len() =>
- {
- AlterationCategory::INS
- }
- (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
- if a.len() > b.len() =>
- {
- AlterationCategory::DEL
- }
- _ => match self.svtype() {
- Some(sv_type) => {
- if let Ok(bnd_desc) = self.bnd_desc() {
- if bnd_desc.a_contig != bnd_desc.b_contig {
- AlterationCategory::TRL
- } else {
- AlterationCategory::BND
- }
- } else {
- AlterationCategory::from(sv_type)
- }
- }
- None => AlterationCategory::Other,
- },
- }
- }
- /// Parses and constructs a BND (breakend) description from the alternative string.
- ///
- /// This function interprets the BND notation in the alternative string and creates
- /// a `BNDDesc` struct containing detailed information about the breakend.
- ///
- /// # Returns
- /// - `Ok(BNDDesc)` if parsing is successful
- /// - `Err` if parsing fails or if the alteration is not a BND
- ///
- /// # Errors
- /// This function will return an error if:
- /// - The alteration category is not BND
- /// - The alternative string cannot be parsed into exactly 3 parts
- /// - The b_position cannot be parsed as a number
- pub fn bnd_desc(&self) -> anyhow::Result<BNDDesc> {
- let alt = self.alternative.to_string();
- if alt.contains('[') || alt.contains(']') {
- let extending_right = alt.contains('[');
- let alt_rep = alt.replace("[", ";").replace("]", ";");
- let alt_is_joined_after = !alt_rep.starts_with(";");
- let parts = alt_rep
- .split(";")
- .filter(|c| !c.is_empty())
- .collect::<Vec<&str>>();
- if alt_is_joined_after {
- // a is ref b is alt
- let a_sens = true;
- let a_contig = self.position.contig();
- let a_position = self.position.position + 1;
- let added_nt = parts[0][1..].to_string();
- let b_sens = alt.contains('[');
- let (contig, pos) = parts[1].split_once(':').unwrap();
- let b_contig = contig.to_string();
- let b_position: u32 = pos.parse()?;
- Ok(BNDDesc {
- a_contig,
- a_position,
- a_sens,
- b_contig,
- b_position,
- b_sens,
- added_nt,
- })
- } else {
- // a is alt b is ref
- let b_sens = true;
- let b_contig = self.position.contig();
- let b_position = self.position.position + 1;
- let mut added_nt = parts[1].to_string();
- added_nt.pop();
- let a_sens = alt.contains(']');
- let (contig, pos) = parts[0].split_once(':').unwrap();
- let a_contig = contig.to_string();
- let a_position: u32 = pos.parse()?;
- Ok(BNDDesc {
- a_contig,
- a_position,
- a_sens,
- b_contig,
- b_position,
- b_sens,
- added_nt,
- })
- }
- // let b_sens = alt.contains('[');
- //
- // let a_sens = if b_sens {
- // !alt.starts_with('[')
- // } else {
- // !alt.starts_with(']')
- // };
- //
- // let parts: Vec<&str> = alt
- // .split(&['[', ']', ':'])
- // .filter(|v| !v.is_empty())
- // .collect();
- //
- // if parts.len() != 3 {
- // return Err(anyhow::anyhow!("Failed to parse parts: {parts:?}"));
- // }
- //
- // let (nt, b_contig, b_position) = if a_sens {
- // (parts[0], parts[1], parts[2])
- // } else {
- // (parts[2], parts[0], parts[1])
- // };
- //
- // let added_nt = if nt.len() > 1 {
- // nt[1..].to_string()
- // } else {
- // nt.to_string()
- // };
- //
- // Ok(BNDDesc {
- // a_contig: self.position.contig(),
- // a_position: self.position.position + 1,
- // a_sens,
- // b_contig: b_contig.to_string(),
- // b_position: b_position
- // .parse()
- // .map_err(|e| anyhow::anyhow!("Failed to parse: {b_position}\n{e}"))?,
- // b_sens,
- // added_nt,
- // })
- } else {
- Err(anyhow::anyhow!("The alteration is not BND: {alt}"))
- }
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
- pub struct BNDDesc {
- pub a_contig: String,
- pub a_position: u32, // 1-based
- pub a_sens: bool,
- pub b_contig: String,
- pub b_position: u32, // 1-based
- pub b_sens: bool,
- pub added_nt: String,
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
- pub enum AlterationCategory {
- SNV,
- DEL,
- INS,
- DUP,
- INV,
- CNV,
- TRL,
- BND,
- Other,
- }
- impl fmt::Display for AlterationCategory {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(
- f,
- "{}",
- match self {
- AlterationCategory::SNV => "SNV",
- AlterationCategory::DEL => "DEL",
- AlterationCategory::INS => "INS",
- AlterationCategory::DUP => "DUP",
- AlterationCategory::INV => "INV",
- AlterationCategory::CNV => "CNV",
- AlterationCategory::BND | AlterationCategory::TRL => "TRL",
- AlterationCategory::Other => "Other",
- }
- )
- }
- }
- impl From<SVType> for AlterationCategory {
- fn from(sv_type: SVType) -> Self {
- match sv_type {
- SVType::DEL => AlterationCategory::DEL,
- SVType::INS => AlterationCategory::INS,
- SVType::DUP => AlterationCategory::DUP,
- SVType::INV => AlterationCategory::INV,
- SVType::CNV => AlterationCategory::CNV,
- SVType::BND => AlterationCategory::BND,
- }
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
- pub enum SVType {
- DEL,
- INS,
- DUP,
- INV,
- CNV,
- BND,
- }
- impl FromStr for SVType {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- match s {
- "DEL" => Ok(SVType::DEL),
- "INS" => Ok(SVType::INS),
- "DUP" => Ok(SVType::DUP),
- "INV" => Ok(SVType::INV),
- "CNV" => Ok(SVType::CNV),
- "BND" => Ok(SVType::BND),
- _ => Err(anyhow!("Can't parse SVTYPE={s}")),
- }
- }
- }
- impl fmt::Display for SVType {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(
- f,
- "{}",
- match self {
- SVType::DEL => "DEL",
- SVType::INS => "INS",
- SVType::DUP => "DUP",
- SVType::INV => "INV",
- SVType::CNV => "CNV",
- SVType::BND => "BND",
- }
- )
- }
- }
- impl VariantId for VcfVariant {
- fn variant_id(&self) -> String {
- format!("{}_{}>{}", self.position, self.reference, self.alternative)
- }
- }
- impl GetGenomePosition for VcfVariant {
- fn position(&self) -> &GenomePosition {
- &self.position
- }
- }
- impl PartialOrd for VcfVariant {
- fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
- Some(self.cmp(other))
- }
- }
- impl Ord for VcfVariant {
- fn cmp(&self, other: &Self) -> Ordering {
- self.position.cmp(&other.position)
- }
- }
- /// Info
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
- pub struct Infos(pub Vec<Info>);
- impl FromStr for Infos {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- Ok(Self(
- s.split(";")
- .map(Info::from_str)
- .collect::<Result<Vec<Info>, _>>()
- .map_err(|e| anyhow::anyhow!("Failed to parse info: {e}"))?,
- ))
- }
- }
- impl fmt::Display for Infos {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(
- f,
- "{}",
- self.0
- .iter()
- .map(|e| e.to_string())
- .collect::<Vec<String>>()
- .join(";")
- )
- }
- }
- #[allow(non_camel_case_types)]
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
- pub enum Info {
- Empty,
- H,
- F,
- P,
- FAU(u32),
- FCU(u32),
- FGU(u32),
- FTU(u32),
- RAU(u32),
- RCU(u32),
- RGU(u32),
- RTU(u32),
- SVTYPE(SVType),
- MATEID(String),
- NORMAL_READ_SUPPORT(u32),
- TUMOUR_READ_SUPPORT(u32),
- NORMAL_ALN_SUPPORT(u32),
- TUMOUR_ALN_SUPPORT(u32),
- SVLEN(i32),
- TUMOUR_DP_BEFORE(Vec<u32>),
- TUMOUR_DP_AT(Vec<u32>),
- TUMOUR_DP_AFTER(Vec<u32>),
- NORMAL_DP_BEFORE(Vec<u32>),
- NORMAL_DP_AT(Vec<u32>),
- NORMAL_DP_AFTER(Vec<u32>),
- TUMOUR_AF(Vec<f32>),
- NORMAL_AF(Vec<f32>),
- BP_NOTATION(String),
- SOURCE(String),
- CLUSTERED_READS_TUMOUR(u32),
- CLUSTERED_READS_NORMAL(u32),
- TUMOUR_ALT_HP(Vec<u32>),
- TUMOUR_PS(Vec<String>),
- NORMAL_ALT_HP(Vec<u32>),
- NORMAL_PS(Vec<String>),
- TUMOUR_TOTAL_HP_AT(Vec<u32>),
- NORMAL_TOTAL_HP_AT(Vec<u32>),
- ORIGIN_STARTS_STD_DEV(f32),
- ORIGIN_MAPQ_MEAN(f32),
- ORIGIN_EVENT_SIZE_STD_DEV(f32),
- ORIGIN_EVENT_SIZE_MEDIAN(f32),
- ORIGIN_EVENT_SIZE_MEAN(f32),
- END_STARTS_STD_DEV(f32),
- END_MAPQ_MEAN(f32),
- END_EVENT_SIZE_STD_DEV(f32),
- END_EVENT_SIZE_MEDIAN(f32),
- END_EVENT_SIZE_MEAN(f32),
- CLASS(String),
- END(u32),
- SVINSLEN(u32),
- SVINSSEQ(String),
- }
- impl FromStr for Info {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- if s.contains('=') {
- let (key, value) = s
- .split_once('=')
- .context(format!("Can't split with `=` in string: {s}"))?;
- Ok(match key {
- "FAU" => Info::FAU(parse_value(value, key)?),
- "FCU" => Info::FCU(parse_value(value, key)?),
- "FGU" => Info::FGU(parse_value(value, key)?),
- "FTU" => Info::FTU(parse_value(value, key)?),
- "RAU" => Info::RAU(parse_value(value, key)?),
- "RCU" => Info::RCU(parse_value(value, key)?),
- "RGU" => Info::RGU(parse_value(value, key)?),
- "RTU" => Info::RTU(parse_value(value, key)?),
- "SVLEN" => Info::SVLEN(parse_value(value, key)?),
- "END" => Info::END(parse_value(value, key)?),
- "SVINSLEN" => Info::SVINSLEN(parse_value(value, key)?),
- "SVTYPE" => Info::SVTYPE(value.parse()?),
- "MATEID" => Info::MATEID(value.to_string()),
- "NORMAL_READ_SUPPORT" => Info::NORMAL_READ_SUPPORT(parse_value(value, key)?),
- "TUMOUR_READ_SUPPORT" => Info::TUMOUR_READ_SUPPORT(parse_value(value, key)?),
- "NORMAL_ALN_SUPPORT" => Info::NORMAL_ALN_SUPPORT(parse_value(value, key)?),
- "TUMOUR_ALN_SUPPORT" => Info::TUMOUR_ALN_SUPPORT(parse_value(value, key)?),
- "SVINSSEQ" => Info::SVINSSEQ(value.to_string()),
- "TUMOUR_DP_BEFORE" => Info::TUMOUR_DP_BEFORE(parse_vec_value(value, key)?),
- "TUMOUR_DP_AT" => Info::TUMOUR_DP_AT(parse_vec_value(value, key)?),
- "TUMOUR_DP_AFTER" => Info::TUMOUR_DP_AFTER(parse_vec_value(value, key)?),
- "NORMAL_DP_BEFORE" => Info::NORMAL_DP_BEFORE(parse_vec_value(value, key)?),
- "NORMAL_DP_AT" => Info::NORMAL_DP_AT(parse_vec_value(value, key)?),
- "NORMAL_DP_AFTER" => Info::NORMAL_DP_AFTER(parse_vec_value(value, key)?),
- "TUMOUR_AF" => Info::TUMOUR_AF(parse_vec_value(value, key)?),
- "NORMAL_AF" => Info::NORMAL_AF(parse_vec_value(value, key)?),
- "BP_NOTATION" => Info::BP_NOTATION(value.to_string()),
- "SOURCE" => Info::SOURCE(value.to_string()),
- "CLUSTERED_READS_TUMOUR" => Info::CLUSTERED_READS_TUMOUR(parse_value(value, key)?),
- "CLUSTERED_READS_NORMAL" => Info::CLUSTERED_READS_NORMAL(parse_value(value, key)?),
- "TUMOUR_ALT_HP" => Info::TUMOUR_ALT_HP(parse_vec_value(value, key)?),
- "TUMOUR_PS" => Info::TUMOUR_PS(parse_vec_value(value, key)?),
- "NORMAL_ALT_HP" => Info::NORMAL_ALT_HP(parse_vec_value(value, key)?),
- "NORMAL_PS" => Info::NORMAL_PS(parse_vec_value(value, key)?),
- "TUMOUR_TOTAL_HP_AT" => Info::TUMOUR_TOTAL_HP_AT(parse_vec_value(value, key)?),
- "NORMAL_TOTAL_HP_AT" => Info::NORMAL_TOTAL_HP_AT(parse_vec_value(value, key)?),
- "ORIGIN_STARTS_STD_DEV" => Info::ORIGIN_STARTS_STD_DEV(parse_value(value, key)?),
- "ORIGIN_MAPQ_MEAN" => Info::ORIGIN_MAPQ_MEAN(parse_value(value, key)?),
- "ORIGIN_EVENT_SIZE_STD_DEV" => {
- Info::ORIGIN_EVENT_SIZE_STD_DEV(parse_value(value, key)?)
- }
- "ORIGIN_EVENT_SIZE_MEDIAN" => {
- Info::ORIGIN_EVENT_SIZE_MEDIAN(parse_value(value, key)?)
- }
- "ORIGIN_EVENT_SIZE_MEAN" => Info::ORIGIN_EVENT_SIZE_MEAN(parse_value(value, key)?),
- "END_STARTS_STD_DEV" => Info::END_STARTS_STD_DEV(parse_value(value, key)?),
- "END_MAPQ_MEAN" => Info::END_MAPQ_MEAN(parse_value(value, key)?),
- "END_EVENT_SIZE_STD_DEV" => Info::END_EVENT_SIZE_STD_DEV(parse_value(value, key)?),
- "END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?),
- "END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?),
- "CLASS" => Info::CLASS(value.to_string()),
- _ => Info::Empty,
- })
- } else {
- Ok(match s {
- "H" => Info::H,
- "F" => Info::F,
- "P" => Info::P,
- _ => Info::Empty,
- })
- }
- }
- }
- impl fmt::Display for Info {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- match self {
- Info::Empty => write!(f, "."),
- Info::H => write!(f, "H"),
- Info::F => write!(f, "F"),
- Info::P => write!(f, "P"),
- Info::FAU(v) => write!(f, "FAU={v}"),
- Info::FCU(v) => write!(f, "FCU={v}"),
- Info::FGU(v) => write!(f, "FGU={v}"),
- Info::FTU(v) => write!(f, "FTU={v}"),
- Info::RAU(v) => write!(f, "RAU={v}"),
- Info::RCU(v) => write!(f, "RCU={v}"),
- Info::RGU(v) => write!(f, "RGU={v}"),
- Info::RTU(v) => write!(f, "RTU={v}"),
- Info::SVTYPE(v) => write!(f, "SVTYPE={v}"),
- Info::SVLEN(v) => write!(f, "SVLEN={v}"),
- Info::END(v) => write!(f, "END={v}"),
- Info::MATEID(v) => write!(f, "MATEID={v}"),
- Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"),
- Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"),
- Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"),
- Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"),
- Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"),
- Info::TUMOUR_ALN_SUPPORT(v) => write!(f, "TUMOUR_ALN_SUPPORT={v}"),
- Info::TUMOUR_DP_BEFORE(v) => write!(f, "TUMOUR_DP_BEFORE={}", concat_numbers(v)),
- Info::TUMOUR_DP_AT(v) => write!(f, "TUMOUR_DP_AT={}", concat_numbers(v)),
- Info::TUMOUR_DP_AFTER(v) => write!(f, "TUMOUR_DP_AFTER={}", concat_numbers(v)),
- Info::NORMAL_DP_BEFORE(v) => write!(f, "NORMAL_DP_BEFORE={}", concat_numbers(v)),
- Info::NORMAL_DP_AT(v) => write!(f, "NORMAL_DP_AT={}", concat_numbers(v)),
- Info::NORMAL_DP_AFTER(v) => write!(f, "NORMAL_DP_AFTER={}", concat_numbers(v)),
- Info::TUMOUR_AF(v) => write!(f, "TUMOUR_AF={}", concat_numbers(v)),
- Info::NORMAL_AF(v) => write!(f, "NORMAL_AF={}", concat_numbers(v)),
- Info::BP_NOTATION(v) => write!(f, "BP_NOTATION={v}"),
- Info::SOURCE(v) => write!(f, "SOURCE={v}"),
- Info::CLUSTERED_READS_TUMOUR(v) => write!(f, "CLUSTERED_READS_TUMOUR={v}"),
- Info::CLUSTERED_READS_NORMAL(v) => write!(f, "CLUSTERED_READS_NORMAL={v}"),
- Info::TUMOUR_ALT_HP(v) => write!(f, "TUMOUR_ALT_HP={}", concat_numbers(v)),
- Info::TUMOUR_PS(v) => write!(f, "TUMOUR_PS={}", v.join(",")),
- Info::NORMAL_ALT_HP(v) => write!(f, "NORMAL_ALT_HP={}", concat_numbers(v)),
- Info::NORMAL_PS(v) => write!(f, "NORMAL_PS={}", v.join(",")),
- Info::TUMOUR_TOTAL_HP_AT(v) => write!(f, "TUMOUR_TOTAL_HP_AT={}", concat_numbers(v)),
- Info::NORMAL_TOTAL_HP_AT(v) => write!(f, "NORMAL_TOTAL_HP_AT={}", concat_numbers(v)),
- Info::ORIGIN_STARTS_STD_DEV(v) => write!(f, "ORIGIN_STARTS_STD_DEV={v}"),
- Info::ORIGIN_MAPQ_MEAN(v) => write!(f, "ORIGIN_MAPQ_MEAN={v}"),
- Info::ORIGIN_EVENT_SIZE_STD_DEV(v) => write!(f, "ORIGIN_EVENT_SIZE_STD_DEV={v}"),
- Info::ORIGIN_EVENT_SIZE_MEDIAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEDIAN={v}"),
- Info::ORIGIN_EVENT_SIZE_MEAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEAN={v}"),
- Info::END_STARTS_STD_DEV(v) => write!(f, "END_STARTS_STD_DEV={v}"),
- Info::END_MAPQ_MEAN(v) => write!(f, "END_MAPQ_MEAN={v}"),
- Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"),
- Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"),
- Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"),
- Info::CLASS(v) => write!(f, "CLASS={v}"),
- }
- }
- }
- pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
- v.iter()
- .map(|n| n.to_string())
- .collect::<Vec<String>>()
- .join(",")
- }
- /// Format
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
- pub enum Format {
- // DeepVariant
- GT(String),
- GQ(u32),
- DP(u32),
- AD(Vec<u32>),
- VAF(f32),
- PL(Vec<u32>),
- // Clairs
- // when format begins with N: normal
- // AF(f32),
- // NAF(f32), // DP(u32),
- NDP(u32),
- NAD(Vec<u32>),
- AU(u32),
- CU(u32),
- GU(u32),
- TU(u32),
- NAU(u32),
- NCU(u32),
- NGU(u32),
- NTU(u32),
- // nanomonsv
- TR(u32),
- VR(u32),
- Other((String, String)), // (key, value)
- }
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
- pub struct Formats(pub Vec<Format>);
- impl TryFrom<(&str, &str)> for Formats {
- type Error = anyhow::Error;
- fn try_from((k, v): (&str, &str)) -> anyhow::Result<Self> {
- let keys: Vec<&str> = k.split(':').collect();
- let values: Vec<&str> = v.split(':').collect();
- if keys.len() != values.len() {
- anyhow::bail!("Mismatch between keys and values count for {k} {v}");
- }
- Ok(Self(
- keys.into_iter()
- .zip(values)
- .map(|(key, value)| Format::try_from((key, value)))
- .collect::<Result<Vec<Format>, _>>()
- .map_err(|e| anyhow::anyhow!("Failed to parse format: {e}"))?,
- ))
- }
- }
- impl From<Formats> for (String, String) {
- fn from(formats: Formats) -> Self {
- let mut keys = Vec::new();
- let mut values = Vec::new();
- for format in formats.0 {
- let (key, value): (String, String) = format.into();
- keys.push(key);
- values.push(value);
- }
- (keys.join(":"), values.join(":"))
- }
- }
- impl TryFrom<(&str, &str)> for Format {
- type Error = anyhow::Error;
- fn try_from((key, value): (&str, &str)) -> anyhow::Result<Self> {
- let format = match key {
- "GT" => Format::GT(value.to_string()),
- "GQ" => Format::GQ(parse_value(value, key)?),
- "DP" => Format::DP(parse_value(value, key)?),
- "AD" => Format::AD(parse_vec_value(value, key)?),
- "VAF" => Format::VAF(parse_value(value, key)?),
- // "AF" => Format::AF(parse_value(value, key)?),
- // "NAF" => Format::NAF(parse_value(value, key)?),
- "NDP" => Format::NDP(parse_value(value, key)?),
- "NAD" => Format::NAD(parse_vec_value(value, key)?),
- "AU" => Format::AU(parse_value(value, key)?),
- "CU" => Format::CU(parse_value(value, key)?),
- "GU" => Format::GU(parse_value(value, key)?),
- "TU" => Format::TU(parse_value(value, key)?),
- "NAU" => Format::NAU(parse_value(value, key)?),
- "NCU" => Format::NCU(parse_value(value, key)?),
- "NGU" => Format::NGU(parse_value(value, key)?),
- "NTU" => Format::NTU(parse_value(value, key)?),
- "PL" => Format::PL(parse_vec_value(value, key)?),
- "TR" => Format::TR(parse_value(value, key)?),
- "VR" => Format::VR(parse_value(value, key)?),
- _ => Format::Other((key.to_string(), value.to_string())),
- };
- Ok(format)
- }
- }
- // Helper function to parse a single value (DeepSeek)
- fn parse_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<T>
- where
- T::Err: std::fmt::Debug,
- {
- value
- .parse()
- .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
- .context(format!("Can't parse {}: {}", key, value)) // Add context
- }
- // Helper function to parse comma-separated values (DeepSeek)
- fn parse_vec_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<Vec<T>>
- where
- T::Err: std::fmt::Debug,
- {
- value
- .split(',')
- .map(|e| {
- e.parse()
- .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
- .context(format!("Failed to parse {}: {}", key, e)) // Add context
- })
- .collect()
- }
- impl From<Format> for (String, String) {
- fn from(format: Format) -> Self {
- let concat = |values: Vec<u32>| -> String {
- values
- .iter()
- .map(|v| v.to_string())
- .collect::<Vec<_>>()
- .join(",")
- };
- match format {
- Format::GT(value) => ("GT".to_string(), value),
- Format::GQ(value) => ("GQ".to_string(), value.to_string()),
- Format::DP(value) => ("DP".to_string(), value.to_string()),
- Format::AD(values) => ("AD".to_string(), concat(values)),
- Format::VAF(value) => ("VAF".to_string(), value.to_string()),
- Format::PL(values) => ("PL".to_string(), concat(values)),
- Format::Other((key, value)) => (key, value),
- // Format::AF(value) => ("AF".to_string(), value.to_string()),
- // Format::NAF(value) => ("NAF".to_string(), value.to_string()),
- Format::NDP(value) => ("NDP".to_string(), value.to_string()),
- Format::NAD(values) => ("NAD".to_string(), concat(values)),
- Format::AU(value) => ("AU".to_string(), value.to_string()),
- Format::CU(value) => ("CU".to_string(), value.to_string()),
- Format::GU(value) => ("GU".to_string(), value.to_string()),
- Format::TU(value) => ("TU".to_string(), value.to_string()),
- Format::NAU(value) => ("NAU".to_string(), value.to_string()),
- Format::NCU(value) => ("NCU".to_string(), value.to_string()),
- Format::NGU(value) => ("NGU".to_string(), value.to_string()),
- Format::NTU(value) => ("NTU".to_string(), value.to_string()),
- Format::TR(value) => ("TR".to_string(), value.to_string()),
- Format::VR(value) => ("VR".to_string(), value.to_string()),
- }
- }
- }
- impl Formats {
- pub fn commun_deepvariant_clairs(&self) -> Self {
- let filtered_vec: Vec<Format> = self
- .0
- .clone()
- .into_iter()
- .map(|e| {
- if let Format::VAF(_v) = e {
- e
- // Format::AF(v)
- } else {
- e
- }
- })
- .filter(|format| {
- matches!(
- format,
- Format::GT(_) | Format::GQ(_) | Format::DP(_) | Format::AD(_) /* | Format::AF(_) */
- )
- })
- .collect();
- Formats(filtered_vec)
- }
- }
- /// Filter
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
- pub enum Filter {
- PASS,
- Other(String),
- }
- impl FromStr for Filter {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- match s {
- "PASS" => Ok(Filter::PASS),
- _ => Ok(Filter::Other(s.to_string())),
- }
- }
- }
- impl fmt::Display for Filter {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- match self {
- Filter::PASS => write!(f, "PASS"),
- Filter::Other(ref s) => write!(f, "{}", s),
- }
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
- pub enum ReferenceAlternative {
- Nucleotide(Base),
- Nucleotides(Vec<Base>),
- Unstructured(String),
- }
- impl FromStr for ReferenceAlternative {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let possible_bases = s.as_bytes().iter();
- let mut res: Vec<Base> = Vec::new();
- for &base in possible_bases {
- match base.try_into() {
- std::result::Result::Ok(b) => res.push(b),
- Err(_) => {
- return Ok(Self::Unstructured(s.to_string()));
- }
- }
- }
- if res.len() == 1 {
- Ok(Self::Nucleotide(res.pop().unwrap()))
- } else {
- Ok(Self::Nucleotides(res))
- }
- }
- }
- impl fmt::Display for ReferenceAlternative {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let string = match self {
- ReferenceAlternative::Nucleotide(b) => b.to_string(),
- ReferenceAlternative::Nucleotides(bases) => bases
- .iter()
- .fold(String::new(), |acc, e| format!("{}{}", acc, e)),
- ReferenceAlternative::Unstructured(s) => s.to_string(),
- };
- write!(f, "{}", string)
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
- pub enum Base {
- A,
- T,
- C,
- G,
- N,
- }
- impl TryFrom<u8> for Base {
- type Error = anyhow::Error;
- fn try_from(base: u8) -> anyhow::Result<Self> {
- match base {
- b'A' => Ok(Base::A),
- b'T' => Ok(Base::T),
- b'C' => Ok(Base::C),
- b'G' => Ok(Base::G),
- b'N' => Ok(Base::N),
- _ => Err(anyhow::anyhow!(
- "Unknown base: {}",
- String::from_utf8_lossy(&[base])
- )),
- }
- }
- }
- impl Base {
- pub fn into_u8(self) -> u8 {
- match self {
- Base::A => b'A',
- Base::T => b'T',
- Base::C => b'C',
- Base::G => b'G',
- Base::N => b'N',
- }
- }
- }
- impl fmt::Display for Base {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- // Use `self.number` to refer to each positional data point.
- let str = match self {
- Base::A => "A",
- Base::T => "T",
- Base::C => "C",
- Base::G => "G",
- Base::N => "N",
- };
- write!(f, "{}", str)
- }
- }
- pub trait Variants {
- fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection>;
- }
- pub trait VariantId {
- fn variant_id(&self) -> String;
- }
- pub trait RunnerVariants: Run + Variants + Send + Sync {}
- pub type CallerBox = Box<dyn RunnerVariants + Send + Sync>;
- #[macro_export]
- macro_rules! init_somatic_callers {
- ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {
- vec![
- $(
- Box::new(<$runner>::initialize($id, $config.clone())?) as CallerBox
- ),+
- ]
- };
- }
- #[macro_export]
- macro_rules! init_solo_callers {
- ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {
- vec![
- $(
- Box::new(<$runner>::initialize($id, $arg, $config.clone())?) as CallerBox
- ),+
- ]
- };
- }
- pub fn run_variants(iterable: &mut [CallerBox]) -> anyhow::Result<()> {
- iterable
- .iter_mut()
- .try_for_each(|runner| runner.run())
- .map_err(|e| anyhow::anyhow!("Error while calling run_variants.\n{e}"))
- }
- pub fn load_variants(
- iterable: &mut [CallerBox],
- annotations: &Annotations,
- ) -> anyhow::Result<Vec<VariantCollection>> {
- iterable
- .par_iter()
- .map(|runner| {
- let r = runner.variants(annotations);
- if let Err(ref e) = r {
- warn!("{e}");
- };
- r
- })
- .filter(|r| r.is_ok())
- .collect::<anyhow::Result<Vec<_>>>()
- .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}"))
- }
- pub fn parallel_intersection<T: Hash + Eq + Clone + Send + Sync>(
- vec1: &[T],
- vec2: &[T],
- ) -> (Vec<T>, Vec<T>, Vec<T>) {
- let set1: HashSet<_> = vec1.par_iter().cloned().collect();
- let set2: HashSet<_> = vec2.par_iter().cloned().collect();
- let common: Vec<T> = set1
- .par_iter()
- .filter(|item| set2.contains(item))
- .cloned()
- .collect();
- let only_in_first: Vec<T> = set1
- .par_iter()
- .filter(|item| !set2.contains(item))
- .cloned()
- .collect();
- let only_in_second: Vec<T> = set2
- .par_iter()
- .filter(|item| !set1.contains(item))
- .cloned()
- .collect();
- (common, only_in_first, only_in_second)
- }
|