| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712 |
- use crate::{
- annotation::Annotations,
- collection::ShouldRun,
- helpers::{estimate_shannon_entropy, mean, Hash128},
- positions::{GenomePosition, GetGenomePosition, VcfPosition},
- runners::Run,
- variant::variant_collection::VariantCollection,
- };
- use anyhow::{anyhow, Context};
- use bitcode::{Decode, Encode};
- use log::{error, info};
- use rayon::prelude::*;
- use serde::{Deserialize, Serialize};
- use std::{
- cmp::Ordering,
- collections::{BTreeSet, HashSet},
- fmt,
- hash::Hash,
- str::FromStr,
- };
- /// Represents a variant in the Variant Call Format (VCF).
- #[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
- pub struct VcfVariant {
- /// A 128-bit hash of the variant's key properties for efficient comparison and storage.
- pub hash: Hash128,
- /// The genomic position of the variant.
- pub position: GenomePosition,
- /// The identifier of the variant.
- pub id: String,
- /// The reference allele.
- pub reference: ReferenceAlternative,
- /// The alternative allele.
- pub alternative: ReferenceAlternative,
- /// The quality score of the variant call, if available.
- pub quality: Option<f32>,
- /// The filter status of the variant.
- pub filter: Filter,
- /// Additional information about the variant.
- pub infos: Infos,
- /// Genotype information and other sample-specific data.
- pub formats: Formats,
- }
- impl PartialEq for VcfVariant {
- /// Compares two VcfVariants for equality.
- ///
- /// Note: This comparison only considers position, reference, and alternative.
- /// It intentionally ignores id, filter, info, format, and quality.
- fn eq(&self, other: &Self) -> bool {
- // Nota bene: id, filter, info, format and quality is intentionally not compared
- self.position == other.position
- && self.reference == other.reference
- && self.alternative == other.alternative
- }
- }
- impl Eq for VcfVariant {}
- impl FromStr for VcfVariant {
- type Err = anyhow::Error;
- /// Parses a VcfVariant from a string representation.
- ///
- /// The input string is expected to be a tab-separated VCF line.
- ///
- /// # Errors
- ///
- /// Returns an error if parsing fails for any field.
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let v: Vec<&str> = s.split('\t').collect();
- let vcf_position: VcfPosition = (
- *v.first().ok_or(anyhow!("Can't get contig from: {s}"))?,
- *v.get(1).ok_or(anyhow!("Can't get position from: {s}"))?,
- )
- .try_into()
- .context(format!("Can't parse position from: {s}"))?;
- let formats = if v.len() >= 10 {
- (
- *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?,
- *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?,
- )
- .try_into()
- .context(format!("Can't parse formats from: {s}"))?
- } else {
- Formats::default()
- };
- let position: GenomePosition = vcf_position.into();
- let reference: ReferenceAlternative = v
- .get(3)
- .ok_or(anyhow!("Can't parse reference from: {s}"))?
- .parse()
- .context(format!("Can't parse reference from: {s}"))?;
- let alternative: ReferenceAlternative = v
- .get(4)
- .ok_or(anyhow!("Can't parse alternative from: {s}"))?
- .parse()
- .context(format!("Can't parse alternative from: {s}"))?;
- // Blake3 128 bytes Hash
- let mut hasher = blake3::Hasher::new();
- hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes
- hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes
- hasher.update(reference.to_string().as_bytes()); // Reference string as bytes
- hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes
- let hash = hasher.finalize();
- let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
- Ok(Self {
- hash,
- position,
- id: v
- .get(2)
- .ok_or(anyhow!("Can't parse id from: {s}"))?
- .to_string(),
- reference,
- alternative,
- quality: v
- .get(5)
- .map(|s| s.parse::<f32>().ok()) // Try to parse as f64; returns Option<f64>
- .unwrap_or(None),
- filter: v
- .get(6)
- .ok_or(anyhow!("Can't parse filter from: {s}"))?
- .parse()
- .context(format!("Can't parse filter from: {s}"))?,
- infos: v
- .get(7)
- .ok_or(anyhow!("Can't parse infos from: {s}"))?
- .parse()
- .context(format!("Can't parse infos from: {s}"))?,
- formats,
- })
- }
- }
- // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag
- impl VcfVariant {
- /// Converts the VcfVariant into a VCF-formatted row string.
- ///
- /// This method creates a tab-separated string representation of the variant,
- /// suitable for writing to a VCF file.
- pub fn into_vcf_row(&self) -> String {
- let vcf_position: VcfPosition = self.position.clone().into();
- let (contig, position) = vcf_position.into();
- let mut columns = vec![
- contig,
- position,
- self.id.to_string(),
- self.reference.to_string(),
- self.alternative.to_string(),
- self.quality
- .map(|v| v.to_string())
- .unwrap_or(".".to_string()),
- self.filter.to_string(),
- self.infos.to_string(),
- ];
- if !self.formats.0.is_empty() {
- let (format, values) = self.formats.clone().into();
- columns.push(format);
- columns.push(values);
- }
- columns.join("\t")
- }
- /// Returns the hash of the variant.
- pub fn hash(&self) -> Hash128 {
- self.hash
- }
- /// Creates a new VcfVariant with common attributes from DeepVariant and CLAIRS.
- ///
- /// This method generates a new variant with shared properties, resetting some fields
- /// to default or empty values.
- pub fn commun_deepvariant_clairs(&self) -> VcfVariant {
- VcfVariant {
- hash: self.hash,
- position: self.position.clone(),
- id: self.id.clone(),
- reference: self.reference.clone(),
- alternative: self.alternative.clone(),
- quality: self.quality,
- filter: Filter::Other(".".to_string()),
- infos: Infos(vec![Info::Empty]),
- formats: self.formats.commun_deepvariant_clairs(),
- }
- }
- /// Checks if the variant has an SVTYPE info field.
- ///
- /// Returns true if the variant contains structural variation type information.
- pub fn has_svtype(&self) -> bool {
- self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_)))
- }
- pub fn n_alt_depth(&self) -> Option<(u32, u32)> {
- let r = self.formats.n_alt_depth();
- if r.is_some() {
- r
- } else {
- self.infos.n_alt_depth()
- }
- }
- /// Retrieves the structural variation type of the variant, if present.
- ///
- /// Returns Some(SVType) if the variant has an SVTYPE info field,
- pub fn svtype(&self) -> Option<SVType> {
- self.infos.0.iter().find_map(|e| {
- if let Info::SVTYPE(sv_type) = e {
- Some(sv_type.clone())
- } else {
- None
- }
- })
- }
- /// Determines the alteration category of the variant.
- ///
- /// This method analyzes the reference and alternative alleles to classify
- /// the variant into one of several alteration categories:
- /// - SNV (Single Nucleotide Variant)
- /// - INS (Insertion)
- /// - DEL (Deletion)
- /// - Other (including structural variants and complex alterations)
- ///
- /// The classification is based on the following rules:
- /// 1. If both reference and alternative are single nucleotides, it's an SNV.
- /// 2. If reference is a single nucleotide and alternative is multiple nucleotides, it's an insertion.
- /// 3. If reference is multiple nucleotides and alternative is a single nucleotide, it's a deletion.
- /// 4. For cases where both are multiple nucleotides, the longer one determines if it's an insertion or deletion.
- /// 5. If none of the above apply, it checks for structural variant types.
- /// 6. If no structural variant type is found, it's classified as "Other".
- ///
- /// # Returns
- /// An `AlterationCategory` enum representing the type of alteration.
- pub fn alteration_category(&self) -> AlterationCategory {
- match (&self.reference, &self.alternative) {
- (ReferenceAlternative::Nucleotide(a), ReferenceAlternative::Nucleotide(b))
- if *a != Base::N && *b != Base::N =>
- {
- AlterationCategory::SNV
- }
- (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
- AlterationCategory::INS
- }
- (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
- AlterationCategory::DEL
- }
- (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
- if a.len() < b.len() =>
- {
- AlterationCategory::INS
- }
- (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
- if a.len() > b.len() =>
- {
- AlterationCategory::DEL
- }
- _ => match self.svtype() {
- Some(sv_type) => {
- if let Ok(bnd_desc) = self.bnd_desc() {
- if bnd_desc.a_contig != bnd_desc.b_contig {
- AlterationCategory::TRL
- } else if bnd_desc.a_sens != bnd_desc.b_sens {
- AlterationCategory::DELINV
- } else {
- AlterationCategory::DEL
- }
- } else {
- AlterationCategory::from(sv_type)
- }
- }
- None => AlterationCategory::Other,
- },
- }
- }
- pub fn inserted_seq(&self) -> Option<String> {
- if self.alteration_category() != AlterationCategory::INS {
- return None;
- }
- if let Some(ins) = self.infos.0.iter().find_map(|e| match e {
- Info::SVINSSEQ(ins) => Some(ins.to_string()),
- _ => None,
- }) {
- return Some(ins);
- }
- match &self.alternative {
- ReferenceAlternative::Nucleotides(nt) => nt
- .get(1..)
- .map(|slice| slice.iter().map(ToString::to_string).collect()),
- _ => None,
- }
- }
- /// Parses and constructs a BND (breakend) description from the alternative string.
- ///
- /// This function interprets the BND notation in the alternative string and creates
- /// a `BNDDesc` struct containing detailed information about the breakend.
- ///
- /// # Returns
- /// - `Ok(BNDDesc)` if parsing is successful
- /// - `Err` if parsing fails or if the alteration is not a BND
- ///
- /// # Errors
- /// This function will return an error if:
- /// - The alteration category is not BND
- pub fn bnd_desc(&self) -> anyhow::Result<BNDDesc> {
- let alt = self.alternative.to_string();
- BNDDesc::from_vcf_breakend(&self.position().contig(), self.position.position + 1, &alt)
- .context(format!("The alteration is not BND: {alt}"))
- }
- /// Returns the length of the deletion if the variant is a deletion (`DEL`).
- pub fn deletion_len(&self) -> Option<u32> {
- if self.alteration_category() != AlterationCategory::DEL {
- return None;
- }
- // Try using the END field
- if let Some(len) = self.infos.0.iter().find_map(|i| {
- if let Info::END(end) = i {
- Some(end.saturating_sub(self.position.position))
- } else {
- None
- }
- }) {
- return Some(len);
- }
- // Fallback to SVLEN field
- if let Some(len) = self.infos.0.iter().find_map(|i| {
- if let Info::SVLEN(len) = i {
- Some(len.unsigned_abs())
- } else {
- None
- }
- }) {
- return Some(len);
- }
- match self.bnd_desc() {
- Ok(bnd_desc) if bnd_desc.a_contig == bnd_desc.b_contig => {
- if bnd_desc.a_position < bnd_desc.b_position {
- return Some(bnd_desc.b_position - bnd_desc.a_position);
- } else {
- return Some(bnd_desc.a_position - bnd_desc.b_position);
- }
- }
- _ => (),
- }
- // Final fallback: infer from reference and alternative
- if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_)) =
- (&self.reference, &self.alternative)
- {
- return Some(nt.len().saturating_sub(1) as u32);
- }
- if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotides(bnt)) =
- (&self.reference, &self.alternative)
- {
- if bnt.len() < nt.len() {
- return Some(nt.len().saturating_sub(bnt.len()) as u32);
- }
- }
- None
- }
- pub fn deletion_seq(&self) -> Option<String> {
- if self.alteration_category() != AlterationCategory::DEL {
- return None;
- }
- // infer from reference and alternative
- if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_)) =
- (&self.reference, &self.alternative)
- {
- return Some(nt.iter().skip(1).map(|e| e.to_string()).collect());
- }
- if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotides(bnt)) =
- (&self.reference, &self.alternative)
- {
- if bnt.len() < nt.len() {
- return Some(nt.iter().skip(1).map(|e| e.to_string()).collect());
- }
- }
- None
- }
- pub fn deletion_desc(&self) -> Option<DeletionDesc> {
- match self.bnd_desc() {
- Ok(bnd_desc) if bnd_desc.a_contig == bnd_desc.b_contig => {
- if bnd_desc.a_position < bnd_desc.b_position {
- return Some(DeletionDesc {
- contig: bnd_desc.a_contig,
- start: bnd_desc.a_position,
- end: bnd_desc.b_position,
- });
- } else {
- return Some(DeletionDesc {
- contig: bnd_desc.a_contig,
- start: bnd_desc.b_position,
- end: bnd_desc.a_position,
- });
- }
- }
- _ => (),
- }
- self.deletion_len().map(|len| DeletionDesc {
- contig: self.position.contig(),
- start: self.position.position + 2,
- end: (self.position.position + 1)
- .checked_add(len)
- .unwrap_or(self.position.position + 2),
- })
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
- pub struct BNDDesc {
- /// Coordinates of the **5′ end** (A)
- pub a_contig: String,
- pub a_position: u32,
- pub a_sens: bool,
- /// Coordinates of the **3′ end** (B)
- pub b_contig: String,
- pub b_position: u32,
- pub b_sens: bool,
- /// Inserted nucleotides at the junction (may be empty)
- pub added_nt: String,
- }
- pub trait GroupByThreshold {
- fn group_by(&self, threshold: u32) -> Vec<Vec<BNDDesc>>;
- }
- impl GroupByThreshold for Vec<BNDDesc> {
- fn group_by(&self, threshold: u32) -> Vec<Vec<BNDDesc>> {
- let mut sorted_vec = self.clone();
- sorted_vec.sort_by(|a, b| {
- (&a.a_contig, a.a_sens, a.a_position).cmp(&(&b.a_contig, b.a_sens, b.a_position))
- });
- let mut grouped: Vec<Vec<BNDDesc>> = Vec::new();
- for desc in sorted_vec {
- if let Some(last_group) = grouped.last_mut() {
- if let Some(last_desc) = last_group.last() {
- if last_desc.a_contig == desc.a_contig
- && last_desc.a_sens == desc.a_sens
- && (desc.a_position as i64 - last_desc.a_position as i64).abs()
- <= threshold as i64
- {
- last_group.push(desc);
- continue;
- }
- }
- }
- grouped.push(vec![desc]);
- }
- grouped
- }
- }
- impl BNDDesc {
- /// Construct a `BNDDesc` from VCF `CHROM`, `POS`, and break‑end `ALT` string.
- /// Supports the four VCF break‑end ALT forms:
- ///
- /// 1) `t[p[` → A(5′)=t flank, B(3′)=p flank (+ + )
- /// 2) `t]p]` → A(5′)=t flank, B(3′)=p flank but on reverse (+ -)
- /// 3) `]p]t` → A(5′)=p flank, B(3′)=t flank (- -)
- /// 4) `[p[t` → A(5′)=p flank, B(3′)=t flank with both on reverse (- +)
- ///
- /// Here `t` is the local flank sequence, `p` is the remote contig:pos.
- pub fn from_vcf_breakend(t_chrom: &str, t_pos: u32, alt: &str) -> Option<Self> {
- // locate first bracket
- let (open, bracket) = alt.char_indices().find(|&(_, c)| c == '[' || c == ']')?;
- // find matching bracket
- let close = alt[open + 1..].find(bracket)? + open + 1;
- // parse remote contig:pos between brackets
- let addr = &alt[open + 1..close];
- let mut parts = addr.splitn(2, ':');
- let p_contig = parts.next()?;
- let p_position: u32 = parts.next()?.parse().ok()?;
- // inserted sequence
- let seq_before = &alt[..open];
- let seq_after = &alt[close + 1..];
- let mut added_nt = String::new();
- added_nt.push_str(seq_before);
- added_nt.push_str(seq_after);
- added_nt.remove(0);
- // determine form and orientation
- // form 1 & 2: bracket after t (open>0): t before p
- // form 3 & 4: bracket before t (open==0): p before t
- let after_local = open > 0;
- let (t_sens, p_sens) = match (bracket, after_local) {
- ('[', true) => (true, true), // form 1 t[p[
- (']', true) => (true, false), // form 2 t]p]
- (']', false) => (true, true), // form 3 ]p]t
- ('[', false) => (true, false), // form 4 [p[t
- _ => return None,
- };
- // assign A/B depending on form
- let (a_contig, a_position, a_sens, b_contig, b_position, b_sens, added_nt) = if after_local
- {
- (
- t_chrom.into(),
- t_pos,
- t_sens,
- p_contig.into(),
- p_position,
- p_sens,
- added_nt,
- )
- } else {
- (
- p_contig.into(),
- p_position,
- p_sens,
- t_chrom.into(),
- t_pos,
- t_sens,
- reverse_complement(&added_nt),
- )
- };
- Some(Self {
- a_contig,
- a_position,
- a_sens,
- b_contig,
- b_position,
- b_sens,
- added_nt,
- })
- }
- pub fn rc(&self) -> Self {
- Self {
- a_contig: self.b_contig.clone(),
- a_position: self.b_position,
- a_sens: !self.b_sens,
- b_contig: self.a_contig.clone(),
- b_position: self.a_position,
- b_sens: !self.a_sens,
- // TODO change seq to RC nd should find base on fa
- added_nt: reverse_complement(&self.added_nt),
- }
- }
- }
- pub fn reverse_complement(dna: &str) -> String {
- fn complement(base: char) -> char {
- match base {
- 'A' => 'T',
- 'T' => 'A',
- 'C' => 'G',
- 'G' => 'C',
- 'a' => 't',
- 't' => 'a',
- 'c' => 'g',
- 'g' => 'c',
- 'N' => 'N',
- 'n' => 'n',
- _ => 'N',
- }
- }
- dna.chars().rev().map(complement).collect()
- }
- impl core::fmt::Display for BNDDesc {
- /// Compact textual form: `(A:contig:pos<arrow>;B:contig:pos<arrow>;ins=...)`.
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
- fn arrow(b: bool) -> &'static str {
- if b {
- "->"
- } else {
- "<-"
- }
- }
- write!(
- f,
- "({}{}:{}::{}::{}:{}{})",
- arrow(self.a_sens),
- self.a_contig,
- self.a_position,
- self.added_nt,
- self.b_contig,
- self.b_position,
- arrow(self.b_sens),
- )
- }
- }
- use petgraph::graph::{DiGraph, NodeIndex};
- use petgraph::visit::{IntoNodeIdentifiers, NodeIndexable};
- use petgraph::Direction;
- /// Wrapper around `petgraph::DiGraph` with `BNDDesc` nodes.
- pub struct BNDGraph<E = ()> {
- graph: DiGraph<BNDDesc, E>,
- }
- impl<E> Default for BNDGraph<E> {
- fn default() -> Self {
- Self {
- graph: DiGraph::new(),
- }
- }
- }
- impl<E> BNDGraph<E> {
- pub fn new() -> Self {
- Self::default()
- }
- pub fn add_breakpoint(&mut self, desc: BNDDesc) -> NodeIndex {
- self.graph.add_node(desc)
- }
- pub fn successors(&self, n: NodeIndex) -> impl Iterator<Item = NodeIndex> + '_ {
- self.graph.neighbors_directed(n, Direction::Outgoing)
- }
- pub fn predecessors(&self, n: NodeIndex) -> impl Iterator<Item = NodeIndex> + '_ {
- self.graph.neighbors_directed(n, Direction::Incoming)
- }
- pub fn inner(&self) -> &DiGraph<BNDDesc, E> {
- &self.graph
- }
- /// Stringify all nodes
- pub fn nodes_as_strings(&self) -> Vec<String> {
- self.graph.node_weights().map(|n| n.to_string()).collect()
- }
- }
- #[allow(clippy::collapsible_if)]
- impl<E: Default> BNDGraph<E> {
- /// Build edges following downstream 5′→3′ logic for two patterns:
- ///
- /// **Pattern 1** (B → A, *u precedes v*):
- ///
- /// * forward (→) `u.b ≤ v.a` → edge **u → v**
- /// * reverse (←) `u.b ≥ v.a` → edge **v → u**
- ///
- /// **Pattern 2** (A → B, *v precedes u*):
- ///
- /// * forward (→) `u.a ≥ v.b` → edge **v → u**
- /// * reverse (←) `u.a ≤ v.b` → edge **u → v**
- pub fn auto_connect(&mut self)
- where
- E: Default,
- {
- let nodes: Vec<NodeIndex> = self.graph.node_indices().collect();
- let mut pending: Vec<(NodeIndex, NodeIndex)> = Vec::new();
- for &u_idx in &nodes {
- for &v_idx in &nodes {
- if u_idx == v_idx {
- continue;
- }
- let u = &self.graph[u_idx];
- let v = &self.graph[v_idx];
- // Pattern 1: B(u) -> A(v) (u precedes v)
- if u.b_contig == v.a_contig && u.b_sens == v.a_sens {
- if (u.b_sens && u.b_position <= v.a_position)
- || (!u.b_sens && u.b_position >= v.a_position)
- {
- pending.push((u_idx, v_idx));
- }
- }
- // Pattern 2: A(u) -> B(v) (v precedes u)
- if u.a_contig == v.b_contig && u.a_sens == v.b_sens {
- if (u.a_sens && u.a_position >= v.b_position)
- || (!u.a_sens && u.a_position <= v.b_position)
- {
- // Edge direction depends on strand logic
- let (src, dst) = if u.a_sens {
- // forward ⇒ edge v -> u
- (v_idx, u_idx)
- } else {
- // reverse ⇒ edge u -> v
- (u_idx, v_idx)
- };
- pending.push((src, dst));
- }
- }
- }
- }
- // second pass – insert, skipping duplicates
- for (src, dst) in pending {
- if self.graph.find_edge(src, dst).is_none() {
- self.graph.add_edge(src, dst, E::default());
- }
- }
- }
- /// Add an edge if absent.
- pub fn add_edge_if_absent(&mut self, src: NodeIndex, dst: NodeIndex) {
- if self.graph.find_edge(src, dst).is_none() {
- self.graph.add_edge(src, dst, E::default());
- }
- }
- /// Pretty‑print a sequence of nodes indicating **actual traversal
- /// direction** between successive nodes:
- /// * `→` when the graph contains an edge from *previous* → *current*.
- /// * `←` when the edge exists from *current* → *previous* (path goes
- /// "against" the stored direction).
- ///
- /// If neither directional edge is present the placeholder `--` is used so
- /// debugging clearly shows missing links.
- pub fn fmt_path(&self, path: &[NodeIndex]) -> String {
- use core::fmt::Write as _;
- let mut out = String::new();
- if let Some((&first, rest)) = path.split_first() {
- let _ = write!(out, "{}", &self.graph[first]);
- let mut prev = first;
- for &curr in rest {
- let arrow = if self.graph.find_edge(prev, curr).is_some() {
- " → "
- } else if self.graph.find_edge(curr, prev).is_some() {
- " ← "
- } else {
- " -- " // unexpected: no direct edge
- };
- out.push_str(arrow);
- let _ = write!(out, "{}", &self.graph[curr]);
- prev = curr;
- }
- }
- out
- }
- // ------------------------------------------------------------------
- // Traversal utilities
- // ------------------------------------------------------------------
- /// Return a directed path that visits **every node exactly once** (Hamiltonian
- /// path) if one exists. Uses naive DFS back-tracking which is exponential—
- /// fine for small graphs (<15 nodes) but aborts early otherwise.
- pub fn hamiltonian_path(&self) -> Option<Vec<NodeIndex>> {
- let n = self.graph.node_count();
- if n == 0 {
- return Some(Vec::new());
- }
- if n > 15 {
- // avoid combinatorial explosion
- return None;
- }
- let mut path = Vec::with_capacity(n);
- let mut visited = vec![false; self.graph.node_bound()];
- for start in self.graph.node_identifiers() {
- path.push(start);
- visited[start.index()] = true;
- if self.dfs_hamiltonian(start, &mut visited, &mut path, n) {
- return Some(path);
- }
- visited[start.index()] = false;
- path.clear();
- }
- None
- }
- /**
- Recursive helper for [`hamiltonian_path`](#method.hamiltonian_path).
- * `current` – node we are currently expanding from.
- * `visited` – mutable bitmap indicating which nodes are already on `path`.
- * `path` – growing list of nodes (last element is always `current`).
- * `target` – desired final length (= `graph.node_count()`).
- The function explores each outgoing neighbour that hasn’t been visited yet,
- marking it **visited → recurse → unvisit** (classic DFS back-tracking). It
- returns `true` as soon as a full-length path is discovered, which bubbles
- up through the recursion to terminate the search early.
- */
- fn dfs_hamiltonian(
- &self,
- current: NodeIndex,
- visited: &mut [bool],
- path: &mut Vec<NodeIndex>,
- target: usize,
- ) -> bool {
- // Base‑case: reached required length → success.
- if path.len() == target {
- return true;
- }
- // Explore every outgoing neighbour.
- for next in self.graph.neighbors(current) {
- if !visited[next.index()] {
- // choose
- visited[next.index()] = true;
- path.push(next);
- // explore
- if self.dfs_hamiltonian(next, visited, path, target) {
- return true;
- }
- // un-choose (back-track)
- path.pop();
- visited[next.index()] = false;
- }
- }
- false
- }
- /// Weakly-connected components sorted descending by size (no `Clone` bound).
- pub fn components_by_size(&self) -> Vec<Vec<NodeIndex>> {
- let mut visited = vec![false; self.graph.node_bound()];
- let mut comps: Vec<Vec<NodeIndex>> = Vec::new();
- for start in self.graph.node_indices() {
- if visited[start.index()] {
- continue;
- }
- // DFS over undirected neighbours
- let mut stack = vec![start];
- let mut comp = Vec::new();
- visited[start.index()] = true;
- while let Some(n) = stack.pop() {
- comp.push(n);
- for neigh in self.graph.neighbors_undirected(n) {
- if !visited[neigh.index()] {
- visited[neigh.index()] = true;
- stack.push(neigh);
- }
- }
- }
- comps.push(comp);
- }
- comps.sort_by_key(|c| std::cmp::Reverse(c.len()));
- comps
- }
- /// Convenience wrapper: try to return a Hamiltonian path; if impossible,
- /// return all weakly‑connected subgraphs ordered by size.
- pub fn path_or_components(&self) -> Result<Vec<NodeIndex>, Vec<Vec<NodeIndex>>> {
- if let Some(p) = self.hamiltonian_path() {
- Ok(p)
- } else {
- Err(self.components_by_size())
- }
- }
- }
- pub trait ToBNDGraph<E = ()> {
- /// Consume the vector and return a `BNDGraph` whose edges are created with
- /// `auto_connect()`.
- fn to_bnd_graph(self) -> BNDGraph<E>
- where
- E: Default;
- }
- impl<E: Default> ToBNDGraph<E> for Vec<BNDDesc> {
- fn to_bnd_graph(self) -> BNDGraph<E> {
- let mut g = BNDGraph::<E>::new();
- for b in self {
- g.add_breakpoint(b);
- }
- g.auto_connect();
- g
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
- pub struct DeletionDesc {
- pub contig: String,
- pub start: u32, // 1-based [)
- pub end: u32,
- }
- impl fmt::Display for DeletionDesc {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(f, "{}:{}_{}_del", self.contig, self.start, self.end)
- }
- }
- impl DeletionDesc {
- pub fn len(&self) -> u32 {
- self.end.saturating_sub(self.start) + 1
- }
- pub fn is_empty(&self) -> bool {
- self.len() == 0
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
- pub enum AlterationCategory {
- SNV,
- DEL,
- INS,
- DUP,
- INV,
- CNV,
- TRL,
- BND,
- DELINV,
- Other,
- }
- impl fmt::Display for AlterationCategory {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(
- f,
- "{}",
- match self {
- AlterationCategory::SNV => "SNV",
- AlterationCategory::DEL => "DEL",
- AlterationCategory::INS => "INS",
- AlterationCategory::DUP => "DUP",
- AlterationCategory::INV => "INV",
- AlterationCategory::CNV => "CNV",
- AlterationCategory::BND => "BND",
- AlterationCategory::TRL => "TRL",
- AlterationCategory::Other => "Other",
- AlterationCategory::DELINV => "DELINV",
- }
- )
- }
- }
- impl From<SVType> for AlterationCategory {
- fn from(sv_type: SVType) -> Self {
- match sv_type {
- SVType::DEL => AlterationCategory::DEL,
- SVType::INS => AlterationCategory::INS,
- SVType::DUP => AlterationCategory::DUP,
- SVType::INV => AlterationCategory::INV,
- SVType::CNV => AlterationCategory::CNV,
- SVType::BND => AlterationCategory::BND,
- }
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
- pub enum SVType {
- DEL,
- INS,
- DUP,
- INV,
- CNV,
- BND,
- }
- impl FromStr for SVType {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- match s {
- "DEL" => Ok(SVType::DEL),
- "INS" => Ok(SVType::INS),
- "DUP" => Ok(SVType::DUP),
- "INV" => Ok(SVType::INV),
- "CNV" => Ok(SVType::CNV),
- "BND" => Ok(SVType::BND),
- _ => Err(anyhow!("Can't parse SVTYPE={s}")),
- }
- }
- }
- impl fmt::Display for SVType {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(
- f,
- "{}",
- match self {
- SVType::DEL => "DEL",
- SVType::INS => "INS",
- SVType::DUP => "DUP",
- SVType::INV => "INV",
- SVType::CNV => "CNV",
- SVType::BND => "BND",
- }
- )
- }
- }
- impl VariantId for VcfVariant {
- fn variant_id(&self) -> String {
- format!(
- "{}:{}_{}_{}",
- self.position.contig(),
- self.position.position + 1,
- self.reference,
- self.alternative
- )
- }
- }
- impl GetGenomePosition for VcfVariant {
- fn position(&self) -> &GenomePosition {
- &self.position
- }
- }
- impl PartialOrd for VcfVariant {
- fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
- Some(self.cmp(other))
- }
- }
- impl Ord for VcfVariant {
- fn cmp(&self, other: &Self) -> Ordering {
- self.position.cmp(&other.position)
- }
- }
- /// A container for a list of VCF `INFO` fields.
- ///
- /// Represents a parsed set of key-value annotations or flags found in the INFO column.
- ///
- /// # Example
- /// ```
- /// use your_crate::Infos;
- /// use std::str::FromStr;
- ///
- /// let infos = Infos::from_str("SVTYPE=DEL;END=12345;TUMOUR_AF=0.25,0.15").unwrap();
- /// println!("{}", infos); // Displays: SVTYPE=DEL;END=12345;TUMOUR_AF=0.25,0.15
- /// ```
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
- pub struct Infos(pub Vec<Info>);
- impl FromStr for Infos {
- type Err = anyhow::Error;
- /// Parses a semicolon-separated list of INFO fields from a VCF record.
- fn from_str(s: &str) -> anyhow::Result<Self> {
- Ok(Self(
- s.split(";")
- .map(Info::from_str)
- .collect::<Result<Vec<Info>, _>>()
- .map_err(|e| anyhow::anyhow!("Failed to parse info: {e}"))?,
- ))
- }
- }
- impl fmt::Display for Infos {
- /// Formats the `Infos` as a semicolon-separated VCF-style INFO string.
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- let items: Vec<_> = self
- .0
- .iter()
- .filter(|info| !matches!(info, Info::Empty))
- .map(ToString::to_string)
- .collect();
- if items.is_empty() {
- write!(f, ".")
- } else {
- write!(f, "{}", items.join(";"))
- }
- }
- }
- impl Infos {
- pub fn n_alt_depth(&self) -> Option<(u32, u32)> {
- use Info::*;
- let mut tumor_alt: Option<u32> = None;
- let mut tumor_depth: Option<u32> = None;
- for info in self.0.iter() {
- match info {
- TUMOUR_DP_AT(v) => {
- let m = mean(v);
- tumor_depth = Some(m.round() as u32);
- }
- TUMOUR_READ_SUPPORT(v) => {
- tumor_alt = Some(*v);
- }
- _ => (),
- }
- }
- match (tumor_alt, tumor_depth) {
- (Some(a), Some(d)) => Some((a, d)),
- _ => None,
- }
- }
- }
- /// Enum representing a single INFO field in a VCF record.
- ///
- /// Supports both standard fields and Severus-specific structural variant annotations.
- /// Handles string values, numeric values, vectors, and flags.
- ///
- /// Variants with `Vec<_>` represent fields with multiple comma-separated values.
- #[allow(non_camel_case_types)]
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
- pub enum Info {
- Empty,
- H,
- F,
- P,
- FAU(u32),
- FCU(u32),
- FGU(u32),
- FTU(u32),
- RAU(u32),
- RCU(u32),
- RGU(u32),
- RTU(u32),
- SVTYPE(SVType),
- MATEID(String),
- NORMAL_READ_SUPPORT(u32),
- TUMOUR_READ_SUPPORT(u32),
- NORMAL_ALN_SUPPORT(u32),
- TUMOUR_ALN_SUPPORT(u32),
- SVLEN(i32),
- TUMOUR_DP_BEFORE(Vec<u32>),
- TUMOUR_DP_AT(Vec<u32>),
- TUMOUR_DP_AFTER(Vec<u32>),
- NORMAL_DP_BEFORE(Vec<u32>),
- NORMAL_DP_AT(Vec<u32>),
- NORMAL_DP_AFTER(Vec<u32>),
- TUMOUR_AF(Vec<f32>),
- NORMAL_AF(Vec<f32>),
- BP_NOTATION(String),
- SOURCE(String),
- CLUSTERED_READS_TUMOUR(u32),
- CLUSTERED_READS_NORMAL(u32),
- TUMOUR_ALT_HP(Vec<u32>),
- TUMOUR_PS(Vec<String>),
- NORMAL_ALT_HP(Vec<u32>),
- NORMAL_PS(Vec<String>),
- TUMOUR_TOTAL_HP_AT(Vec<u32>),
- NORMAL_TOTAL_HP_AT(Vec<u32>),
- ORIGIN_STARTS_STD_DEV(f32),
- ORIGIN_MAPQ_MEAN(f32),
- ORIGIN_EVENT_SIZE_STD_DEV(f32),
- ORIGIN_EVENT_SIZE_MEDIAN(f32),
- ORIGIN_EVENT_SIZE_MEAN(f32),
- END_STARTS_STD_DEV(f32),
- END_MAPQ_MEAN(f32),
- END_EVENT_SIZE_STD_DEV(f32),
- END_EVENT_SIZE_MEDIAN(f32),
- END_EVENT_SIZE_MEAN(f32),
- CLASS(String),
- END(u32),
- SVINSLEN(u32),
- SVINSSEQ(String),
- // Severus
- PRECISE,
- IMPRECISE,
- STRANDS(String),
- DETAILED_TYPE(String),
- INSLEN(i32),
- MAPQ(u32),
- PHASESETID(String),
- HP(u32),
- CLUSTERID(String),
- INSSEQ(String),
- MATE_ID(String),
- INSIDE_VNTR(String),
- ALINGED_POS(String),
- }
- impl FromStr for Info {
- type Err = anyhow::Error;
- /// Parses a single `INFO` key or key=value string into a typed `Info` variant.
- ///
- /// Handles both presence/absence flags and key-value fields
- fn from_str(s: &str) -> anyhow::Result<Self> {
- if s.contains('=') {
- let (key, value) = s
- .split_once('=')
- .context(format!("Can't split with `=` in string: {s}"))?;
- Ok(match key {
- "FAU" => Info::FAU(parse_value(value, key)?),
- "FCU" => Info::FCU(parse_value(value, key)?),
- "FGU" => Info::FGU(parse_value(value, key)?),
- "FTU" => Info::FTU(parse_value(value, key)?),
- "RAU" => Info::RAU(parse_value(value, key)?),
- "RCU" => Info::RCU(parse_value(value, key)?),
- "RGU" => Info::RGU(parse_value(value, key)?),
- "RTU" => Info::RTU(parse_value(value, key)?),
- "SVLEN" => Info::SVLEN(parse_value(value, key)?),
- "END" => Info::END(parse_value(value, key)?),
- "SVINSLEN" => Info::SVINSLEN(parse_value(value, key)?),
- "SVTYPE" => Info::SVTYPE(value.parse()?),
- "MATEID" => Info::MATEID(value.to_string()),
- "NORMAL_READ_SUPPORT" => Info::NORMAL_READ_SUPPORT(parse_value(value, key)?),
- "TUMOUR_READ_SUPPORT" => Info::TUMOUR_READ_SUPPORT(parse_value(value, key)?),
- "NORMAL_ALN_SUPPORT" => Info::NORMAL_ALN_SUPPORT(parse_value(value, key)?),
- "TUMOUR_ALN_SUPPORT" => Info::TUMOUR_ALN_SUPPORT(parse_value(value, key)?),
- "SVINSSEQ" => Info::SVINSSEQ(value.to_string()),
- "TUMOUR_DP_BEFORE" => Info::TUMOUR_DP_BEFORE(parse_vec_value(value, key)?),
- "TUMOUR_DP_AT" => Info::TUMOUR_DP_AT(parse_vec_value(value, key)?),
- "TUMOUR_DP_AFTER" => Info::TUMOUR_DP_AFTER(parse_vec_value(value, key)?),
- "NORMAL_DP_BEFORE" => Info::NORMAL_DP_BEFORE(parse_vec_value(value, key)?),
- "NORMAL_DP_AT" => Info::NORMAL_DP_AT(parse_vec_value(value, key)?),
- "NORMAL_DP_AFTER" => Info::NORMAL_DP_AFTER(parse_vec_value(value, key)?),
- "TUMOUR_AF" => Info::TUMOUR_AF(parse_vec_value(value, key)?),
- "NORMAL_AF" => Info::NORMAL_AF(parse_vec_value(value, key)?),
- "BP_NOTATION" => Info::BP_NOTATION(value.to_string()),
- "SOURCE" => Info::SOURCE(value.to_string()),
- "CLUSTERED_READS_TUMOUR" => Info::CLUSTERED_READS_TUMOUR(parse_value(value, key)?),
- "CLUSTERED_READS_NORMAL" => Info::CLUSTERED_READS_NORMAL(parse_value(value, key)?),
- "TUMOUR_ALT_HP" => Info::TUMOUR_ALT_HP(parse_vec_value(value, key)?),
- "TUMOUR_PS" => Info::TUMOUR_PS(parse_vec_value(value, key)?),
- "NORMAL_ALT_HP" => Info::NORMAL_ALT_HP(parse_vec_value(value, key)?),
- "NORMAL_PS" => Info::NORMAL_PS(parse_vec_value(value, key)?),
- "TUMOUR_TOTAL_HP_AT" => Info::TUMOUR_TOTAL_HP_AT(parse_vec_value(value, key)?),
- "NORMAL_TOTAL_HP_AT" => Info::NORMAL_TOTAL_HP_AT(parse_vec_value(value, key)?),
- "ORIGIN_STARTS_STD_DEV" => Info::ORIGIN_STARTS_STD_DEV(parse_value(value, key)?),
- "ORIGIN_MAPQ_MEAN" => Info::ORIGIN_MAPQ_MEAN(parse_value(value, key)?),
- "ORIGIN_EVENT_SIZE_STD_DEV" => {
- Info::ORIGIN_EVENT_SIZE_STD_DEV(parse_value(value, key)?)
- }
- "ORIGIN_EVENT_SIZE_MEDIAN" => {
- Info::ORIGIN_EVENT_SIZE_MEDIAN(parse_value(value, key)?)
- }
- "ORIGIN_EVENT_SIZE_MEAN" => Info::ORIGIN_EVENT_SIZE_MEAN(parse_value(value, key)?),
- "END_STARTS_STD_DEV" => Info::END_STARTS_STD_DEV(parse_value(value, key)?),
- "END_MAPQ_MEAN" => Info::END_MAPQ_MEAN(parse_value(value, key)?),
- "END_EVENT_SIZE_STD_DEV" => Info::END_EVENT_SIZE_STD_DEV(parse_value(value, key)?),
- "END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?),
- "END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?),
- "CLASS" => Info::CLASS(value.to_string()),
- "PRECISE" => Info::PRECISE,
- "IMPRECISE" => Info::IMPRECISE,
- "STRANDS" => Info::STRANDS(value.to_string()),
- "DETAILED_TYPE" => Info::DETAILED_TYPE(value.to_string()),
- "INSLEN" => Info::INSLEN(parse_value(value, key)?),
- "MAPQ" => Info::MAPQ(parse_value(value, key)?),
- "PHASESETID" => Info::PHASESETID(value.to_string()),
- "HP" => Info::HP(parse_value(value, key)?),
- "CLUSTERID" => Info::CLUSTERID(value.to_string()),
- "INSSEQ" => Info::INSSEQ(value.to_string()),
- "MATE_ID" => Info::MATE_ID(value.to_string()),
- "INSIDE_VNTR" => Info::INSIDE_VNTR(value.to_string()),
- "ALINGED_POS" => Info::ALINGED_POS(value.to_string()),
- _ => Info::Empty,
- })
- } else {
- Ok(match s {
- "H" => Info::H,
- "F" => Info::F,
- "P" => Info::P,
- "PRECISE" => Info::PRECISE,
- "IMPRECISE" => Info::IMPRECISE,
- _ => Info::Empty,
- })
- }
- }
- }
- impl fmt::Display for Info {
- /// Converts the `Info` enum into a VCF-compliant string (key=value or flag).
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- match self {
- Info::Empty => write!(f, "."),
- Info::H => write!(f, "H"),
- Info::F => write!(f, "F"),
- Info::P => write!(f, "P"),
- // ClairS
- Info::FAU(v) => write!(f, "FAU={v}"),
- Info::FCU(v) => write!(f, "FCU={v}"),
- Info::FGU(v) => write!(f, "FGU={v}"),
- Info::FTU(v) => write!(f, "FTU={v}"),
- Info::RAU(v) => write!(f, "RAU={v}"),
- Info::RCU(v) => write!(f, "RCU={v}"),
- Info::RGU(v) => write!(f, "RGU={v}"),
- Info::RTU(v) => write!(f, "RTU={v}"),
- // Nanomonsv
- Info::SVTYPE(v) => write!(f, "SVTYPE={v}"),
- Info::SVLEN(v) => write!(f, "SVLEN={v}"),
- Info::END(v) => write!(f, "END={v}"),
- Info::MATEID(v) => write!(f, "MATEID={v}"),
- Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"),
- Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"),
- // SAVANA
- Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"),
- Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"),
- Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"),
- Info::TUMOUR_ALN_SUPPORT(v) => write!(f, "TUMOUR_ALN_SUPPORT={v}"),
- Info::TUMOUR_DP_BEFORE(v) => write!(f, "TUMOUR_DP_BEFORE={}", concat_numbers(v)),
- Info::TUMOUR_DP_AT(v) => write!(f, "TUMOUR_DP_AT={}", concat_numbers(v)),
- Info::TUMOUR_DP_AFTER(v) => write!(f, "TUMOUR_DP_AFTER={}", concat_numbers(v)),
- Info::NORMAL_DP_BEFORE(v) => write!(f, "NORMAL_DP_BEFORE={}", concat_numbers(v)),
- Info::NORMAL_DP_AT(v) => write!(f, "NORMAL_DP_AT={}", concat_numbers(v)),
- Info::NORMAL_DP_AFTER(v) => write!(f, "NORMAL_DP_AFTER={}", concat_numbers(v)),
- Info::TUMOUR_AF(v) => write!(f, "TUMOUR_AF={}", concat_numbers(v)),
- Info::NORMAL_AF(v) => write!(f, "NORMAL_AF={}", concat_numbers(v)),
- Info::BP_NOTATION(v) => write!(f, "BP_NOTATION={v}"),
- Info::SOURCE(v) => write!(f, "SOURCE={v}"),
- Info::CLUSTERED_READS_TUMOUR(v) => write!(f, "CLUSTERED_READS_TUMOUR={v}"),
- Info::CLUSTERED_READS_NORMAL(v) => write!(f, "CLUSTERED_READS_NORMAL={v}"),
- Info::TUMOUR_ALT_HP(v) => write!(f, "TUMOUR_ALT_HP={}", concat_numbers(v)),
- Info::TUMOUR_PS(v) => write!(f, "TUMOUR_PS={}", v.join(",")),
- Info::NORMAL_ALT_HP(v) => write!(f, "NORMAL_ALT_HP={}", concat_numbers(v)),
- Info::NORMAL_PS(v) => write!(f, "NORMAL_PS={}", v.join(",")),
- Info::TUMOUR_TOTAL_HP_AT(v) => write!(f, "TUMOUR_TOTAL_HP_AT={}", concat_numbers(v)),
- Info::NORMAL_TOTAL_HP_AT(v) => write!(f, "NORMAL_TOTAL_HP_AT={}", concat_numbers(v)),
- Info::ORIGIN_STARTS_STD_DEV(v) => write!(f, "ORIGIN_STARTS_STD_DEV={v}"),
- Info::ORIGIN_MAPQ_MEAN(v) => write!(f, "ORIGIN_MAPQ_MEAN={v}"),
- Info::ORIGIN_EVENT_SIZE_STD_DEV(v) => write!(f, "ORIGIN_EVENT_SIZE_STD_DEV={v}"),
- Info::ORIGIN_EVENT_SIZE_MEDIAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEDIAN={v}"),
- Info::ORIGIN_EVENT_SIZE_MEAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEAN={v}"),
- Info::END_STARTS_STD_DEV(v) => write!(f, "END_STARTS_STD_DEV={v}"),
- Info::END_MAPQ_MEAN(v) => write!(f, "END_MAPQ_MEAN={v}"),
- Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"),
- Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"),
- Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"),
- Info::CLASS(v) => write!(f, "CLASS={v}"),
- // Severus
- Info::PRECISE => write!(f, "PRECISE"),
- Info::IMPRECISE => write!(f, "IMPRECISE"),
- Info::STRANDS(v) => write!(f, "STRANDS={v}"),
- Info::DETAILED_TYPE(v) => write!(f, "DETAILED_TYPE={v}"),
- Info::INSLEN(v) => write!(f, "INSLEN={v}"),
- Info::MAPQ(v) => write!(f, "MAPQ={v}"),
- Info::PHASESETID(v) => write!(f, "PHASESETID={v}"),
- Info::HP(v) => write!(f, "HP={v}"),
- Info::CLUSTERID(v) => write!(f, "CLUSTERID={v}"),
- Info::INSSEQ(v) => write!(f, "INSSEQ={v}"),
- Info::MATE_ID(v) => write!(f, "MATE_ID={v}"),
- Info::INSIDE_VNTR(v) => write!(f, "INSIDE_VNTR={v}"),
- Info::ALINGED_POS(v) => write!(f, "ALINGED_POS={v}"),
- }
- }
- }
- pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
- v.iter()
- .map(|n| n.to_string())
- .collect::<Vec<String>>()
- .join(",")
- }
- impl Info {
- /// Returns the VCF key name (e.g. "DP", "SVTYPE", "PRECISE") for this INFO field.
- ///
- /// This uses the string representation (`Display`) and extracts the part before `=`,
- /// which is valid for both key=value and flag-only entries.
- pub fn key(&self) -> String {
- self.to_string()
- .split('=')
- .next()
- .unwrap_or(".")
- .to_string()
- }
- /// Returns the complete set of known VCF `INFO` header definitions used by `Info` variants.
- ///
- /// # Example
- /// ```
- /// let headers = Info::header_definitions();
- /// for line in headers {
- /// println!("{line}");
- /// }
- /// ```
- pub fn header_definitions() -> BTreeSet<String> {
- let mut set = BTreeSet::new();
- macro_rules! push {
- ($id:expr, $num:expr, $typ:expr, $desc:expr) => {
- set.insert(format!(
- r#"##INFO=<ID={},Number={},Type={},Description="{}">"#,
- $id, $num, $typ, $desc
- ));
- };
- }
- // Flags
- push!("H", 0, "Flag", "H flag");
- push!("F", 0, "Flag", "F flag");
- push!("P", 0, "Flag", "P flag");
- // Allelic support
- push!("FAU", 1, "Integer", "Forward A support in tumour");
- push!("FCU", 1, "Integer", "Forward C support in tumour");
- push!("FGU", 1, "Integer", "Forward G support in tumour");
- push!("FTU", 1, "Integer", "Forward T support in tumour");
- push!("RAU", 1, "Integer", "Reverse A support in tumour");
- push!("RCU", 1, "Integer", "Reverse C support in tumour");
- push!("RGU", 1, "Integer", "Reverse G support in tumour");
- push!("RTU", 1, "Integer", "Reverse T support in tumour");
- // Structural variant metadata
- push!("SVTYPE", 1, "String", "Structural variant type");
- push!("MATEID", 1, "String", "ID of the mate breakend");
- push!("SVLEN", 1, "Integer", "Length of structural variant");
- push!("SVINSLEN", 1, "Integer", "Length of inserted sequence");
- push!("SVINSSEQ", 1, "String", "Inserted sequence");
- // Positions and read support
- push!("END", 1, "Integer", "End position of the variant");
- push!(
- "NORMAL_READ_SUPPORT",
- 1,
- "Integer",
- "Supporting reads in normal sample"
- );
- push!(
- "TUMOUR_READ_SUPPORT",
- 1,
- "Integer",
- "Supporting reads in tumour sample"
- );
- push!(
- "NORMAL_ALN_SUPPORT",
- 1,
- "Integer",
- "Aligned reads in normal sample"
- );
- push!(
- "TUMOUR_ALN_SUPPORT",
- 1,
- "Integer",
- "Aligned reads in tumour sample"
- );
- // Depth profiles
- push!(
- "TUMOUR_DP_BEFORE",
- ".",
- "Integer",
- "Depth before breakpoint in tumour"
- );
- push!(
- "TUMOUR_DP_AT",
- ".",
- "Integer",
- "Depth at breakpoint in tumour"
- );
- push!(
- "TUMOUR_DP_AFTER",
- ".",
- "Integer",
- "Depth after breakpoint in tumour"
- );
- push!(
- "NORMAL_DP_BEFORE",
- ".",
- "Integer",
- "Depth before breakpoint in normal"
- );
- push!(
- "NORMAL_DP_AT",
- ".",
- "Integer",
- "Depth at breakpoint in normal"
- );
- push!(
- "NORMAL_DP_AFTER",
- ".",
- "Integer",
- "Depth after breakpoint in normal"
- );
- // Allele frequencies
- push!(
- "TUMOUR_AF",
- ".",
- "Float",
- "Variant allele frequencies in tumour"
- );
- push!(
- "NORMAL_AF",
- ".",
- "Float",
- "Variant allele frequencies in normal"
- );
- // Haplotype/phasing
- push!(
- "TUMOUR_ALT_HP",
- ".",
- "Integer",
- "Alternate haplotype support in tumour"
- );
- push!("TUMOUR_PS", ".", "String", "Phasing set in tumour");
- push!(
- "NORMAL_ALT_HP",
- ".",
- "Integer",
- "Alternate haplotype support in normal"
- );
- push!("NORMAL_PS", ".", "String", "Phasing set in normal");
- push!(
- "TUMOUR_TOTAL_HP_AT",
- ".",
- "Integer",
- "Total haplotype depth at breakpoint in tumour"
- );
- push!(
- "NORMAL_TOTAL_HP_AT",
- ".",
- "Integer",
- "Total haplotype depth at breakpoint in normal"
- );
- // Cluster analysis
- push!(
- "CLUSTERED_READS_TUMOUR",
- 1,
- "Integer",
- "Clustered reads in tumour"
- );
- push!(
- "CLUSTERED_READS_NORMAL",
- 1,
- "Integer",
- "Clustered reads in normal"
- );
- // Origin and end-point statistics
- push!(
- "ORIGIN_STARTS_STD_DEV",
- 1,
- "Float",
- "STDDEV of read starts at origin"
- );
- push!("ORIGIN_MAPQ_MEAN", 1, "Float", "Mean MAPQ at origin");
- push!(
- "ORIGIN_EVENT_SIZE_STD_DEV",
- 1,
- "Float",
- "STDDEV of event size at origin"
- );
- push!(
- "ORIGIN_EVENT_SIZE_MEDIAN",
- 1,
- "Float",
- "Median event size at origin"
- );
- push!(
- "ORIGIN_EVENT_SIZE_MEAN",
- 1,
- "Float",
- "Mean event size at origin"
- );
- push!(
- "END_STARTS_STD_DEV",
- 1,
- "Float",
- "STDDEV of read starts at end"
- );
- push!("END_MAPQ_MEAN", 1, "Float", "Mean MAPQ at end");
- push!(
- "END_EVENT_SIZE_STD_DEV",
- 1,
- "Float",
- "STDDEV of event size at end"
- );
- push!(
- "END_EVENT_SIZE_MEDIAN",
- 1,
- "Float",
- "Median event size at end"
- );
- push!("END_EVENT_SIZE_MEAN", 1, "Float", "Mean event size at end");
- // Additional
- push!("BP_NOTATION", 1, "String", "Breakpoint notation");
- push!("SOURCE", 1, "String", "Caller source name");
- push!("CLASS", 1, "String", "Variant classification");
- // Severus
- push!(
- "PRECISE",
- 0,
- "Flag",
- "SV with precise breakpoints coordinates and length"
- );
- push!(
- "IMPRECISE",
- 0,
- "Flag",
- "SV with imprecise breakpoints coordinates and length"
- );
- push!("STRANDS", 1, "String", "Breakpoint strandedness");
- push!("DETAILED_TYPE", 1, "String", "Detailed type of the SV");
- push!(
- "INSLEN",
- 1,
- "Integer",
- "Length of the unmapped sequence between breakpoint"
- );
- push!(
- "MAPQ",
- 1,
- "Integer",
- "Median mapping quality of supporting reads"
- );
- push!(
- "PHASESETID",
- 1,
- "String",
- "Matching phaseset ID for phased SVs"
- );
- push!("HP", 1, "Integer", "Matching haplotype ID for phased SVs");
- push!("CLUSTERID", 1, "String", "Cluster ID in breakpoint_graph");
- push!(
- "INSSEQ",
- 1,
- "String",
- "Insertion sequence between breakpoints"
- );
- push!("MATE_ID", 1, "String", "MATE ID for breakends");
- push!(
- "INSIDE_VNTR",
- 1,
- "String",
- "True if an indel is inside a VNTR"
- );
- push!("ALINGED_POS", 1, "String", "Position in the reference");
- set
- }
- }
- /// Format
- /// Enum representing individual FORMAT fields from a VCF record.
- ///
- /// This enum supports common fields used by DeepVariant, Clairs, and nanomonsv,
- /// as well as a generic fallback for other key-value pairs.
- ///
- /// # Examples
- ///
- /// ```
- /// use your_crate::Format;
- ///
- /// let gt = Format::GT("0/1".to_string());
- /// let dp = Format::DP(30);
- /// let ad = Format::AD(vec![10, 20]);
- /// ```
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
- pub enum Format {
- // --- DeepVariant fields ---
- /// Genotype string, e.g., "0/1", "1/1".
- GT(String),
- /// Genotype quality.
- GQ(u32),
- /// Read depth (total coverage at the variant position).
- DP(u32),
- /// Allelic depths for the ref and alt alleles (e.g., [ref, alt1, alt2...]).
- AD(Vec<u32>),
- /// Variant allele frequency (e.g., 0.25 for 25%).
- VAF(f32),
- /// Phred-scaled genotype likelihoods.
- PL(Vec<u32>),
- // --- Clairs fields (prefixed with N: for normal sample, or tumor in case of paired) ---
- /// Normal sample total depth.
- NDP(u32),
- /// Normal sample allelic depths (e.g., [ref, alt1, alt2...]).
- NAD(Vec<u32>),
- /// Allele-specific counts for A, C, G, T bases in tumor sample.
- AU(u32),
- CU(u32),
- GU(u32),
- TU(u32),
- /// Allele-specific counts for A, C, G, T bases in normal sample.
- NAU(u32),
- NCU(u32),
- NGU(u32),
- NTU(u32),
- // --- nanomonsv fields ---
- /// Total number of supporting reads in tumor.
- TR(u32),
- /// Variant-supporting reads in tumor.
- VR(u32),
- // --- Severus fields ---
- DR(u32),
- DV(u32),
- HVAF(Vec<f32>),
- /// Fallback for any other key-value pair not explicitly modeled.
- /// Contains the raw key and value as strings.
- Other((String, String)),
- }
- /// Container for a list of `Format` items.
- /// Represents the full FORMAT field and sample value for one sample.
- ///
- /// # Examples
- ///
- /// ```
- /// use your_crate::{Formats, Format};
- ///
- /// let formats = Formats(vec![
- /// Format::GT("0/1".to_string()),
- /// Format::DP(45),
- /// Format::AD(vec![15, 30]),
- /// ]);
- /// ```
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
- pub struct Formats(pub Vec<Format>);
- impl Formats {
- /// Returns the tumor alternative read depth and total depth if both are available.
- ///
- /// This method looks for:
- /// - `Format::AD`: to compute the sum of alternative allele depths (excluding reference)
- /// - `Format::DP`: to get total read depth
- ///
- /// Returns `Some((alt_depth, total_depth))` if both are present, else `None`.
- ///
- /// # Example
- /// ```
- /// use your_crate::{Formats, Format};
- ///
- /// let f = Formats(vec![
- /// Format::AD(vec![10, 20, 5]),
- /// Format::DP(40),
- /// ]);
- ///
- /// assert_eq!(f.n_alt_depth(), Some((25, 40)));
- /// ```
- pub fn n_alt_depth(&self) -> Option<(u32, u32)> {
- let mut tumor_alt_depth: Option<u32> = None;
- let mut tumor_ref_depth: Option<u32> = None;
- let mut tumor_total_depth: Option<u32> = None;
- for format in &self.0 {
- match format {
- Format::AD(values) => {
- if values.len() > 1 {
- tumor_alt_depth = Some(values[1..].iter().sum());
- }
- }
- Format::DP(value) => {
- tumor_total_depth = Some(*value);
- }
- Format::VR(values) => {
- tumor_alt_depth = Some(*values);
- }
- Format::TR(value) => {
- tumor_ref_depth = Some(*value);
- }
- Format::DV(values) => {
- tumor_alt_depth = Some(*values);
- }
- Format::DR(value) => {
- tumor_ref_depth = Some(*value);
- }
- _ => {}
- }
- }
- match (tumor_alt_depth, tumor_total_depth, tumor_ref_depth) {
- (Some(alt), Some(total), _) => Some((alt, total)),
- (Some(alt), _, Some(refe)) => Some((alt, alt + refe)),
- _ => None,
- }
- }
- }
- impl TryFrom<(&str, &str)> for Formats {
- type Error = anyhow::Error;
- /// Attempts to construct a `Formats` from a pair of colon-separated FORMAT keys and values.
- ///
- /// # Arguments
- /// * `k` - FORMAT field names (e.g., "GT:DP:AD")
- /// * `v` - Corresponding values (e.g., "0/1:35:10,25")
- ///
- /// # Errors
- /// Returns an error if the number of keys and values do not match or if parsing fails.
- ///
- /// # Example
- /// ```
- /// use your_crate::Formats;
- /// use std::convert::TryFrom;
- ///
- /// let f = Formats::try_from(("GT:DP:AD", "0/1:40:12,28")).unwrap();
- /// ```
- fn try_from((k, v): (&str, &str)) -> anyhow::Result<Self> {
- let keys: Vec<&str> = k.split(':').collect();
- let values: Vec<&str> = v.split(':').collect();
- if keys.len() != values.len() {
- anyhow::bail!("Mismatch between keys and values count for {k} {v}");
- }
- Ok(Self(
- keys.into_iter()
- .zip(values)
- .map(|(key, value)| Format::try_from((key, value)))
- .collect::<Result<Vec<Format>, _>>()
- .map_err(|e| anyhow::anyhow!("Failed to parse format: {e}"))?,
- ))
- }
- }
- impl From<Formats> for (String, String) {
- /// Converts `Formats` back into a `(keys, values)` tuple of colon-separated strings.
- ///
- /// This is the inverse of the `TryFrom<(&str, &str)>` implementation.
- ///
- /// # Example
- /// ```
- /// use your_crate::{Format, Formats};
- ///
- /// let formats = Formats(vec![
- /// Format::GT("0/1".to_string()),
- /// Format::DP(30),
- /// ]);
- ///
- /// let (k, v): (String, String) = formats.into();
- /// assert_eq!(k, "GT:DP");
- /// assert_eq!(v, "0/1:30");
- /// ```
- fn from(formats: Formats) -> Self {
- let mut keys = Vec::new();
- let mut values = Vec::new();
- for format in formats.0 {
- let (key, value): (String, String) = format.into();
- keys.push(key);
- values.push(value);
- }
- (keys.join(":"), values.join(":"))
- }
- }
- impl TryFrom<(&str, &str)> for Format {
- type Error = anyhow::Error;
- /// Tries to convert a `(key, value)` pair into a typed `Format` variant.
- ///
- /// This parser supports known FORMAT keys from DeepVariant, Clairs, and nanomonsv.
- /// Unknown keys are stored as `Format::Other((key, value))`.
- ///
- /// # Arguments
- /// * `key` - FORMAT field name
- /// * `value` - raw string value associated with the key
- ///
- /// # Example
- /// ```
- /// use your_crate::Format;
- /// use std::convert::TryFrom;
- ///
- /// let dp = Format::try_from(("DP", "42")).unwrap();
- /// assert!(matches!(dp, Format::DP(42)));
- /// ```
- fn try_from((key, value): (&str, &str)) -> anyhow::Result<Self> {
- let format = match key {
- // DeepVariant
- "GT" => Format::GT(value.to_string()),
- "GQ" => Format::GQ(parse_value(value, key)?),
- "DP" => Format::DP(parse_value(value, key)?),
- "AD" => Format::AD(parse_vec_value(value, key)?),
- "VAF" => Format::VAF(parse_value(value, key)?),
- "PL" => Format::PL(parse_vec_value(value, key)?),
- // Clairs
- "NDP" => Format::NDP(parse_value(value, key)?),
- "NAD" => Format::NAD(parse_vec_value(value, key)?),
- "AU" => Format::AU(parse_value(value, key)?),
- "CU" => Format::CU(parse_value(value, key)?),
- "GU" => Format::GU(parse_value(value, key)?),
- "TU" => Format::TU(parse_value(value, key)?),
- "NAU" => Format::NAU(parse_value(value, key)?),
- "NCU" => Format::NCU(parse_value(value, key)?),
- "NGU" => Format::NGU(parse_value(value, key)?),
- "NTU" => Format::NTU(parse_value(value, key)?),
- // nanomonsv
- "TR" => Format::TR(parse_value(value, key)?),
- "VR" => Format::VR(parse_value(value, key)?),
- // Severus
- "DR" => Format::DR(parse_value(value, key)?),
- "DV" => Format::DV(parse_value(value, key)?),
- "hVAF" => Format::HVAF(parse_vec_value(value, key)?),
- // fallback
- _ => Format::Other((key.to_string(), value.to_string())),
- };
- Ok(format)
- }
- }
- // Helper function to parse a single value (DeepSeek)
- fn parse_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<T>
- where
- T::Err: std::fmt::Debug,
- {
- value
- .parse()
- .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
- .context(format!("Can't parse {}: {}", key, value)) // Add context
- }
- // Helper function to parse comma-separated values
- fn parse_vec_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<Vec<T>>
- where
- T::Err: std::fmt::Debug,
- {
- value
- .split(',')
- .map(|e| {
- e.parse()
- .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
- .context(format!("Failed to parse {}: {}", key, e)) // Add context
- })
- .collect()
- }
- impl From<Format> for (String, String) {
- /// Converts a `Format` enum into a `(key, value)` pair, as strings.
- ///
- /// This is used to serialize the FORMAT field back into VCF-compatible string values.
- /// The key corresponds to the field ID (e.g., `"DP"`, `"GT"`), and the value is the encoded string representation.
- ///
- /// # Examples
- /// ```
- /// use your_crate::Format;
- /// let f = Format::DP(42);
- /// let (k, v): (String, String) = f.into();
- /// assert_eq!(k, "DP");
- /// assert_eq!(v, "42");
- /// ```
- fn from(format: Format) -> Self {
- let concat_u32 = |values: Vec<u32>| -> String {
- values
- .iter()
- .map(u32::to_string)
- .collect::<Vec<_>>()
- .join(",")
- };
- let concat_f32 = |values: Vec<f32>| -> String {
- values
- .iter()
- .map(|v| format!("{:.5}", v)) // consistent decimal format
- .collect::<Vec<_>>()
- .join(",")
- };
- match format {
- // DeepVariant
- Format::GT(value) => ("GT".to_string(), value),
- Format::GQ(value) => ("GQ".to_string(), value.to_string()),
- Format::DP(value) => ("DP".to_string(), value.to_string()),
- Format::AD(values) => ("AD".to_string(), concat_u32(values)),
- Format::VAF(value) => ("VAF".to_string(), format!("{:.5}", value)),
- Format::PL(values) => ("PL".to_string(), concat_u32(values)),
- // Clairs
- Format::NDP(value) => ("NDP".to_string(), value.to_string()),
- Format::NAD(values) => ("NAD".to_string(), concat_u32(values)),
- Format::AU(value) => ("AU".to_string(), value.to_string()),
- Format::CU(value) => ("CU".to_string(), value.to_string()),
- Format::GU(value) => ("GU".to_string(), value.to_string()),
- Format::TU(value) => ("TU".to_string(), value.to_string()),
- Format::NAU(value) => ("NAU".to_string(), value.to_string()),
- Format::NCU(value) => ("NCU".to_string(), value.to_string()),
- Format::NGU(value) => ("NGU".to_string(), value.to_string()),
- Format::NTU(value) => ("NTU".to_string(), value.to_string()),
- // nanomonsv
- Format::TR(value) => ("TR".to_string(), value.to_string()),
- Format::VR(value) => ("VR".to_string(), value.to_string()),
- // Severus
- Format::DR(value) => ("DR".to_string(), value.to_string()),
- Format::DV(value) => ("DV".to_string(), value.to_string()),
- Format::HVAF(values) => ("hVAF".to_string(), concat_f32(values)),
- // fallback
- Format::Other((key, value)) => (key, value),
- }
- }
- }
- impl Formats {
- pub fn commun_deepvariant_clairs(&self) -> Self {
- let filtered_vec: Vec<Format> = self
- .0
- .clone()
- .into_iter()
- .map(|e| {
- if let Format::VAF(_v) = e {
- e
- // Format::AF(v)
- } else {
- e
- }
- })
- .filter(|format| {
- matches!(
- format,
- Format::GT(_) | Format::GQ(_) | Format::DP(_) | Format::AD(_) /* | Format::AF(_) */
- )
- })
- .collect();
- Formats(filtered_vec)
- }
- /// Returns a sorted set of VCF header definitions for all possible `Format` fields.
- ///
- /// # Example
- /// ```
- /// let headers = Formats::format_headers();
- /// for h in headers {
- /// println!("{}", h);
- /// }
- /// ```
- pub fn format_headers() -> BTreeSet<String> {
- let mut headers = BTreeSet::new();
- headers
- .insert(r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#.to_string());
- headers.insert(
- r#"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">"#.to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">"#.to_string(),
- );
- headers.insert(r#"##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles">"#.to_string());
- headers.insert(
- r#"##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Frequency">"#
- .to_string(),
- );
- headers.insert(r#"##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods">"#.to_string());
- headers.insert(
- r#"##FORMAT=<ID=NDP,Number=1,Type=Integer,Description="Normal sample read depth">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=NAD,Number=R,Type=Integer,Description="Normal sample allelic depths">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=AU,Number=1,Type=Integer,Description="Tumor A allele count">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=CU,Number=1,Type=Integer,Description="Tumor C allele count">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=GU,Number=1,Type=Integer,Description="Tumor G allele count">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=TU,Number=1,Type=Integer,Description="Tumor T allele count">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=NAU,Number=1,Type=Integer,Description="Normal A allele count">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=NCU,Number=1,Type=Integer,Description="Normal C allele count">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=NGU,Number=1,Type=Integer,Description="Normal G allele count">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=NTU,Number=1,Type=Integer,Description="Normal T allele count">"#
- .to_string(),
- );
- headers.insert(r#"##FORMAT=<ID=TR,Number=1,Type=Integer,Description="Total supporting reads (tumor)">"#.to_string());
- headers.insert(r#"##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Variant-supporting reads (tumor)">"#.to_string());
- // Severus
- headers.insert(
- r#"##FORMAT=<ID=DR,Number=1,Type=Integer,Description="Number of reference reads">"#
- .to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of variant reads">"#
- .to_string(),
- );
- headers.insert(r#"##FORMAT=<ID=hVAF,Number=3,Type=Float,Description="Haplotype specific variant Allele frequency (H0,H1,H2)">"#.to_string());
- headers.insert(
- r#"##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">"#.to_string(),
- );
- headers.insert(
- r#"##FORMAT=<ID=NAF,Number=1,Type=Float,Description="Normal Allele Frequency">"#
- .to_string(),
- );
- // headers.insert(
- // r#"##FORMAT=<ID=Other,Number=.,Type=String,Description="Unspecified FORMAT field">"#
- // .to_string(),
- // );
- headers
- }
- }
- /// Filter
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
- pub enum Filter {
- PASS,
- Other(String),
- }
- impl FromStr for Filter {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- match s {
- "PASS" => Ok(Filter::PASS),
- _ => Ok(Filter::Other(s.to_string())),
- }
- }
- }
- impl fmt::Display for Filter {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- match self {
- Filter::PASS => write!(f, "PASS"),
- Filter::Other(ref s) => write!(f, "{}", s),
- }
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
- pub enum ReferenceAlternative {
- Nucleotide(Base),
- Nucleotides(Vec<Base>),
- Unstructured(String),
- }
- impl ReferenceAlternative {
- pub fn len(&self) -> usize {
- match self {
- ReferenceAlternative::Nucleotide(_) => 1,
- ReferenceAlternative::Nucleotides(bases) => bases.len(),
- ReferenceAlternative::Unstructured(_) => 0,
- }
- }
- pub fn is_empty(&self) -> bool {
- self.len() == 0
- }
- pub fn ent(&self) -> Option<f64> {
- match self {
- ReferenceAlternative::Nucleotide(_) => None,
- ReferenceAlternative::Nucleotides(bases) => {
- let seq = bases
- .iter()
- .skip(1)
- .map(|b| b.to_string())
- .collect::<String>();
- if seq.len() > 1 {
- Some(estimate_shannon_entropy(&seq))
- } else {
- None
- }
- }
- ReferenceAlternative::Unstructured(_) => None,
- }
- }
- }
- impl FromStr for ReferenceAlternative {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let possible_bases = s.as_bytes().iter();
- let mut res: Vec<Base> = Vec::new();
- for &base in possible_bases {
- match base.try_into() {
- std::result::Result::Ok(b) => res.push(b),
- Err(_) => {
- return Ok(Self::Unstructured(s.to_string()));
- }
- }
- }
- if res.len() == 1 {
- Ok(Self::Nucleotide(res.pop().unwrap()))
- } else {
- Ok(Self::Nucleotides(res))
- }
- }
- }
- impl fmt::Display for ReferenceAlternative {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let string = match self {
- ReferenceAlternative::Nucleotide(b) => b.to_string(),
- ReferenceAlternative::Nucleotides(bases) => bases
- .iter()
- .fold(String::new(), |acc, e| format!("{}{}", acc, e)),
- ReferenceAlternative::Unstructured(s) => s.to_string(),
- };
- write!(f, "{}", string)
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
- pub enum Base {
- A,
- T,
- C,
- G,
- N,
- }
- impl TryFrom<u8> for Base {
- type Error = anyhow::Error;
- fn try_from(base: u8) -> anyhow::Result<Self> {
- match base {
- // uppercase
- b'A' | b'a' => Ok(Base::A),
- b'T' | b't' => Ok(Base::T),
- b'C' | b'c' => Ok(Base::C),
- b'G' | b'g' => Ok(Base::G),
- b'N' | b'n' => Ok(Base::N),
- // unknown
- _ => Err(anyhow::anyhow!(
- "Unknown base: {}",
- String::from_utf8_lossy(&[base])
- )),
- }
- }
- }
- impl Base {
- pub fn into_u8(self) -> u8 {
- match self {
- Base::A => b'A',
- Base::T => b'T',
- Base::C => b'C',
- Base::G => b'G',
- Base::N => b'N',
- }
- }
- }
- impl fmt::Display for Base {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- // Use `self.number` to refer to each positional data point.
- let str = match self {
- Base::A => "A",
- Base::T => "T",
- Base::C => "C",
- Base::G => "G",
- Base::N => "N",
- };
- write!(f, "{}", str)
- }
- }
- pub trait Variants {
- fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection>;
- }
- pub trait VariantId {
- fn variant_id(&self) -> String;
- }
- pub trait Label {
- fn label(&self) -> String;
- }
- /// A trait alias for all dynamically executable pipeline runners.
- ///
- /// This trait represents any component that:
- /// - Can decide whether it needs to run (`ShouldRun`)
- /// - Implements the actual execution logic (`Run`)
- /// - Is thread-safe (`Send + Sync`) to be boxed and dispatched concurrently
- ///
- /// Components implementing this trait can be boxed as `ShouldRunBox`.
- pub trait ShouldRunTrait: ShouldRun + Run + Send + Sync + Label {}
- /// Blanket implementation for all compatible types.
- impl<T> ShouldRunTrait for T where T: ShouldRun + Run + Send + Sync + Label {}
- /// A boxed trait object to hold any runner implementing `ShouldRunTrait`.
- pub type ShouldRunBox = Box<dyn ShouldRunTrait>;
- /// Macro to initialize and box multiple `ShouldRunTrait` components.
- ///
- /// # Arguments
- /// * `$id` - Sample ID (typically a string slice)
- /// * `$config` - Shared configuration object
- /// * `$($runner:ty),+` - One or more runner types implementing `Initialize + ShouldRunTrait`
- ///
- /// # Returns
- /// A vector of boxed runner components (`Vec<ShouldRunBox>`)
- ///
- /// # Example
- /// ```rust
- /// let modules: Vec<ShouldRunBox> = create_should_run!(
- /// "sample_42",
- /// config,
- /// ClairS,
- /// Savana,
- /// DeepSomatic,
- /// )?;
- /// ```
- ///
- /// # Errors
- /// This macro uses `?`, so it must be called inside a function that returns `anyhow::Result`.
- #[macro_export]
- macro_rules! create_should_run {
- ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
- use anyhow::Context;
- let mut runners: Vec<ShouldRunBox> = Vec::new();
- $(
- let runner = <$runner>::initialize($id, $config.clone())
- .with_context(|| format!(
- "Failed to initialize should-run checker {} for {}",
- stringify!($runner), $id
- ))?;
- runners.push(Box::new(runner) as ShouldRunBox);
- )+
- runners
- }};
- }
- /// Macro to initialize and box a list of solo-mode pipeline components that implement `ShouldRunTrait`.
- ///
- /// This is typically used for per-timepoint variant callers (e.g., `DeepVariant`),
- /// where each runner is instantiated for a specific sample timepoint (e.g., "tumoral", "normal").
- ///
- /// Each entry must be provided as a pair: the type of the runner and the timepoint string expression.
- ///
- /// # Arguments
- /// - `$id`: The sample ID (`&str`) passed to each initializer
- /// - `$config`: A `Config` object (must be cloneable)
- /// - `$($runner:ty, $arg:expr),+`: One or more runner types with timepoint arguments (e.g., `config.tumoral_name`)
- ///
- /// # Returns
- /// A `Vec<ShouldRunBox>` with boxed runners initialized per timepoint.
- ///
- /// # Example
- /// ```rust
- /// let solo_callers = create_should_run_solo!(
- /// "sample42",
- /// config,
- /// DeepVariant, config.tumoral_name,
- /// DeepVariant, config.normal_name,
- /// )?;
- /// ```
- ///
- /// # Notes
- /// Using `config.tumoral_name` and `config.normal_name` is preferred over hardcoded "diag"/"mrd".
- ///
- /// # Errors
- /// This macro uses `?` and must be called inside a `Result`-returning context.
- #[macro_export]
- macro_rules! create_should_run_solo {
- ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{
- use anyhow::Context;
- let mut runners: Vec<ShouldRunBox> = Vec::new();
- $(
- let runner = <$runner>::initialize($id, $arg, $config.clone())
- .with_context(|| format!(
- "Failed to initialize solo should-run checker {} for '{}' and arg '{}'",
- stringify!($runner), $id, $arg
- ))?;
- runners.push(Box::new(runner) as ShouldRunBox);
- )+
- runners
- }};
- }
- /// Macro to initialize and box a list of pipeline components that must run once per timepoint
- /// (i.e., both "tumoral" and "normal") and implement `ShouldRunTrait`.
- ///
- /// This is typically used for variant callers like `DeepVariant` that operate in **solo mode**
- /// but must be run twice — once for the tumoral sample and once for the normal sample.
- ///
- /// The macro:
- /// - Calls `.initialize(id, timepoint, config.clone())` twice per type
- /// - Uses `config.tumoral_name` and `config.normal_name` as timepoints
- /// - Returns a flat `Vec<ShouldRunBox>` containing both instances for each type
- ///
- /// # Arguments
- /// - `$id`: The sample ID (`&str`)
- /// - `$config`: The configuration object (must expose `tumoral_name` and `normal_name`)
- /// - `$($runner:ty),+`: One or more runner types that implement `Initialize` with `(id, timepoint, config)`
- ///
- /// # Example
- /// ```rust
- /// let runners = create_should_run_normal_tumoral!(
- /// "sample_42",
- /// config,
- /// DeepVariant,
- /// AnotherCaller,
- /// )?;
- /// ```
- ///
- /// This will expand to:
- /// ```rust
- /// vec![
- /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox,
- /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox,
- /// Box::new(AnotherCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox,
- /// Box::new(AnotherCaller::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox,
- /// ]
- /// ```
- ///
- /// # Errors
- /// This macro uses `?`, so it must be called inside a function that returns `Result`.
- #[macro_export]
- macro_rules! create_should_run_normal_tumoral {
- ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
- use anyhow::Context;
- let mut runners: Vec<ShouldRunBox> = Vec::new();
- $(
- let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone())
- .with_context(|| format!(
- "Failed to initialize tumoral should-run checker {} for {}",
- stringify!($runner), $id
- ))?;
- runners.push(Box::new(tumoral) as ShouldRunBox);
- let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone())
- .with_context(|| format!(
- "Failed to initialize normal should-run checker {} for {}",
- stringify!($runner), $id
- ))?;
- runners.push(Box::new(normal) as ShouldRunBox);
- )+
- runners
- }};
- }
- /// Executes each runner in the slice only if `should_run()` returns true.
- ///
- /// # Arguments
- /// * `iterable` - A mutable slice of boxed `InitRun` components
- ///
- /// # Returns
- /// * `Ok(())` if all required runners execute successfully
- /// * An error if any runner's `run()` method fails
- ///
- /// # Notes
- /// - This function will skip runners that return `false` from `should_run()`
- pub fn run_if_required(iterable: &mut [ShouldRunBox]) -> anyhow::Result<()> {
- iterable.iter_mut().try_for_each(|e| {
- if e.should_run() {
- e.run()
- .map_err(|err| anyhow::anyhow!("Failed to run {}.\n{err}", e.label()))
- } else {
- info!("No run required for: {}", e.label());
- Ok(())
- }
- })
- }
- /// A trait alias for all variant callers that support initialization, execution,
- /// conditional re-running, and variant extraction (VCF + annotations).
- ///
- /// Used to enable polymorphic handling of both solo and somatic callers in the pipeline.
- pub trait RunnerVariants: Variants + Send + Sync + Label {}
- /// Blanket implementation for all compatible types.
- impl<T> RunnerVariants for T where T: Variants + Send + Sync + Label {}
- pub type CallerBox = Box<dyn RunnerVariants + Send + Sync>;
- #[macro_export]
- macro_rules! init_somatic_callers {
- ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
- use anyhow::Context;
- let mut callers: Vec<CallerBox> = Vec::new();
- $(
- let caller = <$runner>::initialize($id, $config.clone())
- .with_context(|| format!(
- "Failed to initialize somatic caller {} for {}",
- stringify!($runner), $id
- ))?;
- callers.push(Box::new(caller) as CallerBox);
- )+
- callers
- }};
- }
- /// Macro to initialize and box a list of **solo-mode variant callers** for specific timepoints,
- /// where each runner implements `RunnerVariants`.
- ///
- /// This is useful for callers like `DeepVariant` that need to be instantiated with a specific
- /// sample timepoint (e.g., `config.tumoral_name` or `config.normal_name`).
- ///
- /// Each entry must be a pair: a runner type and a timepoint expression (usually from config).
- ///
- /// # Arguments
- /// - `$id`: The sample ID (`&str`)
- /// - `$config`: The configuration object (must be cloneable)
- /// - `$($runner:ty, $arg:expr),+`: One or more `(RunnerType, Timepoint)` pairs
- ///
- /// # Returns
- /// A `Vec<CallerBox>` containing initialized, boxed solo-mode variant callers.
- ///
- /// # Example
- /// ```rust
- /// let solo_callers = init_solo_callers!(
- /// "sample_42",
- /// config,
- /// DeepVariant, config.tumoral_name,
- /// DeepVariant, config.normal_name,
- /// )?;
- /// ```
- ///
- /// This will expand to:
- /// ```rust
- /// vec![
- /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
- /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
- /// ]
- /// ```
- ///
- /// # Errors
- /// This macro uses `?` internally, so it must be used inside a `Result`-returning context.
- #[macro_export]
- macro_rules! init_solo_callers {
- ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{
- let mut callers: Vec<CallerBox> = Vec::new();
- $(
- let caller = <$runner>::initialize($id, $arg, $config.clone())
- .with_context(|| format!(
- "Failed to initialize caller {} for {}",
- stringify!($runner), $id
- ))?;
- callers.push(Box::new(caller) as CallerBox);
- )+
- callers
- }};
- }
- /// Macro to initialize and box a list of solo-mode **variant callers** for both `normal` and `tumoral` timepoints.
- ///
- /// This is designed for types like `DeepVariant` that implement `RunnerVariants` and require
- /// separate execution for each timepoint. It will:
- /// - Call `.initialize(id, timepoint, config)` for both `config.tumoral_name` and `config.normal_name`
- /// - Box the result as `CallerBox`
- ///
- /// # Arguments
- /// - `$id`: Sample ID (usually a `&str`)
- /// - `$config`: Cloneable configuration object
- /// - `$($runner:ty),+`: One or more runner types that implement `RunnerVariants`
- ///
- /// # Returns
- /// A `Vec<CallerBox>` containing two boxed instances per runner (one for each timepoint).
- ///
- /// # Example
- /// ```rust
- /// let solo_callers = init_solo_callers_normal_tumoral!(
- /// "sample_42",
- /// config,
- /// DeepVariant,
- /// OtherSoloCaller,
- /// )?;
- /// ```
- ///
- /// This expands to:
- /// ```rust
- /// vec![
- /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
- /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
- /// Box::new(OtherSoloCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
- /// Box::new(OtherSoloCaller::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
- /// ]
- /// ```
- ///
- /// # Errors
- /// This macro uses `?`, so it must be called inside a `Result`-returning context.
- #[macro_export]
- macro_rules! init_solo_callers_normal_tumoral {
- ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
- use anyhow::Context;
- let mut callers: Vec<CallerBox> = Vec::new();
- $(
- let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone())
- .with_context(|| format!(
- "Failed to initialize tumoral caller {} for {} '{}'",
- stringify!($runner), $id, $config.tumoral_name
- ))?;
- callers.push(Box::new(tumoral) as CallerBox);
- let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone())
- .with_context(|| format!(
- "Failed to initialize normal caller {} for {} '{}'",
- stringify!($runner), $id, $config.normal_name
- ))?;
- callers.push(Box::new(normal) as CallerBox);
- )+
- callers
- }};
- }
- // pub fn run_variants(iterable: &mut [CallerBox]) -> anyhow::Result<()> {
- // iterable
- // .iter_mut()
- // .try_for_each(|runner| runner.run())
- // .map_err(|e| anyhow::anyhow!("Error while calling run_variants.\n{e}"))
- // }
- pub fn load_variants(
- iterable: &mut [CallerBox],
- annotations: &Annotations,
- ) -> anyhow::Result<Vec<VariantCollection>> {
- iterable
- .par_iter()
- .map(|runner| {
- let result = runner.variants(annotations);
- if let Err(ref e) = result {
- error!("Failed to load variants from: {}\n{e}", runner.label());
- } else {
- info!("Variants from {} loaded.", runner.label());
- };
- result
- })
- .filter(|r| r.is_ok())
- .collect::<anyhow::Result<Vec<_>>>()
- .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}"))
- }
- // pub fn par_load_variants(
- // iterable: &mut [Box<dyn Variants + Send + Sync>],
- // annotations: &Annotations,
- // ) -> anyhow::Result<Vec<VariantCollection>> {
- // iterable
- // .par_iter()
- // .map(|runner| {
- // let r = runner.variants(annotations);
- // if let Err(ref e) = r {
- // warn!("{e}");
- // };
- // r
- // })
- // .filter(|r| r.is_ok())
- // .collect::<anyhow::Result<Vec<_>>>()
- // .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}"))
- // }
- pub fn parallel_intersection<T: Hash + Eq + Clone + Send + Sync>(
- vec1: &[T],
- vec2: &[T],
- ) -> (Vec<T>, Vec<T>, Vec<T>) {
- let set1: HashSet<_> = vec1.par_iter().cloned().collect();
- let set2: HashSet<_> = vec2.par_iter().cloned().collect();
- let common: Vec<T> = set1
- .par_iter()
- .filter(|item| set2.contains(item))
- .cloned()
- .collect();
- let only_in_first: Vec<T> = set1
- .par_iter()
- .filter(|item| !set2.contains(item))
- .cloned()
- .collect();
- let only_in_second: Vec<T> = set2
- .par_iter()
- .filter(|item| !set1.contains(item))
- .cloned()
- .collect();
- (common, only_in_first, only_in_second)
- }
|