variant.rs 90 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712
  1. use crate::{
  2. annotation::Annotations,
  3. collection::ShouldRun,
  4. helpers::{estimate_shannon_entropy, mean, Hash128},
  5. positions::{GenomePosition, GetGenomePosition, VcfPosition},
  6. runners::Run,
  7. variant::variant_collection::VariantCollection,
  8. };
  9. use anyhow::{anyhow, Context};
  10. use bitcode::{Decode, Encode};
  11. use log::{error, info};
  12. use rayon::prelude::*;
  13. use serde::{Deserialize, Serialize};
  14. use std::{
  15. cmp::Ordering,
  16. collections::{BTreeSet, HashSet},
  17. fmt,
  18. hash::Hash,
  19. str::FromStr,
  20. };
  21. /// Represents a variant in the Variant Call Format (VCF).
  22. #[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
  23. pub struct VcfVariant {
  24. /// A 128-bit hash of the variant's key properties for efficient comparison and storage.
  25. pub hash: Hash128,
  26. /// The genomic position of the variant.
  27. pub position: GenomePosition,
  28. /// The identifier of the variant.
  29. pub id: String,
  30. /// The reference allele.
  31. pub reference: ReferenceAlternative,
  32. /// The alternative allele.
  33. pub alternative: ReferenceAlternative,
  34. /// The quality score of the variant call, if available.
  35. pub quality: Option<f32>,
  36. /// The filter status of the variant.
  37. pub filter: Filter,
  38. /// Additional information about the variant.
  39. pub infos: Infos,
  40. /// Genotype information and other sample-specific data.
  41. pub formats: Formats,
  42. }
  43. impl PartialEq for VcfVariant {
  44. /// Compares two VcfVariants for equality.
  45. ///
  46. /// Note: This comparison only considers position, reference, and alternative.
  47. /// It intentionally ignores id, filter, info, format, and quality.
  48. fn eq(&self, other: &Self) -> bool {
  49. // Nota bene: id, filter, info, format and quality is intentionally not compared
  50. self.position == other.position
  51. && self.reference == other.reference
  52. && self.alternative == other.alternative
  53. }
  54. }
  55. impl Eq for VcfVariant {}
  56. impl FromStr for VcfVariant {
  57. type Err = anyhow::Error;
  58. /// Parses a VcfVariant from a string representation.
  59. ///
  60. /// The input string is expected to be a tab-separated VCF line.
  61. ///
  62. /// # Errors
  63. ///
  64. /// Returns an error if parsing fails for any field.
  65. fn from_str(s: &str) -> anyhow::Result<Self> {
  66. let v: Vec<&str> = s.split('\t').collect();
  67. let vcf_position: VcfPosition = (
  68. *v.first().ok_or(anyhow!("Can't get contig from: {s}"))?,
  69. *v.get(1).ok_or(anyhow!("Can't get position from: {s}"))?,
  70. )
  71. .try_into()
  72. .context(format!("Can't parse position from: {s}"))?;
  73. let formats = if v.len() >= 10 {
  74. (
  75. *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?,
  76. *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?,
  77. )
  78. .try_into()
  79. .context(format!("Can't parse formats from: {s}"))?
  80. } else {
  81. Formats::default()
  82. };
  83. let position: GenomePosition = vcf_position.into();
  84. let reference: ReferenceAlternative = v
  85. .get(3)
  86. .ok_or(anyhow!("Can't parse reference from: {s}"))?
  87. .parse()
  88. .context(format!("Can't parse reference from: {s}"))?;
  89. let alternative: ReferenceAlternative = v
  90. .get(4)
  91. .ok_or(anyhow!("Can't parse alternative from: {s}"))?
  92. .parse()
  93. .context(format!("Can't parse alternative from: {s}"))?;
  94. // Blake3 128 bytes Hash
  95. let mut hasher = blake3::Hasher::new();
  96. hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes
  97. hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes
  98. hasher.update(reference.to_string().as_bytes()); // Reference string as bytes
  99. hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes
  100. let hash = hasher.finalize();
  101. let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
  102. Ok(Self {
  103. hash,
  104. position,
  105. id: v
  106. .get(2)
  107. .ok_or(anyhow!("Can't parse id from: {s}"))?
  108. .to_string(),
  109. reference,
  110. alternative,
  111. quality: v
  112. .get(5)
  113. .map(|s| s.parse::<f32>().ok()) // Try to parse as f64; returns Option<f64>
  114. .unwrap_or(None),
  115. filter: v
  116. .get(6)
  117. .ok_or(anyhow!("Can't parse filter from: {s}"))?
  118. .parse()
  119. .context(format!("Can't parse filter from: {s}"))?,
  120. infos: v
  121. .get(7)
  122. .ok_or(anyhow!("Can't parse infos from: {s}"))?
  123. .parse()
  124. .context(format!("Can't parse infos from: {s}"))?,
  125. formats,
  126. })
  127. }
  128. }
  129. // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag
  130. impl VcfVariant {
  131. /// Converts the VcfVariant into a VCF-formatted row string.
  132. ///
  133. /// This method creates a tab-separated string representation of the variant,
  134. /// suitable for writing to a VCF file.
  135. pub fn into_vcf_row(&self) -> String {
  136. let vcf_position: VcfPosition = self.position.clone().into();
  137. let (contig, position) = vcf_position.into();
  138. let mut columns = vec![
  139. contig,
  140. position,
  141. self.id.to_string(),
  142. self.reference.to_string(),
  143. self.alternative.to_string(),
  144. self.quality
  145. .map(|v| v.to_string())
  146. .unwrap_or(".".to_string()),
  147. self.filter.to_string(),
  148. self.infos.to_string(),
  149. ];
  150. if !self.formats.0.is_empty() {
  151. let (format, values) = self.formats.clone().into();
  152. columns.push(format);
  153. columns.push(values);
  154. }
  155. columns.join("\t")
  156. }
  157. /// Returns the hash of the variant.
  158. pub fn hash(&self) -> Hash128 {
  159. self.hash
  160. }
  161. /// Creates a new VcfVariant with common attributes from DeepVariant and CLAIRS.
  162. ///
  163. /// This method generates a new variant with shared properties, resetting some fields
  164. /// to default or empty values.
  165. pub fn commun_deepvariant_clairs(&self) -> VcfVariant {
  166. VcfVariant {
  167. hash: self.hash,
  168. position: self.position.clone(),
  169. id: self.id.clone(),
  170. reference: self.reference.clone(),
  171. alternative: self.alternative.clone(),
  172. quality: self.quality,
  173. filter: Filter::Other(".".to_string()),
  174. infos: Infos(vec![Info::Empty]),
  175. formats: self.formats.commun_deepvariant_clairs(),
  176. }
  177. }
  178. /// Checks if the variant has an SVTYPE info field.
  179. ///
  180. /// Returns true if the variant contains structural variation type information.
  181. pub fn has_svtype(&self) -> bool {
  182. self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_)))
  183. }
  184. pub fn n_alt_depth(&self) -> Option<(u32, u32)> {
  185. let r = self.formats.n_alt_depth();
  186. if r.is_some() {
  187. r
  188. } else {
  189. self.infos.n_alt_depth()
  190. }
  191. }
  192. /// Retrieves the structural variation type of the variant, if present.
  193. ///
  194. /// Returns Some(SVType) if the variant has an SVTYPE info field,
  195. pub fn svtype(&self) -> Option<SVType> {
  196. self.infos.0.iter().find_map(|e| {
  197. if let Info::SVTYPE(sv_type) = e {
  198. Some(sv_type.clone())
  199. } else {
  200. None
  201. }
  202. })
  203. }
  204. /// Determines the alteration category of the variant.
  205. ///
  206. /// This method analyzes the reference and alternative alleles to classify
  207. /// the variant into one of several alteration categories:
  208. /// - SNV (Single Nucleotide Variant)
  209. /// - INS (Insertion)
  210. /// - DEL (Deletion)
  211. /// - Other (including structural variants and complex alterations)
  212. ///
  213. /// The classification is based on the following rules:
  214. /// 1. If both reference and alternative are single nucleotides, it's an SNV.
  215. /// 2. If reference is a single nucleotide and alternative is multiple nucleotides, it's an insertion.
  216. /// 3. If reference is multiple nucleotides and alternative is a single nucleotide, it's a deletion.
  217. /// 4. For cases where both are multiple nucleotides, the longer one determines if it's an insertion or deletion.
  218. /// 5. If none of the above apply, it checks for structural variant types.
  219. /// 6. If no structural variant type is found, it's classified as "Other".
  220. ///
  221. /// # Returns
  222. /// An `AlterationCategory` enum representing the type of alteration.
  223. pub fn alteration_category(&self) -> AlterationCategory {
  224. match (&self.reference, &self.alternative) {
  225. (ReferenceAlternative::Nucleotide(a), ReferenceAlternative::Nucleotide(b))
  226. if *a != Base::N && *b != Base::N =>
  227. {
  228. AlterationCategory::SNV
  229. }
  230. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
  231. AlterationCategory::INS
  232. }
  233. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
  234. AlterationCategory::DEL
  235. }
  236. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  237. if a.len() < b.len() =>
  238. {
  239. AlterationCategory::INS
  240. }
  241. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  242. if a.len() > b.len() =>
  243. {
  244. AlterationCategory::DEL
  245. }
  246. _ => match self.svtype() {
  247. Some(sv_type) => {
  248. if let Ok(bnd_desc) = self.bnd_desc() {
  249. if bnd_desc.a_contig != bnd_desc.b_contig {
  250. AlterationCategory::TRL
  251. } else if bnd_desc.a_sens != bnd_desc.b_sens {
  252. AlterationCategory::DELINV
  253. } else {
  254. AlterationCategory::DEL
  255. }
  256. } else {
  257. AlterationCategory::from(sv_type)
  258. }
  259. }
  260. None => AlterationCategory::Other,
  261. },
  262. }
  263. }
  264. pub fn inserted_seq(&self) -> Option<String> {
  265. if self.alteration_category() != AlterationCategory::INS {
  266. return None;
  267. }
  268. if let Some(ins) = self.infos.0.iter().find_map(|e| match e {
  269. Info::SVINSSEQ(ins) => Some(ins.to_string()),
  270. _ => None,
  271. }) {
  272. return Some(ins);
  273. }
  274. match &self.alternative {
  275. ReferenceAlternative::Nucleotides(nt) => nt
  276. .get(1..)
  277. .map(|slice| slice.iter().map(ToString::to_string).collect()),
  278. _ => None,
  279. }
  280. }
  281. /// Parses and constructs a BND (breakend) description from the alternative string.
  282. ///
  283. /// This function interprets the BND notation in the alternative string and creates
  284. /// a `BNDDesc` struct containing detailed information about the breakend.
  285. ///
  286. /// # Returns
  287. /// - `Ok(BNDDesc)` if parsing is successful
  288. /// - `Err` if parsing fails or if the alteration is not a BND
  289. ///
  290. /// # Errors
  291. /// This function will return an error if:
  292. /// - The alteration category is not BND
  293. pub fn bnd_desc(&self) -> anyhow::Result<BNDDesc> {
  294. let alt = self.alternative.to_string();
  295. BNDDesc::from_vcf_breakend(&self.position().contig(), self.position.position + 1, &alt)
  296. .context(format!("The alteration is not BND: {alt}"))
  297. }
  298. /// Returns the length of the deletion if the variant is a deletion (`DEL`).
  299. pub fn deletion_len(&self) -> Option<u32> {
  300. if self.alteration_category() != AlterationCategory::DEL {
  301. return None;
  302. }
  303. // Try using the END field
  304. if let Some(len) = self.infos.0.iter().find_map(|i| {
  305. if let Info::END(end) = i {
  306. Some(end.saturating_sub(self.position.position))
  307. } else {
  308. None
  309. }
  310. }) {
  311. return Some(len);
  312. }
  313. // Fallback to SVLEN field
  314. if let Some(len) = self.infos.0.iter().find_map(|i| {
  315. if let Info::SVLEN(len) = i {
  316. Some(len.unsigned_abs())
  317. } else {
  318. None
  319. }
  320. }) {
  321. return Some(len);
  322. }
  323. match self.bnd_desc() {
  324. Ok(bnd_desc) if bnd_desc.a_contig == bnd_desc.b_contig => {
  325. if bnd_desc.a_position < bnd_desc.b_position {
  326. return Some(bnd_desc.b_position - bnd_desc.a_position);
  327. } else {
  328. return Some(bnd_desc.a_position - bnd_desc.b_position);
  329. }
  330. }
  331. _ => (),
  332. }
  333. // Final fallback: infer from reference and alternative
  334. if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_)) =
  335. (&self.reference, &self.alternative)
  336. {
  337. return Some(nt.len().saturating_sub(1) as u32);
  338. }
  339. if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotides(bnt)) =
  340. (&self.reference, &self.alternative)
  341. {
  342. if bnt.len() < nt.len() {
  343. return Some(nt.len().saturating_sub(bnt.len()) as u32);
  344. }
  345. }
  346. None
  347. }
  348. pub fn deletion_seq(&self) -> Option<String> {
  349. if self.alteration_category() != AlterationCategory::DEL {
  350. return None;
  351. }
  352. // infer from reference and alternative
  353. if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotide(_)) =
  354. (&self.reference, &self.alternative)
  355. {
  356. return Some(nt.iter().skip(1).map(|e| e.to_string()).collect());
  357. }
  358. if let (ReferenceAlternative::Nucleotides(nt), ReferenceAlternative::Nucleotides(bnt)) =
  359. (&self.reference, &self.alternative)
  360. {
  361. if bnt.len() < nt.len() {
  362. return Some(nt.iter().skip(1).map(|e| e.to_string()).collect());
  363. }
  364. }
  365. None
  366. }
  367. pub fn deletion_desc(&self) -> Option<DeletionDesc> {
  368. match self.bnd_desc() {
  369. Ok(bnd_desc) if bnd_desc.a_contig == bnd_desc.b_contig => {
  370. if bnd_desc.a_position < bnd_desc.b_position {
  371. return Some(DeletionDesc {
  372. contig: bnd_desc.a_contig,
  373. start: bnd_desc.a_position,
  374. end: bnd_desc.b_position,
  375. });
  376. } else {
  377. return Some(DeletionDesc {
  378. contig: bnd_desc.a_contig,
  379. start: bnd_desc.b_position,
  380. end: bnd_desc.a_position,
  381. });
  382. }
  383. }
  384. _ => (),
  385. }
  386. self.deletion_len().map(|len| DeletionDesc {
  387. contig: self.position.contig(),
  388. start: self.position.position + 2,
  389. end: (self.position.position + 1)
  390. .checked_add(len)
  391. .unwrap_or(self.position.position + 2),
  392. })
  393. }
  394. }
  395. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
  396. pub struct BNDDesc {
  397. /// Coordinates of the **5′ end** (A)
  398. pub a_contig: String,
  399. pub a_position: u32,
  400. pub a_sens: bool,
  401. /// Coordinates of the **3′ end** (B)
  402. pub b_contig: String,
  403. pub b_position: u32,
  404. pub b_sens: bool,
  405. /// Inserted nucleotides at the junction (may be empty)
  406. pub added_nt: String,
  407. }
  408. pub trait GroupByThreshold {
  409. fn group_by(&self, threshold: u32) -> Vec<Vec<BNDDesc>>;
  410. }
  411. impl GroupByThreshold for Vec<BNDDesc> {
  412. fn group_by(&self, threshold: u32) -> Vec<Vec<BNDDesc>> {
  413. let mut sorted_vec = self.clone();
  414. sorted_vec.sort_by(|a, b| {
  415. (&a.a_contig, a.a_sens, a.a_position).cmp(&(&b.a_contig, b.a_sens, b.a_position))
  416. });
  417. let mut grouped: Vec<Vec<BNDDesc>> = Vec::new();
  418. for desc in sorted_vec {
  419. if let Some(last_group) = grouped.last_mut() {
  420. if let Some(last_desc) = last_group.last() {
  421. if last_desc.a_contig == desc.a_contig
  422. && last_desc.a_sens == desc.a_sens
  423. && (desc.a_position as i64 - last_desc.a_position as i64).abs()
  424. <= threshold as i64
  425. {
  426. last_group.push(desc);
  427. continue;
  428. }
  429. }
  430. }
  431. grouped.push(vec![desc]);
  432. }
  433. grouped
  434. }
  435. }
  436. impl BNDDesc {
  437. /// Construct a `BNDDesc` from VCF `CHROM`, `POS`, and break‑end `ALT` string.
  438. /// Supports the four VCF break‑end ALT forms:
  439. ///
  440. /// 1) `t[p[` → A(5′)=t flank, B(3′)=p flank (+ + )
  441. /// 2) `t]p]` → A(5′)=t flank, B(3′)=p flank but on reverse (+ -)
  442. /// 3) `]p]t` → A(5′)=p flank, B(3′)=t flank (- -)
  443. /// 4) `[p[t` → A(5′)=p flank, B(3′)=t flank with both on reverse (- +)
  444. ///
  445. /// Here `t` is the local flank sequence, `p` is the remote contig:pos.
  446. pub fn from_vcf_breakend(t_chrom: &str, t_pos: u32, alt: &str) -> Option<Self> {
  447. // locate first bracket
  448. let (open, bracket) = alt.char_indices().find(|&(_, c)| c == '[' || c == ']')?;
  449. // find matching bracket
  450. let close = alt[open + 1..].find(bracket)? + open + 1;
  451. // parse remote contig:pos between brackets
  452. let addr = &alt[open + 1..close];
  453. let mut parts = addr.splitn(2, ':');
  454. let p_contig = parts.next()?;
  455. let p_position: u32 = parts.next()?.parse().ok()?;
  456. // inserted sequence
  457. let seq_before = &alt[..open];
  458. let seq_after = &alt[close + 1..];
  459. let mut added_nt = String::new();
  460. added_nt.push_str(seq_before);
  461. added_nt.push_str(seq_after);
  462. added_nt.remove(0);
  463. // determine form and orientation
  464. // form 1 & 2: bracket after t (open>0): t before p
  465. // form 3 & 4: bracket before t (open==0): p before t
  466. let after_local = open > 0;
  467. let (t_sens, p_sens) = match (bracket, after_local) {
  468. ('[', true) => (true, true), // form 1 t[p[
  469. (']', true) => (true, false), // form 2 t]p]
  470. (']', false) => (true, true), // form 3 ]p]t
  471. ('[', false) => (true, false), // form 4 [p[t
  472. _ => return None,
  473. };
  474. // assign A/B depending on form
  475. let (a_contig, a_position, a_sens, b_contig, b_position, b_sens, added_nt) = if after_local
  476. {
  477. (
  478. t_chrom.into(),
  479. t_pos,
  480. t_sens,
  481. p_contig.into(),
  482. p_position,
  483. p_sens,
  484. added_nt,
  485. )
  486. } else {
  487. (
  488. p_contig.into(),
  489. p_position,
  490. p_sens,
  491. t_chrom.into(),
  492. t_pos,
  493. t_sens,
  494. reverse_complement(&added_nt),
  495. )
  496. };
  497. Some(Self {
  498. a_contig,
  499. a_position,
  500. a_sens,
  501. b_contig,
  502. b_position,
  503. b_sens,
  504. added_nt,
  505. })
  506. }
  507. pub fn rc(&self) -> Self {
  508. Self {
  509. a_contig: self.b_contig.clone(),
  510. a_position: self.b_position,
  511. a_sens: !self.b_sens,
  512. b_contig: self.a_contig.clone(),
  513. b_position: self.a_position,
  514. b_sens: !self.a_sens,
  515. // TODO change seq to RC nd should find base on fa
  516. added_nt: reverse_complement(&self.added_nt),
  517. }
  518. }
  519. }
  520. pub fn reverse_complement(dna: &str) -> String {
  521. fn complement(base: char) -> char {
  522. match base {
  523. 'A' => 'T',
  524. 'T' => 'A',
  525. 'C' => 'G',
  526. 'G' => 'C',
  527. 'a' => 't',
  528. 't' => 'a',
  529. 'c' => 'g',
  530. 'g' => 'c',
  531. 'N' => 'N',
  532. 'n' => 'n',
  533. _ => 'N',
  534. }
  535. }
  536. dna.chars().rev().map(complement).collect()
  537. }
  538. impl core::fmt::Display for BNDDesc {
  539. /// Compact textual form: `(A:contig:pos<arrow>;B:contig:pos<arrow>;ins=...)`.
  540. fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
  541. fn arrow(b: bool) -> &'static str {
  542. if b {
  543. "->"
  544. } else {
  545. "<-"
  546. }
  547. }
  548. write!(
  549. f,
  550. "({}{}:{}::{}::{}:{}{})",
  551. arrow(self.a_sens),
  552. self.a_contig,
  553. self.a_position,
  554. self.added_nt,
  555. self.b_contig,
  556. self.b_position,
  557. arrow(self.b_sens),
  558. )
  559. }
  560. }
  561. use petgraph::graph::{DiGraph, NodeIndex};
  562. use petgraph::visit::{IntoNodeIdentifiers, NodeIndexable};
  563. use petgraph::Direction;
  564. /// Wrapper around `petgraph::DiGraph` with `BNDDesc` nodes.
  565. pub struct BNDGraph<E = ()> {
  566. graph: DiGraph<BNDDesc, E>,
  567. }
  568. impl<E> Default for BNDGraph<E> {
  569. fn default() -> Self {
  570. Self {
  571. graph: DiGraph::new(),
  572. }
  573. }
  574. }
  575. impl<E> BNDGraph<E> {
  576. pub fn new() -> Self {
  577. Self::default()
  578. }
  579. pub fn add_breakpoint(&mut self, desc: BNDDesc) -> NodeIndex {
  580. self.graph.add_node(desc)
  581. }
  582. pub fn successors(&self, n: NodeIndex) -> impl Iterator<Item = NodeIndex> + '_ {
  583. self.graph.neighbors_directed(n, Direction::Outgoing)
  584. }
  585. pub fn predecessors(&self, n: NodeIndex) -> impl Iterator<Item = NodeIndex> + '_ {
  586. self.graph.neighbors_directed(n, Direction::Incoming)
  587. }
  588. pub fn inner(&self) -> &DiGraph<BNDDesc, E> {
  589. &self.graph
  590. }
  591. /// Stringify all nodes
  592. pub fn nodes_as_strings(&self) -> Vec<String> {
  593. self.graph.node_weights().map(|n| n.to_string()).collect()
  594. }
  595. }
  596. #[allow(clippy::collapsible_if)]
  597. impl<E: Default> BNDGraph<E> {
  598. /// Build edges following downstream 5′→3′ logic for two patterns:
  599. ///
  600. /// **Pattern 1** (B → A, *u precedes v*):
  601. ///
  602. /// * forward (→)  `u.b ≤ v.a`   → edge **u → v**
  603. /// * reverse (←)  `u.b ≥ v.a`   → edge **v → u**
  604. ///
  605. /// **Pattern 2** (A → B, *v precedes u*):
  606. ///
  607. /// * forward (→)  `u.a ≥ v.b`   → edge **v → u**
  608. /// * reverse (←)  `u.a ≤ v.b`   → edge **u → v**
  609. pub fn auto_connect(&mut self)
  610. where
  611. E: Default,
  612. {
  613. let nodes: Vec<NodeIndex> = self.graph.node_indices().collect();
  614. let mut pending: Vec<(NodeIndex, NodeIndex)> = Vec::new();
  615. for &u_idx in &nodes {
  616. for &v_idx in &nodes {
  617. if u_idx == v_idx {
  618. continue;
  619. }
  620. let u = &self.graph[u_idx];
  621. let v = &self.graph[v_idx];
  622. // Pattern 1: B(u) -> A(v) (u precedes v)
  623. if u.b_contig == v.a_contig && u.b_sens == v.a_sens {
  624. if (u.b_sens && u.b_position <= v.a_position)
  625. || (!u.b_sens && u.b_position >= v.a_position)
  626. {
  627. pending.push((u_idx, v_idx));
  628. }
  629. }
  630. // Pattern 2: A(u) -> B(v) (v precedes u)
  631. if u.a_contig == v.b_contig && u.a_sens == v.b_sens {
  632. if (u.a_sens && u.a_position >= v.b_position)
  633. || (!u.a_sens && u.a_position <= v.b_position)
  634. {
  635. // Edge direction depends on strand logic
  636. let (src, dst) = if u.a_sens {
  637. // forward ⇒ edge v -> u
  638. (v_idx, u_idx)
  639. } else {
  640. // reverse ⇒ edge u -> v
  641. (u_idx, v_idx)
  642. };
  643. pending.push((src, dst));
  644. }
  645. }
  646. }
  647. }
  648. // second pass – insert, skipping duplicates
  649. for (src, dst) in pending {
  650. if self.graph.find_edge(src, dst).is_none() {
  651. self.graph.add_edge(src, dst, E::default());
  652. }
  653. }
  654. }
  655. /// Add an edge if absent.
  656. pub fn add_edge_if_absent(&mut self, src: NodeIndex, dst: NodeIndex) {
  657. if self.graph.find_edge(src, dst).is_none() {
  658. self.graph.add_edge(src, dst, E::default());
  659. }
  660. }
  661. /// Pretty‑print a sequence of nodes indicating **actual traversal
  662. /// direction** between successive nodes:
  663. /// * `→` when the graph contains an edge from *previous* → *current*.
  664. /// * `←` when the edge exists from *current* → *previous* (path goes
  665. /// "against" the stored direction).
  666. ///
  667. /// If neither directional edge is present the placeholder `--` is used so
  668. /// debugging clearly shows missing links.
  669. pub fn fmt_path(&self, path: &[NodeIndex]) -> String {
  670. use core::fmt::Write as _;
  671. let mut out = String::new();
  672. if let Some((&first, rest)) = path.split_first() {
  673. let _ = write!(out, "{}", &self.graph[first]);
  674. let mut prev = first;
  675. for &curr in rest {
  676. let arrow = if self.graph.find_edge(prev, curr).is_some() {
  677. " → "
  678. } else if self.graph.find_edge(curr, prev).is_some() {
  679. " ← "
  680. } else {
  681. " -- " // unexpected: no direct edge
  682. };
  683. out.push_str(arrow);
  684. let _ = write!(out, "{}", &self.graph[curr]);
  685. prev = curr;
  686. }
  687. }
  688. out
  689. }
  690. // ------------------------------------------------------------------
  691. // Traversal utilities
  692. // ------------------------------------------------------------------
  693. /// Return a directed path that visits **every node exactly once** (Hamiltonian
  694. /// path) if one exists. Uses naive DFS back-tracking which is exponential—
  695. /// fine for small graphs (<15 nodes) but aborts early otherwise.
  696. pub fn hamiltonian_path(&self) -> Option<Vec<NodeIndex>> {
  697. let n = self.graph.node_count();
  698. if n == 0 {
  699. return Some(Vec::new());
  700. }
  701. if n > 15 {
  702. // avoid combinatorial explosion
  703. return None;
  704. }
  705. let mut path = Vec::with_capacity(n);
  706. let mut visited = vec![false; self.graph.node_bound()];
  707. for start in self.graph.node_identifiers() {
  708. path.push(start);
  709. visited[start.index()] = true;
  710. if self.dfs_hamiltonian(start, &mut visited, &mut path, n) {
  711. return Some(path);
  712. }
  713. visited[start.index()] = false;
  714. path.clear();
  715. }
  716. None
  717. }
  718. /**
  719. Recursive helper for [`hamiltonian_path`](#method.hamiltonian_path).
  720. * `current` – node we are currently expanding from.
  721. * `visited` – mutable bitmap indicating which nodes are already on `path`.
  722. * `path`    – growing list of nodes (last element is always `current`).
  723. * `target`  – desired final length (= `graph.node_count()`).
  724. The function explores each outgoing neighbour that hasn’t been visited yet,
  725. marking it **visited → recurse → unvisit** (classic DFS back-tracking). It
  726. returns `true` as soon as a full-length path is discovered, which bubbles
  727. up through the recursion to terminate the search early.
  728. */
  729. fn dfs_hamiltonian(
  730. &self,
  731. current: NodeIndex,
  732. visited: &mut [bool],
  733. path: &mut Vec<NodeIndex>,
  734. target: usize,
  735. ) -> bool {
  736. // Base‑case: reached required length → success.
  737. if path.len() == target {
  738. return true;
  739. }
  740. // Explore every outgoing neighbour.
  741. for next in self.graph.neighbors(current) {
  742. if !visited[next.index()] {
  743. // choose
  744. visited[next.index()] = true;
  745. path.push(next);
  746. // explore
  747. if self.dfs_hamiltonian(next, visited, path, target) {
  748. return true;
  749. }
  750. // un-choose (back-track)
  751. path.pop();
  752. visited[next.index()] = false;
  753. }
  754. }
  755. false
  756. }
  757. /// Weakly-connected components sorted descending by size (no `Clone` bound).
  758. pub fn components_by_size(&self) -> Vec<Vec<NodeIndex>> {
  759. let mut visited = vec![false; self.graph.node_bound()];
  760. let mut comps: Vec<Vec<NodeIndex>> = Vec::new();
  761. for start in self.graph.node_indices() {
  762. if visited[start.index()] {
  763. continue;
  764. }
  765. // DFS over undirected neighbours
  766. let mut stack = vec![start];
  767. let mut comp = Vec::new();
  768. visited[start.index()] = true;
  769. while let Some(n) = stack.pop() {
  770. comp.push(n);
  771. for neigh in self.graph.neighbors_undirected(n) {
  772. if !visited[neigh.index()] {
  773. visited[neigh.index()] = true;
  774. stack.push(neigh);
  775. }
  776. }
  777. }
  778. comps.push(comp);
  779. }
  780. comps.sort_by_key(|c| std::cmp::Reverse(c.len()));
  781. comps
  782. }
  783. /// Convenience wrapper: try to return a Hamiltonian path; if impossible,
  784. /// return all weakly‑connected subgraphs ordered by size.
  785. pub fn path_or_components(&self) -> Result<Vec<NodeIndex>, Vec<Vec<NodeIndex>>> {
  786. if let Some(p) = self.hamiltonian_path() {
  787. Ok(p)
  788. } else {
  789. Err(self.components_by_size())
  790. }
  791. }
  792. }
  793. pub trait ToBNDGraph<E = ()> {
  794. /// Consume the vector and return a `BNDGraph` whose edges are created with
  795. /// `auto_connect()`.
  796. fn to_bnd_graph(self) -> BNDGraph<E>
  797. where
  798. E: Default;
  799. }
  800. impl<E: Default> ToBNDGraph<E> for Vec<BNDDesc> {
  801. fn to_bnd_graph(self) -> BNDGraph<E> {
  802. let mut g = BNDGraph::<E>::new();
  803. for b in self {
  804. g.add_breakpoint(b);
  805. }
  806. g.auto_connect();
  807. g
  808. }
  809. }
  810. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
  811. pub struct DeletionDesc {
  812. pub contig: String,
  813. pub start: u32, // 1-based [)
  814. pub end: u32,
  815. }
  816. impl fmt::Display for DeletionDesc {
  817. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  818. write!(f, "{}:{}_{}_del", self.contig, self.start, self.end)
  819. }
  820. }
  821. impl DeletionDesc {
  822. pub fn len(&self) -> u32 {
  823. self.end.saturating_sub(self.start) + 1
  824. }
  825. pub fn is_empty(&self) -> bool {
  826. self.len() == 0
  827. }
  828. }
  829. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
  830. pub enum AlterationCategory {
  831. SNV,
  832. DEL,
  833. INS,
  834. DUP,
  835. INV,
  836. CNV,
  837. TRL,
  838. BND,
  839. DELINV,
  840. Other,
  841. }
  842. impl fmt::Display for AlterationCategory {
  843. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  844. write!(
  845. f,
  846. "{}",
  847. match self {
  848. AlterationCategory::SNV => "SNV",
  849. AlterationCategory::DEL => "DEL",
  850. AlterationCategory::INS => "INS",
  851. AlterationCategory::DUP => "DUP",
  852. AlterationCategory::INV => "INV",
  853. AlterationCategory::CNV => "CNV",
  854. AlterationCategory::BND => "BND",
  855. AlterationCategory::TRL => "TRL",
  856. AlterationCategory::Other => "Other",
  857. AlterationCategory::DELINV => "DELINV",
  858. }
  859. )
  860. }
  861. }
  862. impl From<SVType> for AlterationCategory {
  863. fn from(sv_type: SVType) -> Self {
  864. match sv_type {
  865. SVType::DEL => AlterationCategory::DEL,
  866. SVType::INS => AlterationCategory::INS,
  867. SVType::DUP => AlterationCategory::DUP,
  868. SVType::INV => AlterationCategory::INV,
  869. SVType::CNV => AlterationCategory::CNV,
  870. SVType::BND => AlterationCategory::BND,
  871. }
  872. }
  873. }
  874. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
  875. pub enum SVType {
  876. DEL,
  877. INS,
  878. DUP,
  879. INV,
  880. CNV,
  881. BND,
  882. }
  883. impl FromStr for SVType {
  884. type Err = anyhow::Error;
  885. fn from_str(s: &str) -> anyhow::Result<Self> {
  886. match s {
  887. "DEL" => Ok(SVType::DEL),
  888. "INS" => Ok(SVType::INS),
  889. "DUP" => Ok(SVType::DUP),
  890. "INV" => Ok(SVType::INV),
  891. "CNV" => Ok(SVType::CNV),
  892. "BND" => Ok(SVType::BND),
  893. _ => Err(anyhow!("Can't parse SVTYPE={s}")),
  894. }
  895. }
  896. }
  897. impl fmt::Display for SVType {
  898. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  899. write!(
  900. f,
  901. "{}",
  902. match self {
  903. SVType::DEL => "DEL",
  904. SVType::INS => "INS",
  905. SVType::DUP => "DUP",
  906. SVType::INV => "INV",
  907. SVType::CNV => "CNV",
  908. SVType::BND => "BND",
  909. }
  910. )
  911. }
  912. }
  913. impl VariantId for VcfVariant {
  914. fn variant_id(&self) -> String {
  915. format!(
  916. "{}:{}_{}_{}",
  917. self.position.contig(),
  918. self.position.position + 1,
  919. self.reference,
  920. self.alternative
  921. )
  922. }
  923. }
  924. impl GetGenomePosition for VcfVariant {
  925. fn position(&self) -> &GenomePosition {
  926. &self.position
  927. }
  928. }
  929. impl PartialOrd for VcfVariant {
  930. fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
  931. Some(self.cmp(other))
  932. }
  933. }
  934. impl Ord for VcfVariant {
  935. fn cmp(&self, other: &Self) -> Ordering {
  936. self.position.cmp(&other.position)
  937. }
  938. }
  939. /// A container for a list of VCF `INFO` fields.
  940. ///
  941. /// Represents a parsed set of key-value annotations or flags found in the INFO column.
  942. ///
  943. /// # Example
  944. /// ```
  945. /// use your_crate::Infos;
  946. /// use std::str::FromStr;
  947. ///
  948. /// let infos = Infos::from_str("SVTYPE=DEL;END=12345;TUMOUR_AF=0.25,0.15").unwrap();
  949. /// println!("{}", infos); // Displays: SVTYPE=DEL;END=12345;TUMOUR_AF=0.25,0.15
  950. /// ```
  951. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
  952. pub struct Infos(pub Vec<Info>);
  953. impl FromStr for Infos {
  954. type Err = anyhow::Error;
  955. /// Parses a semicolon-separated list of INFO fields from a VCF record.
  956. fn from_str(s: &str) -> anyhow::Result<Self> {
  957. Ok(Self(
  958. s.split(";")
  959. .map(Info::from_str)
  960. .collect::<Result<Vec<Info>, _>>()
  961. .map_err(|e| anyhow::anyhow!("Failed to parse info: {e}"))?,
  962. ))
  963. }
  964. }
  965. impl fmt::Display for Infos {
  966. /// Formats the `Infos` as a semicolon-separated VCF-style INFO string.
  967. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  968. let items: Vec<_> = self
  969. .0
  970. .iter()
  971. .filter(|info| !matches!(info, Info::Empty))
  972. .map(ToString::to_string)
  973. .collect();
  974. if items.is_empty() {
  975. write!(f, ".")
  976. } else {
  977. write!(f, "{}", items.join(";"))
  978. }
  979. }
  980. }
  981. impl Infos {
  982. pub fn n_alt_depth(&self) -> Option<(u32, u32)> {
  983. use Info::*;
  984. let mut tumor_alt: Option<u32> = None;
  985. let mut tumor_depth: Option<u32> = None;
  986. for info in self.0.iter() {
  987. match info {
  988. TUMOUR_DP_AT(v) => {
  989. let m = mean(v);
  990. tumor_depth = Some(m.round() as u32);
  991. }
  992. TUMOUR_READ_SUPPORT(v) => {
  993. tumor_alt = Some(*v);
  994. }
  995. _ => (),
  996. }
  997. }
  998. match (tumor_alt, tumor_depth) {
  999. (Some(a), Some(d)) => Some((a, d)),
  1000. _ => None,
  1001. }
  1002. }
  1003. }
  1004. /// Enum representing a single INFO field in a VCF record.
  1005. ///
  1006. /// Supports both standard fields and Severus-specific structural variant annotations.
  1007. /// Handles string values, numeric values, vectors, and flags.
  1008. ///
  1009. /// Variants with `Vec<_>` represent fields with multiple comma-separated values.
  1010. #[allow(non_camel_case_types)]
  1011. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
  1012. pub enum Info {
  1013. Empty,
  1014. H,
  1015. F,
  1016. P,
  1017. FAU(u32),
  1018. FCU(u32),
  1019. FGU(u32),
  1020. FTU(u32),
  1021. RAU(u32),
  1022. RCU(u32),
  1023. RGU(u32),
  1024. RTU(u32),
  1025. SVTYPE(SVType),
  1026. MATEID(String),
  1027. NORMAL_READ_SUPPORT(u32),
  1028. TUMOUR_READ_SUPPORT(u32),
  1029. NORMAL_ALN_SUPPORT(u32),
  1030. TUMOUR_ALN_SUPPORT(u32),
  1031. SVLEN(i32),
  1032. TUMOUR_DP_BEFORE(Vec<u32>),
  1033. TUMOUR_DP_AT(Vec<u32>),
  1034. TUMOUR_DP_AFTER(Vec<u32>),
  1035. NORMAL_DP_BEFORE(Vec<u32>),
  1036. NORMAL_DP_AT(Vec<u32>),
  1037. NORMAL_DP_AFTER(Vec<u32>),
  1038. TUMOUR_AF(Vec<f32>),
  1039. NORMAL_AF(Vec<f32>),
  1040. BP_NOTATION(String),
  1041. SOURCE(String),
  1042. CLUSTERED_READS_TUMOUR(u32),
  1043. CLUSTERED_READS_NORMAL(u32),
  1044. TUMOUR_ALT_HP(Vec<u32>),
  1045. TUMOUR_PS(Vec<String>),
  1046. NORMAL_ALT_HP(Vec<u32>),
  1047. NORMAL_PS(Vec<String>),
  1048. TUMOUR_TOTAL_HP_AT(Vec<u32>),
  1049. NORMAL_TOTAL_HP_AT(Vec<u32>),
  1050. ORIGIN_STARTS_STD_DEV(f32),
  1051. ORIGIN_MAPQ_MEAN(f32),
  1052. ORIGIN_EVENT_SIZE_STD_DEV(f32),
  1053. ORIGIN_EVENT_SIZE_MEDIAN(f32),
  1054. ORIGIN_EVENT_SIZE_MEAN(f32),
  1055. END_STARTS_STD_DEV(f32),
  1056. END_MAPQ_MEAN(f32),
  1057. END_EVENT_SIZE_STD_DEV(f32),
  1058. END_EVENT_SIZE_MEDIAN(f32),
  1059. END_EVENT_SIZE_MEAN(f32),
  1060. CLASS(String),
  1061. END(u32),
  1062. SVINSLEN(u32),
  1063. SVINSSEQ(String),
  1064. // Severus
  1065. PRECISE,
  1066. IMPRECISE,
  1067. STRANDS(String),
  1068. DETAILED_TYPE(String),
  1069. INSLEN(i32),
  1070. MAPQ(u32),
  1071. PHASESETID(String),
  1072. HP(u32),
  1073. CLUSTERID(String),
  1074. INSSEQ(String),
  1075. MATE_ID(String),
  1076. INSIDE_VNTR(String),
  1077. ALINGED_POS(String),
  1078. }
  1079. impl FromStr for Info {
  1080. type Err = anyhow::Error;
  1081. /// Parses a single `INFO` key or key=value string into a typed `Info` variant.
  1082. ///
  1083. /// Handles both presence/absence flags and key-value fields
  1084. fn from_str(s: &str) -> anyhow::Result<Self> {
  1085. if s.contains('=') {
  1086. let (key, value) = s
  1087. .split_once('=')
  1088. .context(format!("Can't split with `=` in string: {s}"))?;
  1089. Ok(match key {
  1090. "FAU" => Info::FAU(parse_value(value, key)?),
  1091. "FCU" => Info::FCU(parse_value(value, key)?),
  1092. "FGU" => Info::FGU(parse_value(value, key)?),
  1093. "FTU" => Info::FTU(parse_value(value, key)?),
  1094. "RAU" => Info::RAU(parse_value(value, key)?),
  1095. "RCU" => Info::RCU(parse_value(value, key)?),
  1096. "RGU" => Info::RGU(parse_value(value, key)?),
  1097. "RTU" => Info::RTU(parse_value(value, key)?),
  1098. "SVLEN" => Info::SVLEN(parse_value(value, key)?),
  1099. "END" => Info::END(parse_value(value, key)?),
  1100. "SVINSLEN" => Info::SVINSLEN(parse_value(value, key)?),
  1101. "SVTYPE" => Info::SVTYPE(value.parse()?),
  1102. "MATEID" => Info::MATEID(value.to_string()),
  1103. "NORMAL_READ_SUPPORT" => Info::NORMAL_READ_SUPPORT(parse_value(value, key)?),
  1104. "TUMOUR_READ_SUPPORT" => Info::TUMOUR_READ_SUPPORT(parse_value(value, key)?),
  1105. "NORMAL_ALN_SUPPORT" => Info::NORMAL_ALN_SUPPORT(parse_value(value, key)?),
  1106. "TUMOUR_ALN_SUPPORT" => Info::TUMOUR_ALN_SUPPORT(parse_value(value, key)?),
  1107. "SVINSSEQ" => Info::SVINSSEQ(value.to_string()),
  1108. "TUMOUR_DP_BEFORE" => Info::TUMOUR_DP_BEFORE(parse_vec_value(value, key)?),
  1109. "TUMOUR_DP_AT" => Info::TUMOUR_DP_AT(parse_vec_value(value, key)?),
  1110. "TUMOUR_DP_AFTER" => Info::TUMOUR_DP_AFTER(parse_vec_value(value, key)?),
  1111. "NORMAL_DP_BEFORE" => Info::NORMAL_DP_BEFORE(parse_vec_value(value, key)?),
  1112. "NORMAL_DP_AT" => Info::NORMAL_DP_AT(parse_vec_value(value, key)?),
  1113. "NORMAL_DP_AFTER" => Info::NORMAL_DP_AFTER(parse_vec_value(value, key)?),
  1114. "TUMOUR_AF" => Info::TUMOUR_AF(parse_vec_value(value, key)?),
  1115. "NORMAL_AF" => Info::NORMAL_AF(parse_vec_value(value, key)?),
  1116. "BP_NOTATION" => Info::BP_NOTATION(value.to_string()),
  1117. "SOURCE" => Info::SOURCE(value.to_string()),
  1118. "CLUSTERED_READS_TUMOUR" => Info::CLUSTERED_READS_TUMOUR(parse_value(value, key)?),
  1119. "CLUSTERED_READS_NORMAL" => Info::CLUSTERED_READS_NORMAL(parse_value(value, key)?),
  1120. "TUMOUR_ALT_HP" => Info::TUMOUR_ALT_HP(parse_vec_value(value, key)?),
  1121. "TUMOUR_PS" => Info::TUMOUR_PS(parse_vec_value(value, key)?),
  1122. "NORMAL_ALT_HP" => Info::NORMAL_ALT_HP(parse_vec_value(value, key)?),
  1123. "NORMAL_PS" => Info::NORMAL_PS(parse_vec_value(value, key)?),
  1124. "TUMOUR_TOTAL_HP_AT" => Info::TUMOUR_TOTAL_HP_AT(parse_vec_value(value, key)?),
  1125. "NORMAL_TOTAL_HP_AT" => Info::NORMAL_TOTAL_HP_AT(parse_vec_value(value, key)?),
  1126. "ORIGIN_STARTS_STD_DEV" => Info::ORIGIN_STARTS_STD_DEV(parse_value(value, key)?),
  1127. "ORIGIN_MAPQ_MEAN" => Info::ORIGIN_MAPQ_MEAN(parse_value(value, key)?),
  1128. "ORIGIN_EVENT_SIZE_STD_DEV" => {
  1129. Info::ORIGIN_EVENT_SIZE_STD_DEV(parse_value(value, key)?)
  1130. }
  1131. "ORIGIN_EVENT_SIZE_MEDIAN" => {
  1132. Info::ORIGIN_EVENT_SIZE_MEDIAN(parse_value(value, key)?)
  1133. }
  1134. "ORIGIN_EVENT_SIZE_MEAN" => Info::ORIGIN_EVENT_SIZE_MEAN(parse_value(value, key)?),
  1135. "END_STARTS_STD_DEV" => Info::END_STARTS_STD_DEV(parse_value(value, key)?),
  1136. "END_MAPQ_MEAN" => Info::END_MAPQ_MEAN(parse_value(value, key)?),
  1137. "END_EVENT_SIZE_STD_DEV" => Info::END_EVENT_SIZE_STD_DEV(parse_value(value, key)?),
  1138. "END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?),
  1139. "END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?),
  1140. "CLASS" => Info::CLASS(value.to_string()),
  1141. "PRECISE" => Info::PRECISE,
  1142. "IMPRECISE" => Info::IMPRECISE,
  1143. "STRANDS" => Info::STRANDS(value.to_string()),
  1144. "DETAILED_TYPE" => Info::DETAILED_TYPE(value.to_string()),
  1145. "INSLEN" => Info::INSLEN(parse_value(value, key)?),
  1146. "MAPQ" => Info::MAPQ(parse_value(value, key)?),
  1147. "PHASESETID" => Info::PHASESETID(value.to_string()),
  1148. "HP" => Info::HP(parse_value(value, key)?),
  1149. "CLUSTERID" => Info::CLUSTERID(value.to_string()),
  1150. "INSSEQ" => Info::INSSEQ(value.to_string()),
  1151. "MATE_ID" => Info::MATE_ID(value.to_string()),
  1152. "INSIDE_VNTR" => Info::INSIDE_VNTR(value.to_string()),
  1153. "ALINGED_POS" => Info::ALINGED_POS(value.to_string()),
  1154. _ => Info::Empty,
  1155. })
  1156. } else {
  1157. Ok(match s {
  1158. "H" => Info::H,
  1159. "F" => Info::F,
  1160. "P" => Info::P,
  1161. "PRECISE" => Info::PRECISE,
  1162. "IMPRECISE" => Info::IMPRECISE,
  1163. _ => Info::Empty,
  1164. })
  1165. }
  1166. }
  1167. }
  1168. impl fmt::Display for Info {
  1169. /// Converts the `Info` enum into a VCF-compliant string (key=value or flag).
  1170. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  1171. match self {
  1172. Info::Empty => write!(f, "."),
  1173. Info::H => write!(f, "H"),
  1174. Info::F => write!(f, "F"),
  1175. Info::P => write!(f, "P"),
  1176. // ClairS
  1177. Info::FAU(v) => write!(f, "FAU={v}"),
  1178. Info::FCU(v) => write!(f, "FCU={v}"),
  1179. Info::FGU(v) => write!(f, "FGU={v}"),
  1180. Info::FTU(v) => write!(f, "FTU={v}"),
  1181. Info::RAU(v) => write!(f, "RAU={v}"),
  1182. Info::RCU(v) => write!(f, "RCU={v}"),
  1183. Info::RGU(v) => write!(f, "RGU={v}"),
  1184. Info::RTU(v) => write!(f, "RTU={v}"),
  1185. // Nanomonsv
  1186. Info::SVTYPE(v) => write!(f, "SVTYPE={v}"),
  1187. Info::SVLEN(v) => write!(f, "SVLEN={v}"),
  1188. Info::END(v) => write!(f, "END={v}"),
  1189. Info::MATEID(v) => write!(f, "MATEID={v}"),
  1190. Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"),
  1191. Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"),
  1192. // SAVANA
  1193. Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"),
  1194. Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"),
  1195. Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"),
  1196. Info::TUMOUR_ALN_SUPPORT(v) => write!(f, "TUMOUR_ALN_SUPPORT={v}"),
  1197. Info::TUMOUR_DP_BEFORE(v) => write!(f, "TUMOUR_DP_BEFORE={}", concat_numbers(v)),
  1198. Info::TUMOUR_DP_AT(v) => write!(f, "TUMOUR_DP_AT={}", concat_numbers(v)),
  1199. Info::TUMOUR_DP_AFTER(v) => write!(f, "TUMOUR_DP_AFTER={}", concat_numbers(v)),
  1200. Info::NORMAL_DP_BEFORE(v) => write!(f, "NORMAL_DP_BEFORE={}", concat_numbers(v)),
  1201. Info::NORMAL_DP_AT(v) => write!(f, "NORMAL_DP_AT={}", concat_numbers(v)),
  1202. Info::NORMAL_DP_AFTER(v) => write!(f, "NORMAL_DP_AFTER={}", concat_numbers(v)),
  1203. Info::TUMOUR_AF(v) => write!(f, "TUMOUR_AF={}", concat_numbers(v)),
  1204. Info::NORMAL_AF(v) => write!(f, "NORMAL_AF={}", concat_numbers(v)),
  1205. Info::BP_NOTATION(v) => write!(f, "BP_NOTATION={v}"),
  1206. Info::SOURCE(v) => write!(f, "SOURCE={v}"),
  1207. Info::CLUSTERED_READS_TUMOUR(v) => write!(f, "CLUSTERED_READS_TUMOUR={v}"),
  1208. Info::CLUSTERED_READS_NORMAL(v) => write!(f, "CLUSTERED_READS_NORMAL={v}"),
  1209. Info::TUMOUR_ALT_HP(v) => write!(f, "TUMOUR_ALT_HP={}", concat_numbers(v)),
  1210. Info::TUMOUR_PS(v) => write!(f, "TUMOUR_PS={}", v.join(",")),
  1211. Info::NORMAL_ALT_HP(v) => write!(f, "NORMAL_ALT_HP={}", concat_numbers(v)),
  1212. Info::NORMAL_PS(v) => write!(f, "NORMAL_PS={}", v.join(",")),
  1213. Info::TUMOUR_TOTAL_HP_AT(v) => write!(f, "TUMOUR_TOTAL_HP_AT={}", concat_numbers(v)),
  1214. Info::NORMAL_TOTAL_HP_AT(v) => write!(f, "NORMAL_TOTAL_HP_AT={}", concat_numbers(v)),
  1215. Info::ORIGIN_STARTS_STD_DEV(v) => write!(f, "ORIGIN_STARTS_STD_DEV={v}"),
  1216. Info::ORIGIN_MAPQ_MEAN(v) => write!(f, "ORIGIN_MAPQ_MEAN={v}"),
  1217. Info::ORIGIN_EVENT_SIZE_STD_DEV(v) => write!(f, "ORIGIN_EVENT_SIZE_STD_DEV={v}"),
  1218. Info::ORIGIN_EVENT_SIZE_MEDIAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEDIAN={v}"),
  1219. Info::ORIGIN_EVENT_SIZE_MEAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEAN={v}"),
  1220. Info::END_STARTS_STD_DEV(v) => write!(f, "END_STARTS_STD_DEV={v}"),
  1221. Info::END_MAPQ_MEAN(v) => write!(f, "END_MAPQ_MEAN={v}"),
  1222. Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"),
  1223. Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"),
  1224. Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"),
  1225. Info::CLASS(v) => write!(f, "CLASS={v}"),
  1226. // Severus
  1227. Info::PRECISE => write!(f, "PRECISE"),
  1228. Info::IMPRECISE => write!(f, "IMPRECISE"),
  1229. Info::STRANDS(v) => write!(f, "STRANDS={v}"),
  1230. Info::DETAILED_TYPE(v) => write!(f, "DETAILED_TYPE={v}"),
  1231. Info::INSLEN(v) => write!(f, "INSLEN={v}"),
  1232. Info::MAPQ(v) => write!(f, "MAPQ={v}"),
  1233. Info::PHASESETID(v) => write!(f, "PHASESETID={v}"),
  1234. Info::HP(v) => write!(f, "HP={v}"),
  1235. Info::CLUSTERID(v) => write!(f, "CLUSTERID={v}"),
  1236. Info::INSSEQ(v) => write!(f, "INSSEQ={v}"),
  1237. Info::MATE_ID(v) => write!(f, "MATE_ID={v}"),
  1238. Info::INSIDE_VNTR(v) => write!(f, "INSIDE_VNTR={v}"),
  1239. Info::ALINGED_POS(v) => write!(f, "ALINGED_POS={v}"),
  1240. }
  1241. }
  1242. }
  1243. pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
  1244. v.iter()
  1245. .map(|n| n.to_string())
  1246. .collect::<Vec<String>>()
  1247. .join(",")
  1248. }
  1249. impl Info {
  1250. /// Returns the VCF key name (e.g. "DP", "SVTYPE", "PRECISE") for this INFO field.
  1251. ///
  1252. /// This uses the string representation (`Display`) and extracts the part before `=`,
  1253. /// which is valid for both key=value and flag-only entries.
  1254. pub fn key(&self) -> String {
  1255. self.to_string()
  1256. .split('=')
  1257. .next()
  1258. .unwrap_or(".")
  1259. .to_string()
  1260. }
  1261. /// Returns the complete set of known VCF `INFO` header definitions used by `Info` variants.
  1262. ///
  1263. /// # Example
  1264. /// ```
  1265. /// let headers = Info::header_definitions();
  1266. /// for line in headers {
  1267. /// println!("{line}");
  1268. /// }
  1269. /// ```
  1270. pub fn header_definitions() -> BTreeSet<String> {
  1271. let mut set = BTreeSet::new();
  1272. macro_rules! push {
  1273. ($id:expr, $num:expr, $typ:expr, $desc:expr) => {
  1274. set.insert(format!(
  1275. r#"##INFO=<ID={},Number={},Type={},Description="{}">"#,
  1276. $id, $num, $typ, $desc
  1277. ));
  1278. };
  1279. }
  1280. // Flags
  1281. push!("H", 0, "Flag", "H flag");
  1282. push!("F", 0, "Flag", "F flag");
  1283. push!("P", 0, "Flag", "P flag");
  1284. // Allelic support
  1285. push!("FAU", 1, "Integer", "Forward A support in tumour");
  1286. push!("FCU", 1, "Integer", "Forward C support in tumour");
  1287. push!("FGU", 1, "Integer", "Forward G support in tumour");
  1288. push!("FTU", 1, "Integer", "Forward T support in tumour");
  1289. push!("RAU", 1, "Integer", "Reverse A support in tumour");
  1290. push!("RCU", 1, "Integer", "Reverse C support in tumour");
  1291. push!("RGU", 1, "Integer", "Reverse G support in tumour");
  1292. push!("RTU", 1, "Integer", "Reverse T support in tumour");
  1293. // Structural variant metadata
  1294. push!("SVTYPE", 1, "String", "Structural variant type");
  1295. push!("MATEID", 1, "String", "ID of the mate breakend");
  1296. push!("SVLEN", 1, "Integer", "Length of structural variant");
  1297. push!("SVINSLEN", 1, "Integer", "Length of inserted sequence");
  1298. push!("SVINSSEQ", 1, "String", "Inserted sequence");
  1299. // Positions and read support
  1300. push!("END", 1, "Integer", "End position of the variant");
  1301. push!(
  1302. "NORMAL_READ_SUPPORT",
  1303. 1,
  1304. "Integer",
  1305. "Supporting reads in normal sample"
  1306. );
  1307. push!(
  1308. "TUMOUR_READ_SUPPORT",
  1309. 1,
  1310. "Integer",
  1311. "Supporting reads in tumour sample"
  1312. );
  1313. push!(
  1314. "NORMAL_ALN_SUPPORT",
  1315. 1,
  1316. "Integer",
  1317. "Aligned reads in normal sample"
  1318. );
  1319. push!(
  1320. "TUMOUR_ALN_SUPPORT",
  1321. 1,
  1322. "Integer",
  1323. "Aligned reads in tumour sample"
  1324. );
  1325. // Depth profiles
  1326. push!(
  1327. "TUMOUR_DP_BEFORE",
  1328. ".",
  1329. "Integer",
  1330. "Depth before breakpoint in tumour"
  1331. );
  1332. push!(
  1333. "TUMOUR_DP_AT",
  1334. ".",
  1335. "Integer",
  1336. "Depth at breakpoint in tumour"
  1337. );
  1338. push!(
  1339. "TUMOUR_DP_AFTER",
  1340. ".",
  1341. "Integer",
  1342. "Depth after breakpoint in tumour"
  1343. );
  1344. push!(
  1345. "NORMAL_DP_BEFORE",
  1346. ".",
  1347. "Integer",
  1348. "Depth before breakpoint in normal"
  1349. );
  1350. push!(
  1351. "NORMAL_DP_AT",
  1352. ".",
  1353. "Integer",
  1354. "Depth at breakpoint in normal"
  1355. );
  1356. push!(
  1357. "NORMAL_DP_AFTER",
  1358. ".",
  1359. "Integer",
  1360. "Depth after breakpoint in normal"
  1361. );
  1362. // Allele frequencies
  1363. push!(
  1364. "TUMOUR_AF",
  1365. ".",
  1366. "Float",
  1367. "Variant allele frequencies in tumour"
  1368. );
  1369. push!(
  1370. "NORMAL_AF",
  1371. ".",
  1372. "Float",
  1373. "Variant allele frequencies in normal"
  1374. );
  1375. // Haplotype/phasing
  1376. push!(
  1377. "TUMOUR_ALT_HP",
  1378. ".",
  1379. "Integer",
  1380. "Alternate haplotype support in tumour"
  1381. );
  1382. push!("TUMOUR_PS", ".", "String", "Phasing set in tumour");
  1383. push!(
  1384. "NORMAL_ALT_HP",
  1385. ".",
  1386. "Integer",
  1387. "Alternate haplotype support in normal"
  1388. );
  1389. push!("NORMAL_PS", ".", "String", "Phasing set in normal");
  1390. push!(
  1391. "TUMOUR_TOTAL_HP_AT",
  1392. ".",
  1393. "Integer",
  1394. "Total haplotype depth at breakpoint in tumour"
  1395. );
  1396. push!(
  1397. "NORMAL_TOTAL_HP_AT",
  1398. ".",
  1399. "Integer",
  1400. "Total haplotype depth at breakpoint in normal"
  1401. );
  1402. // Cluster analysis
  1403. push!(
  1404. "CLUSTERED_READS_TUMOUR",
  1405. 1,
  1406. "Integer",
  1407. "Clustered reads in tumour"
  1408. );
  1409. push!(
  1410. "CLUSTERED_READS_NORMAL",
  1411. 1,
  1412. "Integer",
  1413. "Clustered reads in normal"
  1414. );
  1415. // Origin and end-point statistics
  1416. push!(
  1417. "ORIGIN_STARTS_STD_DEV",
  1418. 1,
  1419. "Float",
  1420. "STDDEV of read starts at origin"
  1421. );
  1422. push!("ORIGIN_MAPQ_MEAN", 1, "Float", "Mean MAPQ at origin");
  1423. push!(
  1424. "ORIGIN_EVENT_SIZE_STD_DEV",
  1425. 1,
  1426. "Float",
  1427. "STDDEV of event size at origin"
  1428. );
  1429. push!(
  1430. "ORIGIN_EVENT_SIZE_MEDIAN",
  1431. 1,
  1432. "Float",
  1433. "Median event size at origin"
  1434. );
  1435. push!(
  1436. "ORIGIN_EVENT_SIZE_MEAN",
  1437. 1,
  1438. "Float",
  1439. "Mean event size at origin"
  1440. );
  1441. push!(
  1442. "END_STARTS_STD_DEV",
  1443. 1,
  1444. "Float",
  1445. "STDDEV of read starts at end"
  1446. );
  1447. push!("END_MAPQ_MEAN", 1, "Float", "Mean MAPQ at end");
  1448. push!(
  1449. "END_EVENT_SIZE_STD_DEV",
  1450. 1,
  1451. "Float",
  1452. "STDDEV of event size at end"
  1453. );
  1454. push!(
  1455. "END_EVENT_SIZE_MEDIAN",
  1456. 1,
  1457. "Float",
  1458. "Median event size at end"
  1459. );
  1460. push!("END_EVENT_SIZE_MEAN", 1, "Float", "Mean event size at end");
  1461. // Additional
  1462. push!("BP_NOTATION", 1, "String", "Breakpoint notation");
  1463. push!("SOURCE", 1, "String", "Caller source name");
  1464. push!("CLASS", 1, "String", "Variant classification");
  1465. // Severus
  1466. push!(
  1467. "PRECISE",
  1468. 0,
  1469. "Flag",
  1470. "SV with precise breakpoints coordinates and length"
  1471. );
  1472. push!(
  1473. "IMPRECISE",
  1474. 0,
  1475. "Flag",
  1476. "SV with imprecise breakpoints coordinates and length"
  1477. );
  1478. push!("STRANDS", 1, "String", "Breakpoint strandedness");
  1479. push!("DETAILED_TYPE", 1, "String", "Detailed type of the SV");
  1480. push!(
  1481. "INSLEN",
  1482. 1,
  1483. "Integer",
  1484. "Length of the unmapped sequence between breakpoint"
  1485. );
  1486. push!(
  1487. "MAPQ",
  1488. 1,
  1489. "Integer",
  1490. "Median mapping quality of supporting reads"
  1491. );
  1492. push!(
  1493. "PHASESETID",
  1494. 1,
  1495. "String",
  1496. "Matching phaseset ID for phased SVs"
  1497. );
  1498. push!("HP", 1, "Integer", "Matching haplotype ID for phased SVs");
  1499. push!("CLUSTERID", 1, "String", "Cluster ID in breakpoint_graph");
  1500. push!(
  1501. "INSSEQ",
  1502. 1,
  1503. "String",
  1504. "Insertion sequence between breakpoints"
  1505. );
  1506. push!("MATE_ID", 1, "String", "MATE ID for breakends");
  1507. push!(
  1508. "INSIDE_VNTR",
  1509. 1,
  1510. "String",
  1511. "True if an indel is inside a VNTR"
  1512. );
  1513. push!("ALINGED_POS", 1, "String", "Position in the reference");
  1514. set
  1515. }
  1516. }
  1517. /// Format
  1518. /// Enum representing individual FORMAT fields from a VCF record.
  1519. ///
  1520. /// This enum supports common fields used by DeepVariant, Clairs, and nanomonsv,
  1521. /// as well as a generic fallback for other key-value pairs.
  1522. ///
  1523. /// # Examples
  1524. ///
  1525. /// ```
  1526. /// use your_crate::Format;
  1527. ///
  1528. /// let gt = Format::GT("0/1".to_string());
  1529. /// let dp = Format::DP(30);
  1530. /// let ad = Format::AD(vec![10, 20]);
  1531. /// ```
  1532. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
  1533. pub enum Format {
  1534. // --- DeepVariant fields ---
  1535. /// Genotype string, e.g., "0/1", "1/1".
  1536. GT(String),
  1537. /// Genotype quality.
  1538. GQ(u32),
  1539. /// Read depth (total coverage at the variant position).
  1540. DP(u32),
  1541. /// Allelic depths for the ref and alt alleles (e.g., [ref, alt1, alt2...]).
  1542. AD(Vec<u32>),
  1543. /// Variant allele frequency (e.g., 0.25 for 25%).
  1544. VAF(f32),
  1545. /// Phred-scaled genotype likelihoods.
  1546. PL(Vec<u32>),
  1547. // --- Clairs fields (prefixed with N: for normal sample, or tumor in case of paired) ---
  1548. /// Normal sample total depth.
  1549. NDP(u32),
  1550. /// Normal sample allelic depths (e.g., [ref, alt1, alt2...]).
  1551. NAD(Vec<u32>),
  1552. /// Allele-specific counts for A, C, G, T bases in tumor sample.
  1553. AU(u32),
  1554. CU(u32),
  1555. GU(u32),
  1556. TU(u32),
  1557. /// Allele-specific counts for A, C, G, T bases in normal sample.
  1558. NAU(u32),
  1559. NCU(u32),
  1560. NGU(u32),
  1561. NTU(u32),
  1562. // --- nanomonsv fields ---
  1563. /// Total number of supporting reads in tumor.
  1564. TR(u32),
  1565. /// Variant-supporting reads in tumor.
  1566. VR(u32),
  1567. // --- Severus fields ---
  1568. DR(u32),
  1569. DV(u32),
  1570. HVAF(Vec<f32>),
  1571. /// Fallback for any other key-value pair not explicitly modeled.
  1572. /// Contains the raw key and value as strings.
  1573. Other((String, String)),
  1574. }
  1575. /// Container for a list of `Format` items.
  1576. /// Represents the full FORMAT field and sample value for one sample.
  1577. ///
  1578. /// # Examples
  1579. ///
  1580. /// ```
  1581. /// use your_crate::{Formats, Format};
  1582. ///
  1583. /// let formats = Formats(vec![
  1584. /// Format::GT("0/1".to_string()),
  1585. /// Format::DP(45),
  1586. /// Format::AD(vec![15, 30]),
  1587. /// ]);
  1588. /// ```
  1589. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
  1590. pub struct Formats(pub Vec<Format>);
  1591. impl Formats {
  1592. /// Returns the tumor alternative read depth and total depth if both are available.
  1593. ///
  1594. /// This method looks for:
  1595. /// - `Format::AD`: to compute the sum of alternative allele depths (excluding reference)
  1596. /// - `Format::DP`: to get total read depth
  1597. ///
  1598. /// Returns `Some((alt_depth, total_depth))` if both are present, else `None`.
  1599. ///
  1600. /// # Example
  1601. /// ```
  1602. /// use your_crate::{Formats, Format};
  1603. ///
  1604. /// let f = Formats(vec![
  1605. /// Format::AD(vec![10, 20, 5]),
  1606. /// Format::DP(40),
  1607. /// ]);
  1608. ///
  1609. /// assert_eq!(f.n_alt_depth(), Some((25, 40)));
  1610. /// ```
  1611. pub fn n_alt_depth(&self) -> Option<(u32, u32)> {
  1612. let mut tumor_alt_depth: Option<u32> = None;
  1613. let mut tumor_ref_depth: Option<u32> = None;
  1614. let mut tumor_total_depth: Option<u32> = None;
  1615. for format in &self.0 {
  1616. match format {
  1617. Format::AD(values) => {
  1618. if values.len() > 1 {
  1619. tumor_alt_depth = Some(values[1..].iter().sum());
  1620. }
  1621. }
  1622. Format::DP(value) => {
  1623. tumor_total_depth = Some(*value);
  1624. }
  1625. Format::VR(values) => {
  1626. tumor_alt_depth = Some(*values);
  1627. }
  1628. Format::TR(value) => {
  1629. tumor_ref_depth = Some(*value);
  1630. }
  1631. Format::DV(values) => {
  1632. tumor_alt_depth = Some(*values);
  1633. }
  1634. Format::DR(value) => {
  1635. tumor_ref_depth = Some(*value);
  1636. }
  1637. _ => {}
  1638. }
  1639. }
  1640. match (tumor_alt_depth, tumor_total_depth, tumor_ref_depth) {
  1641. (Some(alt), Some(total), _) => Some((alt, total)),
  1642. (Some(alt), _, Some(refe)) => Some((alt, alt + refe)),
  1643. _ => None,
  1644. }
  1645. }
  1646. }
  1647. impl TryFrom<(&str, &str)> for Formats {
  1648. type Error = anyhow::Error;
  1649. /// Attempts to construct a `Formats` from a pair of colon-separated FORMAT keys and values.
  1650. ///
  1651. /// # Arguments
  1652. /// * `k` - FORMAT field names (e.g., "GT:DP:AD")
  1653. /// * `v` - Corresponding values (e.g., "0/1:35:10,25")
  1654. ///
  1655. /// # Errors
  1656. /// Returns an error if the number of keys and values do not match or if parsing fails.
  1657. ///
  1658. /// # Example
  1659. /// ```
  1660. /// use your_crate::Formats;
  1661. /// use std::convert::TryFrom;
  1662. ///
  1663. /// let f = Formats::try_from(("GT:DP:AD", "0/1:40:12,28")).unwrap();
  1664. /// ```
  1665. fn try_from((k, v): (&str, &str)) -> anyhow::Result<Self> {
  1666. let keys: Vec<&str> = k.split(':').collect();
  1667. let values: Vec<&str> = v.split(':').collect();
  1668. if keys.len() != values.len() {
  1669. anyhow::bail!("Mismatch between keys and values count for {k} {v}");
  1670. }
  1671. Ok(Self(
  1672. keys.into_iter()
  1673. .zip(values)
  1674. .map(|(key, value)| Format::try_from((key, value)))
  1675. .collect::<Result<Vec<Format>, _>>()
  1676. .map_err(|e| anyhow::anyhow!("Failed to parse format: {e}"))?,
  1677. ))
  1678. }
  1679. }
  1680. impl From<Formats> for (String, String) {
  1681. /// Converts `Formats` back into a `(keys, values)` tuple of colon-separated strings.
  1682. ///
  1683. /// This is the inverse of the `TryFrom<(&str, &str)>` implementation.
  1684. ///
  1685. /// # Example
  1686. /// ```
  1687. /// use your_crate::{Format, Formats};
  1688. ///
  1689. /// let formats = Formats(vec![
  1690. /// Format::GT("0/1".to_string()),
  1691. /// Format::DP(30),
  1692. /// ]);
  1693. ///
  1694. /// let (k, v): (String, String) = formats.into();
  1695. /// assert_eq!(k, "GT:DP");
  1696. /// assert_eq!(v, "0/1:30");
  1697. /// ```
  1698. fn from(formats: Formats) -> Self {
  1699. let mut keys = Vec::new();
  1700. let mut values = Vec::new();
  1701. for format in formats.0 {
  1702. let (key, value): (String, String) = format.into();
  1703. keys.push(key);
  1704. values.push(value);
  1705. }
  1706. (keys.join(":"), values.join(":"))
  1707. }
  1708. }
  1709. impl TryFrom<(&str, &str)> for Format {
  1710. type Error = anyhow::Error;
  1711. /// Tries to convert a `(key, value)` pair into a typed `Format` variant.
  1712. ///
  1713. /// This parser supports known FORMAT keys from DeepVariant, Clairs, and nanomonsv.
  1714. /// Unknown keys are stored as `Format::Other((key, value))`.
  1715. ///
  1716. /// # Arguments
  1717. /// * `key` - FORMAT field name
  1718. /// * `value` - raw string value associated with the key
  1719. ///
  1720. /// # Example
  1721. /// ```
  1722. /// use your_crate::Format;
  1723. /// use std::convert::TryFrom;
  1724. ///
  1725. /// let dp = Format::try_from(("DP", "42")).unwrap();
  1726. /// assert!(matches!(dp, Format::DP(42)));
  1727. /// ```
  1728. fn try_from((key, value): (&str, &str)) -> anyhow::Result<Self> {
  1729. let format = match key {
  1730. // DeepVariant
  1731. "GT" => Format::GT(value.to_string()),
  1732. "GQ" => Format::GQ(parse_value(value, key)?),
  1733. "DP" => Format::DP(parse_value(value, key)?),
  1734. "AD" => Format::AD(parse_vec_value(value, key)?),
  1735. "VAF" => Format::VAF(parse_value(value, key)?),
  1736. "PL" => Format::PL(parse_vec_value(value, key)?),
  1737. // Clairs
  1738. "NDP" => Format::NDP(parse_value(value, key)?),
  1739. "NAD" => Format::NAD(parse_vec_value(value, key)?),
  1740. "AU" => Format::AU(parse_value(value, key)?),
  1741. "CU" => Format::CU(parse_value(value, key)?),
  1742. "GU" => Format::GU(parse_value(value, key)?),
  1743. "TU" => Format::TU(parse_value(value, key)?),
  1744. "NAU" => Format::NAU(parse_value(value, key)?),
  1745. "NCU" => Format::NCU(parse_value(value, key)?),
  1746. "NGU" => Format::NGU(parse_value(value, key)?),
  1747. "NTU" => Format::NTU(parse_value(value, key)?),
  1748. // nanomonsv
  1749. "TR" => Format::TR(parse_value(value, key)?),
  1750. "VR" => Format::VR(parse_value(value, key)?),
  1751. // Severus
  1752. "DR" => Format::DR(parse_value(value, key)?),
  1753. "DV" => Format::DV(parse_value(value, key)?),
  1754. "hVAF" => Format::HVAF(parse_vec_value(value, key)?),
  1755. // fallback
  1756. _ => Format::Other((key.to_string(), value.to_string())),
  1757. };
  1758. Ok(format)
  1759. }
  1760. }
  1761. // Helper function to parse a single value (DeepSeek)
  1762. fn parse_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<T>
  1763. where
  1764. T::Err: std::fmt::Debug,
  1765. {
  1766. value
  1767. .parse()
  1768. .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
  1769. .context(format!("Can't parse {}: {}", key, value)) // Add context
  1770. }
  1771. // Helper function to parse comma-separated values
  1772. fn parse_vec_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<Vec<T>>
  1773. where
  1774. T::Err: std::fmt::Debug,
  1775. {
  1776. value
  1777. .split(',')
  1778. .map(|e| {
  1779. e.parse()
  1780. .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
  1781. .context(format!("Failed to parse {}: {}", key, e)) // Add context
  1782. })
  1783. .collect()
  1784. }
  1785. impl From<Format> for (String, String) {
  1786. /// Converts a `Format` enum into a `(key, value)` pair, as strings.
  1787. ///
  1788. /// This is used to serialize the FORMAT field back into VCF-compatible string values.
  1789. /// The key corresponds to the field ID (e.g., `"DP"`, `"GT"`), and the value is the encoded string representation.
  1790. ///
  1791. /// # Examples
  1792. /// ```
  1793. /// use your_crate::Format;
  1794. /// let f = Format::DP(42);
  1795. /// let (k, v): (String, String) = f.into();
  1796. /// assert_eq!(k, "DP");
  1797. /// assert_eq!(v, "42");
  1798. /// ```
  1799. fn from(format: Format) -> Self {
  1800. let concat_u32 = |values: Vec<u32>| -> String {
  1801. values
  1802. .iter()
  1803. .map(u32::to_string)
  1804. .collect::<Vec<_>>()
  1805. .join(",")
  1806. };
  1807. let concat_f32 = |values: Vec<f32>| -> String {
  1808. values
  1809. .iter()
  1810. .map(|v| format!("{:.5}", v)) // consistent decimal format
  1811. .collect::<Vec<_>>()
  1812. .join(",")
  1813. };
  1814. match format {
  1815. // DeepVariant
  1816. Format::GT(value) => ("GT".to_string(), value),
  1817. Format::GQ(value) => ("GQ".to_string(), value.to_string()),
  1818. Format::DP(value) => ("DP".to_string(), value.to_string()),
  1819. Format::AD(values) => ("AD".to_string(), concat_u32(values)),
  1820. Format::VAF(value) => ("VAF".to_string(), format!("{:.5}", value)),
  1821. Format::PL(values) => ("PL".to_string(), concat_u32(values)),
  1822. // Clairs
  1823. Format::NDP(value) => ("NDP".to_string(), value.to_string()),
  1824. Format::NAD(values) => ("NAD".to_string(), concat_u32(values)),
  1825. Format::AU(value) => ("AU".to_string(), value.to_string()),
  1826. Format::CU(value) => ("CU".to_string(), value.to_string()),
  1827. Format::GU(value) => ("GU".to_string(), value.to_string()),
  1828. Format::TU(value) => ("TU".to_string(), value.to_string()),
  1829. Format::NAU(value) => ("NAU".to_string(), value.to_string()),
  1830. Format::NCU(value) => ("NCU".to_string(), value.to_string()),
  1831. Format::NGU(value) => ("NGU".to_string(), value.to_string()),
  1832. Format::NTU(value) => ("NTU".to_string(), value.to_string()),
  1833. // nanomonsv
  1834. Format::TR(value) => ("TR".to_string(), value.to_string()),
  1835. Format::VR(value) => ("VR".to_string(), value.to_string()),
  1836. // Severus
  1837. Format::DR(value) => ("DR".to_string(), value.to_string()),
  1838. Format::DV(value) => ("DV".to_string(), value.to_string()),
  1839. Format::HVAF(values) => ("hVAF".to_string(), concat_f32(values)),
  1840. // fallback
  1841. Format::Other((key, value)) => (key, value),
  1842. }
  1843. }
  1844. }
  1845. impl Formats {
  1846. pub fn commun_deepvariant_clairs(&self) -> Self {
  1847. let filtered_vec: Vec<Format> = self
  1848. .0
  1849. .clone()
  1850. .into_iter()
  1851. .map(|e| {
  1852. if let Format::VAF(_v) = e {
  1853. e
  1854. // Format::AF(v)
  1855. } else {
  1856. e
  1857. }
  1858. })
  1859. .filter(|format| {
  1860. matches!(
  1861. format,
  1862. Format::GT(_) | Format::GQ(_) | Format::DP(_) | Format::AD(_) /* | Format::AF(_) */
  1863. )
  1864. })
  1865. .collect();
  1866. Formats(filtered_vec)
  1867. }
  1868. /// Returns a sorted set of VCF header definitions for all possible `Format` fields.
  1869. ///
  1870. /// # Example
  1871. /// ```
  1872. /// let headers = Formats::format_headers();
  1873. /// for h in headers {
  1874. /// println!("{}", h);
  1875. /// }
  1876. /// ```
  1877. pub fn format_headers() -> BTreeSet<String> {
  1878. let mut headers = BTreeSet::new();
  1879. headers
  1880. .insert(r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#.to_string());
  1881. headers.insert(
  1882. r#"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">"#.to_string(),
  1883. );
  1884. headers.insert(
  1885. r#"##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">"#.to_string(),
  1886. );
  1887. headers.insert(r#"##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles">"#.to_string());
  1888. headers.insert(
  1889. r#"##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Frequency">"#
  1890. .to_string(),
  1891. );
  1892. headers.insert(r#"##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods">"#.to_string());
  1893. headers.insert(
  1894. r#"##FORMAT=<ID=NDP,Number=1,Type=Integer,Description="Normal sample read depth">"#
  1895. .to_string(),
  1896. );
  1897. headers.insert(
  1898. r#"##FORMAT=<ID=NAD,Number=R,Type=Integer,Description="Normal sample allelic depths">"#
  1899. .to_string(),
  1900. );
  1901. headers.insert(
  1902. r#"##FORMAT=<ID=AU,Number=1,Type=Integer,Description="Tumor A allele count">"#
  1903. .to_string(),
  1904. );
  1905. headers.insert(
  1906. r#"##FORMAT=<ID=CU,Number=1,Type=Integer,Description="Tumor C allele count">"#
  1907. .to_string(),
  1908. );
  1909. headers.insert(
  1910. r#"##FORMAT=<ID=GU,Number=1,Type=Integer,Description="Tumor G allele count">"#
  1911. .to_string(),
  1912. );
  1913. headers.insert(
  1914. r#"##FORMAT=<ID=TU,Number=1,Type=Integer,Description="Tumor T allele count">"#
  1915. .to_string(),
  1916. );
  1917. headers.insert(
  1918. r#"##FORMAT=<ID=NAU,Number=1,Type=Integer,Description="Normal A allele count">"#
  1919. .to_string(),
  1920. );
  1921. headers.insert(
  1922. r#"##FORMAT=<ID=NCU,Number=1,Type=Integer,Description="Normal C allele count">"#
  1923. .to_string(),
  1924. );
  1925. headers.insert(
  1926. r#"##FORMAT=<ID=NGU,Number=1,Type=Integer,Description="Normal G allele count">"#
  1927. .to_string(),
  1928. );
  1929. headers.insert(
  1930. r#"##FORMAT=<ID=NTU,Number=1,Type=Integer,Description="Normal T allele count">"#
  1931. .to_string(),
  1932. );
  1933. headers.insert(r#"##FORMAT=<ID=TR,Number=1,Type=Integer,Description="Total supporting reads (tumor)">"#.to_string());
  1934. headers.insert(r#"##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Variant-supporting reads (tumor)">"#.to_string());
  1935. // Severus
  1936. headers.insert(
  1937. r#"##FORMAT=<ID=DR,Number=1,Type=Integer,Description="Number of reference reads">"#
  1938. .to_string(),
  1939. );
  1940. headers.insert(
  1941. r#"##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of variant reads">"#
  1942. .to_string(),
  1943. );
  1944. headers.insert(r#"##FORMAT=<ID=hVAF,Number=3,Type=Float,Description="Haplotype specific variant Allele frequency (H0,H1,H2)">"#.to_string());
  1945. headers.insert(
  1946. r#"##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">"#.to_string(),
  1947. );
  1948. headers.insert(
  1949. r#"##FORMAT=<ID=NAF,Number=1,Type=Float,Description="Normal Allele Frequency">"#
  1950. .to_string(),
  1951. );
  1952. // headers.insert(
  1953. // r#"##FORMAT=<ID=Other,Number=.,Type=String,Description="Unspecified FORMAT field">"#
  1954. // .to_string(),
  1955. // );
  1956. headers
  1957. }
  1958. }
  1959. /// Filter
  1960. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
  1961. pub enum Filter {
  1962. PASS,
  1963. Other(String),
  1964. }
  1965. impl FromStr for Filter {
  1966. type Err = anyhow::Error;
  1967. fn from_str(s: &str) -> anyhow::Result<Self> {
  1968. match s {
  1969. "PASS" => Ok(Filter::PASS),
  1970. _ => Ok(Filter::Other(s.to_string())),
  1971. }
  1972. }
  1973. }
  1974. impl fmt::Display for Filter {
  1975. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  1976. match self {
  1977. Filter::PASS => write!(f, "PASS"),
  1978. Filter::Other(ref s) => write!(f, "{}", s),
  1979. }
  1980. }
  1981. }
  1982. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
  1983. pub enum ReferenceAlternative {
  1984. Nucleotide(Base),
  1985. Nucleotides(Vec<Base>),
  1986. Unstructured(String),
  1987. }
  1988. impl ReferenceAlternative {
  1989. pub fn len(&self) -> usize {
  1990. match self {
  1991. ReferenceAlternative::Nucleotide(_) => 1,
  1992. ReferenceAlternative::Nucleotides(bases) => bases.len(),
  1993. ReferenceAlternative::Unstructured(_) => 0,
  1994. }
  1995. }
  1996. pub fn is_empty(&self) -> bool {
  1997. self.len() == 0
  1998. }
  1999. pub fn ent(&self) -> Option<f64> {
  2000. match self {
  2001. ReferenceAlternative::Nucleotide(_) => None,
  2002. ReferenceAlternative::Nucleotides(bases) => {
  2003. let seq = bases
  2004. .iter()
  2005. .skip(1)
  2006. .map(|b| b.to_string())
  2007. .collect::<String>();
  2008. if seq.len() > 1 {
  2009. Some(estimate_shannon_entropy(&seq))
  2010. } else {
  2011. None
  2012. }
  2013. }
  2014. ReferenceAlternative::Unstructured(_) => None,
  2015. }
  2016. }
  2017. }
  2018. impl FromStr for ReferenceAlternative {
  2019. type Err = anyhow::Error;
  2020. fn from_str(s: &str) -> anyhow::Result<Self> {
  2021. let possible_bases = s.as_bytes().iter();
  2022. let mut res: Vec<Base> = Vec::new();
  2023. for &base in possible_bases {
  2024. match base.try_into() {
  2025. std::result::Result::Ok(b) => res.push(b),
  2026. Err(_) => {
  2027. return Ok(Self::Unstructured(s.to_string()));
  2028. }
  2029. }
  2030. }
  2031. if res.len() == 1 {
  2032. Ok(Self::Nucleotide(res.pop().unwrap()))
  2033. } else {
  2034. Ok(Self::Nucleotides(res))
  2035. }
  2036. }
  2037. }
  2038. impl fmt::Display for ReferenceAlternative {
  2039. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  2040. let string = match self {
  2041. ReferenceAlternative::Nucleotide(b) => b.to_string(),
  2042. ReferenceAlternative::Nucleotides(bases) => bases
  2043. .iter()
  2044. .fold(String::new(), |acc, e| format!("{}{}", acc, e)),
  2045. ReferenceAlternative::Unstructured(s) => s.to_string(),
  2046. };
  2047. write!(f, "{}", string)
  2048. }
  2049. }
  2050. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
  2051. pub enum Base {
  2052. A,
  2053. T,
  2054. C,
  2055. G,
  2056. N,
  2057. }
  2058. impl TryFrom<u8> for Base {
  2059. type Error = anyhow::Error;
  2060. fn try_from(base: u8) -> anyhow::Result<Self> {
  2061. match base {
  2062. // uppercase
  2063. b'A' | b'a' => Ok(Base::A),
  2064. b'T' | b't' => Ok(Base::T),
  2065. b'C' | b'c' => Ok(Base::C),
  2066. b'G' | b'g' => Ok(Base::G),
  2067. b'N' | b'n' => Ok(Base::N),
  2068. // unknown
  2069. _ => Err(anyhow::anyhow!(
  2070. "Unknown base: {}",
  2071. String::from_utf8_lossy(&[base])
  2072. )),
  2073. }
  2074. }
  2075. }
  2076. impl Base {
  2077. pub fn into_u8(self) -> u8 {
  2078. match self {
  2079. Base::A => b'A',
  2080. Base::T => b'T',
  2081. Base::C => b'C',
  2082. Base::G => b'G',
  2083. Base::N => b'N',
  2084. }
  2085. }
  2086. }
  2087. impl fmt::Display for Base {
  2088. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  2089. // Use `self.number` to refer to each positional data point.
  2090. let str = match self {
  2091. Base::A => "A",
  2092. Base::T => "T",
  2093. Base::C => "C",
  2094. Base::G => "G",
  2095. Base::N => "N",
  2096. };
  2097. write!(f, "{}", str)
  2098. }
  2099. }
  2100. pub trait Variants {
  2101. fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection>;
  2102. }
  2103. pub trait VariantId {
  2104. fn variant_id(&self) -> String;
  2105. }
  2106. pub trait Label {
  2107. fn label(&self) -> String;
  2108. }
  2109. /// A trait alias for all dynamically executable pipeline runners.
  2110. ///
  2111. /// This trait represents any component that:
  2112. /// - Can decide whether it needs to run (`ShouldRun`)
  2113. /// - Implements the actual execution logic (`Run`)
  2114. /// - Is thread-safe (`Send + Sync`) to be boxed and dispatched concurrently
  2115. ///
  2116. /// Components implementing this trait can be boxed as `ShouldRunBox`.
  2117. pub trait ShouldRunTrait: ShouldRun + Run + Send + Sync + Label {}
  2118. /// Blanket implementation for all compatible types.
  2119. impl<T> ShouldRunTrait for T where T: ShouldRun + Run + Send + Sync + Label {}
  2120. /// A boxed trait object to hold any runner implementing `ShouldRunTrait`.
  2121. pub type ShouldRunBox = Box<dyn ShouldRunTrait>;
  2122. /// Macro to initialize and box multiple `ShouldRunTrait` components.
  2123. ///
  2124. /// # Arguments
  2125. /// * `$id` - Sample ID (typically a string slice)
  2126. /// * `$config` - Shared configuration object
  2127. /// * `$($runner:ty),+` - One or more runner types implementing `Initialize + ShouldRunTrait`
  2128. ///
  2129. /// # Returns
  2130. /// A vector of boxed runner components (`Vec<ShouldRunBox>`)
  2131. ///
  2132. /// # Example
  2133. /// ```rust
  2134. /// let modules: Vec<ShouldRunBox> = create_should_run!(
  2135. /// "sample_42",
  2136. /// config,
  2137. /// ClairS,
  2138. /// Savana,
  2139. /// DeepSomatic,
  2140. /// )?;
  2141. /// ```
  2142. ///
  2143. /// # Errors
  2144. /// This macro uses `?`, so it must be called inside a function that returns `anyhow::Result`.
  2145. #[macro_export]
  2146. macro_rules! create_should_run {
  2147. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  2148. use anyhow::Context;
  2149. let mut runners: Vec<ShouldRunBox> = Vec::new();
  2150. $(
  2151. let runner = <$runner>::initialize($id, $config.clone())
  2152. .with_context(|| format!(
  2153. "Failed to initialize should-run checker {} for {}",
  2154. stringify!($runner), $id
  2155. ))?;
  2156. runners.push(Box::new(runner) as ShouldRunBox);
  2157. )+
  2158. runners
  2159. }};
  2160. }
  2161. /// Macro to initialize and box a list of solo-mode pipeline components that implement `ShouldRunTrait`.
  2162. ///
  2163. /// This is typically used for per-timepoint variant callers (e.g., `DeepVariant`),
  2164. /// where each runner is instantiated for a specific sample timepoint (e.g., "tumoral", "normal").
  2165. ///
  2166. /// Each entry must be provided as a pair: the type of the runner and the timepoint string expression.
  2167. ///
  2168. /// # Arguments
  2169. /// - `$id`: The sample ID (`&str`) passed to each initializer
  2170. /// - `$config`: A `Config` object (must be cloneable)
  2171. /// - `$($runner:ty, $arg:expr),+`: One or more runner types with timepoint arguments (e.g., `config.tumoral_name`)
  2172. ///
  2173. /// # Returns
  2174. /// A `Vec<ShouldRunBox>` with boxed runners initialized per timepoint.
  2175. ///
  2176. /// # Example
  2177. /// ```rust
  2178. /// let solo_callers = create_should_run_solo!(
  2179. /// "sample42",
  2180. /// config,
  2181. /// DeepVariant, config.tumoral_name,
  2182. /// DeepVariant, config.normal_name,
  2183. /// )?;
  2184. /// ```
  2185. ///
  2186. /// # Notes
  2187. /// Using `config.tumoral_name` and `config.normal_name` is preferred over hardcoded "diag"/"mrd".
  2188. ///
  2189. /// # Errors
  2190. /// This macro uses `?` and must be called inside a `Result`-returning context.
  2191. #[macro_export]
  2192. macro_rules! create_should_run_solo {
  2193. ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{
  2194. use anyhow::Context;
  2195. let mut runners: Vec<ShouldRunBox> = Vec::new();
  2196. $(
  2197. let runner = <$runner>::initialize($id, $arg, $config.clone())
  2198. .with_context(|| format!(
  2199. "Failed to initialize solo should-run checker {} for '{}' and arg '{}'",
  2200. stringify!($runner), $id, $arg
  2201. ))?;
  2202. runners.push(Box::new(runner) as ShouldRunBox);
  2203. )+
  2204. runners
  2205. }};
  2206. }
  2207. /// Macro to initialize and box a list of pipeline components that must run once per timepoint
  2208. /// (i.e., both "tumoral" and "normal") and implement `ShouldRunTrait`.
  2209. ///
  2210. /// This is typically used for variant callers like `DeepVariant` that operate in **solo mode**
  2211. /// but must be run twice — once for the tumoral sample and once for the normal sample.
  2212. ///
  2213. /// The macro:
  2214. /// - Calls `.initialize(id, timepoint, config.clone())` twice per type
  2215. /// - Uses `config.tumoral_name` and `config.normal_name` as timepoints
  2216. /// - Returns a flat `Vec<ShouldRunBox>` containing both instances for each type
  2217. ///
  2218. /// # Arguments
  2219. /// - `$id`: The sample ID (`&str`)
  2220. /// - `$config`: The configuration object (must expose `tumoral_name` and `normal_name`)
  2221. /// - `$($runner:ty),+`: One or more runner types that implement `Initialize` with `(id, timepoint, config)`
  2222. ///
  2223. /// # Example
  2224. /// ```rust
  2225. /// let runners = create_should_run_normal_tumoral!(
  2226. /// "sample_42",
  2227. /// config,
  2228. /// DeepVariant,
  2229. /// AnotherCaller,
  2230. /// )?;
  2231. /// ```
  2232. ///
  2233. /// This will expand to:
  2234. /// ```rust
  2235. /// vec![
  2236. /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox,
  2237. /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox,
  2238. /// Box::new(AnotherCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox,
  2239. /// Box::new(AnotherCaller::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox,
  2240. /// ]
  2241. /// ```
  2242. ///
  2243. /// # Errors
  2244. /// This macro uses `?`, so it must be called inside a function that returns `Result`.
  2245. #[macro_export]
  2246. macro_rules! create_should_run_normal_tumoral {
  2247. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  2248. use anyhow::Context;
  2249. let mut runners: Vec<ShouldRunBox> = Vec::new();
  2250. $(
  2251. let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone())
  2252. .with_context(|| format!(
  2253. "Failed to initialize tumoral should-run checker {} for {}",
  2254. stringify!($runner), $id
  2255. ))?;
  2256. runners.push(Box::new(tumoral) as ShouldRunBox);
  2257. let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone())
  2258. .with_context(|| format!(
  2259. "Failed to initialize normal should-run checker {} for {}",
  2260. stringify!($runner), $id
  2261. ))?;
  2262. runners.push(Box::new(normal) as ShouldRunBox);
  2263. )+
  2264. runners
  2265. }};
  2266. }
  2267. /// Executes each runner in the slice only if `should_run()` returns true.
  2268. ///
  2269. /// # Arguments
  2270. /// * `iterable` - A mutable slice of boxed `InitRun` components
  2271. ///
  2272. /// # Returns
  2273. /// * `Ok(())` if all required runners execute successfully
  2274. /// * An error if any runner's `run()` method fails
  2275. ///
  2276. /// # Notes
  2277. /// - This function will skip runners that return `false` from `should_run()`
  2278. pub fn run_if_required(iterable: &mut [ShouldRunBox]) -> anyhow::Result<()> {
  2279. iterable.iter_mut().try_for_each(|e| {
  2280. if e.should_run() {
  2281. e.run()
  2282. .map_err(|err| anyhow::anyhow!("Failed to run {}.\n{err}", e.label()))
  2283. } else {
  2284. info!("No run required for: {}", e.label());
  2285. Ok(())
  2286. }
  2287. })
  2288. }
  2289. /// A trait alias for all variant callers that support initialization, execution,
  2290. /// conditional re-running, and variant extraction (VCF + annotations).
  2291. ///
  2292. /// Used to enable polymorphic handling of both solo and somatic callers in the pipeline.
  2293. pub trait RunnerVariants: Variants + Send + Sync + Label {}
  2294. /// Blanket implementation for all compatible types.
  2295. impl<T> RunnerVariants for T where T: Variants + Send + Sync + Label {}
  2296. pub type CallerBox = Box<dyn RunnerVariants + Send + Sync>;
  2297. #[macro_export]
  2298. macro_rules! init_somatic_callers {
  2299. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  2300. use anyhow::Context;
  2301. let mut callers: Vec<CallerBox> = Vec::new();
  2302. $(
  2303. let caller = <$runner>::initialize($id, $config.clone())
  2304. .with_context(|| format!(
  2305. "Failed to initialize somatic caller {} for {}",
  2306. stringify!($runner), $id
  2307. ))?;
  2308. callers.push(Box::new(caller) as CallerBox);
  2309. )+
  2310. callers
  2311. }};
  2312. }
  2313. /// Macro to initialize and box a list of **solo-mode variant callers** for specific timepoints,
  2314. /// where each runner implements `RunnerVariants`.
  2315. ///
  2316. /// This is useful for callers like `DeepVariant` that need to be instantiated with a specific
  2317. /// sample timepoint (e.g., `config.tumoral_name` or `config.normal_name`).
  2318. ///
  2319. /// Each entry must be a pair: a runner type and a timepoint expression (usually from config).
  2320. ///
  2321. /// # Arguments
  2322. /// - `$id`: The sample ID (`&str`)
  2323. /// - `$config`: The configuration object (must be cloneable)
  2324. /// - `$($runner:ty, $arg:expr),+`: One or more `(RunnerType, Timepoint)` pairs
  2325. ///
  2326. /// # Returns
  2327. /// A `Vec<CallerBox>` containing initialized, boxed solo-mode variant callers.
  2328. ///
  2329. /// # Example
  2330. /// ```rust
  2331. /// let solo_callers = init_solo_callers!(
  2332. /// "sample_42",
  2333. /// config,
  2334. /// DeepVariant, config.tumoral_name,
  2335. /// DeepVariant, config.normal_name,
  2336. /// )?;
  2337. /// ```
  2338. ///
  2339. /// This will expand to:
  2340. /// ```rust
  2341. /// vec![
  2342. /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
  2343. /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
  2344. /// ]
  2345. /// ```
  2346. ///
  2347. /// # Errors
  2348. /// This macro uses `?` internally, so it must be used inside a `Result`-returning context.
  2349. #[macro_export]
  2350. macro_rules! init_solo_callers {
  2351. ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{
  2352. let mut callers: Vec<CallerBox> = Vec::new();
  2353. $(
  2354. let caller = <$runner>::initialize($id, $arg, $config.clone())
  2355. .with_context(|| format!(
  2356. "Failed to initialize caller {} for {}",
  2357. stringify!($runner), $id
  2358. ))?;
  2359. callers.push(Box::new(caller) as CallerBox);
  2360. )+
  2361. callers
  2362. }};
  2363. }
  2364. /// Macro to initialize and box a list of solo-mode **variant callers** for both `normal` and `tumoral` timepoints.
  2365. ///
  2366. /// This is designed for types like `DeepVariant` that implement `RunnerVariants` and require
  2367. /// separate execution for each timepoint. It will:
  2368. /// - Call `.initialize(id, timepoint, config)` for both `config.tumoral_name` and `config.normal_name`
  2369. /// - Box the result as `CallerBox`
  2370. ///
  2371. /// # Arguments
  2372. /// - `$id`: Sample ID (usually a `&str`)
  2373. /// - `$config`: Cloneable configuration object
  2374. /// - `$($runner:ty),+`: One or more runner types that implement `RunnerVariants`
  2375. ///
  2376. /// # Returns
  2377. /// A `Vec<CallerBox>` containing two boxed instances per runner (one for each timepoint).
  2378. ///
  2379. /// # Example
  2380. /// ```rust
  2381. /// let solo_callers = init_solo_callers_normal_tumoral!(
  2382. /// "sample_42",
  2383. /// config,
  2384. /// DeepVariant,
  2385. /// OtherSoloCaller,
  2386. /// )?;
  2387. /// ```
  2388. ///
  2389. /// This expands to:
  2390. /// ```rust
  2391. /// vec![
  2392. /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
  2393. /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
  2394. /// Box::new(OtherSoloCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
  2395. /// Box::new(OtherSoloCaller::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
  2396. /// ]
  2397. /// ```
  2398. ///
  2399. /// # Errors
  2400. /// This macro uses `?`, so it must be called inside a `Result`-returning context.
  2401. #[macro_export]
  2402. macro_rules! init_solo_callers_normal_tumoral {
  2403. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  2404. use anyhow::Context;
  2405. let mut callers: Vec<CallerBox> = Vec::new();
  2406. $(
  2407. let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone())
  2408. .with_context(|| format!(
  2409. "Failed to initialize tumoral caller {} for {} '{}'",
  2410. stringify!($runner), $id, $config.tumoral_name
  2411. ))?;
  2412. callers.push(Box::new(tumoral) as CallerBox);
  2413. let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone())
  2414. .with_context(|| format!(
  2415. "Failed to initialize normal caller {} for {} '{}'",
  2416. stringify!($runner), $id, $config.normal_name
  2417. ))?;
  2418. callers.push(Box::new(normal) as CallerBox);
  2419. )+
  2420. callers
  2421. }};
  2422. }
  2423. // pub fn run_variants(iterable: &mut [CallerBox]) -> anyhow::Result<()> {
  2424. // iterable
  2425. // .iter_mut()
  2426. // .try_for_each(|runner| runner.run())
  2427. // .map_err(|e| anyhow::anyhow!("Error while calling run_variants.\n{e}"))
  2428. // }
  2429. pub fn load_variants(
  2430. iterable: &mut [CallerBox],
  2431. annotations: &Annotations,
  2432. ) -> anyhow::Result<Vec<VariantCollection>> {
  2433. iterable
  2434. .par_iter()
  2435. .map(|runner| {
  2436. let result = runner.variants(annotations);
  2437. if let Err(ref e) = result {
  2438. error!("Failed to load variants from: {}\n{e}", runner.label());
  2439. } else {
  2440. info!("Variants from {} loaded.", runner.label());
  2441. };
  2442. result
  2443. })
  2444. .filter(|r| r.is_ok())
  2445. .collect::<anyhow::Result<Vec<_>>>()
  2446. .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}"))
  2447. }
  2448. // pub fn par_load_variants(
  2449. // iterable: &mut [Box<dyn Variants + Send + Sync>],
  2450. // annotations: &Annotations,
  2451. // ) -> anyhow::Result<Vec<VariantCollection>> {
  2452. // iterable
  2453. // .par_iter()
  2454. // .map(|runner| {
  2455. // let r = runner.variants(annotations);
  2456. // if let Err(ref e) = r {
  2457. // warn!("{e}");
  2458. // };
  2459. // r
  2460. // })
  2461. // .filter(|r| r.is_ok())
  2462. // .collect::<anyhow::Result<Vec<_>>>()
  2463. // .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}"))
  2464. // }
  2465. pub fn parallel_intersection<T: Hash + Eq + Clone + Send + Sync>(
  2466. vec1: &[T],
  2467. vec2: &[T],
  2468. ) -> (Vec<T>, Vec<T>, Vec<T>) {
  2469. let set1: HashSet<_> = vec1.par_iter().cloned().collect();
  2470. let set2: HashSet<_> = vec2.par_iter().cloned().collect();
  2471. let common: Vec<T> = set1
  2472. .par_iter()
  2473. .filter(|item| set2.contains(item))
  2474. .cloned()
  2475. .collect();
  2476. let only_in_first: Vec<T> = set1
  2477. .par_iter()
  2478. .filter(|item| !set2.contains(item))
  2479. .cloned()
  2480. .collect();
  2481. let only_in_second: Vec<T> = set2
  2482. .par_iter()
  2483. .filter(|item| !set1.contains(item))
  2484. .cloned()
  2485. .collect();
  2486. (common, only_in_first, only_in_second)
  2487. }