variant.rs 52 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505
  1. use crate::{
  2. annotation::Annotations,
  3. collection::ShouldRun,
  4. helpers::Hash128,
  5. positions::{GenomePosition, GetGenomePosition, VcfPosition},
  6. runners::Run,
  7. variant::variant_collection::VariantCollection,
  8. };
  9. use anyhow::{anyhow, Context};
  10. use bitcode::{Decode, Encode};
  11. use log::{error, info};
  12. use rayon::prelude::*;
  13. use serde::{Deserialize, Serialize};
  14. use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr};
  15. /// Represents a variant in the Variant Call Format (VCF).
  16. #[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
  17. pub struct VcfVariant {
  18. /// A 128-bit hash of the variant's key properties for efficient comparison and storage.
  19. pub hash: Hash128,
  20. /// The genomic position of the variant.
  21. pub position: GenomePosition,
  22. /// The identifier of the variant.
  23. pub id: String,
  24. /// The reference allele.
  25. pub reference: ReferenceAlternative,
  26. /// The alternative allele.
  27. pub alternative: ReferenceAlternative,
  28. /// The quality score of the variant call, if available.
  29. pub quality: Option<f32>,
  30. /// The filter status of the variant.
  31. pub filter: Filter,
  32. /// Additional information about the variant.
  33. pub infos: Infos,
  34. /// Genotype information and other sample-specific data.
  35. pub formats: Formats,
  36. }
  37. impl PartialEq for VcfVariant {
  38. /// Compares two VcfVariants for equality.
  39. ///
  40. /// Note: This comparison only considers position, reference, and alternative.
  41. /// It intentionally ignores id, filter, info, format, and quality.
  42. fn eq(&self, other: &Self) -> bool {
  43. // Nota bene: id, filter, info, format and quality is intentionally not compared
  44. self.position == other.position
  45. && self.reference == other.reference
  46. && self.alternative == other.alternative
  47. }
  48. }
  49. impl Eq for VcfVariant {}
  50. impl FromStr for VcfVariant {
  51. type Err = anyhow::Error;
  52. /// Parses a VcfVariant from a string representation.
  53. ///
  54. /// The input string is expected to be a tab-separated VCF line.
  55. ///
  56. /// # Errors
  57. ///
  58. /// Returns an error if parsing fails for any field.
  59. fn from_str(s: &str) -> anyhow::Result<Self> {
  60. let v: Vec<&str> = s.split('\t').collect();
  61. let vcf_position: VcfPosition = (
  62. *v.first().ok_or(anyhow!("Can't get contig from: {s}"))?,
  63. *v.get(1).ok_or(anyhow!("Can't get position from: {s}"))?,
  64. )
  65. .try_into()
  66. .context(format!("Can't parse position from: {s}"))?;
  67. let formats = if v.len() >= 10 {
  68. (
  69. *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?,
  70. *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?,
  71. )
  72. .try_into()
  73. .context(format!("Can't parse formats from: {s}"))?
  74. } else {
  75. Formats::default()
  76. };
  77. let position: GenomePosition = vcf_position.into();
  78. let reference: ReferenceAlternative = v
  79. .get(3)
  80. .ok_or(anyhow!("Can't parse reference from: {s}"))?
  81. .parse()
  82. .context(format!("Can't parse reference from: {s}"))?;
  83. let alternative: ReferenceAlternative = v
  84. .get(4)
  85. .ok_or(anyhow!("Can't parse alternative from: {s}"))?
  86. .parse()
  87. .context(format!("Can't parse alternative from: {s}"))?;
  88. // Blake3 128 bytes Hash
  89. let mut hasher = blake3::Hasher::new();
  90. hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes
  91. hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes
  92. hasher.update(reference.to_string().as_bytes()); // Reference string as bytes
  93. hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes
  94. let hash = hasher.finalize();
  95. let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
  96. Ok(Self {
  97. hash,
  98. position,
  99. id: v
  100. .get(2)
  101. .ok_or(anyhow!("Can't parse id from: {s}"))?
  102. .to_string(),
  103. reference,
  104. alternative,
  105. quality: v
  106. .get(5)
  107. .map(|s| s.parse::<f32>().ok()) // Try to parse as f64; returns Option<f64>
  108. .unwrap_or(None),
  109. filter: v
  110. .get(6)
  111. .ok_or(anyhow!("Can't parse filter from: {s}"))?
  112. .parse()
  113. .context(format!("Can't parse filter from: {s}"))?,
  114. infos: v
  115. .get(7)
  116. .ok_or(anyhow!("Can't parse infos from: {s}"))?
  117. .parse()
  118. .context(format!("Can't parse infos from: {s}"))?,
  119. formats,
  120. })
  121. }
  122. }
  123. // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag
  124. impl VcfVariant {
  125. /// Converts the VcfVariant into a VCF-formatted row string.
  126. ///
  127. /// This method creates a tab-separated string representation of the variant,
  128. /// suitable for writing to a VCF file.
  129. pub fn into_vcf_row(&self) -> String {
  130. let vcf_position: VcfPosition = self.position.clone().into();
  131. let (contig, position) = vcf_position.into();
  132. let mut columns = vec![
  133. contig,
  134. position,
  135. self.id.to_string(),
  136. self.reference.to_string(),
  137. self.alternative.to_string(),
  138. self.quality
  139. .map(|v| v.to_string())
  140. .unwrap_or(".".to_string()),
  141. self.filter.to_string(),
  142. self.infos.to_string(),
  143. ];
  144. if !self.formats.0.is_empty() {
  145. let (format, values) = self.formats.clone().into();
  146. columns.push(format);
  147. columns.push(values);
  148. }
  149. columns.join("\t")
  150. }
  151. /// Returns the hash of the variant.
  152. pub fn hash(&self) -> Hash128 {
  153. self.hash
  154. }
  155. /// Creates a new VcfVariant with common attributes from DeepVariant and CLAIRS.
  156. ///
  157. /// This method generates a new variant with shared properties, resetting some fields
  158. /// to default or empty values.
  159. pub fn commun_deepvariant_clairs(&self) -> VcfVariant {
  160. VcfVariant {
  161. hash: self.hash,
  162. position: self.position.clone(),
  163. id: self.id.clone(),
  164. reference: self.reference.clone(),
  165. alternative: self.alternative.clone(),
  166. quality: self.quality,
  167. filter: Filter::Other(".".to_string()),
  168. infos: Infos(vec![Info::Empty]),
  169. formats: self.formats.commun_deepvariant_clairs(),
  170. }
  171. }
  172. /// Checks if the variant has an SVTYPE info field.
  173. ///
  174. /// Returns true if the variant contains structural variation type information.
  175. pub fn has_svtype(&self) -> bool {
  176. self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_)))
  177. }
  178. /// Retrieves the structural variation type of the variant, if present.
  179. ///
  180. /// Returns Some(SVType) if the variant has an SVTYPE info field,
  181. pub fn svtype(&self) -> Option<SVType> {
  182. self.infos.0.iter().find_map(|e| {
  183. if let Info::SVTYPE(sv_type) = e {
  184. Some(sv_type.clone())
  185. } else {
  186. None
  187. }
  188. })
  189. }
  190. /// Determines the alteration category of the variant.
  191. ///
  192. /// This method analyzes the reference and alternative alleles to classify
  193. /// the variant into one of several alteration categories:
  194. /// - SNV (Single Nucleotide Variant)
  195. /// - INS (Insertion)
  196. /// - DEL (Deletion)
  197. /// - Other (including structural variants and complex alterations)
  198. ///
  199. /// The classification is based on the following rules:
  200. /// 1. If both reference and alternative are single nucleotides, it's an SNV.
  201. /// 2. If reference is a single nucleotide and alternative is multiple nucleotides, it's an insertion.
  202. /// 3. If reference is multiple nucleotides and alternative is a single nucleotide, it's a deletion.
  203. /// 4. For cases where both are multiple nucleotides, the longer one determines if it's an insertion or deletion.
  204. /// 5. If none of the above apply, it checks for structural variant types.
  205. /// 6. If no structural variant type is found, it's classified as "Other".
  206. ///
  207. /// # Returns
  208. /// An `AlterationCategory` enum representing the type of alteration.
  209. pub fn alteration_category(&self) -> AlterationCategory {
  210. match (&self.reference, &self.alternative) {
  211. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
  212. AlterationCategory::SNV
  213. }
  214. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
  215. AlterationCategory::INS
  216. }
  217. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
  218. AlterationCategory::Other
  219. }
  220. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
  221. AlterationCategory::DEL
  222. }
  223. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  224. if a.len() < b.len() =>
  225. {
  226. AlterationCategory::INS
  227. }
  228. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  229. if a.len() > b.len() =>
  230. {
  231. AlterationCategory::DEL
  232. }
  233. _ => match self.svtype() {
  234. Some(sv_type) => {
  235. if let Ok(bnd_desc) = self.bnd_desc() {
  236. if bnd_desc.a_contig != bnd_desc.b_contig {
  237. AlterationCategory::TRL
  238. } else {
  239. AlterationCategory::BND
  240. }
  241. } else {
  242. AlterationCategory::from(sv_type)
  243. }
  244. }
  245. None => AlterationCategory::Other,
  246. },
  247. }
  248. }
  249. /// Parses and constructs a BND (breakend) description from the alternative string.
  250. ///
  251. /// This function interprets the BND notation in the alternative string and creates
  252. /// a `BNDDesc` struct containing detailed information about the breakend.
  253. ///
  254. /// # Returns
  255. /// - `Ok(BNDDesc)` if parsing is successful
  256. /// - `Err` if parsing fails or if the alteration is not a BND
  257. ///
  258. /// # Errors
  259. /// This function will return an error if:
  260. /// - The alteration category is not BND
  261. /// - The alternative string cannot be parsed into exactly 3 parts
  262. /// - The b_position cannot be parsed as a number
  263. pub fn bnd_desc(&self) -> anyhow::Result<BNDDesc> {
  264. let alt = self.alternative.to_string();
  265. if alt.contains('[') || alt.contains(']') {
  266. let alt_rep = alt.replace("[", ";").replace("]", ";");
  267. let alt_is_joined_after = !alt_rep.starts_with(";");
  268. let parts = alt_rep
  269. .split(";")
  270. .filter(|c| !c.is_empty())
  271. .collect::<Vec<&str>>();
  272. if alt_is_joined_after {
  273. // a is ref b is alt
  274. let a_sens = true;
  275. let a_contig = self.position.contig();
  276. let a_position = self.position.position + 1;
  277. let added_nt = parts[0][1..].to_string();
  278. let b_sens = alt.contains('[');
  279. let (contig, pos) = parts[1].split_once(':').unwrap();
  280. let b_contig = contig.to_string();
  281. let b_position: u32 = pos.parse()?;
  282. Ok(BNDDesc {
  283. a_contig,
  284. a_position,
  285. a_sens,
  286. b_contig,
  287. b_position,
  288. b_sens,
  289. added_nt,
  290. })
  291. } else {
  292. // a is alt b is ref
  293. let b_sens = true;
  294. let b_contig = self.position.contig();
  295. let b_position = self.position.position + 1;
  296. let mut added_nt = parts[1].to_string();
  297. added_nt.pop();
  298. let a_sens = alt.contains(']');
  299. let (contig, pos) = parts[0].split_once(':').unwrap();
  300. let a_contig = contig.to_string();
  301. let a_position: u32 = pos.parse()?;
  302. Ok(BNDDesc {
  303. a_contig,
  304. a_position,
  305. a_sens,
  306. b_contig,
  307. b_position,
  308. b_sens,
  309. added_nt,
  310. })
  311. }
  312. } else {
  313. Err(anyhow::anyhow!("The alteration is not BND: {alt}"))
  314. }
  315. }
  316. pub fn deletion_len(&self) -> Option<u32> {
  317. if self.alteration_category() != AlterationCategory::DEL {
  318. return None;
  319. }
  320. self.infos
  321. .0
  322. .iter()
  323. .find_map(|i| {
  324. if let Info::SVLEN(len) = i {
  325. Some(*len as u32)
  326. } else {
  327. None
  328. }
  329. })
  330. .or_else(|| {
  331. if let (
  332. ReferenceAlternative::Nucleotides(nt),
  333. ReferenceAlternative::Nucleotide(_),
  334. ) = (&self.reference, &self.alternative)
  335. {
  336. Some(nt.len().saturating_sub(1) as u32)
  337. } else {
  338. None
  339. }
  340. })
  341. }
  342. pub fn deletion_desc(&self) -> Option<DeletionDesc> {
  343. if let Some(len) = self.deletion_len() {
  344. Some(DeletionDesc {
  345. contig: self.position.contig(),
  346. start: self.position.position + 1,
  347. end: self.position.position + len,
  348. })
  349. } else {
  350. None
  351. }
  352. }
  353. }
  354. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
  355. pub struct BNDDesc {
  356. pub a_contig: String,
  357. pub a_position: u32, // 1-based
  358. pub a_sens: bool,
  359. pub b_contig: String,
  360. pub b_position: u32, // 1-based
  361. pub b_sens: bool,
  362. pub added_nt: String,
  363. }
  364. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
  365. pub struct DeletionDesc {
  366. pub contig: String,
  367. pub start: u32, // 1-based
  368. pub end: u32,
  369. }
  370. impl DeletionDesc {
  371. pub fn len(&self) -> u32 {
  372. self.end.saturating_sub(self.start)
  373. }
  374. pub fn is_empty(&self) -> bool {
  375. self.len() == 0
  376. }
  377. }
  378. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
  379. pub enum AlterationCategory {
  380. SNV,
  381. DEL,
  382. INS,
  383. DUP,
  384. INV,
  385. CNV,
  386. TRL,
  387. BND,
  388. Other,
  389. }
  390. impl fmt::Display for AlterationCategory {
  391. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  392. write!(
  393. f,
  394. "{}",
  395. match self {
  396. AlterationCategory::SNV => "SNV",
  397. AlterationCategory::DEL => "DEL",
  398. AlterationCategory::INS => "INS",
  399. AlterationCategory::DUP => "DUP",
  400. AlterationCategory::INV => "INV",
  401. AlterationCategory::CNV => "CNV",
  402. AlterationCategory::BND | AlterationCategory::TRL => "TRL",
  403. AlterationCategory::Other => "Other",
  404. }
  405. )
  406. }
  407. }
  408. impl From<SVType> for AlterationCategory {
  409. fn from(sv_type: SVType) -> Self {
  410. match sv_type {
  411. SVType::DEL => AlterationCategory::DEL,
  412. SVType::INS => AlterationCategory::INS,
  413. SVType::DUP => AlterationCategory::DUP,
  414. SVType::INV => AlterationCategory::INV,
  415. SVType::CNV => AlterationCategory::CNV,
  416. SVType::BND => AlterationCategory::BND,
  417. }
  418. }
  419. }
  420. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
  421. pub enum SVType {
  422. DEL,
  423. INS,
  424. DUP,
  425. INV,
  426. CNV,
  427. BND,
  428. }
  429. impl FromStr for SVType {
  430. type Err = anyhow::Error;
  431. fn from_str(s: &str) -> anyhow::Result<Self> {
  432. match s {
  433. "DEL" => Ok(SVType::DEL),
  434. "INS" => Ok(SVType::INS),
  435. "DUP" => Ok(SVType::DUP),
  436. "INV" => Ok(SVType::INV),
  437. "CNV" => Ok(SVType::CNV),
  438. "BND" => Ok(SVType::BND),
  439. _ => Err(anyhow!("Can't parse SVTYPE={s}")),
  440. }
  441. }
  442. }
  443. impl fmt::Display for SVType {
  444. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  445. write!(
  446. f,
  447. "{}",
  448. match self {
  449. SVType::DEL => "DEL",
  450. SVType::INS => "INS",
  451. SVType::DUP => "DUP",
  452. SVType::INV => "INV",
  453. SVType::CNV => "CNV",
  454. SVType::BND => "BND",
  455. }
  456. )
  457. }
  458. }
  459. impl VariantId for VcfVariant {
  460. fn variant_id(&self) -> String {
  461. format!("{}_{}>{}", self.position, self.reference, self.alternative)
  462. }
  463. }
  464. impl GetGenomePosition for VcfVariant {
  465. fn position(&self) -> &GenomePosition {
  466. &self.position
  467. }
  468. }
  469. impl PartialOrd for VcfVariant {
  470. fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
  471. Some(self.cmp(other))
  472. }
  473. }
  474. impl Ord for VcfVariant {
  475. fn cmp(&self, other: &Self) -> Ordering {
  476. self.position.cmp(&other.position)
  477. }
  478. }
  479. /// Info
  480. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
  481. pub struct Infos(pub Vec<Info>);
  482. impl FromStr for Infos {
  483. type Err = anyhow::Error;
  484. fn from_str(s: &str) -> anyhow::Result<Self> {
  485. Ok(Self(
  486. s.split(";")
  487. .map(Info::from_str)
  488. .collect::<Result<Vec<Info>, _>>()
  489. .map_err(|e| anyhow::anyhow!("Failed to parse info: {e}"))?,
  490. ))
  491. }
  492. }
  493. impl fmt::Display for Infos {
  494. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  495. write!(
  496. f,
  497. "{}",
  498. self.0
  499. .iter()
  500. .map(|e| e.to_string())
  501. .collect::<Vec<String>>()
  502. .join(";")
  503. )
  504. }
  505. }
  506. #[allow(non_camel_case_types)]
  507. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
  508. pub enum Info {
  509. Empty,
  510. H,
  511. F,
  512. P,
  513. FAU(u32),
  514. FCU(u32),
  515. FGU(u32),
  516. FTU(u32),
  517. RAU(u32),
  518. RCU(u32),
  519. RGU(u32),
  520. RTU(u32),
  521. SVTYPE(SVType),
  522. MATEID(String),
  523. NORMAL_READ_SUPPORT(u32),
  524. TUMOUR_READ_SUPPORT(u32),
  525. NORMAL_ALN_SUPPORT(u32),
  526. TUMOUR_ALN_SUPPORT(u32),
  527. SVLEN(i32),
  528. TUMOUR_DP_BEFORE(Vec<u32>),
  529. TUMOUR_DP_AT(Vec<u32>),
  530. TUMOUR_DP_AFTER(Vec<u32>),
  531. NORMAL_DP_BEFORE(Vec<u32>),
  532. NORMAL_DP_AT(Vec<u32>),
  533. NORMAL_DP_AFTER(Vec<u32>),
  534. TUMOUR_AF(Vec<f32>),
  535. NORMAL_AF(Vec<f32>),
  536. BP_NOTATION(String),
  537. SOURCE(String),
  538. CLUSTERED_READS_TUMOUR(u32),
  539. CLUSTERED_READS_NORMAL(u32),
  540. TUMOUR_ALT_HP(Vec<u32>),
  541. TUMOUR_PS(Vec<String>),
  542. NORMAL_ALT_HP(Vec<u32>),
  543. NORMAL_PS(Vec<String>),
  544. TUMOUR_TOTAL_HP_AT(Vec<u32>),
  545. NORMAL_TOTAL_HP_AT(Vec<u32>),
  546. ORIGIN_STARTS_STD_DEV(f32),
  547. ORIGIN_MAPQ_MEAN(f32),
  548. ORIGIN_EVENT_SIZE_STD_DEV(f32),
  549. ORIGIN_EVENT_SIZE_MEDIAN(f32),
  550. ORIGIN_EVENT_SIZE_MEAN(f32),
  551. END_STARTS_STD_DEV(f32),
  552. END_MAPQ_MEAN(f32),
  553. END_EVENT_SIZE_STD_DEV(f32),
  554. END_EVENT_SIZE_MEDIAN(f32),
  555. END_EVENT_SIZE_MEAN(f32),
  556. CLASS(String),
  557. END(u32),
  558. SVINSLEN(u32),
  559. SVINSSEQ(String),
  560. }
  561. impl FromStr for Info {
  562. type Err = anyhow::Error;
  563. fn from_str(s: &str) -> anyhow::Result<Self> {
  564. if s.contains('=') {
  565. let (key, value) = s
  566. .split_once('=')
  567. .context(format!("Can't split with `=` in string: {s}"))?;
  568. Ok(match key {
  569. "FAU" => Info::FAU(parse_value(value, key)?),
  570. "FCU" => Info::FCU(parse_value(value, key)?),
  571. "FGU" => Info::FGU(parse_value(value, key)?),
  572. "FTU" => Info::FTU(parse_value(value, key)?),
  573. "RAU" => Info::RAU(parse_value(value, key)?),
  574. "RCU" => Info::RCU(parse_value(value, key)?),
  575. "RGU" => Info::RGU(parse_value(value, key)?),
  576. "RTU" => Info::RTU(parse_value(value, key)?),
  577. "SVLEN" => Info::SVLEN(parse_value(value, key)?),
  578. "END" => Info::END(parse_value(value, key)?),
  579. "SVINSLEN" => Info::SVINSLEN(parse_value(value, key)?),
  580. "SVTYPE" => Info::SVTYPE(value.parse()?),
  581. "MATEID" => Info::MATEID(value.to_string()),
  582. "NORMAL_READ_SUPPORT" => Info::NORMAL_READ_SUPPORT(parse_value(value, key)?),
  583. "TUMOUR_READ_SUPPORT" => Info::TUMOUR_READ_SUPPORT(parse_value(value, key)?),
  584. "NORMAL_ALN_SUPPORT" => Info::NORMAL_ALN_SUPPORT(parse_value(value, key)?),
  585. "TUMOUR_ALN_SUPPORT" => Info::TUMOUR_ALN_SUPPORT(parse_value(value, key)?),
  586. "SVINSSEQ" => Info::SVINSSEQ(value.to_string()),
  587. "TUMOUR_DP_BEFORE" => Info::TUMOUR_DP_BEFORE(parse_vec_value(value, key)?),
  588. "TUMOUR_DP_AT" => Info::TUMOUR_DP_AT(parse_vec_value(value, key)?),
  589. "TUMOUR_DP_AFTER" => Info::TUMOUR_DP_AFTER(parse_vec_value(value, key)?),
  590. "NORMAL_DP_BEFORE" => Info::NORMAL_DP_BEFORE(parse_vec_value(value, key)?),
  591. "NORMAL_DP_AT" => Info::NORMAL_DP_AT(parse_vec_value(value, key)?),
  592. "NORMAL_DP_AFTER" => Info::NORMAL_DP_AFTER(parse_vec_value(value, key)?),
  593. "TUMOUR_AF" => Info::TUMOUR_AF(parse_vec_value(value, key)?),
  594. "NORMAL_AF" => Info::NORMAL_AF(parse_vec_value(value, key)?),
  595. "BP_NOTATION" => Info::BP_NOTATION(value.to_string()),
  596. "SOURCE" => Info::SOURCE(value.to_string()),
  597. "CLUSTERED_READS_TUMOUR" => Info::CLUSTERED_READS_TUMOUR(parse_value(value, key)?),
  598. "CLUSTERED_READS_NORMAL" => Info::CLUSTERED_READS_NORMAL(parse_value(value, key)?),
  599. "TUMOUR_ALT_HP" => Info::TUMOUR_ALT_HP(parse_vec_value(value, key)?),
  600. "TUMOUR_PS" => Info::TUMOUR_PS(parse_vec_value(value, key)?),
  601. "NORMAL_ALT_HP" => Info::NORMAL_ALT_HP(parse_vec_value(value, key)?),
  602. "NORMAL_PS" => Info::NORMAL_PS(parse_vec_value(value, key)?),
  603. "TUMOUR_TOTAL_HP_AT" => Info::TUMOUR_TOTAL_HP_AT(parse_vec_value(value, key)?),
  604. "NORMAL_TOTAL_HP_AT" => Info::NORMAL_TOTAL_HP_AT(parse_vec_value(value, key)?),
  605. "ORIGIN_STARTS_STD_DEV" => Info::ORIGIN_STARTS_STD_DEV(parse_value(value, key)?),
  606. "ORIGIN_MAPQ_MEAN" => Info::ORIGIN_MAPQ_MEAN(parse_value(value, key)?),
  607. "ORIGIN_EVENT_SIZE_STD_DEV" => {
  608. Info::ORIGIN_EVENT_SIZE_STD_DEV(parse_value(value, key)?)
  609. }
  610. "ORIGIN_EVENT_SIZE_MEDIAN" => {
  611. Info::ORIGIN_EVENT_SIZE_MEDIAN(parse_value(value, key)?)
  612. }
  613. "ORIGIN_EVENT_SIZE_MEAN" => Info::ORIGIN_EVENT_SIZE_MEAN(parse_value(value, key)?),
  614. "END_STARTS_STD_DEV" => Info::END_STARTS_STD_DEV(parse_value(value, key)?),
  615. "END_MAPQ_MEAN" => Info::END_MAPQ_MEAN(parse_value(value, key)?),
  616. "END_EVENT_SIZE_STD_DEV" => Info::END_EVENT_SIZE_STD_DEV(parse_value(value, key)?),
  617. "END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?),
  618. "END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?),
  619. "CLASS" => Info::CLASS(value.to_string()),
  620. _ => Info::Empty,
  621. })
  622. } else {
  623. Ok(match s {
  624. "H" => Info::H,
  625. "F" => Info::F,
  626. "P" => Info::P,
  627. _ => Info::Empty,
  628. })
  629. }
  630. }
  631. }
  632. impl fmt::Display for Info {
  633. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  634. match self {
  635. Info::Empty => write!(f, "."),
  636. Info::H => write!(f, "H"),
  637. Info::F => write!(f, "F"),
  638. Info::P => write!(f, "P"),
  639. Info::FAU(v) => write!(f, "FAU={v}"),
  640. Info::FCU(v) => write!(f, "FCU={v}"),
  641. Info::FGU(v) => write!(f, "FGU={v}"),
  642. Info::FTU(v) => write!(f, "FTU={v}"),
  643. Info::RAU(v) => write!(f, "RAU={v}"),
  644. Info::RCU(v) => write!(f, "RCU={v}"),
  645. Info::RGU(v) => write!(f, "RGU={v}"),
  646. Info::RTU(v) => write!(f, "RTU={v}"),
  647. Info::SVTYPE(v) => write!(f, "SVTYPE={v}"),
  648. Info::SVLEN(v) => write!(f, "SVLEN={v}"),
  649. Info::END(v) => write!(f, "END={v}"),
  650. Info::MATEID(v) => write!(f, "MATEID={v}"),
  651. Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"),
  652. Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"),
  653. Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"),
  654. Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"),
  655. Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"),
  656. Info::TUMOUR_ALN_SUPPORT(v) => write!(f, "TUMOUR_ALN_SUPPORT={v}"),
  657. Info::TUMOUR_DP_BEFORE(v) => write!(f, "TUMOUR_DP_BEFORE={}", concat_numbers(v)),
  658. Info::TUMOUR_DP_AT(v) => write!(f, "TUMOUR_DP_AT={}", concat_numbers(v)),
  659. Info::TUMOUR_DP_AFTER(v) => write!(f, "TUMOUR_DP_AFTER={}", concat_numbers(v)),
  660. Info::NORMAL_DP_BEFORE(v) => write!(f, "NORMAL_DP_BEFORE={}", concat_numbers(v)),
  661. Info::NORMAL_DP_AT(v) => write!(f, "NORMAL_DP_AT={}", concat_numbers(v)),
  662. Info::NORMAL_DP_AFTER(v) => write!(f, "NORMAL_DP_AFTER={}", concat_numbers(v)),
  663. Info::TUMOUR_AF(v) => write!(f, "TUMOUR_AF={}", concat_numbers(v)),
  664. Info::NORMAL_AF(v) => write!(f, "NORMAL_AF={}", concat_numbers(v)),
  665. Info::BP_NOTATION(v) => write!(f, "BP_NOTATION={v}"),
  666. Info::SOURCE(v) => write!(f, "SOURCE={v}"),
  667. Info::CLUSTERED_READS_TUMOUR(v) => write!(f, "CLUSTERED_READS_TUMOUR={v}"),
  668. Info::CLUSTERED_READS_NORMAL(v) => write!(f, "CLUSTERED_READS_NORMAL={v}"),
  669. Info::TUMOUR_ALT_HP(v) => write!(f, "TUMOUR_ALT_HP={}", concat_numbers(v)),
  670. Info::TUMOUR_PS(v) => write!(f, "TUMOUR_PS={}", v.join(",")),
  671. Info::NORMAL_ALT_HP(v) => write!(f, "NORMAL_ALT_HP={}", concat_numbers(v)),
  672. Info::NORMAL_PS(v) => write!(f, "NORMAL_PS={}", v.join(",")),
  673. Info::TUMOUR_TOTAL_HP_AT(v) => write!(f, "TUMOUR_TOTAL_HP_AT={}", concat_numbers(v)),
  674. Info::NORMAL_TOTAL_HP_AT(v) => write!(f, "NORMAL_TOTAL_HP_AT={}", concat_numbers(v)),
  675. Info::ORIGIN_STARTS_STD_DEV(v) => write!(f, "ORIGIN_STARTS_STD_DEV={v}"),
  676. Info::ORIGIN_MAPQ_MEAN(v) => write!(f, "ORIGIN_MAPQ_MEAN={v}"),
  677. Info::ORIGIN_EVENT_SIZE_STD_DEV(v) => write!(f, "ORIGIN_EVENT_SIZE_STD_DEV={v}"),
  678. Info::ORIGIN_EVENT_SIZE_MEDIAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEDIAN={v}"),
  679. Info::ORIGIN_EVENT_SIZE_MEAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEAN={v}"),
  680. Info::END_STARTS_STD_DEV(v) => write!(f, "END_STARTS_STD_DEV={v}"),
  681. Info::END_MAPQ_MEAN(v) => write!(f, "END_MAPQ_MEAN={v}"),
  682. Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"),
  683. Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"),
  684. Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"),
  685. Info::CLASS(v) => write!(f, "CLASS={v}"),
  686. }
  687. }
  688. }
  689. pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
  690. v.iter()
  691. .map(|n| n.to_string())
  692. .collect::<Vec<String>>()
  693. .join(",")
  694. }
  695. /// Format
  696. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
  697. pub enum Format {
  698. // DeepVariant
  699. GT(String),
  700. GQ(u32),
  701. DP(u32),
  702. AD(Vec<u32>),
  703. VAF(f32),
  704. PL(Vec<u32>),
  705. // Clairs
  706. // when format begins with N: normal
  707. // AF(f32),
  708. // NAF(f32), // DP(u32),
  709. NDP(u32),
  710. NAD(Vec<u32>),
  711. AU(u32),
  712. CU(u32),
  713. GU(u32),
  714. TU(u32),
  715. NAU(u32),
  716. NCU(u32),
  717. NGU(u32),
  718. NTU(u32),
  719. // nanomonsv
  720. TR(u32),
  721. VR(u32),
  722. Other((String, String)), // (key, value)
  723. }
  724. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
  725. pub struct Formats(pub Vec<Format>);
  726. impl Formats {
  727. /// Get the tumoral alternative read depth and total depth as an Option<(u32, u32)>.
  728. pub fn n_alt_depth(&self) -> Option<(u32, u32)> {
  729. let mut tumor_alt_depth: Option<u32> = None;
  730. let mut tumor_total_depth: Option<u32> = None;
  731. for format in &self.0 {
  732. match format {
  733. // Tumor Allelic Depth (AD)
  734. Format::AD(values) => {
  735. if values.len() > 1 {
  736. // Sum all alternative allele depths (excluding reference allele)
  737. tumor_alt_depth = Some(values[1..].iter().sum());
  738. }
  739. }
  740. // Tumor Total Depth (DP)
  741. Format::DP(value) => {
  742. tumor_total_depth = Some(*value);
  743. }
  744. _ => {}
  745. }
  746. }
  747. // Return a tuple (tumor_alt_depth, tumor_total_depth) if both are available
  748. match (tumor_alt_depth, tumor_total_depth) {
  749. (Some(alt), Some(total)) => Some((alt, total)),
  750. _ => None,
  751. }
  752. }
  753. }
  754. impl TryFrom<(&str, &str)> for Formats {
  755. type Error = anyhow::Error;
  756. fn try_from((k, v): (&str, &str)) -> anyhow::Result<Self> {
  757. let keys: Vec<&str> = k.split(':').collect();
  758. let values: Vec<&str> = v.split(':').collect();
  759. if keys.len() != values.len() {
  760. anyhow::bail!("Mismatch between keys and values count for {k} {v}");
  761. }
  762. Ok(Self(
  763. keys.into_iter()
  764. .zip(values)
  765. .map(|(key, value)| Format::try_from((key, value)))
  766. .collect::<Result<Vec<Format>, _>>()
  767. .map_err(|e| anyhow::anyhow!("Failed to parse format: {e}"))?,
  768. ))
  769. }
  770. }
  771. impl From<Formats> for (String, String) {
  772. fn from(formats: Formats) -> Self {
  773. let mut keys = Vec::new();
  774. let mut values = Vec::new();
  775. for format in formats.0 {
  776. let (key, value): (String, String) = format.into();
  777. keys.push(key);
  778. values.push(value);
  779. }
  780. (keys.join(":"), values.join(":"))
  781. }
  782. }
  783. impl TryFrom<(&str, &str)> for Format {
  784. type Error = anyhow::Error;
  785. fn try_from((key, value): (&str, &str)) -> anyhow::Result<Self> {
  786. let format = match key {
  787. "GT" => Format::GT(value.to_string()),
  788. "GQ" => Format::GQ(parse_value(value, key)?),
  789. "DP" => Format::DP(parse_value(value, key)?),
  790. "AD" => Format::AD(parse_vec_value(value, key)?),
  791. "VAF" => Format::VAF(parse_value(value, key)?),
  792. // "AF" => Format::AF(parse_value(value, key)?),
  793. // "NAF" => Format::NAF(parse_value(value, key)?),
  794. "NDP" => Format::NDP(parse_value(value, key)?),
  795. "NAD" => Format::NAD(parse_vec_value(value, key)?),
  796. "AU" => Format::AU(parse_value(value, key)?),
  797. "CU" => Format::CU(parse_value(value, key)?),
  798. "GU" => Format::GU(parse_value(value, key)?),
  799. "TU" => Format::TU(parse_value(value, key)?),
  800. "NAU" => Format::NAU(parse_value(value, key)?),
  801. "NCU" => Format::NCU(parse_value(value, key)?),
  802. "NGU" => Format::NGU(parse_value(value, key)?),
  803. "NTU" => Format::NTU(parse_value(value, key)?),
  804. "PL" => Format::PL(parse_vec_value(value, key)?),
  805. "TR" => Format::TR(parse_value(value, key)?),
  806. "VR" => Format::VR(parse_value(value, key)?),
  807. _ => Format::Other((key.to_string(), value.to_string())),
  808. };
  809. Ok(format)
  810. }
  811. }
  812. // Helper function to parse a single value (DeepSeek)
  813. fn parse_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<T>
  814. where
  815. T::Err: std::fmt::Debug,
  816. {
  817. value
  818. .parse()
  819. .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
  820. .context(format!("Can't parse {}: {}", key, value)) // Add context
  821. }
  822. // Helper function to parse comma-separated values (DeepSeek)
  823. fn parse_vec_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<Vec<T>>
  824. where
  825. T::Err: std::fmt::Debug,
  826. {
  827. value
  828. .split(',')
  829. .map(|e| {
  830. e.parse()
  831. .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
  832. .context(format!("Failed to parse {}: {}", key, e)) // Add context
  833. })
  834. .collect()
  835. }
  836. impl From<Format> for (String, String) {
  837. fn from(format: Format) -> Self {
  838. let concat = |values: Vec<u32>| -> String {
  839. values
  840. .iter()
  841. .map(|v| v.to_string())
  842. .collect::<Vec<_>>()
  843. .join(",")
  844. };
  845. match format {
  846. Format::GT(value) => ("GT".to_string(), value),
  847. Format::GQ(value) => ("GQ".to_string(), value.to_string()),
  848. Format::DP(value) => ("DP".to_string(), value.to_string()),
  849. Format::AD(values) => ("AD".to_string(), concat(values)),
  850. Format::VAF(value) => ("VAF".to_string(), value.to_string()),
  851. Format::PL(values) => ("PL".to_string(), concat(values)),
  852. Format::Other((key, value)) => (key, value),
  853. // Format::AF(value) => ("AF".to_string(), value.to_string()),
  854. // Format::NAF(value) => ("NAF".to_string(), value.to_string()),
  855. Format::NDP(value) => ("NDP".to_string(), value.to_string()),
  856. Format::NAD(values) => ("NAD".to_string(), concat(values)),
  857. Format::AU(value) => ("AU".to_string(), value.to_string()),
  858. Format::CU(value) => ("CU".to_string(), value.to_string()),
  859. Format::GU(value) => ("GU".to_string(), value.to_string()),
  860. Format::TU(value) => ("TU".to_string(), value.to_string()),
  861. Format::NAU(value) => ("NAU".to_string(), value.to_string()),
  862. Format::NCU(value) => ("NCU".to_string(), value.to_string()),
  863. Format::NGU(value) => ("NGU".to_string(), value.to_string()),
  864. Format::NTU(value) => ("NTU".to_string(), value.to_string()),
  865. Format::TR(value) => ("TR".to_string(), value.to_string()),
  866. Format::VR(value) => ("VR".to_string(), value.to_string()),
  867. }
  868. }
  869. }
  870. impl Formats {
  871. pub fn commun_deepvariant_clairs(&self) -> Self {
  872. let filtered_vec: Vec<Format> = self
  873. .0
  874. .clone()
  875. .into_iter()
  876. .map(|e| {
  877. if let Format::VAF(_v) = e {
  878. e
  879. // Format::AF(v)
  880. } else {
  881. e
  882. }
  883. })
  884. .filter(|format| {
  885. matches!(
  886. format,
  887. Format::GT(_) | Format::GQ(_) | Format::DP(_) | Format::AD(_) /* | Format::AF(_) */
  888. )
  889. })
  890. .collect();
  891. Formats(filtered_vec)
  892. }
  893. }
  894. /// Filter
  895. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
  896. pub enum Filter {
  897. PASS,
  898. Other(String),
  899. }
  900. impl FromStr for Filter {
  901. type Err = anyhow::Error;
  902. fn from_str(s: &str) -> anyhow::Result<Self> {
  903. match s {
  904. "PASS" => Ok(Filter::PASS),
  905. _ => Ok(Filter::Other(s.to_string())),
  906. }
  907. }
  908. }
  909. impl fmt::Display for Filter {
  910. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  911. match self {
  912. Filter::PASS => write!(f, "PASS"),
  913. Filter::Other(ref s) => write!(f, "{}", s),
  914. }
  915. }
  916. }
  917. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
  918. pub enum ReferenceAlternative {
  919. Nucleotide(Base),
  920. Nucleotides(Vec<Base>),
  921. Unstructured(String),
  922. }
  923. impl FromStr for ReferenceAlternative {
  924. type Err = anyhow::Error;
  925. fn from_str(s: &str) -> anyhow::Result<Self> {
  926. let possible_bases = s.as_bytes().iter();
  927. let mut res: Vec<Base> = Vec::new();
  928. for &base in possible_bases {
  929. match base.try_into() {
  930. std::result::Result::Ok(b) => res.push(b),
  931. Err(_) => {
  932. return Ok(Self::Unstructured(s.to_string()));
  933. }
  934. }
  935. }
  936. if res.len() == 1 {
  937. Ok(Self::Nucleotide(res.pop().unwrap()))
  938. } else {
  939. Ok(Self::Nucleotides(res))
  940. }
  941. }
  942. }
  943. impl fmt::Display for ReferenceAlternative {
  944. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  945. let string = match self {
  946. ReferenceAlternative::Nucleotide(b) => b.to_string(),
  947. ReferenceAlternative::Nucleotides(bases) => bases
  948. .iter()
  949. .fold(String::new(), |acc, e| format!("{}{}", acc, e)),
  950. ReferenceAlternative::Unstructured(s) => s.to_string(),
  951. };
  952. write!(f, "{}", string)
  953. }
  954. }
  955. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
  956. pub enum Base {
  957. A,
  958. T,
  959. C,
  960. G,
  961. N,
  962. }
  963. impl TryFrom<u8> for Base {
  964. type Error = anyhow::Error;
  965. fn try_from(base: u8) -> anyhow::Result<Self> {
  966. match base {
  967. b'A' => Ok(Base::A),
  968. b'T' => Ok(Base::T),
  969. b'C' => Ok(Base::C),
  970. b'G' => Ok(Base::G),
  971. b'N' => Ok(Base::N),
  972. _ => Err(anyhow::anyhow!(
  973. "Unknown base: {}",
  974. String::from_utf8_lossy(&[base])
  975. )),
  976. }
  977. }
  978. }
  979. impl Base {
  980. pub fn into_u8(self) -> u8 {
  981. match self {
  982. Base::A => b'A',
  983. Base::T => b'T',
  984. Base::C => b'C',
  985. Base::G => b'G',
  986. Base::N => b'N',
  987. }
  988. }
  989. }
  990. impl fmt::Display for Base {
  991. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  992. // Use `self.number` to refer to each positional data point.
  993. let str = match self {
  994. Base::A => "A",
  995. Base::T => "T",
  996. Base::C => "C",
  997. Base::G => "G",
  998. Base::N => "N",
  999. };
  1000. write!(f, "{}", str)
  1001. }
  1002. }
  1003. pub trait Variants {
  1004. fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection>;
  1005. }
  1006. pub trait VariantId {
  1007. fn variant_id(&self) -> String;
  1008. }
  1009. pub trait Label {
  1010. fn label(&self) -> String;
  1011. }
  1012. /// A trait alias for all dynamically executable pipeline runners.
  1013. ///
  1014. /// This trait represents any component that:
  1015. /// - Can decide whether it needs to run (`ShouldRun`)
  1016. /// - Implements the actual execution logic (`Run`)
  1017. /// - Is thread-safe (`Send + Sync`) to be boxed and dispatched concurrently
  1018. ///
  1019. /// Components implementing this trait can be boxed as `ShouldRunBox`.
  1020. pub trait ShouldRunTrait: ShouldRun + Run + Send + Sync + Label {}
  1021. /// Blanket implementation for all compatible types.
  1022. impl<T> ShouldRunTrait for T where T: ShouldRun + Run + Send + Sync + Label {}
  1023. /// A boxed trait object to hold any runner implementing `ShouldRunTrait`.
  1024. pub type ShouldRunBox = Box<dyn ShouldRunTrait>;
  1025. /// Macro to initialize and box multiple `ShouldRunTrait` components.
  1026. ///
  1027. /// # Arguments
  1028. /// * `$id` - Sample ID (typically a string slice)
  1029. /// * `$config` - Shared configuration object
  1030. /// * `$($runner:ty),+` - One or more runner types implementing `Initialize + ShouldRunTrait`
  1031. ///
  1032. /// # Returns
  1033. /// A vector of boxed runner components (`Vec<ShouldRunBox>`)
  1034. ///
  1035. /// # Example
  1036. /// ```rust
  1037. /// let modules: Vec<ShouldRunBox> = create_should_run!(
  1038. /// "sample_42",
  1039. /// config,
  1040. /// ClairS,
  1041. /// Savana,
  1042. /// DeepSomatic,
  1043. /// )?;
  1044. /// ```
  1045. ///
  1046. /// # Errors
  1047. /// This macro uses `?`, so it must be called inside a function that returns `anyhow::Result`.
  1048. #[macro_export]
  1049. macro_rules! create_should_run {
  1050. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  1051. use anyhow::Context;
  1052. let mut runners: Vec<ShouldRunBox> = Vec::new();
  1053. $(
  1054. let runner = <$runner>::initialize($id, $config.clone())
  1055. .with_context(|| format!(
  1056. "Failed to initialize should-run checker {} for {}",
  1057. stringify!($runner), $id
  1058. ))?;
  1059. runners.push(Box::new(runner) as ShouldRunBox);
  1060. )+
  1061. runners
  1062. }};
  1063. }
  1064. /// Macro to initialize and box a list of solo-mode pipeline components that implement `ShouldRunTrait`.
  1065. ///
  1066. /// This is typically used for per-timepoint variant callers (e.g., `DeepVariant`),
  1067. /// where each runner is instantiated for a specific sample timepoint (e.g., "tumoral", "normal").
  1068. ///
  1069. /// Each entry must be provided as a pair: the type of the runner and the timepoint string expression.
  1070. ///
  1071. /// # Arguments
  1072. /// - `$id`: The sample ID (`&str`) passed to each initializer
  1073. /// - `$config`: A `Config` object (must be cloneable)
  1074. /// - `$($runner:ty, $arg:expr),+`: One or more runner types with timepoint arguments (e.g., `config.tumoral_name`)
  1075. ///
  1076. /// # Returns
  1077. /// A `Vec<ShouldRunBox>` with boxed runners initialized per timepoint.
  1078. ///
  1079. /// # Example
  1080. /// ```rust
  1081. /// let solo_callers = create_should_run_solo!(
  1082. /// "sample42",
  1083. /// config,
  1084. /// DeepVariant, config.tumoral_name,
  1085. /// DeepVariant, config.normal_name,
  1086. /// )?;
  1087. /// ```
  1088. ///
  1089. /// # Notes
  1090. /// Using `config.tumoral_name` and `config.normal_name` is preferred over hardcoded "diag"/"mrd".
  1091. ///
  1092. /// # Errors
  1093. /// This macro uses `?` and must be called inside a `Result`-returning context.
  1094. #[macro_export]
  1095. macro_rules! create_should_run_solo {
  1096. ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{
  1097. use anyhow::Context;
  1098. let mut runners: Vec<ShouldRunBox> = Vec::new();
  1099. $(
  1100. let runner = <$runner>::initialize($id, $arg, $config.clone())
  1101. .with_context(|| format!(
  1102. "Failed to initialize solo should-run checker {} for '{}' and arg '{}'",
  1103. stringify!($runner), $id, $arg
  1104. ))?;
  1105. runners.push(Box::new(runner) as ShouldRunBox);
  1106. )+
  1107. runners
  1108. }};
  1109. }
  1110. /// Macro to initialize and box a list of pipeline components that must run once per timepoint
  1111. /// (i.e., both "tumoral" and "normal") and implement `ShouldRunTrait`.
  1112. ///
  1113. /// This is typically used for variant callers like `DeepVariant` that operate in **solo mode**
  1114. /// but must be run twice — once for the tumoral sample and once for the normal sample.
  1115. ///
  1116. /// The macro:
  1117. /// - Calls `.initialize(id, timepoint, config.clone())` twice per type
  1118. /// - Uses `config.tumoral_name` and `config.normal_name` as timepoints
  1119. /// - Returns a flat `Vec<ShouldRunBox>` containing both instances for each type
  1120. ///
  1121. /// # Arguments
  1122. /// - `$id`: The sample ID (`&str`)
  1123. /// - `$config`: The configuration object (must expose `tumoral_name` and `normal_name`)
  1124. /// - `$($runner:ty),+`: One or more runner types that implement `Initialize` with `(id, timepoint, config)`
  1125. ///
  1126. /// # Example
  1127. /// ```rust
  1128. /// let runners = create_should_run_normal_tumoral!(
  1129. /// "sample_42",
  1130. /// config,
  1131. /// DeepVariant,
  1132. /// AnotherCaller,
  1133. /// )?;
  1134. /// ```
  1135. ///
  1136. /// This will expand to:
  1137. /// ```rust
  1138. /// vec![
  1139. /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox,
  1140. /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox,
  1141. /// Box::new(AnotherCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox,
  1142. /// Box::new(AnotherCaller::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox,
  1143. /// ]
  1144. /// ```
  1145. ///
  1146. /// # Errors
  1147. /// This macro uses `?`, so it must be called inside a function that returns `Result`.
  1148. #[macro_export]
  1149. macro_rules! create_should_run_normal_tumoral {
  1150. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  1151. use anyhow::Context;
  1152. let mut runners: Vec<ShouldRunBox> = Vec::new();
  1153. $(
  1154. let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone())
  1155. .with_context(|| format!(
  1156. "Failed to initialize tumoral should-run checker {} for {}",
  1157. stringify!($runner), $id
  1158. ))?;
  1159. runners.push(Box::new(tumoral) as ShouldRunBox);
  1160. let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone())
  1161. .with_context(|| format!(
  1162. "Failed to initialize normal should-run checker {} for {}",
  1163. stringify!($runner), $id
  1164. ))?;
  1165. runners.push(Box::new(normal) as ShouldRunBox);
  1166. )+
  1167. runners
  1168. }};
  1169. }
  1170. /// Executes each runner in the slice only if `should_run()` returns true.
  1171. ///
  1172. /// # Arguments
  1173. /// * `iterable` - A mutable slice of boxed `InitRun` components
  1174. ///
  1175. /// # Returns
  1176. /// * `Ok(())` if all required runners execute successfully
  1177. /// * An error if any runner's `run()` method fails
  1178. ///
  1179. /// # Notes
  1180. /// - This function will skip runners that return `false` from `should_run()`
  1181. pub fn run_if_required(iterable: &mut [ShouldRunBox]) -> anyhow::Result<()> {
  1182. iterable.iter_mut().try_for_each(|e| {
  1183. if e.should_run() {
  1184. e.run()
  1185. } else {
  1186. info!("Skipping running: {}", e.label());
  1187. Ok(())
  1188. }
  1189. })
  1190. }
  1191. /// A trait alias for all variant callers that support initialization, execution,
  1192. /// conditional re-running, and variant extraction (VCF + annotations).
  1193. ///
  1194. /// Used to enable polymorphic handling of both solo and somatic callers in the pipeline.
  1195. pub trait RunnerVariants: Variants + Send + Sync + Label {}
  1196. /// Blanket implementation for all compatible types.
  1197. impl<T> RunnerVariants for T where T: Variants + Send + Sync + Label {}
  1198. pub type CallerBox = Box<dyn RunnerVariants + Send + Sync>;
  1199. #[macro_export]
  1200. macro_rules! init_somatic_callers {
  1201. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  1202. use anyhow::Context;
  1203. let mut callers: Vec<CallerBox> = Vec::new();
  1204. $(
  1205. let caller = <$runner>::initialize($id, $config.clone())
  1206. .with_context(|| format!(
  1207. "Failed to initialize somatic caller {} for {}",
  1208. stringify!($runner), $id
  1209. ))?;
  1210. callers.push(Box::new(caller) as CallerBox);
  1211. )+
  1212. callers
  1213. }};
  1214. }
  1215. /// Macro to initialize and box a list of **solo-mode variant callers** for specific timepoints,
  1216. /// where each runner implements `RunnerVariants`.
  1217. ///
  1218. /// This is useful for callers like `DeepVariant` that need to be instantiated with a specific
  1219. /// sample timepoint (e.g., `config.tumoral_name` or `config.normal_name`).
  1220. ///
  1221. /// Each entry must be a pair: a runner type and a timepoint expression (usually from config).
  1222. ///
  1223. /// # Arguments
  1224. /// - `$id`: The sample ID (`&str`)
  1225. /// - `$config`: The configuration object (must be cloneable)
  1226. /// - `$($runner:ty, $arg:expr),+`: One or more `(RunnerType, Timepoint)` pairs
  1227. ///
  1228. /// # Returns
  1229. /// A `Vec<CallerBox>` containing initialized, boxed solo-mode variant callers.
  1230. ///
  1231. /// # Example
  1232. /// ```rust
  1233. /// let solo_callers = init_solo_callers!(
  1234. /// "sample_42",
  1235. /// config,
  1236. /// DeepVariant, config.tumoral_name,
  1237. /// DeepVariant, config.normal_name,
  1238. /// )?;
  1239. /// ```
  1240. ///
  1241. /// This will expand to:
  1242. /// ```rust
  1243. /// vec![
  1244. /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
  1245. /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
  1246. /// ]
  1247. /// ```
  1248. ///
  1249. /// # Errors
  1250. /// This macro uses `?` internally, so it must be used inside a `Result`-returning context.
  1251. #[macro_export]
  1252. macro_rules! init_solo_callers {
  1253. ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{
  1254. let mut callers: Vec<CallerBox> = Vec::new();
  1255. $(
  1256. let caller = <$runner>::initialize($id, $arg, $config.clone())
  1257. .with_context(|| format!(
  1258. "Failed to initialize caller {} for {}",
  1259. stringify!($runner), $id
  1260. ))?;
  1261. callers.push(Box::new(caller) as CallerBox);
  1262. )+
  1263. callers
  1264. }};
  1265. }
  1266. /// Macro to initialize and box a list of solo-mode **variant callers** for both `normal` and `tumoral` timepoints.
  1267. ///
  1268. /// This is designed for types like `DeepVariant` that implement `RunnerVariants` and require
  1269. /// separate execution for each timepoint. It will:
  1270. /// - Call `.initialize(id, timepoint, config)` for both `config.tumoral_name` and `config.normal_name`
  1271. /// - Box the result as `CallerBox`
  1272. ///
  1273. /// # Arguments
  1274. /// - `$id`: Sample ID (usually a `&str`)
  1275. /// - `$config`: Cloneable configuration object
  1276. /// - `$($runner:ty),+`: One or more runner types that implement `RunnerVariants`
  1277. ///
  1278. /// # Returns
  1279. /// A `Vec<CallerBox>` containing two boxed instances per runner (one for each timepoint).
  1280. ///
  1281. /// # Example
  1282. /// ```rust
  1283. /// let solo_callers = init_solo_callers_normal_tumoral!(
  1284. /// "sample_42",
  1285. /// config,
  1286. /// DeepVariant,
  1287. /// OtherSoloCaller,
  1288. /// )?;
  1289. /// ```
  1290. ///
  1291. /// This expands to:
  1292. /// ```rust
  1293. /// vec![
  1294. /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
  1295. /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
  1296. /// Box::new(OtherSoloCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
  1297. /// Box::new(OtherSoloCaller::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
  1298. /// ]
  1299. /// ```
  1300. ///
  1301. /// # Errors
  1302. /// This macro uses `?`, so it must be called inside a `Result`-returning context.
  1303. #[macro_export]
  1304. macro_rules! init_solo_callers_normal_tumoral {
  1305. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  1306. use anyhow::Context;
  1307. let mut callers: Vec<CallerBox> = Vec::new();
  1308. $(
  1309. let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone())
  1310. .with_context(|| format!(
  1311. "Failed to initialize tumoral caller {} for {} '{}'",
  1312. stringify!($runner), $id, $config.tumoral_name
  1313. ))?;
  1314. callers.push(Box::new(tumoral) as CallerBox);
  1315. let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone())
  1316. .with_context(|| format!(
  1317. "Failed to initialize normal caller {} for {} '{}'",
  1318. stringify!($runner), $id, $config.normal_name
  1319. ))?;
  1320. callers.push(Box::new(normal) as CallerBox);
  1321. )+
  1322. callers
  1323. }};
  1324. }
  1325. // pub fn run_variants(iterable: &mut [CallerBox]) -> anyhow::Result<()> {
  1326. // iterable
  1327. // .iter_mut()
  1328. // .try_for_each(|runner| runner.run())
  1329. // .map_err(|e| anyhow::anyhow!("Error while calling run_variants.\n{e}"))
  1330. // }
  1331. pub fn load_variants(
  1332. iterable: &mut [CallerBox],
  1333. annotations: &Annotations,
  1334. ) -> anyhow::Result<Vec<VariantCollection>> {
  1335. iterable
  1336. .par_iter()
  1337. .map(|runner| {
  1338. let result = runner.variants(annotations);
  1339. if let Err(ref e) = result {
  1340. error!("Failed to load variants from: {}\n{e}", runner.label());
  1341. } else {
  1342. info!("Variants from {} loaded.", runner.label());
  1343. };
  1344. result
  1345. })
  1346. .filter(|r| r.is_ok())
  1347. .collect::<anyhow::Result<Vec<_>>>()
  1348. .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}"))
  1349. }
  1350. // pub fn par_load_variants(
  1351. // iterable: &mut [Box<dyn Variants + Send + Sync>],
  1352. // annotations: &Annotations,
  1353. // ) -> anyhow::Result<Vec<VariantCollection>> {
  1354. // iterable
  1355. // .par_iter()
  1356. // .map(|runner| {
  1357. // let r = runner.variants(annotations);
  1358. // if let Err(ref e) = r {
  1359. // warn!("{e}");
  1360. // };
  1361. // r
  1362. // })
  1363. // .filter(|r| r.is_ok())
  1364. // .collect::<anyhow::Result<Vec<_>>>()
  1365. // .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}"))
  1366. // }
  1367. pub fn parallel_intersection<T: Hash + Eq + Clone + Send + Sync>(
  1368. vec1: &[T],
  1369. vec2: &[T],
  1370. ) -> (Vec<T>, Vec<T>, Vec<T>) {
  1371. let set1: HashSet<_> = vec1.par_iter().cloned().collect();
  1372. let set2: HashSet<_> = vec2.par_iter().cloned().collect();
  1373. let common: Vec<T> = set1
  1374. .par_iter()
  1375. .filter(|item| set2.contains(item))
  1376. .cloned()
  1377. .collect();
  1378. let only_in_first: Vec<T> = set1
  1379. .par_iter()
  1380. .filter(|item| !set2.contains(item))
  1381. .cloned()
  1382. .collect();
  1383. let only_in_second: Vec<T> = set2
  1384. .par_iter()
  1385. .filter(|item| !set1.contains(item))
  1386. .cloned()
  1387. .collect();
  1388. (common, only_in_first, only_in_second)
  1389. }