variant.rs 51 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451
  1. use crate::{
  2. annotation::Annotations,
  3. collection::ShouldRun,
  4. helpers::Hash128,
  5. positions::{GenomePosition, GetGenomePosition, VcfPosition},
  6. runners::Run,
  7. variant::variant_collection::VariantCollection,
  8. };
  9. use anyhow::{anyhow, Context};
  10. use bitcode::{Decode, Encode};
  11. use log::{error, info};
  12. use rayon::prelude::*;
  13. use serde::{Deserialize, Serialize};
  14. use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr};
  15. /// Represents a variant in the Variant Call Format (VCF).
  16. #[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
  17. pub struct VcfVariant {
  18. /// A 128-bit hash of the variant's key properties for efficient comparison and storage.
  19. pub hash: Hash128,
  20. /// The genomic position of the variant.
  21. pub position: GenomePosition,
  22. /// The identifier of the variant.
  23. pub id: String,
  24. /// The reference allele.
  25. pub reference: ReferenceAlternative,
  26. /// The alternative allele.
  27. pub alternative: ReferenceAlternative,
  28. /// The quality score of the variant call, if available.
  29. pub quality: Option<f32>,
  30. /// The filter status of the variant.
  31. pub filter: Filter,
  32. /// Additional information about the variant.
  33. pub infos: Infos,
  34. /// Genotype information and other sample-specific data.
  35. pub formats: Formats,
  36. }
  37. impl PartialEq for VcfVariant {
  38. /// Compares two VcfVariants for equality.
  39. ///
  40. /// Note: This comparison only considers position, reference, and alternative.
  41. /// It intentionally ignores id, filter, info, format, and quality.
  42. fn eq(&self, other: &Self) -> bool {
  43. // Nota bene: id, filter, info, format and quality is intentionally not compared
  44. self.position == other.position
  45. && self.reference == other.reference
  46. && self.alternative == other.alternative
  47. }
  48. }
  49. impl Eq for VcfVariant {}
  50. impl FromStr for VcfVariant {
  51. type Err = anyhow::Error;
  52. /// Parses a VcfVariant from a string representation.
  53. ///
  54. /// The input string is expected to be a tab-separated VCF line.
  55. ///
  56. /// # Errors
  57. ///
  58. /// Returns an error if parsing fails for any field.
  59. fn from_str(s: &str) -> anyhow::Result<Self> {
  60. let v: Vec<&str> = s.split('\t').collect();
  61. let vcf_position: VcfPosition = (
  62. *v.first().ok_or(anyhow!("Can't get contig from: {s}"))?,
  63. *v.get(1).ok_or(anyhow!("Can't get position from: {s}"))?,
  64. )
  65. .try_into()
  66. .context(format!("Can't parse position from: {s}"))?;
  67. let formats = if v.len() >= 10 {
  68. (
  69. *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?,
  70. *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?,
  71. )
  72. .try_into()
  73. .context(format!("Can't parse formats from: {s}"))?
  74. } else {
  75. Formats::default()
  76. };
  77. let position: GenomePosition = vcf_position.into();
  78. let reference: ReferenceAlternative = v
  79. .get(3)
  80. .ok_or(anyhow!("Can't parse reference from: {s}"))?
  81. .parse()
  82. .context(format!("Can't parse reference from: {s}"))?;
  83. let alternative: ReferenceAlternative = v
  84. .get(4)
  85. .ok_or(anyhow!("Can't parse alternative from: {s}"))?
  86. .parse()
  87. .context(format!("Can't parse alternative from: {s}"))?;
  88. // Blake3 128 bytes Hash
  89. let mut hasher = blake3::Hasher::new();
  90. hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes
  91. hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes
  92. hasher.update(reference.to_string().as_bytes()); // Reference string as bytes
  93. hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes
  94. let hash = hasher.finalize();
  95. let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
  96. Ok(Self {
  97. hash,
  98. position,
  99. id: v
  100. .get(2)
  101. .ok_or(anyhow!("Can't parse id from: {s}"))?
  102. .to_string(),
  103. reference,
  104. alternative,
  105. quality: v
  106. .get(5)
  107. .map(|s| s.parse::<f32>().ok()) // Try to parse as f64; returns Option<f64>
  108. .unwrap_or(None),
  109. filter: v
  110. .get(6)
  111. .ok_or(anyhow!("Can't parse filter from: {s}"))?
  112. .parse()
  113. .context(format!("Can't parse filter from: {s}"))?,
  114. infos: v
  115. .get(7)
  116. .ok_or(anyhow!("Can't parse infos from: {s}"))?
  117. .parse()
  118. .context(format!("Can't parse infos from: {s}"))?,
  119. formats,
  120. })
  121. }
  122. }
  123. // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag
  124. impl VcfVariant {
  125. /// Converts the VcfVariant into a VCF-formatted row string.
  126. ///
  127. /// This method creates a tab-separated string representation of the variant,
  128. /// suitable for writing to a VCF file.
  129. pub fn into_vcf_row(&self) -> String {
  130. let vcf_position: VcfPosition = self.position.clone().into();
  131. let (contig, position) = vcf_position.into();
  132. let mut columns = vec![
  133. contig,
  134. position,
  135. self.id.to_string(),
  136. self.reference.to_string(),
  137. self.alternative.to_string(),
  138. self.quality
  139. .map(|v| v.to_string())
  140. .unwrap_or(".".to_string()),
  141. self.filter.to_string(),
  142. self.infos.to_string(),
  143. ];
  144. if !self.formats.0.is_empty() {
  145. let (format, values) = self.formats.clone().into();
  146. columns.push(format);
  147. columns.push(values);
  148. }
  149. columns.join("\t")
  150. }
  151. /// Returns the hash of the variant.
  152. pub fn hash(&self) -> Hash128 {
  153. self.hash
  154. }
  155. /// Creates a new VcfVariant with common attributes from DeepVariant and CLAIRS.
  156. ///
  157. /// This method generates a new variant with shared properties, resetting some fields
  158. /// to default or empty values.
  159. pub fn commun_deepvariant_clairs(&self) -> VcfVariant {
  160. VcfVariant {
  161. hash: self.hash,
  162. position: self.position.clone(),
  163. id: self.id.clone(),
  164. reference: self.reference.clone(),
  165. alternative: self.alternative.clone(),
  166. quality: self.quality,
  167. filter: Filter::Other(".".to_string()),
  168. infos: Infos(vec![Info::Empty]),
  169. formats: self.formats.commun_deepvariant_clairs(),
  170. }
  171. }
  172. /// Checks if the variant has an SVTYPE info field.
  173. ///
  174. /// Returns true if the variant contains structural variation type information.
  175. pub fn has_svtype(&self) -> bool {
  176. self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_)))
  177. }
  178. /// Retrieves the structural variation type of the variant, if present.
  179. ///
  180. /// Returns Some(SVType) if the variant has an SVTYPE info field,
  181. pub fn svtype(&self) -> Option<SVType> {
  182. self.infos.0.iter().find_map(|e| {
  183. if let Info::SVTYPE(sv_type) = e {
  184. Some(sv_type.clone())
  185. } else {
  186. None
  187. }
  188. })
  189. }
  190. /// Determines the alteration category of the variant.
  191. ///
  192. /// This method analyzes the reference and alternative alleles to classify
  193. /// the variant into one of several alteration categories:
  194. /// - SNV (Single Nucleotide Variant)
  195. /// - INS (Insertion)
  196. /// - DEL (Deletion)
  197. /// - Other (including structural variants and complex alterations)
  198. ///
  199. /// The classification is based on the following rules:
  200. /// 1. If both reference and alternative are single nucleotides, it's an SNV.
  201. /// 2. If reference is a single nucleotide and alternative is multiple nucleotides, it's an insertion.
  202. /// 3. If reference is multiple nucleotides and alternative is a single nucleotide, it's a deletion.
  203. /// 4. For cases where both are multiple nucleotides, the longer one determines if it's an insertion or deletion.
  204. /// 5. If none of the above apply, it checks for structural variant types.
  205. /// 6. If no structural variant type is found, it's classified as "Other".
  206. ///
  207. /// # Returns
  208. /// An `AlterationCategory` enum representing the type of alteration.
  209. pub fn alteration_category(&self) -> AlterationCategory {
  210. match (&self.reference, &self.alternative) {
  211. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
  212. AlterationCategory::SNV
  213. }
  214. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
  215. AlterationCategory::INS
  216. }
  217. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
  218. AlterationCategory::Other
  219. }
  220. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
  221. AlterationCategory::DEL
  222. }
  223. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  224. if a.len() < b.len() =>
  225. {
  226. AlterationCategory::INS
  227. }
  228. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  229. if a.len() > b.len() =>
  230. {
  231. AlterationCategory::DEL
  232. }
  233. _ => match self.svtype() {
  234. Some(sv_type) => {
  235. if let Ok(bnd_desc) = self.bnd_desc() {
  236. if bnd_desc.a_contig != bnd_desc.b_contig {
  237. AlterationCategory::TRL
  238. } else {
  239. AlterationCategory::BND
  240. }
  241. } else {
  242. AlterationCategory::from(sv_type)
  243. }
  244. }
  245. None => AlterationCategory::Other,
  246. },
  247. }
  248. }
  249. /// Parses and constructs a BND (breakend) description from the alternative string.
  250. ///
  251. /// This function interprets the BND notation in the alternative string and creates
  252. /// a `BNDDesc` struct containing detailed information about the breakend.
  253. ///
  254. /// # Returns
  255. /// - `Ok(BNDDesc)` if parsing is successful
  256. /// - `Err` if parsing fails or if the alteration is not a BND
  257. ///
  258. /// # Errors
  259. /// This function will return an error if:
  260. /// - The alteration category is not BND
  261. /// - The alternative string cannot be parsed into exactly 3 parts
  262. /// - The b_position cannot be parsed as a number
  263. pub fn bnd_desc(&self) -> anyhow::Result<BNDDesc> {
  264. let alt = self.alternative.to_string();
  265. if alt.contains('[') || alt.contains(']') {
  266. let alt_rep = alt.replace("[", ";").replace("]", ";");
  267. let alt_is_joined_after = !alt_rep.starts_with(";");
  268. let parts = alt_rep
  269. .split(";")
  270. .filter(|c| !c.is_empty())
  271. .collect::<Vec<&str>>();
  272. if alt_is_joined_after {
  273. // a is ref b is alt
  274. let a_sens = true;
  275. let a_contig = self.position.contig();
  276. let a_position = self.position.position + 1;
  277. let added_nt = parts[0][1..].to_string();
  278. let b_sens = alt.contains('[');
  279. let (contig, pos) = parts[1].split_once(':').unwrap();
  280. let b_contig = contig.to_string();
  281. let b_position: u32 = pos.parse()?;
  282. Ok(BNDDesc {
  283. a_contig,
  284. a_position,
  285. a_sens,
  286. b_contig,
  287. b_position,
  288. b_sens,
  289. added_nt,
  290. })
  291. } else {
  292. // a is alt b is ref
  293. let b_sens = true;
  294. let b_contig = self.position.contig();
  295. let b_position = self.position.position + 1;
  296. let mut added_nt = parts[1].to_string();
  297. added_nt.pop();
  298. let a_sens = alt.contains(']');
  299. let (contig, pos) = parts[0].split_once(':').unwrap();
  300. let a_contig = contig.to_string();
  301. let a_position: u32 = pos.parse()?;
  302. Ok(BNDDesc {
  303. a_contig,
  304. a_position,
  305. a_sens,
  306. b_contig,
  307. b_position,
  308. b_sens,
  309. added_nt,
  310. })
  311. }
  312. } else {
  313. Err(anyhow::anyhow!("The alteration is not BND: {alt}"))
  314. }
  315. }
  316. }
  317. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
  318. pub struct BNDDesc {
  319. pub a_contig: String,
  320. pub a_position: u32, // 1-based
  321. pub a_sens: bool,
  322. pub b_contig: String,
  323. pub b_position: u32, // 1-based
  324. pub b_sens: bool,
  325. pub added_nt: String,
  326. }
  327. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
  328. pub enum AlterationCategory {
  329. SNV,
  330. DEL,
  331. INS,
  332. DUP,
  333. INV,
  334. CNV,
  335. TRL,
  336. BND,
  337. Other,
  338. }
  339. impl fmt::Display for AlterationCategory {
  340. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  341. write!(
  342. f,
  343. "{}",
  344. match self {
  345. AlterationCategory::SNV => "SNV",
  346. AlterationCategory::DEL => "DEL",
  347. AlterationCategory::INS => "INS",
  348. AlterationCategory::DUP => "DUP",
  349. AlterationCategory::INV => "INV",
  350. AlterationCategory::CNV => "CNV",
  351. AlterationCategory::BND | AlterationCategory::TRL => "TRL",
  352. AlterationCategory::Other => "Other",
  353. }
  354. )
  355. }
  356. }
  357. impl From<SVType> for AlterationCategory {
  358. fn from(sv_type: SVType) -> Self {
  359. match sv_type {
  360. SVType::DEL => AlterationCategory::DEL,
  361. SVType::INS => AlterationCategory::INS,
  362. SVType::DUP => AlterationCategory::DUP,
  363. SVType::INV => AlterationCategory::INV,
  364. SVType::CNV => AlterationCategory::CNV,
  365. SVType::BND => AlterationCategory::BND,
  366. }
  367. }
  368. }
  369. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
  370. pub enum SVType {
  371. DEL,
  372. INS,
  373. DUP,
  374. INV,
  375. CNV,
  376. BND,
  377. }
  378. impl FromStr for SVType {
  379. type Err = anyhow::Error;
  380. fn from_str(s: &str) -> anyhow::Result<Self> {
  381. match s {
  382. "DEL" => Ok(SVType::DEL),
  383. "INS" => Ok(SVType::INS),
  384. "DUP" => Ok(SVType::DUP),
  385. "INV" => Ok(SVType::INV),
  386. "CNV" => Ok(SVType::CNV),
  387. "BND" => Ok(SVType::BND),
  388. _ => Err(anyhow!("Can't parse SVTYPE={s}")),
  389. }
  390. }
  391. }
  392. impl fmt::Display for SVType {
  393. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  394. write!(
  395. f,
  396. "{}",
  397. match self {
  398. SVType::DEL => "DEL",
  399. SVType::INS => "INS",
  400. SVType::DUP => "DUP",
  401. SVType::INV => "INV",
  402. SVType::CNV => "CNV",
  403. SVType::BND => "BND",
  404. }
  405. )
  406. }
  407. }
  408. impl VariantId for VcfVariant {
  409. fn variant_id(&self) -> String {
  410. format!("{}_{}>{}", self.position, self.reference, self.alternative)
  411. }
  412. }
  413. impl GetGenomePosition for VcfVariant {
  414. fn position(&self) -> &GenomePosition {
  415. &self.position
  416. }
  417. }
  418. impl PartialOrd for VcfVariant {
  419. fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
  420. Some(self.cmp(other))
  421. }
  422. }
  423. impl Ord for VcfVariant {
  424. fn cmp(&self, other: &Self) -> Ordering {
  425. self.position.cmp(&other.position)
  426. }
  427. }
  428. /// Info
  429. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
  430. pub struct Infos(pub Vec<Info>);
  431. impl FromStr for Infos {
  432. type Err = anyhow::Error;
  433. fn from_str(s: &str) -> anyhow::Result<Self> {
  434. Ok(Self(
  435. s.split(";")
  436. .map(Info::from_str)
  437. .collect::<Result<Vec<Info>, _>>()
  438. .map_err(|e| anyhow::anyhow!("Failed to parse info: {e}"))?,
  439. ))
  440. }
  441. }
  442. impl fmt::Display for Infos {
  443. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  444. write!(
  445. f,
  446. "{}",
  447. self.0
  448. .iter()
  449. .map(|e| e.to_string())
  450. .collect::<Vec<String>>()
  451. .join(";")
  452. )
  453. }
  454. }
  455. #[allow(non_camel_case_types)]
  456. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
  457. pub enum Info {
  458. Empty,
  459. H,
  460. F,
  461. P,
  462. FAU(u32),
  463. FCU(u32),
  464. FGU(u32),
  465. FTU(u32),
  466. RAU(u32),
  467. RCU(u32),
  468. RGU(u32),
  469. RTU(u32),
  470. SVTYPE(SVType),
  471. MATEID(String),
  472. NORMAL_READ_SUPPORT(u32),
  473. TUMOUR_READ_SUPPORT(u32),
  474. NORMAL_ALN_SUPPORT(u32),
  475. TUMOUR_ALN_SUPPORT(u32),
  476. SVLEN(i32),
  477. TUMOUR_DP_BEFORE(Vec<u32>),
  478. TUMOUR_DP_AT(Vec<u32>),
  479. TUMOUR_DP_AFTER(Vec<u32>),
  480. NORMAL_DP_BEFORE(Vec<u32>),
  481. NORMAL_DP_AT(Vec<u32>),
  482. NORMAL_DP_AFTER(Vec<u32>),
  483. TUMOUR_AF(Vec<f32>),
  484. NORMAL_AF(Vec<f32>),
  485. BP_NOTATION(String),
  486. SOURCE(String),
  487. CLUSTERED_READS_TUMOUR(u32),
  488. CLUSTERED_READS_NORMAL(u32),
  489. TUMOUR_ALT_HP(Vec<u32>),
  490. TUMOUR_PS(Vec<String>),
  491. NORMAL_ALT_HP(Vec<u32>),
  492. NORMAL_PS(Vec<String>),
  493. TUMOUR_TOTAL_HP_AT(Vec<u32>),
  494. NORMAL_TOTAL_HP_AT(Vec<u32>),
  495. ORIGIN_STARTS_STD_DEV(f32),
  496. ORIGIN_MAPQ_MEAN(f32),
  497. ORIGIN_EVENT_SIZE_STD_DEV(f32),
  498. ORIGIN_EVENT_SIZE_MEDIAN(f32),
  499. ORIGIN_EVENT_SIZE_MEAN(f32),
  500. END_STARTS_STD_DEV(f32),
  501. END_MAPQ_MEAN(f32),
  502. END_EVENT_SIZE_STD_DEV(f32),
  503. END_EVENT_SIZE_MEDIAN(f32),
  504. END_EVENT_SIZE_MEAN(f32),
  505. CLASS(String),
  506. END(u32),
  507. SVINSLEN(u32),
  508. SVINSSEQ(String),
  509. }
  510. impl FromStr for Info {
  511. type Err = anyhow::Error;
  512. fn from_str(s: &str) -> anyhow::Result<Self> {
  513. if s.contains('=') {
  514. let (key, value) = s
  515. .split_once('=')
  516. .context(format!("Can't split with `=` in string: {s}"))?;
  517. Ok(match key {
  518. "FAU" => Info::FAU(parse_value(value, key)?),
  519. "FCU" => Info::FCU(parse_value(value, key)?),
  520. "FGU" => Info::FGU(parse_value(value, key)?),
  521. "FTU" => Info::FTU(parse_value(value, key)?),
  522. "RAU" => Info::RAU(parse_value(value, key)?),
  523. "RCU" => Info::RCU(parse_value(value, key)?),
  524. "RGU" => Info::RGU(parse_value(value, key)?),
  525. "RTU" => Info::RTU(parse_value(value, key)?),
  526. "SVLEN" => Info::SVLEN(parse_value(value, key)?),
  527. "END" => Info::END(parse_value(value, key)?),
  528. "SVINSLEN" => Info::SVINSLEN(parse_value(value, key)?),
  529. "SVTYPE" => Info::SVTYPE(value.parse()?),
  530. "MATEID" => Info::MATEID(value.to_string()),
  531. "NORMAL_READ_SUPPORT" => Info::NORMAL_READ_SUPPORT(parse_value(value, key)?),
  532. "TUMOUR_READ_SUPPORT" => Info::TUMOUR_READ_SUPPORT(parse_value(value, key)?),
  533. "NORMAL_ALN_SUPPORT" => Info::NORMAL_ALN_SUPPORT(parse_value(value, key)?),
  534. "TUMOUR_ALN_SUPPORT" => Info::TUMOUR_ALN_SUPPORT(parse_value(value, key)?),
  535. "SVINSSEQ" => Info::SVINSSEQ(value.to_string()),
  536. "TUMOUR_DP_BEFORE" => Info::TUMOUR_DP_BEFORE(parse_vec_value(value, key)?),
  537. "TUMOUR_DP_AT" => Info::TUMOUR_DP_AT(parse_vec_value(value, key)?),
  538. "TUMOUR_DP_AFTER" => Info::TUMOUR_DP_AFTER(parse_vec_value(value, key)?),
  539. "NORMAL_DP_BEFORE" => Info::NORMAL_DP_BEFORE(parse_vec_value(value, key)?),
  540. "NORMAL_DP_AT" => Info::NORMAL_DP_AT(parse_vec_value(value, key)?),
  541. "NORMAL_DP_AFTER" => Info::NORMAL_DP_AFTER(parse_vec_value(value, key)?),
  542. "TUMOUR_AF" => Info::TUMOUR_AF(parse_vec_value(value, key)?),
  543. "NORMAL_AF" => Info::NORMAL_AF(parse_vec_value(value, key)?),
  544. "BP_NOTATION" => Info::BP_NOTATION(value.to_string()),
  545. "SOURCE" => Info::SOURCE(value.to_string()),
  546. "CLUSTERED_READS_TUMOUR" => Info::CLUSTERED_READS_TUMOUR(parse_value(value, key)?),
  547. "CLUSTERED_READS_NORMAL" => Info::CLUSTERED_READS_NORMAL(parse_value(value, key)?),
  548. "TUMOUR_ALT_HP" => Info::TUMOUR_ALT_HP(parse_vec_value(value, key)?),
  549. "TUMOUR_PS" => Info::TUMOUR_PS(parse_vec_value(value, key)?),
  550. "NORMAL_ALT_HP" => Info::NORMAL_ALT_HP(parse_vec_value(value, key)?),
  551. "NORMAL_PS" => Info::NORMAL_PS(parse_vec_value(value, key)?),
  552. "TUMOUR_TOTAL_HP_AT" => Info::TUMOUR_TOTAL_HP_AT(parse_vec_value(value, key)?),
  553. "NORMAL_TOTAL_HP_AT" => Info::NORMAL_TOTAL_HP_AT(parse_vec_value(value, key)?),
  554. "ORIGIN_STARTS_STD_DEV" => Info::ORIGIN_STARTS_STD_DEV(parse_value(value, key)?),
  555. "ORIGIN_MAPQ_MEAN" => Info::ORIGIN_MAPQ_MEAN(parse_value(value, key)?),
  556. "ORIGIN_EVENT_SIZE_STD_DEV" => {
  557. Info::ORIGIN_EVENT_SIZE_STD_DEV(parse_value(value, key)?)
  558. }
  559. "ORIGIN_EVENT_SIZE_MEDIAN" => {
  560. Info::ORIGIN_EVENT_SIZE_MEDIAN(parse_value(value, key)?)
  561. }
  562. "ORIGIN_EVENT_SIZE_MEAN" => Info::ORIGIN_EVENT_SIZE_MEAN(parse_value(value, key)?),
  563. "END_STARTS_STD_DEV" => Info::END_STARTS_STD_DEV(parse_value(value, key)?),
  564. "END_MAPQ_MEAN" => Info::END_MAPQ_MEAN(parse_value(value, key)?),
  565. "END_EVENT_SIZE_STD_DEV" => Info::END_EVENT_SIZE_STD_DEV(parse_value(value, key)?),
  566. "END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?),
  567. "END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?),
  568. "CLASS" => Info::CLASS(value.to_string()),
  569. _ => Info::Empty,
  570. })
  571. } else {
  572. Ok(match s {
  573. "H" => Info::H,
  574. "F" => Info::F,
  575. "P" => Info::P,
  576. _ => Info::Empty,
  577. })
  578. }
  579. }
  580. }
  581. impl fmt::Display for Info {
  582. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  583. match self {
  584. Info::Empty => write!(f, "."),
  585. Info::H => write!(f, "H"),
  586. Info::F => write!(f, "F"),
  587. Info::P => write!(f, "P"),
  588. Info::FAU(v) => write!(f, "FAU={v}"),
  589. Info::FCU(v) => write!(f, "FCU={v}"),
  590. Info::FGU(v) => write!(f, "FGU={v}"),
  591. Info::FTU(v) => write!(f, "FTU={v}"),
  592. Info::RAU(v) => write!(f, "RAU={v}"),
  593. Info::RCU(v) => write!(f, "RCU={v}"),
  594. Info::RGU(v) => write!(f, "RGU={v}"),
  595. Info::RTU(v) => write!(f, "RTU={v}"),
  596. Info::SVTYPE(v) => write!(f, "SVTYPE={v}"),
  597. Info::SVLEN(v) => write!(f, "SVLEN={v}"),
  598. Info::END(v) => write!(f, "END={v}"),
  599. Info::MATEID(v) => write!(f, "MATEID={v}"),
  600. Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"),
  601. Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"),
  602. Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"),
  603. Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"),
  604. Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"),
  605. Info::TUMOUR_ALN_SUPPORT(v) => write!(f, "TUMOUR_ALN_SUPPORT={v}"),
  606. Info::TUMOUR_DP_BEFORE(v) => write!(f, "TUMOUR_DP_BEFORE={}", concat_numbers(v)),
  607. Info::TUMOUR_DP_AT(v) => write!(f, "TUMOUR_DP_AT={}", concat_numbers(v)),
  608. Info::TUMOUR_DP_AFTER(v) => write!(f, "TUMOUR_DP_AFTER={}", concat_numbers(v)),
  609. Info::NORMAL_DP_BEFORE(v) => write!(f, "NORMAL_DP_BEFORE={}", concat_numbers(v)),
  610. Info::NORMAL_DP_AT(v) => write!(f, "NORMAL_DP_AT={}", concat_numbers(v)),
  611. Info::NORMAL_DP_AFTER(v) => write!(f, "NORMAL_DP_AFTER={}", concat_numbers(v)),
  612. Info::TUMOUR_AF(v) => write!(f, "TUMOUR_AF={}", concat_numbers(v)),
  613. Info::NORMAL_AF(v) => write!(f, "NORMAL_AF={}", concat_numbers(v)),
  614. Info::BP_NOTATION(v) => write!(f, "BP_NOTATION={v}"),
  615. Info::SOURCE(v) => write!(f, "SOURCE={v}"),
  616. Info::CLUSTERED_READS_TUMOUR(v) => write!(f, "CLUSTERED_READS_TUMOUR={v}"),
  617. Info::CLUSTERED_READS_NORMAL(v) => write!(f, "CLUSTERED_READS_NORMAL={v}"),
  618. Info::TUMOUR_ALT_HP(v) => write!(f, "TUMOUR_ALT_HP={}", concat_numbers(v)),
  619. Info::TUMOUR_PS(v) => write!(f, "TUMOUR_PS={}", v.join(",")),
  620. Info::NORMAL_ALT_HP(v) => write!(f, "NORMAL_ALT_HP={}", concat_numbers(v)),
  621. Info::NORMAL_PS(v) => write!(f, "NORMAL_PS={}", v.join(",")),
  622. Info::TUMOUR_TOTAL_HP_AT(v) => write!(f, "TUMOUR_TOTAL_HP_AT={}", concat_numbers(v)),
  623. Info::NORMAL_TOTAL_HP_AT(v) => write!(f, "NORMAL_TOTAL_HP_AT={}", concat_numbers(v)),
  624. Info::ORIGIN_STARTS_STD_DEV(v) => write!(f, "ORIGIN_STARTS_STD_DEV={v}"),
  625. Info::ORIGIN_MAPQ_MEAN(v) => write!(f, "ORIGIN_MAPQ_MEAN={v}"),
  626. Info::ORIGIN_EVENT_SIZE_STD_DEV(v) => write!(f, "ORIGIN_EVENT_SIZE_STD_DEV={v}"),
  627. Info::ORIGIN_EVENT_SIZE_MEDIAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEDIAN={v}"),
  628. Info::ORIGIN_EVENT_SIZE_MEAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEAN={v}"),
  629. Info::END_STARTS_STD_DEV(v) => write!(f, "END_STARTS_STD_DEV={v}"),
  630. Info::END_MAPQ_MEAN(v) => write!(f, "END_MAPQ_MEAN={v}"),
  631. Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"),
  632. Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"),
  633. Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"),
  634. Info::CLASS(v) => write!(f, "CLASS={v}"),
  635. }
  636. }
  637. }
  638. pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
  639. v.iter()
  640. .map(|n| n.to_string())
  641. .collect::<Vec<String>>()
  642. .join(",")
  643. }
  644. /// Format
  645. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
  646. pub enum Format {
  647. // DeepVariant
  648. GT(String),
  649. GQ(u32),
  650. DP(u32),
  651. AD(Vec<u32>),
  652. VAF(f32),
  653. PL(Vec<u32>),
  654. // Clairs
  655. // when format begins with N: normal
  656. // AF(f32),
  657. // NAF(f32), // DP(u32),
  658. NDP(u32),
  659. NAD(Vec<u32>),
  660. AU(u32),
  661. CU(u32),
  662. GU(u32),
  663. TU(u32),
  664. NAU(u32),
  665. NCU(u32),
  666. NGU(u32),
  667. NTU(u32),
  668. // nanomonsv
  669. TR(u32),
  670. VR(u32),
  671. Other((String, String)), // (key, value)
  672. }
  673. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
  674. pub struct Formats(pub Vec<Format>);
  675. impl Formats {
  676. /// Get the tumoral alternative read depth and total depth as an Option<(u32, u32)>.
  677. pub fn n_alt_depth(&self) -> Option<(u32, u32)> {
  678. let mut tumor_alt_depth: Option<u32> = None;
  679. let mut tumor_total_depth: Option<u32> = None;
  680. for format in &self.0 {
  681. match format {
  682. // Tumor Allelic Depth (AD)
  683. Format::AD(values) => {
  684. if values.len() > 1 {
  685. // Sum all alternative allele depths (excluding reference allele)
  686. tumor_alt_depth = Some(values[1..].iter().sum());
  687. }
  688. }
  689. // Tumor Total Depth (DP)
  690. Format::DP(value) => {
  691. tumor_total_depth = Some(*value);
  692. }
  693. _ => {}
  694. }
  695. }
  696. // Return a tuple (tumor_alt_depth, tumor_total_depth) if both are available
  697. match (tumor_alt_depth, tumor_total_depth) {
  698. (Some(alt), Some(total)) => Some((alt, total)),
  699. _ => None,
  700. }
  701. }
  702. }
  703. impl TryFrom<(&str, &str)> for Formats {
  704. type Error = anyhow::Error;
  705. fn try_from((k, v): (&str, &str)) -> anyhow::Result<Self> {
  706. let keys: Vec<&str> = k.split(':').collect();
  707. let values: Vec<&str> = v.split(':').collect();
  708. if keys.len() != values.len() {
  709. anyhow::bail!("Mismatch between keys and values count for {k} {v}");
  710. }
  711. Ok(Self(
  712. keys.into_iter()
  713. .zip(values)
  714. .map(|(key, value)| Format::try_from((key, value)))
  715. .collect::<Result<Vec<Format>, _>>()
  716. .map_err(|e| anyhow::anyhow!("Failed to parse format: {e}"))?,
  717. ))
  718. }
  719. }
  720. impl From<Formats> for (String, String) {
  721. fn from(formats: Formats) -> Self {
  722. let mut keys = Vec::new();
  723. let mut values = Vec::new();
  724. for format in formats.0 {
  725. let (key, value): (String, String) = format.into();
  726. keys.push(key);
  727. values.push(value);
  728. }
  729. (keys.join(":"), values.join(":"))
  730. }
  731. }
  732. impl TryFrom<(&str, &str)> for Format {
  733. type Error = anyhow::Error;
  734. fn try_from((key, value): (&str, &str)) -> anyhow::Result<Self> {
  735. let format = match key {
  736. "GT" => Format::GT(value.to_string()),
  737. "GQ" => Format::GQ(parse_value(value, key)?),
  738. "DP" => Format::DP(parse_value(value, key)?),
  739. "AD" => Format::AD(parse_vec_value(value, key)?),
  740. "VAF" => Format::VAF(parse_value(value, key)?),
  741. // "AF" => Format::AF(parse_value(value, key)?),
  742. // "NAF" => Format::NAF(parse_value(value, key)?),
  743. "NDP" => Format::NDP(parse_value(value, key)?),
  744. "NAD" => Format::NAD(parse_vec_value(value, key)?),
  745. "AU" => Format::AU(parse_value(value, key)?),
  746. "CU" => Format::CU(parse_value(value, key)?),
  747. "GU" => Format::GU(parse_value(value, key)?),
  748. "TU" => Format::TU(parse_value(value, key)?),
  749. "NAU" => Format::NAU(parse_value(value, key)?),
  750. "NCU" => Format::NCU(parse_value(value, key)?),
  751. "NGU" => Format::NGU(parse_value(value, key)?),
  752. "NTU" => Format::NTU(parse_value(value, key)?),
  753. "PL" => Format::PL(parse_vec_value(value, key)?),
  754. "TR" => Format::TR(parse_value(value, key)?),
  755. "VR" => Format::VR(parse_value(value, key)?),
  756. _ => Format::Other((key.to_string(), value.to_string())),
  757. };
  758. Ok(format)
  759. }
  760. }
  761. // Helper function to parse a single value (DeepSeek)
  762. fn parse_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<T>
  763. where
  764. T::Err: std::fmt::Debug,
  765. {
  766. value
  767. .parse()
  768. .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
  769. .context(format!("Can't parse {}: {}", key, value)) // Add context
  770. }
  771. // Helper function to parse comma-separated values (DeepSeek)
  772. fn parse_vec_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<Vec<T>>
  773. where
  774. T::Err: std::fmt::Debug,
  775. {
  776. value
  777. .split(',')
  778. .map(|e| {
  779. e.parse()
  780. .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
  781. .context(format!("Failed to parse {}: {}", key, e)) // Add context
  782. })
  783. .collect()
  784. }
  785. impl From<Format> for (String, String) {
  786. fn from(format: Format) -> Self {
  787. let concat = |values: Vec<u32>| -> String {
  788. values
  789. .iter()
  790. .map(|v| v.to_string())
  791. .collect::<Vec<_>>()
  792. .join(",")
  793. };
  794. match format {
  795. Format::GT(value) => ("GT".to_string(), value),
  796. Format::GQ(value) => ("GQ".to_string(), value.to_string()),
  797. Format::DP(value) => ("DP".to_string(), value.to_string()),
  798. Format::AD(values) => ("AD".to_string(), concat(values)),
  799. Format::VAF(value) => ("VAF".to_string(), value.to_string()),
  800. Format::PL(values) => ("PL".to_string(), concat(values)),
  801. Format::Other((key, value)) => (key, value),
  802. // Format::AF(value) => ("AF".to_string(), value.to_string()),
  803. // Format::NAF(value) => ("NAF".to_string(), value.to_string()),
  804. Format::NDP(value) => ("NDP".to_string(), value.to_string()),
  805. Format::NAD(values) => ("NAD".to_string(), concat(values)),
  806. Format::AU(value) => ("AU".to_string(), value.to_string()),
  807. Format::CU(value) => ("CU".to_string(), value.to_string()),
  808. Format::GU(value) => ("GU".to_string(), value.to_string()),
  809. Format::TU(value) => ("TU".to_string(), value.to_string()),
  810. Format::NAU(value) => ("NAU".to_string(), value.to_string()),
  811. Format::NCU(value) => ("NCU".to_string(), value.to_string()),
  812. Format::NGU(value) => ("NGU".to_string(), value.to_string()),
  813. Format::NTU(value) => ("NTU".to_string(), value.to_string()),
  814. Format::TR(value) => ("TR".to_string(), value.to_string()),
  815. Format::VR(value) => ("VR".to_string(), value.to_string()),
  816. }
  817. }
  818. }
  819. impl Formats {
  820. pub fn commun_deepvariant_clairs(&self) -> Self {
  821. let filtered_vec: Vec<Format> = self
  822. .0
  823. .clone()
  824. .into_iter()
  825. .map(|e| {
  826. if let Format::VAF(_v) = e {
  827. e
  828. // Format::AF(v)
  829. } else {
  830. e
  831. }
  832. })
  833. .filter(|format| {
  834. matches!(
  835. format,
  836. Format::GT(_) | Format::GQ(_) | Format::DP(_) | Format::AD(_) /* | Format::AF(_) */
  837. )
  838. })
  839. .collect();
  840. Formats(filtered_vec)
  841. }
  842. }
  843. /// Filter
  844. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Encode, Decode)]
  845. pub enum Filter {
  846. PASS,
  847. Other(String),
  848. }
  849. impl FromStr for Filter {
  850. type Err = anyhow::Error;
  851. fn from_str(s: &str) -> anyhow::Result<Self> {
  852. match s {
  853. "PASS" => Ok(Filter::PASS),
  854. _ => Ok(Filter::Other(s.to_string())),
  855. }
  856. }
  857. }
  858. impl fmt::Display for Filter {
  859. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  860. match self {
  861. Filter::PASS => write!(f, "PASS"),
  862. Filter::Other(ref s) => write!(f, "{}", s),
  863. }
  864. }
  865. }
  866. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
  867. pub enum ReferenceAlternative {
  868. Nucleotide(Base),
  869. Nucleotides(Vec<Base>),
  870. Unstructured(String),
  871. }
  872. impl FromStr for ReferenceAlternative {
  873. type Err = anyhow::Error;
  874. fn from_str(s: &str) -> anyhow::Result<Self> {
  875. let possible_bases = s.as_bytes().iter();
  876. let mut res: Vec<Base> = Vec::new();
  877. for &base in possible_bases {
  878. match base.try_into() {
  879. std::result::Result::Ok(b) => res.push(b),
  880. Err(_) => {
  881. return Ok(Self::Unstructured(s.to_string()));
  882. }
  883. }
  884. }
  885. if res.len() == 1 {
  886. Ok(Self::Nucleotide(res.pop().unwrap()))
  887. } else {
  888. Ok(Self::Nucleotides(res))
  889. }
  890. }
  891. }
  892. impl fmt::Display for ReferenceAlternative {
  893. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  894. let string = match self {
  895. ReferenceAlternative::Nucleotide(b) => b.to_string(),
  896. ReferenceAlternative::Nucleotides(bases) => bases
  897. .iter()
  898. .fold(String::new(), |acc, e| format!("{}{}", acc, e)),
  899. ReferenceAlternative::Unstructured(s) => s.to_string(),
  900. };
  901. write!(f, "{}", string)
  902. }
  903. }
  904. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash, Encode, Decode)]
  905. pub enum Base {
  906. A,
  907. T,
  908. C,
  909. G,
  910. N,
  911. }
  912. impl TryFrom<u8> for Base {
  913. type Error = anyhow::Error;
  914. fn try_from(base: u8) -> anyhow::Result<Self> {
  915. match base {
  916. b'A' => Ok(Base::A),
  917. b'T' => Ok(Base::T),
  918. b'C' => Ok(Base::C),
  919. b'G' => Ok(Base::G),
  920. b'N' => Ok(Base::N),
  921. _ => Err(anyhow::anyhow!(
  922. "Unknown base: {}",
  923. String::from_utf8_lossy(&[base])
  924. )),
  925. }
  926. }
  927. }
  928. impl Base {
  929. pub fn into_u8(self) -> u8 {
  930. match self {
  931. Base::A => b'A',
  932. Base::T => b'T',
  933. Base::C => b'C',
  934. Base::G => b'G',
  935. Base::N => b'N',
  936. }
  937. }
  938. }
  939. impl fmt::Display for Base {
  940. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  941. // Use `self.number` to refer to each positional data point.
  942. let str = match self {
  943. Base::A => "A",
  944. Base::T => "T",
  945. Base::C => "C",
  946. Base::G => "G",
  947. Base::N => "N",
  948. };
  949. write!(f, "{}", str)
  950. }
  951. }
  952. pub trait Variants {
  953. fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection>;
  954. }
  955. pub trait VariantId {
  956. fn variant_id(&self) -> String;
  957. }
  958. pub trait Label {
  959. fn label(&self) -> String;
  960. }
  961. /// A trait alias for all dynamically executable pipeline runners.
  962. ///
  963. /// This trait represents any component that:
  964. /// - Can decide whether it needs to run (`ShouldRun`)
  965. /// - Implements the actual execution logic (`Run`)
  966. /// - Is thread-safe (`Send + Sync`) to be boxed and dispatched concurrently
  967. ///
  968. /// Components implementing this trait can be boxed as `ShouldRunBox`.
  969. pub trait ShouldRunTrait: ShouldRun + Run + Send + Sync + Label {}
  970. /// Blanket implementation for all compatible types.
  971. impl<T> ShouldRunTrait for T where T: ShouldRun + Run + Send + Sync + Label {}
  972. /// A boxed trait object to hold any runner implementing `ShouldRunTrait`.
  973. pub type ShouldRunBox = Box<dyn ShouldRunTrait>;
  974. /// Macro to initialize and box multiple `ShouldRunTrait` components.
  975. ///
  976. /// # Arguments
  977. /// * `$id` - Sample ID (typically a string slice)
  978. /// * `$config` - Shared configuration object
  979. /// * `$($runner:ty),+` - One or more runner types implementing `Initialize + ShouldRunTrait`
  980. ///
  981. /// # Returns
  982. /// A vector of boxed runner components (`Vec<ShouldRunBox>`)
  983. ///
  984. /// # Example
  985. /// ```rust
  986. /// let modules: Vec<ShouldRunBox> = create_should_run!(
  987. /// "sample_42",
  988. /// config,
  989. /// ClairS,
  990. /// Savana,
  991. /// DeepSomatic,
  992. /// )?;
  993. /// ```
  994. ///
  995. /// # Errors
  996. /// This macro uses `?`, so it must be called inside a function that returns `anyhow::Result`.
  997. #[macro_export]
  998. macro_rules! create_should_run {
  999. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  1000. use anyhow::Context;
  1001. let mut runners: Vec<ShouldRunBox> = Vec::new();
  1002. $(
  1003. let runner = <$runner>::initialize($id, $config.clone())
  1004. .with_context(|| format!(
  1005. "Failed to initialize should-run checker {} for {}",
  1006. stringify!($runner), $id
  1007. ))?;
  1008. runners.push(Box::new(runner) as ShouldRunBox);
  1009. )+
  1010. runners
  1011. }};
  1012. }
  1013. /// Macro to initialize and box a list of solo-mode pipeline components that implement `ShouldRunTrait`.
  1014. ///
  1015. /// This is typically used for per-timepoint variant callers (e.g., `DeepVariant`),
  1016. /// where each runner is instantiated for a specific sample timepoint (e.g., "tumoral", "normal").
  1017. ///
  1018. /// Each entry must be provided as a pair: the type of the runner and the timepoint string expression.
  1019. ///
  1020. /// # Arguments
  1021. /// - `$id`: The sample ID (`&str`) passed to each initializer
  1022. /// - `$config`: A `Config` object (must be cloneable)
  1023. /// - `$($runner:ty, $arg:expr),+`: One or more runner types with timepoint arguments (e.g., `config.tumoral_name`)
  1024. ///
  1025. /// # Returns
  1026. /// A `Vec<ShouldRunBox>` with boxed runners initialized per timepoint.
  1027. ///
  1028. /// # Example
  1029. /// ```rust
  1030. /// let solo_callers = create_should_run_solo!(
  1031. /// "sample42",
  1032. /// config,
  1033. /// DeepVariant, config.tumoral_name,
  1034. /// DeepVariant, config.normal_name,
  1035. /// )?;
  1036. /// ```
  1037. ///
  1038. /// # Notes
  1039. /// Using `config.tumoral_name` and `config.normal_name` is preferred over hardcoded "diag"/"mrd".
  1040. ///
  1041. /// # Errors
  1042. /// This macro uses `?` and must be called inside a `Result`-returning context.
  1043. #[macro_export]
  1044. macro_rules! create_should_run_solo {
  1045. ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{
  1046. use anyhow::Context;
  1047. let mut runners: Vec<ShouldRunBox> = Vec::new();
  1048. $(
  1049. let runner = <$runner>::initialize($id, $arg, $config.clone())
  1050. .with_context(|| format!(
  1051. "Failed to initialize solo should-run checker {} for '{}' and arg '{}'",
  1052. stringify!($runner), $id, $arg
  1053. ))?;
  1054. runners.push(Box::new(runner) as ShouldRunBox);
  1055. )+
  1056. runners
  1057. }};
  1058. }
  1059. /// Macro to initialize and box a list of pipeline components that must run once per timepoint
  1060. /// (i.e., both "tumoral" and "normal") and implement `ShouldRunTrait`.
  1061. ///
  1062. /// This is typically used for variant callers like `DeepVariant` that operate in **solo mode**
  1063. /// but must be run twice — once for the tumoral sample and once for the normal sample.
  1064. ///
  1065. /// The macro:
  1066. /// - Calls `.initialize(id, timepoint, config.clone())` twice per type
  1067. /// - Uses `config.tumoral_name` and `config.normal_name` as timepoints
  1068. /// - Returns a flat `Vec<ShouldRunBox>` containing both instances for each type
  1069. ///
  1070. /// # Arguments
  1071. /// - `$id`: The sample ID (`&str`)
  1072. /// - `$config`: The configuration object (must expose `tumoral_name` and `normal_name`)
  1073. /// - `$($runner:ty),+`: One or more runner types that implement `Initialize` with `(id, timepoint, config)`
  1074. ///
  1075. /// # Example
  1076. /// ```rust
  1077. /// let runners = create_should_run_normal_tumoral!(
  1078. /// "sample_42",
  1079. /// config,
  1080. /// DeepVariant,
  1081. /// AnotherCaller,
  1082. /// )?;
  1083. /// ```
  1084. ///
  1085. /// This will expand to:
  1086. /// ```rust
  1087. /// vec![
  1088. /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox,
  1089. /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox,
  1090. /// Box::new(AnotherCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as ShouldRunBox,
  1091. /// Box::new(AnotherCaller::initialize("sample_42", config.normal_name, config.clone())?) as ShouldRunBox,
  1092. /// ]
  1093. /// ```
  1094. ///
  1095. /// # Errors
  1096. /// This macro uses `?`, so it must be called inside a function that returns `Result`.
  1097. #[macro_export]
  1098. macro_rules! create_should_run_normal_tumoral {
  1099. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  1100. use anyhow::Context;
  1101. let mut runners: Vec<ShouldRunBox> = Vec::new();
  1102. $(
  1103. let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone())
  1104. .with_context(|| format!(
  1105. "Failed to initialize tumoral should-run checker {} for {}",
  1106. stringify!($runner), $id
  1107. ))?;
  1108. runners.push(Box::new(tumoral) as ShouldRunBox);
  1109. let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone())
  1110. .with_context(|| format!(
  1111. "Failed to initialize normal should-run checker {} for {}",
  1112. stringify!($runner), $id
  1113. ))?;
  1114. runners.push(Box::new(normal) as ShouldRunBox);
  1115. )+
  1116. runners
  1117. }};
  1118. }
  1119. /// Executes each runner in the slice only if `should_run()` returns true.
  1120. ///
  1121. /// # Arguments
  1122. /// * `iterable` - A mutable slice of boxed `InitRun` components
  1123. ///
  1124. /// # Returns
  1125. /// * `Ok(())` if all required runners execute successfully
  1126. /// * An error if any runner's `run()` method fails
  1127. ///
  1128. /// # Notes
  1129. /// - This function will skip runners that return `false` from `should_run()`
  1130. pub fn run_if_required(iterable: &mut [ShouldRunBox]) -> anyhow::Result<()> {
  1131. iterable.iter_mut().try_for_each(|e| {
  1132. if e.should_run() {
  1133. e.run()
  1134. } else {
  1135. info!("Skipping running: {}", e.label());
  1136. Ok(())
  1137. }
  1138. })
  1139. }
  1140. /// A trait alias for all variant callers that support initialization, execution,
  1141. /// conditional re-running, and variant extraction (VCF + annotations).
  1142. ///
  1143. /// Used to enable polymorphic handling of both solo and somatic callers in the pipeline.
  1144. pub trait RunnerVariants: Variants + Send + Sync + Label {}
  1145. /// Blanket implementation for all compatible types.
  1146. impl<T> RunnerVariants for T where T: Variants + Send + Sync + Label {}
  1147. pub type CallerBox = Box<dyn RunnerVariants + Send + Sync>;
  1148. #[macro_export]
  1149. macro_rules! init_somatic_callers {
  1150. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  1151. use anyhow::Context;
  1152. let mut callers: Vec<CallerBox> = Vec::new();
  1153. $(
  1154. let caller = <$runner>::initialize($id, $config.clone())
  1155. .with_context(|| format!(
  1156. "Failed to initialize somatic caller {} for {}",
  1157. stringify!($runner), $id
  1158. ))?;
  1159. callers.push(Box::new(caller) as CallerBox);
  1160. )+
  1161. callers
  1162. }};
  1163. }
  1164. /// Macro to initialize and box a list of **solo-mode variant callers** for specific timepoints,
  1165. /// where each runner implements `RunnerVariants`.
  1166. ///
  1167. /// This is useful for callers like `DeepVariant` that need to be instantiated with a specific
  1168. /// sample timepoint (e.g., `config.tumoral_name` or `config.normal_name`).
  1169. ///
  1170. /// Each entry must be a pair: a runner type and a timepoint expression (usually from config).
  1171. ///
  1172. /// # Arguments
  1173. /// - `$id`: The sample ID (`&str`)
  1174. /// - `$config`: The configuration object (must be cloneable)
  1175. /// - `$($runner:ty, $arg:expr),+`: One or more `(RunnerType, Timepoint)` pairs
  1176. ///
  1177. /// # Returns
  1178. /// A `Vec<CallerBox>` containing initialized, boxed solo-mode variant callers.
  1179. ///
  1180. /// # Example
  1181. /// ```rust
  1182. /// let solo_callers = init_solo_callers!(
  1183. /// "sample_42",
  1184. /// config,
  1185. /// DeepVariant, config.tumoral_name,
  1186. /// DeepVariant, config.normal_name,
  1187. /// )?;
  1188. /// ```
  1189. ///
  1190. /// This will expand to:
  1191. /// ```rust
  1192. /// vec![
  1193. /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
  1194. /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
  1195. /// ]
  1196. /// ```
  1197. ///
  1198. /// # Errors
  1199. /// This macro uses `?` internally, so it must be used inside a `Result`-returning context.
  1200. #[macro_export]
  1201. macro_rules! init_solo_callers {
  1202. ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {{
  1203. let mut callers: Vec<CallerBox> = Vec::new();
  1204. $(
  1205. let caller = <$runner>::initialize($id, $arg, $config.clone())
  1206. .with_context(|| format!(
  1207. "Failed to initialize caller {} for {}",
  1208. stringify!($runner), $id
  1209. ))?;
  1210. callers.push(Box::new(caller) as CallerBox);
  1211. )+
  1212. callers
  1213. }};
  1214. }
  1215. /// Macro to initialize and box a list of solo-mode **variant callers** for both `normal` and `tumoral` timepoints.
  1216. ///
  1217. /// This is designed for types like `DeepVariant` that implement `RunnerVariants` and require
  1218. /// separate execution for each timepoint. It will:
  1219. /// - Call `.initialize(id, timepoint, config)` for both `config.tumoral_name` and `config.normal_name`
  1220. /// - Box the result as `CallerBox`
  1221. ///
  1222. /// # Arguments
  1223. /// - `$id`: Sample ID (usually a `&str`)
  1224. /// - `$config`: Cloneable configuration object
  1225. /// - `$($runner:ty),+`: One or more runner types that implement `RunnerVariants`
  1226. ///
  1227. /// # Returns
  1228. /// A `Vec<CallerBox>` containing two boxed instances per runner (one for each timepoint).
  1229. ///
  1230. /// # Example
  1231. /// ```rust
  1232. /// let solo_callers = init_solo_callers_normal_tumoral!(
  1233. /// "sample_42",
  1234. /// config,
  1235. /// DeepVariant,
  1236. /// OtherSoloCaller,
  1237. /// )?;
  1238. /// ```
  1239. ///
  1240. /// This expands to:
  1241. /// ```rust
  1242. /// vec![
  1243. /// Box::new(DeepVariant::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
  1244. /// Box::new(DeepVariant::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
  1245. /// Box::new(OtherSoloCaller::initialize("sample_42", config.tumoral_name, config.clone())?) as CallerBox,
  1246. /// Box::new(OtherSoloCaller::initialize("sample_42", config.normal_name, config.clone())?) as CallerBox,
  1247. /// ]
  1248. /// ```
  1249. ///
  1250. /// # Errors
  1251. /// This macro uses `?`, so it must be called inside a `Result`-returning context.
  1252. #[macro_export]
  1253. macro_rules! init_solo_callers_normal_tumoral {
  1254. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {{
  1255. use anyhow::Context;
  1256. let mut callers: Vec<CallerBox> = Vec::new();
  1257. $(
  1258. let tumoral = <$runner>::initialize($id, &$config.tumoral_name, $config.clone())
  1259. .with_context(|| format!(
  1260. "Failed to initialize tumoral caller {} for {} '{}'",
  1261. stringify!($runner), $id, $config.tumoral_name
  1262. ))?;
  1263. callers.push(Box::new(tumoral) as CallerBox);
  1264. let normal = <$runner>::initialize($id, &$config.normal_name, $config.clone())
  1265. .with_context(|| format!(
  1266. "Failed to initialize normal caller {} for {} '{}'",
  1267. stringify!($runner), $id, $config.normal_name
  1268. ))?;
  1269. callers.push(Box::new(normal) as CallerBox);
  1270. )+
  1271. callers
  1272. }};
  1273. }
  1274. // pub fn run_variants(iterable: &mut [CallerBox]) -> anyhow::Result<()> {
  1275. // iterable
  1276. // .iter_mut()
  1277. // .try_for_each(|runner| runner.run())
  1278. // .map_err(|e| anyhow::anyhow!("Error while calling run_variants.\n{e}"))
  1279. // }
  1280. pub fn load_variants(
  1281. iterable: &mut [CallerBox],
  1282. annotations: &Annotations,
  1283. ) -> anyhow::Result<Vec<VariantCollection>> {
  1284. iterable
  1285. .par_iter()
  1286. .map(|runner| {
  1287. let result = runner.variants(annotations);
  1288. if let Err(ref e) = result {
  1289. error!("Failed to load variants from: {}\n{e}", runner.label());
  1290. } else {
  1291. info!("Variants from {} loaded.", runner.label());
  1292. };
  1293. result
  1294. })
  1295. .filter(|r| r.is_ok())
  1296. .collect::<anyhow::Result<Vec<_>>>()
  1297. .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}"))
  1298. }
  1299. // pub fn par_load_variants(
  1300. // iterable: &mut [Box<dyn Variants + Send + Sync>],
  1301. // annotations: &Annotations,
  1302. // ) -> anyhow::Result<Vec<VariantCollection>> {
  1303. // iterable
  1304. // .par_iter()
  1305. // .map(|runner| {
  1306. // let r = runner.variants(annotations);
  1307. // if let Err(ref e) = r {
  1308. // warn!("{e}");
  1309. // };
  1310. // r
  1311. // })
  1312. // .filter(|r| r.is_ok())
  1313. // .collect::<anyhow::Result<Vec<_>>>()
  1314. // .map_err(|e| anyhow::anyhow!("Failed to load variants.\n{e}"))
  1315. // }
  1316. pub fn parallel_intersection<T: Hash + Eq + Clone + Send + Sync>(
  1317. vec1: &[T],
  1318. vec2: &[T],
  1319. ) -> (Vec<T>, Vec<T>, Vec<T>) {
  1320. let set1: HashSet<_> = vec1.par_iter().cloned().collect();
  1321. let set2: HashSet<_> = vec2.par_iter().cloned().collect();
  1322. let common: Vec<T> = set1
  1323. .par_iter()
  1324. .filter(|item| set2.contains(item))
  1325. .cloned()
  1326. .collect();
  1327. let only_in_first: Vec<T> = set1
  1328. .par_iter()
  1329. .filter(|item| !set2.contains(item))
  1330. .cloned()
  1331. .collect();
  1332. let only_in_second: Vec<T> = set2
  1333. .par_iter()
  1334. .filter(|item| !set1.contains(item))
  1335. .cloned()
  1336. .collect();
  1337. (common, only_in_first, only_in_second)
  1338. }