variant.rs 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928
  1. use crate::{
  2. annotation::Annotations,
  3. helpers::Hash128,
  4. positions::{GenomePosition, GetGenomePosition, VcfPosition},
  5. runners::Run,
  6. variant::variant_collection::VariantCollection,
  7. };
  8. use anyhow::{anyhow, Context, Ok};
  9. use rayon::prelude::*;
  10. use serde::{Deserialize, Serialize};
  11. use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr};
  12. #[derive(Debug, Clone, Serialize, Deserialize)]
  13. pub struct VcfVariant {
  14. pub hash: Hash128,
  15. pub position: GenomePosition,
  16. pub id: String,
  17. pub reference: ReferenceAlternative,
  18. pub alternative: ReferenceAlternative,
  19. pub quality: Option<f32>,
  20. pub filter: Filter,
  21. pub infos: Infos,
  22. pub formats: Formats,
  23. }
  24. impl PartialEq for VcfVariant {
  25. fn eq(&self, other: &Self) -> bool {
  26. // Nota bene: id, filter, info, format and quality is intentionally not compared
  27. self.position == other.position
  28. && self.reference == other.reference
  29. && self.alternative == other.alternative
  30. }
  31. }
  32. impl Eq for VcfVariant {}
  33. impl FromStr for VcfVariant {
  34. type Err = anyhow::Error;
  35. fn from_str(s: &str) -> anyhow::Result<Self> {
  36. let v: Vec<&str> = s.split('\t').collect();
  37. let vcf_position: VcfPosition = (
  38. *v.first().ok_or(anyhow!("Can't get contig from: {s}"))?,
  39. *v.get(1).ok_or(anyhow!("Can't get position from: {s}"))?,
  40. )
  41. .try_into()
  42. .context(format!("Can't parse position from: {s}"))?;
  43. let formats = if v.len() == 10 {
  44. (
  45. *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?,
  46. *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?,
  47. )
  48. .try_into()
  49. .context(format!("Can't parse formats from: {s}"))?
  50. } else {
  51. Formats::default()
  52. };
  53. let position: GenomePosition = vcf_position.into();
  54. let reference: ReferenceAlternative = v
  55. .get(3)
  56. .ok_or(anyhow!("Can't parse reference from: {s}"))?
  57. .parse()
  58. .context(format!("Can't parse reference from: {s}"))?;
  59. let alternative: ReferenceAlternative = v
  60. .get(4)
  61. .ok_or(anyhow!("Can't parse alternative from: {s}"))?
  62. .parse()
  63. .context(format!("Can't parse alternative from: {s}"))?;
  64. // Blake3 128 bytes Hash
  65. let mut hasher = blake3::Hasher::new();
  66. hasher.update(&position.contig.to_ne_bytes()); // Convert position to bytes
  67. hasher.update(&position.position.to_ne_bytes()); // Convert position to bytes
  68. hasher.update(reference.to_string().as_bytes()); // Reference string as bytes
  69. hasher.update(alternative.to_string().as_bytes()); // Alternative string as bytes
  70. let hash = hasher.finalize();
  71. let hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
  72. Ok(Self {
  73. hash,
  74. position,
  75. id: v
  76. .get(2)
  77. .ok_or(anyhow!("Can't parse id from: {s}"))?
  78. .to_string(),
  79. reference,
  80. alternative,
  81. quality: v
  82. .get(5)
  83. .map(|s| s.parse::<f32>().ok()) // Try to parse as f64; returns Option<f64>
  84. .unwrap_or(None),
  85. filter: v
  86. .get(6)
  87. .ok_or(anyhow!("Can't parse filter from: {s}"))?
  88. .parse()
  89. .context(format!("Can't parse filter from: {s}"))?,
  90. infos: v
  91. .get(7)
  92. .ok_or(anyhow!("Can't parse infos from: {s}"))?
  93. .parse()
  94. .context(format!("Can't parse infos from: {s}"))?,
  95. formats,
  96. })
  97. }
  98. }
  99. // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag
  100. impl VcfVariant {
  101. pub fn into_vcf_row(&self) -> String {
  102. let vcf_position: VcfPosition = self.position.clone().into();
  103. let (contig, position) = vcf_position.into();
  104. let mut columns = vec![
  105. contig,
  106. position,
  107. self.id.to_string(),
  108. self.reference.to_string(),
  109. self.alternative.to_string(),
  110. self.quality
  111. .map(|v| v.to_string())
  112. .unwrap_or(".".to_string()),
  113. self.filter.to_string(),
  114. self.infos.to_string(),
  115. ];
  116. if !self.formats.0.is_empty() {
  117. let (format, values) = self.formats.clone().into();
  118. columns.push(format);
  119. columns.push(values);
  120. }
  121. columns.join("\t")
  122. }
  123. pub fn hash(&self) -> Hash128 {
  124. self.hash
  125. }
  126. pub fn commun_deepvariant_clairs(&self) -> VcfVariant {
  127. VcfVariant {
  128. hash: self.hash,
  129. position: self.position.clone(),
  130. id: self.id.clone(),
  131. reference: self.reference.clone(),
  132. alternative: self.alternative.clone(),
  133. quality: self.quality,
  134. filter: Filter::Other(".".to_string()),
  135. infos: Infos(vec![Info::Empty]),
  136. formats: self.formats.commun_deepvariant_clairs(),
  137. }
  138. }
  139. pub fn has_svtype(&self) -> bool {
  140. self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_)))
  141. }
  142. pub fn svtype(&self) -> Option<SVType> {
  143. self.infos.0.iter().find_map(|e| {
  144. if let Info::SVTYPE(sv_type) = e {
  145. Some(sv_type.clone())
  146. } else {
  147. None
  148. }
  149. })
  150. }
  151. pub fn alteration_category(&self) -> AlterationCategory {
  152. match (&self.reference, &self.alternative) {
  153. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
  154. AlterationCategory::SNV
  155. }
  156. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
  157. AlterationCategory::INS
  158. }
  159. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
  160. AlterationCategory::Other
  161. }
  162. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
  163. AlterationCategory::DEL
  164. }
  165. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  166. if a.len() < b.len() =>
  167. {
  168. AlterationCategory::INS
  169. }
  170. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  171. if a.len() > b.len() =>
  172. {
  173. AlterationCategory::DEL
  174. }
  175. _ => match self.svtype() {
  176. Some(sv_type) => AlterationCategory::from(sv_type),
  177. None => AlterationCategory::Other,
  178. },
  179. }
  180. }
  181. }
  182. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
  183. pub enum AlterationCategory {
  184. SNV,
  185. DEL,
  186. INS,
  187. DUP,
  188. INV,
  189. CNV,
  190. BND,
  191. Other,
  192. }
  193. impl fmt::Display for AlterationCategory {
  194. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  195. write!(
  196. f,
  197. "{}",
  198. match self {
  199. AlterationCategory::SNV => "SNV",
  200. AlterationCategory::DEL => "DEL",
  201. AlterationCategory::INS => "INS",
  202. AlterationCategory::DUP => "DUP",
  203. AlterationCategory::INV => "INV",
  204. AlterationCategory::CNV => "CNV",
  205. AlterationCategory::BND => "BND",
  206. AlterationCategory::Other => "Other",
  207. }
  208. )
  209. }
  210. }
  211. impl From<SVType> for AlterationCategory {
  212. fn from(sv_type: SVType) -> Self {
  213. match sv_type {
  214. SVType::DEL => AlterationCategory::DEL,
  215. SVType::INS => AlterationCategory::INS,
  216. SVType::DUP => AlterationCategory::DUP,
  217. SVType::INV => AlterationCategory::INV,
  218. SVType::CNV => AlterationCategory::CNV,
  219. SVType::BND => AlterationCategory::BND,
  220. }
  221. }
  222. }
  223. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  224. pub enum SVType {
  225. DEL,
  226. INS,
  227. DUP,
  228. INV,
  229. CNV,
  230. BND,
  231. }
  232. impl FromStr for SVType {
  233. type Err = anyhow::Error;
  234. fn from_str(s: &str) -> anyhow::Result<Self> {
  235. match s {
  236. "DEL" => Ok(SVType::DEL),
  237. "INS" => Ok(SVType::INS),
  238. "DUP" => Ok(SVType::DUP),
  239. "INV" => Ok(SVType::INV),
  240. "CNV" => Ok(SVType::CNV),
  241. "BND" => Ok(SVType::BND),
  242. _ => Err(anyhow!("Can't parse SVTYPE={s}")),
  243. }
  244. }
  245. }
  246. impl fmt::Display for SVType {
  247. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  248. write!(
  249. f,
  250. "{}",
  251. match self {
  252. SVType::DEL => "DEL",
  253. SVType::INS => "INS",
  254. SVType::DUP => "DUP",
  255. SVType::INV => "INV",
  256. SVType::CNV => "CNV",
  257. SVType::BND => "BND",
  258. }
  259. )
  260. }
  261. }
  262. impl VariantId for VcfVariant {
  263. fn variant_id(&self) -> String {
  264. format!("{}_{}>{}", self.position, self.reference, self.alternative)
  265. }
  266. }
  267. impl GetGenomePosition for VcfVariant {
  268. fn position(&self) -> &GenomePosition {
  269. &self.position
  270. }
  271. }
  272. impl PartialOrd for VcfVariant {
  273. fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
  274. Some(self.cmp(other))
  275. }
  276. }
  277. impl Ord for VcfVariant {
  278. fn cmp(&self, other: &Self) -> Ordering {
  279. self.position.cmp(&other.position)
  280. }
  281. }
  282. /// Info
  283. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
  284. pub struct Infos(Vec<Info>);
  285. impl FromStr for Infos {
  286. type Err = anyhow::Error;
  287. fn from_str(s: &str) -> anyhow::Result<Self> {
  288. Ok(Self(
  289. s.split(";")
  290. .map(Info::from_str)
  291. .collect::<Result<Vec<Info>, _>>()
  292. .map_err(|e| anyhow::anyhow!("Failed to parse info: {e}"))?,
  293. ))
  294. }
  295. }
  296. impl fmt::Display for Infos {
  297. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  298. write!(
  299. f,
  300. "{}",
  301. self.0
  302. .iter()
  303. .map(|e| e.to_string())
  304. .collect::<Vec<String>>()
  305. .join(";")
  306. )
  307. }
  308. }
  309. #[allow(non_camel_case_types)]
  310. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  311. pub enum Info {
  312. Empty,
  313. H,
  314. F,
  315. P,
  316. FAU(u32),
  317. FCU(u32),
  318. FGU(u32),
  319. FTU(u32),
  320. RAU(u32),
  321. RCU(u32),
  322. RGU(u32),
  323. RTU(u32),
  324. SVTYPE(SVType),
  325. MATEID(String),
  326. NORMAL_READ_SUPPORT(u32),
  327. TUMOUR_READ_SUPPORT(u32),
  328. NORMAL_ALN_SUPPORT(u32),
  329. TUMOUR_ALN_SUPPORT(u32),
  330. SVLEN(i32),
  331. TUMOUR_DP_BEFORE(Vec<u32>),
  332. TUMOUR_DP_AT(Vec<u32>),
  333. TUMOUR_DP_AFTER(Vec<u32>),
  334. NORMAL_DP_BEFORE(Vec<u32>),
  335. NORMAL_DP_AT(Vec<u32>),
  336. NORMAL_DP_AFTER(Vec<u32>),
  337. TUMOUR_AF(Vec<f32>),
  338. NORMAL_AF(Vec<f32>),
  339. BP_NOTATION(String),
  340. SOURCE(String),
  341. CLUSTERED_READS_TUMOUR(u32),
  342. CLUSTERED_READS_NORMAL(u32),
  343. TUMOUR_ALT_HP(Vec<u32>),
  344. TUMOUR_PS(Vec<String>),
  345. NORMAL_ALT_HP(Vec<u32>),
  346. NORMAL_PS(Vec<String>),
  347. TUMOUR_TOTAL_HP_AT(Vec<u32>),
  348. NORMAL_TOTAL_HP_AT(Vec<u32>),
  349. ORIGIN_STARTS_STD_DEV(f32),
  350. ORIGIN_MAPQ_MEAN(f32),
  351. ORIGIN_EVENT_SIZE_STD_DEV(f32),
  352. ORIGIN_EVENT_SIZE_MEDIAN(f32),
  353. ORIGIN_EVENT_SIZE_MEAN(f32),
  354. END_STARTS_STD_DEV(f32),
  355. END_MAPQ_MEAN(f32),
  356. END_EVENT_SIZE_STD_DEV(f32),
  357. END_EVENT_SIZE_MEDIAN(f32),
  358. END_EVENT_SIZE_MEAN(f32),
  359. CLASS(String),
  360. END(u32),
  361. SVINSLEN(u32),
  362. SVINSSEQ(String),
  363. }
  364. impl FromStr for Info {
  365. type Err = anyhow::Error;
  366. fn from_str(s: &str) -> anyhow::Result<Self> {
  367. if s.contains('=') {
  368. let (key, value) = s
  369. .split_once('=')
  370. .context(format!("Can't split with `=` in string: {s}"))?;
  371. Ok(match key {
  372. "FAU" => Info::FAU(parse_value(value, key)?),
  373. "FCU" => Info::FCU(parse_value(value, key)?),
  374. "FGU" => Info::FGU(parse_value(value, key)?),
  375. "FTU" => Info::FTU(parse_value(value, key)?),
  376. "RAU" => Info::RAU(parse_value(value, key)?),
  377. "RCU" => Info::RCU(parse_value(value, key)?),
  378. "RGU" => Info::RGU(parse_value(value, key)?),
  379. "RTU" => Info::RTU(parse_value(value, key)?),
  380. "SVLEN" => Info::SVLEN(parse_value(value, key)?),
  381. "END" => Info::END(parse_value(value, key)?),
  382. "SVINSLEN" => Info::SVINSLEN(parse_value(value, key)?),
  383. "SVTYPE" => Info::SVTYPE(value.parse()?),
  384. "MATEID" => Info::MATEID(value.to_string()),
  385. "NORMAL_READ_SUPPORT" => Info::NORMAL_READ_SUPPORT(parse_value(value, key)?),
  386. "TUMOUR_READ_SUPPORT" => Info::TUMOUR_READ_SUPPORT(parse_value(value, key)?),
  387. "NORMAL_ALN_SUPPORT" => Info::NORMAL_ALN_SUPPORT(parse_value(value, key)?),
  388. "TUMOUR_ALN_SUPPORT" => Info::TUMOUR_ALN_SUPPORT(parse_value(value, key)?),
  389. "SVINSSEQ" => Info::SVINSSEQ(value.to_string()),
  390. "TUMOUR_DP_BEFORE" => Info::TUMOUR_DP_BEFORE(parse_vec_value(value, key)?),
  391. "TUMOUR_DP_AT" => Info::TUMOUR_DP_AT(parse_vec_value(value, key)?),
  392. "TUMOUR_DP_AFTER" => Info::TUMOUR_DP_AFTER(parse_vec_value(value, key)?),
  393. "NORMAL_DP_BEFORE" => Info::NORMAL_DP_BEFORE(parse_vec_value(value, key)?),
  394. "NORMAL_DP_AT" => Info::NORMAL_DP_AT(parse_vec_value(value, key)?),
  395. "NORMAL_DP_AFTER" => Info::NORMAL_DP_AFTER(parse_vec_value(value, key)?),
  396. "TUMOUR_AF" => Info::TUMOUR_AF(parse_vec_value(value, key)?),
  397. "NORMAL_AF" => Info::NORMAL_AF(parse_vec_value(value, key)?),
  398. "BP_NOTATION" => Info::BP_NOTATION(value.to_string()),
  399. "SOURCE" => Info::SOURCE(value.to_string()),
  400. "CLUSTERED_READS_TUMOUR" => Info::CLUSTERED_READS_TUMOUR(parse_value(value, key)?),
  401. "CLUSTERED_READS_NORMAL" => Info::CLUSTERED_READS_NORMAL(parse_value(value, key)?),
  402. "TUMOUR_ALT_HP" => Info::TUMOUR_ALT_HP(parse_vec_value(value, key)?),
  403. "TUMOUR_PS" => Info::TUMOUR_PS(parse_vec_value(value, key)?),
  404. "NORMAL_ALT_HP" => Info::NORMAL_ALT_HP(parse_vec_value(value, key)?),
  405. "NORMAL_PS" => Info::NORMAL_PS(parse_vec_value(value, key)?),
  406. "TUMOUR_TOTAL_HP_AT" => Info::TUMOUR_TOTAL_HP_AT(parse_vec_value(value, key)?),
  407. "NORMAL_TOTAL_HP_AT" => Info::NORMAL_TOTAL_HP_AT(parse_vec_value(value, key)?),
  408. "ORIGIN_STARTS_STD_DEV" => Info::ORIGIN_STARTS_STD_DEV(parse_value(value, key)?),
  409. "ORIGIN_MAPQ_MEAN" => Info::ORIGIN_MAPQ_MEAN(parse_value(value, key)?),
  410. "ORIGIN_EVENT_SIZE_STD_DEV" => {
  411. Info::ORIGIN_EVENT_SIZE_STD_DEV(parse_value(value, key)?)
  412. }
  413. "ORIGIN_EVENT_SIZE_MEDIAN" => {
  414. Info::ORIGIN_EVENT_SIZE_MEDIAN(parse_value(value, key)?)
  415. }
  416. "ORIGIN_EVENT_SIZE_MEAN" => Info::ORIGIN_EVENT_SIZE_MEAN(parse_value(value, key)?),
  417. "END_STARTS_STD_DEV" => Info::END_STARTS_STD_DEV(parse_value(value, key)?),
  418. "END_MAPQ_MEAN" => Info::END_MAPQ_MEAN(parse_value(value, key)?),
  419. "END_EVENT_SIZE_STD_DEV" => Info::END_EVENT_SIZE_STD_DEV(parse_value(value, key)?),
  420. "END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?),
  421. "END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?),
  422. "CLASS" => Info::CLASS(value.to_string()),
  423. _ => Info::Empty,
  424. })
  425. } else {
  426. Ok(match s {
  427. "H" => Info::H,
  428. "F" => Info::F,
  429. "P" => Info::P,
  430. _ => Info::Empty,
  431. })
  432. }
  433. }
  434. }
  435. impl fmt::Display for Info {
  436. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  437. match self {
  438. Info::Empty => write!(f, "."),
  439. Info::H => write!(f, "H"),
  440. Info::F => write!(f, "F"),
  441. Info::P => write!(f, "P"),
  442. Info::FAU(v) => write!(f, "FAU={v}"),
  443. Info::FCU(v) => write!(f, "FCU={v}"),
  444. Info::FGU(v) => write!(f, "FGU={v}"),
  445. Info::FTU(v) => write!(f, "FTU={v}"),
  446. Info::RAU(v) => write!(f, "RAU={v}"),
  447. Info::RCU(v) => write!(f, "RCU={v}"),
  448. Info::RGU(v) => write!(f, "RGU={v}"),
  449. Info::RTU(v) => write!(f, "RTU={v}"),
  450. Info::SVTYPE(v) => write!(f, "SVTYPE={v}"),
  451. Info::SVLEN(v) => write!(f, "SVLEN={v}"),
  452. Info::END(v) => write!(f, "END={v}"),
  453. Info::MATEID(v) => write!(f, "MATEID={v}"),
  454. Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"),
  455. Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"),
  456. Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"),
  457. Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"),
  458. Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"),
  459. Info::TUMOUR_ALN_SUPPORT(v) => write!(f, "TUMOUR_ALN_SUPPORT={v}"),
  460. Info::TUMOUR_DP_BEFORE(v) => write!(f, "TUMOUR_DP_BEFORE={}", concat_numbers(v)),
  461. Info::TUMOUR_DP_AT(v) => write!(f, "TUMOUR_DP_AT={}", concat_numbers(v)),
  462. Info::TUMOUR_DP_AFTER(v) => write!(f, "TUMOUR_DP_AFTER={}", concat_numbers(v)),
  463. Info::NORMAL_DP_BEFORE(v) => write!(f, "NORMAL_DP_BEFORE={}", concat_numbers(v)),
  464. Info::NORMAL_DP_AT(v) => write!(f, "NORMAL_DP_AT={}", concat_numbers(v)),
  465. Info::NORMAL_DP_AFTER(v) => write!(f, "NORMAL_DP_AFTER={}", concat_numbers(v)),
  466. Info::TUMOUR_AF(v) => write!(f, "TUMOUR_AF={}", concat_numbers(v)),
  467. Info::NORMAL_AF(v) => write!(f, "NORMAL_AF={}", concat_numbers(v)),
  468. Info::BP_NOTATION(v) => write!(f, "BP_NOTATION={v}"),
  469. Info::SOURCE(v) => write!(f, "SOURCE={v}"),
  470. Info::CLUSTERED_READS_TUMOUR(v) => write!(f, "CLUSTERED_READS_TUMOUR={v}"),
  471. Info::CLUSTERED_READS_NORMAL(v) => write!(f, "CLUSTERED_READS_NORMAL={v}"),
  472. Info::TUMOUR_ALT_HP(v) => write!(f, "TUMOUR_ALT_HP={}", concat_numbers(v)),
  473. Info::TUMOUR_PS(v) => write!(f, "TUMOUR_PS={}", v.join(",")),
  474. Info::NORMAL_ALT_HP(v) => write!(f, "NORMAL_ALT_HP={}", concat_numbers(v)),
  475. Info::NORMAL_PS(v) => write!(f, "NORMAL_PS={}", v.join(",")),
  476. Info::TUMOUR_TOTAL_HP_AT(v) => write!(f, "TUMOUR_TOTAL_HP_AT={}", concat_numbers(v)),
  477. Info::NORMAL_TOTAL_HP_AT(v) => write!(f, "NORMAL_TOTAL_HP_AT={}", concat_numbers(v)),
  478. Info::ORIGIN_STARTS_STD_DEV(v) => write!(f, "ORIGIN_STARTS_STD_DEV={v}"),
  479. Info::ORIGIN_MAPQ_MEAN(v) => write!(f, "ORIGIN_MAPQ_MEAN={v}"),
  480. Info::ORIGIN_EVENT_SIZE_STD_DEV(v) => write!(f, "ORIGIN_EVENT_SIZE_STD_DEV={v}"),
  481. Info::ORIGIN_EVENT_SIZE_MEDIAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEDIAN={v}"),
  482. Info::ORIGIN_EVENT_SIZE_MEAN(v) => write!(f, "ORIGIN_EVENT_SIZE_MEAN={v}"),
  483. Info::END_STARTS_STD_DEV(v) => write!(f, "END_STARTS_STD_DEV={v}"),
  484. Info::END_MAPQ_MEAN(v) => write!(f, "END_MAPQ_MEAN={v}"),
  485. Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"),
  486. Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"),
  487. Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"),
  488. Info::CLASS(v) => write!(f, "CLASS={v}"),
  489. }
  490. }
  491. }
  492. pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
  493. v.iter()
  494. .map(|n| n.to_string())
  495. .collect::<Vec<String>>()
  496. .join(",")
  497. }
  498. /// Format
  499. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  500. pub enum Format {
  501. // DeepVariant
  502. GT(String),
  503. GQ(u32),
  504. DP(u32),
  505. AD(Vec<u32>),
  506. VAF(f32),
  507. PL(Vec<u32>),
  508. // Clairs
  509. // when format begins with N: normal
  510. // AF(f32),
  511. // NAF(f32), // DP(u32),
  512. NDP(u32),
  513. NAD(Vec<u32>),
  514. AU(u32),
  515. CU(u32),
  516. GU(u32),
  517. TU(u32),
  518. NAU(u32),
  519. NCU(u32),
  520. NGU(u32),
  521. NTU(u32),
  522. // nanomonsv
  523. TR(u32),
  524. VR(u32),
  525. Other((String, String)), // (key, value)
  526. }
  527. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
  528. pub struct Formats(Vec<Format>);
  529. impl TryFrom<(&str, &str)> for Formats {
  530. type Error = anyhow::Error;
  531. fn try_from((k, v): (&str, &str)) -> anyhow::Result<Self> {
  532. let keys: Vec<&str> = k.split(':').collect();
  533. let values: Vec<&str> = v.split(':').collect();
  534. if keys.len() != values.len() {
  535. anyhow::bail!("Mismatch between keys and values count for {k} {v}");
  536. }
  537. Ok(Self(
  538. keys.into_iter()
  539. .zip(values)
  540. .map(|(key, value)| Format::try_from((key, value)))
  541. .collect::<Result<Vec<Format>, _>>()
  542. .map_err(|e| anyhow::anyhow!("Failed to parse format: {e}"))?,
  543. ))
  544. }
  545. }
  546. impl From<Formats> for (String, String) {
  547. fn from(formats: Formats) -> Self {
  548. let mut keys = Vec::new();
  549. let mut values = Vec::new();
  550. for format in formats.0 {
  551. let (key, value): (String, String) = format.into();
  552. keys.push(key);
  553. values.push(value);
  554. }
  555. (keys.join(":"), values.join(":"))
  556. }
  557. }
  558. impl TryFrom<(&str, &str)> for Format {
  559. type Error = anyhow::Error;
  560. fn try_from((key, value): (&str, &str)) -> anyhow::Result<Self> {
  561. let format = match key {
  562. "GT" => Format::GT(value.to_string()),
  563. "GQ" => Format::GQ(parse_value(value, key)?),
  564. "DP" => Format::DP(parse_value(value, key)?),
  565. "AD" => Format::AD(parse_vec_value(value, key)?),
  566. "VAF" => Format::VAF(parse_value(value, key)?),
  567. // "AF" => Format::AF(parse_value(value, key)?),
  568. // "NAF" => Format::NAF(parse_value(value, key)?),
  569. "NDP" => Format::NDP(parse_value(value, key)?),
  570. "NAD" => Format::NAD(parse_vec_value(value, key)?),
  571. "AU" => Format::AU(parse_value(value, key)?),
  572. "CU" => Format::CU(parse_value(value, key)?),
  573. "GU" => Format::GU(parse_value(value, key)?),
  574. "TU" => Format::TU(parse_value(value, key)?),
  575. "NAU" => Format::NAU(parse_value(value, key)?),
  576. "NCU" => Format::NCU(parse_value(value, key)?),
  577. "NGU" => Format::NGU(parse_value(value, key)?),
  578. "NTU" => Format::NTU(parse_value(value, key)?),
  579. "PL" => Format::PL(parse_vec_value(value, key)?),
  580. "TR" => Format::TR(parse_value(value, key)?),
  581. "VR" => Format::VR(parse_value(value, key)?),
  582. _ => Format::Other((key.to_string(), value.to_string())),
  583. };
  584. Ok(format)
  585. }
  586. }
  587. // Helper function to parse a single value (DeepSeek)
  588. fn parse_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<T>
  589. where
  590. T::Err: std::fmt::Debug,
  591. {
  592. value
  593. .parse()
  594. .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
  595. .context(format!("Can't parse {}: {}", key, value)) // Add context
  596. }
  597. // Helper function to parse comma-separated values (DeepSeek)
  598. fn parse_vec_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<Vec<T>>
  599. where
  600. T::Err: std::fmt::Debug,
  601. {
  602. value
  603. .split(',')
  604. .map(|e| {
  605. e.parse()
  606. .map_err(|e| anyhow::anyhow!("{:?}", e)) // Convert the error to `anyhow::Error`
  607. .context(format!("Failed to parse {}: {}", key, e)) // Add context
  608. })
  609. .collect()
  610. }
  611. impl From<Format> for (String, String) {
  612. fn from(format: Format) -> Self {
  613. let concat = |values: Vec<u32>| -> String {
  614. values
  615. .iter()
  616. .map(|v| v.to_string())
  617. .collect::<Vec<_>>()
  618. .join(",")
  619. };
  620. match format {
  621. Format::GT(value) => ("GT".to_string(), value),
  622. Format::GQ(value) => ("GQ".to_string(), value.to_string()),
  623. Format::DP(value) => ("DP".to_string(), value.to_string()),
  624. Format::AD(values) => ("AD".to_string(), concat(values)),
  625. Format::VAF(value) => ("VAF".to_string(), value.to_string()),
  626. Format::PL(values) => ("PL".to_string(), concat(values)),
  627. Format::Other((key, value)) => (key, value),
  628. // Format::AF(value) => ("AF".to_string(), value.to_string()),
  629. // Format::NAF(value) => ("NAF".to_string(), value.to_string()),
  630. Format::NDP(value) => ("NDP".to_string(), value.to_string()),
  631. Format::NAD(values) => ("NAD".to_string(), concat(values)),
  632. Format::AU(value) => ("AU".to_string(), value.to_string()),
  633. Format::CU(value) => ("CU".to_string(), value.to_string()),
  634. Format::GU(value) => ("GU".to_string(), value.to_string()),
  635. Format::TU(value) => ("TU".to_string(), value.to_string()),
  636. Format::NAU(value) => ("NAU".to_string(), value.to_string()),
  637. Format::NCU(value) => ("NCU".to_string(), value.to_string()),
  638. Format::NGU(value) => ("NGU".to_string(), value.to_string()),
  639. Format::NTU(value) => ("NTU".to_string(), value.to_string()),
  640. Format::TR(value) => ("TR".to_string(), value.to_string()),
  641. Format::VR(value) => ("VR".to_string(), value.to_string()),
  642. }
  643. }
  644. }
  645. impl Formats {
  646. pub fn commun_deepvariant_clairs(&self) -> Self {
  647. let filtered_vec: Vec<Format> = self
  648. .0
  649. .clone()
  650. .into_iter()
  651. .map(|e| {
  652. if let Format::VAF(_v) = e {
  653. e
  654. // Format::AF(v)
  655. } else {
  656. e
  657. }
  658. })
  659. .filter(|format| {
  660. matches!(
  661. format,
  662. Format::GT(_) | Format::GQ(_) | Format::DP(_) | Format::AD(_) /* | Format::AF(_) */
  663. )
  664. })
  665. .collect();
  666. Formats(filtered_vec)
  667. }
  668. }
  669. /// Filter
  670. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  671. pub enum Filter {
  672. PASS,
  673. Other(String),
  674. }
  675. impl FromStr for Filter {
  676. type Err = anyhow::Error;
  677. fn from_str(s: &str) -> anyhow::Result<Self> {
  678. match s {
  679. "PASS" => Ok(Filter::PASS),
  680. _ => Ok(Filter::Other(s.to_string())),
  681. }
  682. }
  683. }
  684. impl fmt::Display for Filter {
  685. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  686. match self {
  687. Filter::PASS => write!(f, "PASS"),
  688. Filter::Other(ref s) => write!(f, "{}", s),
  689. }
  690. }
  691. }
  692. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
  693. pub enum ReferenceAlternative {
  694. Nucleotide(Base),
  695. Nucleotides(Vec<Base>),
  696. Unstructured(String),
  697. }
  698. impl FromStr for ReferenceAlternative {
  699. type Err = anyhow::Error;
  700. fn from_str(s: &str) -> anyhow::Result<Self> {
  701. let possible_bases = s.as_bytes().iter();
  702. let mut res: Vec<Base> = Vec::new();
  703. for &base in possible_bases {
  704. match base.try_into() {
  705. std::result::Result::Ok(b) => res.push(b),
  706. Err(_) => {
  707. return Ok(Self::Unstructured(s.to_string()));
  708. }
  709. }
  710. }
  711. if res.len() == 1 {
  712. Ok(Self::Nucleotide(res.pop().unwrap()))
  713. } else {
  714. Ok(Self::Nucleotides(res))
  715. }
  716. }
  717. }
  718. impl fmt::Display for ReferenceAlternative {
  719. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  720. let string = match self {
  721. ReferenceAlternative::Nucleotide(b) => b.to_string(),
  722. ReferenceAlternative::Nucleotides(bases) => bases
  723. .iter()
  724. .fold(String::new(), |acc, e| format!("{}{}", acc, e)),
  725. ReferenceAlternative::Unstructured(s) => s.to_string(),
  726. };
  727. write!(f, "{}", string)
  728. }
  729. }
  730. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
  731. pub enum Base {
  732. A,
  733. T,
  734. C,
  735. G,
  736. N,
  737. }
  738. impl TryFrom<u8> for Base {
  739. type Error = anyhow::Error;
  740. fn try_from(base: u8) -> anyhow::Result<Self> {
  741. match base {
  742. b'A' => Ok(Base::A),
  743. b'T' => Ok(Base::T),
  744. b'C' => Ok(Base::C),
  745. b'G' => Ok(Base::G),
  746. b'N' => Ok(Base::N),
  747. _ => Err(anyhow::anyhow!(
  748. "Unknown base: {}",
  749. String::from_utf8_lossy(&[base])
  750. )),
  751. }
  752. }
  753. }
  754. impl Base {
  755. pub fn into_u8(self) -> u8 {
  756. match self {
  757. Base::A => b'A',
  758. Base::T => b'T',
  759. Base::C => b'C',
  760. Base::G => b'G',
  761. Base::N => b'N',
  762. }
  763. }
  764. }
  765. impl fmt::Display for Base {
  766. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  767. // Use `self.number` to refer to each positional data point.
  768. let str = match self {
  769. Base::A => "A",
  770. Base::T => "T",
  771. Base::C => "C",
  772. Base::G => "G",
  773. Base::N => "N",
  774. };
  775. write!(f, "{}", str)
  776. }
  777. }
  778. pub trait Variants {
  779. fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection>;
  780. }
  781. pub trait VariantId {
  782. fn variant_id(&self) -> String;
  783. }
  784. pub trait RunnerVariants: Run + Variants + Send + Sync {}
  785. pub type CallerBox = Box<dyn RunnerVariants + Send + Sync>;
  786. #[macro_export]
  787. macro_rules! init_somatic_callers {
  788. ($id:expr, $config:expr, $($runner:ty),+ $(,)?) => {
  789. vec![
  790. $(
  791. Box::new(<$runner>::initialize($id, $config.clone())?) as CallerBox
  792. ),+
  793. ]
  794. };
  795. }
  796. #[macro_export]
  797. macro_rules! init_solo_callers {
  798. ($id:expr, $config:expr, $($runner:ty, $arg:expr),+ $(,)?) => {
  799. vec![
  800. $(
  801. Box::new(<$runner>::initialize($id, $arg, $config.clone())?) as CallerBox
  802. ),+
  803. ]
  804. };
  805. }
  806. pub fn run_variants(iterable: &mut [CallerBox]) -> anyhow::Result<()> {
  807. iterable
  808. .iter_mut()
  809. .try_for_each(|runner| runner.run())
  810. .map_err(|e| anyhow::anyhow!("Error while calling run_variants.\n{e}"))
  811. }
  812. pub fn load_variants(
  813. iterable: &mut [CallerBox],
  814. annotations: &Annotations,
  815. ) -> anyhow::Result<Vec<VariantCollection>> {
  816. iterable
  817. .par_iter()
  818. .map(|runner| runner.variants(annotations))
  819. .collect::<anyhow::Result<Vec<_>>>()
  820. .map_err(|e| anyhow::anyhow!("Error while running load_variants.\n{e}"))
  821. }
  822. pub fn parallel_intersection<T: Hash + Eq + Clone + Send + Sync>(
  823. vec1: &[T],
  824. vec2: &[T],
  825. ) -> (Vec<T>, Vec<T>, Vec<T>) {
  826. let set1: HashSet<_> = vec1.par_iter().cloned().collect();
  827. let set2: HashSet<_> = vec2.par_iter().cloned().collect();
  828. let common: Vec<T> = set1
  829. .par_iter()
  830. .filter(|item| set2.contains(item))
  831. .cloned()
  832. .collect();
  833. let only_in_first: Vec<T> = set1
  834. .par_iter()
  835. .filter(|item| !set2.contains(item))
  836. .cloned()
  837. .collect();
  838. let only_in_second: Vec<T> = set2
  839. .par_iter()
  840. .filter(|item| !set1.contains(item))
  841. .cloned()
  842. .collect();
  843. (common, only_in_first, only_in_second)
  844. }