variants.rs 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332
  1. use crate::{
  2. annotations::{
  3. cosmic::Cosmic,
  4. echtvar::{parse_echtvar_val, run_echtvar},
  5. gnomad::GnomAD,
  6. ncbi_gff::NCBIGFF,
  7. vep::{get_best_vep, vep_chunk, VEP},
  8. },
  9. callers::{
  10. clairs::{ClairSFormat, ClairSInfo},
  11. deepvariant::{DeepVariantFormat, DeepVariantInfo},
  12. nanomonsv::{NanomonsvFormat, NanomonsvInfo},
  13. sniffles::{SnifflesFormat, SnifflesInfo},
  14. },
  15. config::Config,
  16. in_out::{
  17. self,
  18. dict_reader::read_dict,
  19. get_reader,
  20. vcf_reader::{read_vcf, read_vcf_progress, VCFRow},
  21. vcf_writer::{vcf_header_from, VariantWritter},
  22. },
  23. sql::{stats_sql::insert_stats, variants_sql::insert_variants},
  24. utils::{
  25. chi_square_test_for_proportions, estimate_shannon_entropy, get_hts_nt_pileup, new_pg,
  26. new_pg_speed, print_stat_cat,
  27. },
  28. };
  29. use anyhow::{anyhow, Context, Error, Ok, Result};
  30. use csv::ReaderBuilder;
  31. use dashmap::DashMap;
  32. use hashbrown::HashMap;
  33. use indicatif::{MultiProgress, ParallelProgressIterator};
  34. use log::{info, warn};
  35. use noodles_core::{region::Region, Position};
  36. use noodles_fasta::indexed_reader::Builder as FastaBuilder;
  37. use noodles_gff as gff;
  38. use rayon::prelude::*;
  39. use serde::{Deserialize, Serialize};
  40. use std::io::Write;
  41. use std::{
  42. env::temp_dir,
  43. fmt,
  44. fs::File,
  45. str::FromStr,
  46. sync::{
  47. atomic::{AtomicI32, Ordering},
  48. Arc,
  49. },
  50. };
  51. // chr12:25116542|G>T KRAS
  52. #[derive(Debug, Clone)]
  53. pub struct Variants {
  54. pub name: String,
  55. pub data: Vec<Variant>,
  56. pub constit: DashMap<String, Variant>,
  57. pub stats_vcf: StatsVCF,
  58. pub stats_bam: StatsBAM,
  59. pub cfg: Config,
  60. pub mp: MultiProgress,
  61. }
  62. #[derive(Debug, Clone, Serialize, Deserialize, Default)]
  63. pub struct StatsVCF {
  64. n_tumoral_init: usize,
  65. n_constit_init: usize,
  66. n_constit: i32,
  67. n_loh: i32,
  68. n_low_mrd_depth: i32,
  69. }
  70. impl fmt::Display for StatsVCF {
  71. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  72. let k = 100.0 / self.n_tumoral_init as f64;
  73. let string = format!(
  74. "VCF filters found {} ({:.1}%) constit, {} ({:.1}%) LOH, {} ({:.1}%) Low depth for constit variants",
  75. self.n_constit, self.n_constit as f64 * k,
  76. self.n_loh, self.n_loh as f64 * k,
  77. self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k
  78. );
  79. write!(f, "{}", string)
  80. }
  81. }
  82. #[derive(Debug, Clone, Serialize, Deserialize, Default)]
  83. pub struct StatsBAM {
  84. n_lasting: i32,
  85. n_constit: i32,
  86. n_low_mrd_depth: i32,
  87. n_low_diversity: i32,
  88. n_somatic: i32,
  89. }
  90. impl fmt::Display for StatsBAM {
  91. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  92. let k = 100.0 / self.n_lasting as f64;
  93. let string = format!(
  94. "BAM filters found {} ({:.1}%) constit, {} ({:.1}%) low depth for constit variants, {} ({:.1}%) low diversity of sequence at the variant position, {} ({:.1}%) somatic variants",
  95. self.n_constit, self.n_constit as f64 * k,
  96. self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k,
  97. self.n_low_diversity, self.n_low_diversity as f64 * k,
  98. self.n_somatic, self.n_somatic as f64 * k
  99. );
  100. write!(f, "{}", string)
  101. }
  102. }
  103. impl Variants {
  104. pub fn from_vec(name: String, mp: &MultiProgress, data: Vec<Variant>) -> Self {
  105. Self {
  106. name,
  107. data,
  108. constit: DashMap::new(),
  109. stats_vcf: StatsVCF::default(),
  110. stats_bam: StatsBAM::default(),
  111. cfg: Config::get().unwrap(),
  112. mp: mp.clone(),
  113. }
  114. }
  115. pub fn from_vcfs(
  116. name: String,
  117. v: Vec<(&str, &VCFSource, &VariantType)>,
  118. cfg: &Config,
  119. mp: MultiProgress,
  120. ) -> Result<Self> {
  121. let pg = mp.add(new_pg(v.len() as u64));
  122. pg.set_message("Reading VCF");
  123. let constit: Arc<DashMap<String, Variant>> = Arc::new(DashMap::new());
  124. let n_constit = AtomicI32::new(0);
  125. let data: Vec<Variant> = v
  126. .par_iter()
  127. // .progress_count(v.len() as u64)
  128. .flat_map(|(path, source, variant_type)| {
  129. let r = match variant_type {
  130. VariantType::Somatic => read_vcf(path, source, variant_type).unwrap(),
  131. VariantType::Constitutionnal => {
  132. read_vcf(path, source, variant_type)
  133. .unwrap()
  134. .par_iter()
  135. .for_each(|e| {
  136. n_constit.fetch_add(1, Ordering::SeqCst);
  137. constit.insert(
  138. format!(
  139. "{}:{}|{}>{}",
  140. e.contig, e.position, e.reference, e.alternative
  141. ),
  142. e.clone(),
  143. );
  144. });
  145. vec![]
  146. }
  147. };
  148. pg.inc(1);
  149. r
  150. })
  151. .collect();
  152. let stats_vcf = StatsVCF::default();
  153. let stats_bam = StatsBAM::default();
  154. let constit = Arc::try_unwrap(constit).unwrap();
  155. let elapsed = pg.elapsed();
  156. pg.finish();
  157. info!("{} variants parsed from somatic VCFs and {} variants positions parsed from constitutionnal VCFs. Executed in {}s", data.len(), constit.len(), elapsed.as_secs());
  158. let cfg = cfg.clone();
  159. return Ok(Self {
  160. name,
  161. data,
  162. constit,
  163. stats_vcf,
  164. stats_bam,
  165. cfg,
  166. mp: mp.clone(),
  167. });
  168. }
  169. pub fn vcf_filters(&mut self) {
  170. let cfg = &self.cfg;
  171. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  172. pg.set_message("VCF filtering");
  173. let n_tumoral_init = self.len();
  174. let n_constit_init = self.constit_len();
  175. let min_loh_diff = cfg.deepvariant_loh_pval as f64;
  176. let min_mrd_depth = cfg.min_mrd_depth;
  177. info!("Filtering Constitutionnal (reported variant in constit), LOH (VAF proportion test < {}), LowMRDDepth (< {} in constit) variants by VCF annotations of {} likely somatic variants", min_loh_diff, min_mrd_depth, n_tumoral_init);
  178. let n_constit = AtomicI32::new(0);
  179. let n_loh = AtomicI32::new(0);
  180. let n_low_mrd_depth = AtomicI32::new(0);
  181. self.data = self
  182. .data
  183. .par_iter()
  184. .map(|e| {
  185. let mut tumoral = e.clone();
  186. let k = format!(
  187. "{}:{}|{}>{}",
  188. tumoral.contig, tumoral.position, tumoral.reference, tumoral.alternative
  189. );
  190. if let Some(mut constit) = self.constit.get_mut(&k) {
  191. if constit.get_depth() < min_mrd_depth {
  192. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  193. tumoral.annotations.push(AnnotationType::VariantCategory(
  194. VariantCategory::LowMRDDepth,
  195. ));
  196. } else if constit.get_n_alt() == constit.get_depth()
  197. && tumoral.get_n_alt() == tumoral.get_depth()
  198. {
  199. n_constit.fetch_add(1, Ordering::SeqCst);
  200. tumoral
  201. .annotations
  202. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  203. } else {
  204. let pval = chi_square_test_for_proportions(
  205. tumoral.get_n_alt() as f64,
  206. tumoral.get_depth() as f64,
  207. constit.get_n_alt() as f64,
  208. constit.get_depth() as f64,
  209. )
  210. .unwrap();
  211. if pval != 0.0 && pval <= min_loh_diff {
  212. n_loh.fetch_add(1, Ordering::SeqCst);
  213. tumoral
  214. .annotations
  215. .push(AnnotationType::VariantCategory(VariantCategory::LOH));
  216. } else {
  217. n_constit.fetch_add(1, Ordering::SeqCst);
  218. tumoral
  219. .annotations
  220. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  221. }
  222. }
  223. // If not un Constit registry, ClairS look for VCF constit depth and n_alt
  224. } else if let Format::ClairS(format) = &tumoral.callers_data.get(0).unwrap().format
  225. {
  226. if format.ndp < min_mrd_depth {
  227. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  228. tumoral.annotations.push(AnnotationType::VariantCategory(
  229. VariantCategory::LowMRDDepth,
  230. ));
  231. } else if let ReferenceAlternative::Nucleotide(alt_base) = &tumoral.alternative
  232. {
  233. let mrd_n_alt = match alt_base {
  234. Base::A => format.nau,
  235. Base::T => format.ntu,
  236. Base::C => format.ncu,
  237. Base::G => format.ngu,
  238. _ => 0,
  239. };
  240. if mrd_n_alt != 0 {
  241. n_constit.fetch_add(1, Ordering::SeqCst);
  242. tumoral
  243. .annotations
  244. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  245. }
  246. }
  247. }
  248. pg.inc(1);
  249. tumoral
  250. })
  251. .collect();
  252. let n_constit = n_constit.load(Ordering::SeqCst);
  253. let n_loh = n_loh.load(Ordering::SeqCst);
  254. let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
  255. self.stats_vcf = StatsVCF {
  256. n_tumoral_init,
  257. n_constit_init,
  258. n_constit,
  259. n_loh,
  260. n_low_mrd_depth,
  261. };
  262. // let elapsed = start.elapsed();
  263. let elapsed = pg.elapsed();
  264. pg.finish();
  265. info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
  266. }
  267. /// Filter variants by reading informations from constist BAM.
  268. pub fn bam_filters(&mut self, mrd_bam: &str) {
  269. let cfg = &self.cfg;
  270. // let start = Instant::now();
  271. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  272. pg.set_message("BAM filtering");
  273. let min_mrd_depth = cfg.min_mrd_depth;
  274. info!("Filtering Constitutionnal (Alt base found in BAM pileup), LowDiversity (sequence +/- 20nt around variant with entropy < {}), LowMRDDepth (BAM pileup depth < {}) variants by BAM pileup fetching of {} likely somatic variants", cfg.min_diversity, min_mrd_depth, self.stats_vcf.n_tumoral_init - (self.stats_vcf.n_constit + self.stats_vcf.n_loh + self.stats_vcf.n_low_mrd_depth) as usize);
  275. let n_already = AtomicI32::new(0);
  276. let n_constit = AtomicI32::new(0);
  277. let n_low_mrd_depth = AtomicI32::new(0);
  278. let n_low_diversity = AtomicI32::new(0);
  279. let n_somatic = AtomicI32::new(0);
  280. self.data.par_chunks_mut(10_000).for_each(|chunk| {
  281. let mut bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam)
  282. .context(anyhow!("Reading {}", mrd_bam))
  283. .unwrap();
  284. let mut genome_reader = FastaBuilder::default()
  285. .build_from_path(&cfg.reference_fa)
  286. .unwrap();
  287. for tumoral in chunk.iter_mut() {
  288. pg.inc(1);
  289. if tumoral.annotations.len() > 0 {
  290. n_already.fetch_add(1, Ordering::SeqCst);
  291. continue;
  292. }
  293. let (pos, is_ins) = match tumoral.alt_cat() {
  294. AlterationCategory::INS => (tumoral.position, true),
  295. AlterationCategory::DEL => (tumoral.position, false),
  296. _ => (tumoral.position, false),
  297. };
  298. match get_hts_nt_pileup(
  299. &mut bam,
  300. &tumoral.contig,
  301. pos as i32,
  302. is_ins, // tumoral.position as i32,
  303. ) {
  304. std::result::Result::Ok(bases) => {
  305. let depth = bases.len() as u32;
  306. if depth < min_mrd_depth {
  307. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  308. tumoral.annotations.push(AnnotationType::VariantCategory(
  309. VariantCategory::LowMRDDepth,
  310. ));
  311. } else {
  312. // Check local diversity
  313. let start =
  314. Position::try_from((tumoral.position - 20) as usize).unwrap();
  315. let end = Position::try_from((tumoral.position + 19) as usize).unwrap();
  316. let r = Region::new(tumoral.contig.to_string(), start..=end);
  317. if let std::result::Result::Ok(reg) = genome_reader.query(&r) {
  318. let s = reg.sequence();
  319. let u = s.as_ref();
  320. let s = String::from_utf8(u.to_vec()).unwrap();
  321. let ent = estimate_shannon_entropy(&s.to_lowercase());
  322. if ent < cfg.min_diversity {
  323. if tumoral.position == 148725437 {
  324. warn!("POS {}", ent);
  325. }
  326. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  327. tumoral.annotations.push(AnnotationType::VariantCategory(
  328. VariantCategory::LowDiversity,
  329. ));
  330. continue;
  331. }
  332. }
  333. // Check if the base is in constitutionnal pileup
  334. if let ReferenceAlternative::Nucleotide(alt_b) = &tumoral.alternative {
  335. let alt_b = alt_b.clone().into_u8();
  336. let n_alt_mrd = bases
  337. .clone()
  338. .into_iter()
  339. .filter(|e| *e == alt_b)
  340. .collect::<Vec<_>>()
  341. .len();
  342. if n_alt_mrd > 0 {
  343. n_constit.fetch_add(1, Ordering::SeqCst);
  344. tumoral.annotations.push(AnnotationType::VariantCategory(
  345. VariantCategory::Constit,
  346. ));
  347. } else {
  348. n_somatic.fetch_add(1, Ordering::SeqCst);
  349. tumoral.annotations.push(AnnotationType::VariantCategory(
  350. VariantCategory::Somatic,
  351. ));
  352. }
  353. } else if tumoral.is_ins() {
  354. let n_alt_mrd =
  355. bases.clone().into_iter().filter(|e| *e == b'I').count();
  356. if n_alt_mrd > 0 {
  357. n_constit.fetch_add(1, Ordering::SeqCst);
  358. tumoral.annotations.push(AnnotationType::VariantCategory(
  359. VariantCategory::Constit,
  360. ));
  361. } else {
  362. n_somatic.fetch_add(1, Ordering::SeqCst);
  363. tumoral.annotations.push(AnnotationType::VariantCategory(
  364. VariantCategory::Somatic,
  365. ));
  366. }
  367. } else if tumoral.alt_cat() == AlterationCategory::DEL {
  368. let n_alt_mrd =
  369. bases.clone().into_iter().filter(|e| *e == b'D').count();
  370. if n_alt_mrd > 0 {
  371. n_constit.fetch_add(1, Ordering::SeqCst);
  372. tumoral.annotations.push(AnnotationType::VariantCategory(
  373. VariantCategory::Constit,
  374. ));
  375. } else {
  376. n_somatic.fetch_add(1, Ordering::SeqCst);
  377. tumoral.annotations.push(AnnotationType::VariantCategory(
  378. VariantCategory::Somatic,
  379. ));
  380. }
  381. }
  382. }
  383. }
  384. Err(r) => panic!("{}", r),
  385. }
  386. }
  387. });
  388. let n_constit = n_constit.load(Ordering::SeqCst);
  389. let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
  390. let n_low_diversity = n_low_diversity.load(Ordering::SeqCst);
  391. let n_somatic = n_somatic.load(Ordering::SeqCst);
  392. let n_lasting = self.data.len() as i32 - n_already.load(Ordering::SeqCst);
  393. self.stats_bam = StatsBAM {
  394. n_lasting,
  395. n_constit,
  396. n_low_mrd_depth,
  397. n_low_diversity,
  398. n_somatic,
  399. };
  400. let elapsed = pg.elapsed();
  401. pg.finish();
  402. info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
  403. }
  404. pub fn get_cat(&mut self, cat: &VariantCategory) -> Vec<Variant> {
  405. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  406. pg.set_message(format!("Get cat {:?}", cat));
  407. self.data
  408. .par_iter()
  409. .progress_with(pg)
  410. .flat_map(|e| {
  411. if e.annotations
  412. .iter()
  413. .filter(|e| match e {
  414. AnnotationType::VariantCategory(vc) => vc == cat,
  415. _ => false,
  416. })
  417. .count()
  418. > 0
  419. {
  420. vec![e.clone()]
  421. } else {
  422. vec![]
  423. }
  424. })
  425. .collect::<Vec<Variant>>()
  426. }
  427. pub fn write_vcf_cat(&mut self, path: &str, cat: &VariantCategory) -> Result<()> {
  428. info!("Writing VCF {}", path);
  429. let mut to_write = sort_variants(self.get_cat(cat), &self.cfg.dict_file)?;
  430. let pg = self.mp.add(new_pg_speed(to_write.len() as u64));
  431. pg.set_message("Writing VCF");
  432. let mut w = VariantWritter::new(path, &self.cfg.dict_file)?;
  433. for row in to_write.iter_mut() {
  434. w.write_variant(row)?;
  435. pg.inc(1);
  436. }
  437. w.write_index_finish()?;
  438. Ok(())
  439. }
  440. pub fn keep_somatics_un(&mut self) {
  441. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  442. pg.set_message("Filtering Variants");
  443. self.data = self
  444. .data
  445. .par_iter_mut()
  446. .progress_with(pg)
  447. .flat_map(|e| {
  448. // keep unannotated and somatic
  449. if e.annotations
  450. .iter()
  451. .filter(|a| match a {
  452. AnnotationType::VariantCategory(vc) => match vc {
  453. VariantCategory::Somatic => false,
  454. _ => true,
  455. },
  456. _ => false,
  457. })
  458. .count()
  459. == 0
  460. {
  461. vec![e]
  462. } else {
  463. vec![]
  464. }
  465. })
  466. .map(|e| e.clone())
  467. .collect();
  468. }
  469. pub fn vep(&mut self) {
  470. // let steps = num_integer::div_ceil(self.data.len(), self.cfg.vep_chunk_size);
  471. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  472. pg.set_message("VEP");
  473. self.data
  474. .par_chunks_mut(self.cfg.vep_chunk_size)
  475. .progress_with(pg)
  476. .for_each(|chunks| vep_chunk(chunks).unwrap());
  477. }
  478. pub fn sort(&mut self) -> Result<()> {
  479. let cfg = &self.cfg;
  480. self.data = sort_variants(self.data.clone(), &cfg.dict_file)?;
  481. Ok(())
  482. }
  483. pub fn merge(&mut self) {
  484. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  485. pg.set_message("Merging Variants by contig, positions, ref, alt");
  486. let hm: DashMap<String, Variant> = DashMap::new();
  487. self.data.par_iter().progress_with(pg).for_each(|e| {
  488. let k = format!(
  489. "{}:{}|{}>{}",
  490. e.contig, e.position, e.reference, e.alternative
  491. );
  492. if let Some(mut v) = hm.get_mut(&k) {
  493. let v = v.value_mut();
  494. e.callers_data.iter().for_each(|cd| {
  495. v.callers_data.push(cd.clone());
  496. v.callers_data.dedup();
  497. });
  498. v.source.extend(e.source.clone());
  499. v.source.dedup();
  500. } else {
  501. hm.insert(k, e.clone());
  502. }
  503. });
  504. self.data = hm.iter().map(|e| e.value().clone()).collect();
  505. }
  506. pub fn annotate_gff_feature(&mut self, gff_path: &str) -> Result<()> {
  507. let gff_path = gff_path.to_string();
  508. let len = self.data.len();
  509. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  510. pg.set_message("GFF Annotate");
  511. self.data
  512. .par_chunks_mut(len / 33)
  513. .progress_with(pg)
  514. .for_each(|chunk| {
  515. let mut reader = File::open(gff_path.to_string())
  516. .map(noodles_bgzf::Reader::new)
  517. .map(gff::Reader::new)
  518. .unwrap();
  519. let index = noodles_csi::read(format!("{}.csi", gff_path)).unwrap();
  520. for v in chunk.iter_mut() {
  521. let start = Position::try_from(v.position as usize).unwrap();
  522. let r = Region::new(v.contig.to_string(), start..=start);
  523. if let std::result::Result::Ok(rows) = reader.query(&index, &r.clone()) {
  524. for row in rows {
  525. let ncbi = NCBIGFF::try_from(row.unwrap()).unwrap();
  526. v.annotations.push(AnnotationType::NCBIGFF(ncbi));
  527. }
  528. }
  529. }
  530. });
  531. Ok(())
  532. }
  533. pub fn echtvar_annotate(&mut self, header_path: &str) -> Result<()> {
  534. let len = self.len();
  535. let header = vcf_header_from(header_path)?;
  536. let pg = self.mp.add(new_pg_speed(len as u64));
  537. pg.set_message("Echtvar Annotate");
  538. self.data
  539. .par_chunks_mut(len / 33)
  540. .progress_with(pg)
  541. .for_each(|chunk| {
  542. let in_tmp = format!(
  543. "{}/echtvar_in_{}.vcf",
  544. temp_dir().to_str().unwrap(),
  545. uuid::Uuid::new_v4()
  546. );
  547. let out_tmp = format!(
  548. "{}/echtvar_in_{}.vcf.gz",
  549. temp_dir().to_str().unwrap(),
  550. uuid::Uuid::new_v4()
  551. );
  552. let mut vcf = File::create(&in_tmp).unwrap();
  553. let _ = writeln!(vcf, "{}", header);
  554. for (i, row) in chunk.iter().enumerate() {
  555. let _ = writeln!(
  556. vcf,
  557. "{}\t{}\t{}\t{}\t{}\t{}\tPASS\t.\t{}\t{}",
  558. row.contig,
  559. row.position,
  560. i + 1,
  561. row.reference,
  562. row.alternative,
  563. ".",
  564. ".",
  565. "."
  566. );
  567. }
  568. run_echtvar(&in_tmp, &out_tmp).unwrap();
  569. let mut reader = ReaderBuilder::new()
  570. .delimiter(b'\t')
  571. .has_headers(false)
  572. .comment(Some(b'#'))
  573. .flexible(true)
  574. .from_reader(get_reader(&out_tmp).unwrap());
  575. // let mut lines: HashMap<u64, Vec<VEPLine>> = HashMap::new();
  576. let mut last: usize = 1;
  577. for line in reader.deserialize::<VCFRow>() {
  578. if let std::result::Result::Ok(row) = line {
  579. let (cosmic, gnomad) = parse_echtvar_val(&row.info).unwrap();
  580. let id: usize = row.id.parse().unwrap();
  581. if id != last {
  582. panic!("Echtvar output not in input order!");
  583. }
  584. if let Some(c) = cosmic {
  585. chunk[id - 1].annotations.push(AnnotationType::Cosmic(c));
  586. }
  587. if let Some(g) = gnomad {
  588. chunk[id - 1].annotations.push(AnnotationType::GnomAD(g));
  589. }
  590. last += 1;
  591. }
  592. }
  593. });
  594. Ok(())
  595. }
  596. pub fn category_iter(&self, category: &VariantCategory) -> Vec<&Variant> {
  597. self.data
  598. .par_iter()
  599. .filter(|v| {
  600. for annotation in v.annotations.iter() {
  601. match annotation {
  602. AnnotationType::VariantCategory(cat) => {
  603. if cat == category {
  604. return true;
  605. }
  606. }
  607. _ => (),
  608. }
  609. }
  610. return false;
  611. })
  612. .collect::<Vec<&Variant>>()
  613. }
  614. pub fn filter_snp(&mut self) -> Result<i32> {
  615. let n_snp = AtomicI32::new(0);
  616. self.data = self
  617. .data
  618. .clone()
  619. .into_par_iter()
  620. .filter(|e| {
  621. let mut res = true;
  622. e.annotations.iter().for_each(|a| {
  623. match a {
  624. AnnotationType::GnomAD(g) => {
  625. res = g.gnomad_af < 0.01;
  626. }
  627. _ => (),
  628. };
  629. });
  630. if !res {
  631. n_snp.fetch_add(1, Ordering::SeqCst);
  632. }
  633. res
  634. })
  635. .collect();
  636. let n = n_snp.load(Ordering::SeqCst);
  637. Ok(n)
  638. }
  639. pub fn len(&self) -> usize {
  640. self.data.len()
  641. }
  642. pub fn constit_len(&self) -> usize {
  643. self.constit.len()
  644. }
  645. pub fn get_variant(&self, contig: &str, pos: u32) -> Vec<Variant> {
  646. self.data
  647. .par_iter()
  648. .filter(|v| v.contig == contig && v.position == pos)
  649. .map(|v| v.clone())
  650. .collect()
  651. }
  652. pub fn stats(&self) -> Result<()> {
  653. let mut callers_cat = HashMap::new();
  654. let mut n_caller_data = 0;
  655. let mut ncbi_feature = HashMap::new();
  656. let mut n_ncbi_feature = 0;
  657. let mut cosmic_sup_1 = HashMap::new();
  658. let mut n_cosmic_sup_1 = 0;
  659. let mut cons_cat = HashMap::new();
  660. let mut n_csq = 0;
  661. let add_hm = |hm: &mut HashMap<String, u32>, k: &str| {
  662. let (_, v) = hm.raw_entry_mut().from_key(k).or_insert(k.to_string(), 1);
  663. *v += 1;
  664. };
  665. for ele in self.data.iter() {
  666. // Callers
  667. let mut callers = Vec::new();
  668. for cd in &ele.callers_data {
  669. callers.push(
  670. match cd.format {
  671. Format::DeepVariant(_) => "DeepVariant",
  672. Format::ClairS(_) => "ClairS",
  673. Format::Sniffles(_) => "Sniffles",
  674. Format::Nanomonsv(_) => "Nanomonsv",
  675. }
  676. .to_string(),
  677. );
  678. }
  679. if callers.len() > 0 {
  680. n_caller_data += 1;
  681. callers.sort();
  682. let k = callers.join(",");
  683. let (_, v) = callers_cat
  684. .raw_entry_mut()
  685. .from_key(&k)
  686. .or_insert(k.clone(), 1);
  687. *v += 1;
  688. }
  689. // Annotations
  690. for annot in ele.annotations.iter() {
  691. let mut features = Vec::new();
  692. let mut cosmic_m1 = false;
  693. match annot {
  694. AnnotationType::NCBIGFF(ncbi) => {
  695. features.push(ncbi.feature.to_string());
  696. }
  697. AnnotationType::Cosmic(c) => {
  698. if c.cosmic_cnt > 1 {
  699. cosmic_m1 = true;
  700. }
  701. }
  702. _ => (),
  703. };
  704. if features.len() > 0 {
  705. features.sort();
  706. add_hm(&mut ncbi_feature, &features.join(","));
  707. n_ncbi_feature += 1;
  708. }
  709. if cosmic_m1 {
  710. add_hm(&mut cosmic_sup_1, "Cosmic > 1");
  711. n_cosmic_sup_1 += 1;
  712. }
  713. }
  714. // VEP
  715. let d: Vec<VEP> = ele
  716. .annotations
  717. .iter()
  718. .flat_map(|e| {
  719. if let AnnotationType::VEP(e) = e {
  720. e.clone()
  721. } else {
  722. vec![]
  723. }
  724. })
  725. .collect();
  726. if let std::result::Result::Ok(vep) = get_best_vep(&d) {
  727. if let Some(csq) = vep.consequence {
  728. n_csq += 1;
  729. let csq = csq.join(",");
  730. let (_, v) = cons_cat
  731. .raw_entry_mut()
  732. .from_key(&csq)
  733. .or_insert(csq.clone(), 1);
  734. *v += 1;
  735. }
  736. }
  737. }
  738. print_stat_cat(&cons_cat, n_csq as u32);
  739. print_stat_cat(&ncbi_feature, n_ncbi_feature as u32);
  740. print_stat_cat(&cosmic_sup_1, n_cosmic_sup_1 as u32);
  741. print_stat_cat(&callers_cat, n_caller_data as u32);
  742. // let file = File::create(path)?;
  743. // let mut writer = BufWriter::new(file);
  744. // let tow = Stats::new(
  745. // (n_csq, cons_cat),
  746. // (n_ncbi_feature, ncbi_feature),
  747. // (n_caller_data, callers_cat),
  748. // n_cosmic_sup_1,
  749. // n_total,
  750. // n_constit,
  751. // n_tumoral,
  752. // n_constit_first,
  753. // n_loh_first,
  754. // n_low_mrd_depth_first,
  755. // n_constit_sec,
  756. // n_low_diversity_sec,
  757. // n_low_mrd_depth_sec,
  758. // n_somatic_sec,
  759. // );
  760. // serde_json::to_writer(&mut writer, &tow)?;
  761. Ok(())
  762. }
  763. pub fn save_sql(&self, path: &str) -> Result<()> {
  764. insert_variants(&self, path)
  765. }
  766. pub fn stats_sql(&self, path: &str) -> Result<()> {
  767. insert_stats(
  768. "VCF".to_string(),
  769. serde_json::to_string(&self.stats_vcf)?,
  770. path,
  771. )?;
  772. insert_stats(
  773. "BAM".to_string(),
  774. serde_json::to_string(&self.stats_bam)?,
  775. path,
  776. )?;
  777. Ok(())
  778. }
  779. pub fn save_bytes(&self, path: &str) -> Result<()> {
  780. let serialized = pot::to_vec(&self.data)?;
  781. let mut w =
  782. noodles_bgzf::writer::Builder::default().build_with_writer(File::create(path)?);
  783. w.write_all(&serialized)?;
  784. Ok(())
  785. }
  786. pub fn new_from_bytes(name: &str, path: &str, mp: MultiProgress) -> Result<Self> {
  787. info!("Loading variants from: {path}");
  788. let r = in_out::get_reader_progress(path, &mp)?;
  789. let data: Vec<Variant> = pot::from_reader(r)?;
  790. Ok(Self {
  791. name: name.to_string(),
  792. data,
  793. constit: DashMap::new(),
  794. stats_vcf: StatsVCF::default(),
  795. stats_bam: StatsBAM::default(),
  796. cfg: Config::get()?,
  797. mp,
  798. })
  799. }
  800. }
  801. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  802. pub struct Variant {
  803. pub contig: String,
  804. pub position: u32,
  805. pub reference: ReferenceAlternative,
  806. pub alternative: ReferenceAlternative,
  807. pub callers_data: Vec<CallerData>,
  808. pub n_alt: Option<u32>,
  809. pub n_ref: Option<u32>,
  810. pub vaf: Option<f32>,
  811. pub depth: Option<u32>,
  812. pub variant_type: VariantType,
  813. pub source: Vec<VCFSource>,
  814. pub annotations: Vec<AnnotationType>,
  815. }
  816. #[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
  817. pub struct CallerData {
  818. pub qual: Option<f32>,
  819. pub format: Format,
  820. pub info: Info,
  821. }
  822. impl CallerData {
  823. pub fn get_vaf(&self) -> f64 {
  824. match &self.format {
  825. Format::DeepVariant(v) => v.vaf as f64,
  826. Format::ClairS(v) => v.af,
  827. Format::Sniffles(v) => v.dv as f64 / (v.dv as f64 + v.dr as f64),
  828. Format::Nanomonsv(v) => v.vr as f64 / v.tr as f64,
  829. }
  830. }
  831. pub fn get_depth(&mut self) -> u32 {
  832. match &self.format {
  833. Format::DeepVariant(v) => v.dp,
  834. Format::ClairS(v) => v.dp,
  835. Format::Sniffles(v) => v.dv + v.dr,
  836. Format::Nanomonsv(v) => v.tr,
  837. }
  838. }
  839. pub fn get_n_alt(&mut self) -> u32 {
  840. match &self.format {
  841. Format::DeepVariant(v) => v.ad.get(1).unwrap().to_owned(),
  842. Format::ClairS(v) => v.ad.get(1).unwrap().to_owned(),
  843. Format::Sniffles(v) => v.dv,
  844. Format::Nanomonsv(v) => v.tr - v.vr,
  845. }
  846. }
  847. /// Variants filter rules
  848. pub fn should_filter(&self) -> bool {
  849. if let Info::Sniffles(info) = &self.info {
  850. let imprecise = info
  851. .tags
  852. .iter()
  853. .filter(|s| s.to_string() == "IMPRECISE".to_string())
  854. .count();
  855. let mut n_alt = 0;
  856. if let Format::Sniffles(f) = &self.format {
  857. n_alt = f.dv;
  858. }
  859. if imprecise == 0 && n_alt >= 3 {
  860. return false;
  861. } else {
  862. return true;
  863. }
  864. } else {
  865. return false;
  866. }
  867. }
  868. }
  869. #[derive(Debug, Clone, Serialize, Eq, PartialEq, Deserialize)]
  870. pub enum VariantType {
  871. Somatic,
  872. Constitutionnal,
  873. }
  874. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  875. pub enum VCFSource {
  876. DeepVariant,
  877. ClairS,
  878. Sniffles,
  879. Nanomonsv,
  880. }
  881. impl FromStr for VCFSource {
  882. type Err = anyhow::Error;
  883. fn from_str(s: &str) -> Result<Self> {
  884. match s {
  885. "DeepVariant" => Ok(VCFSource::DeepVariant),
  886. "ClairS" => Ok(VCFSource::ClairS),
  887. "Sniffles" => Ok(VCFSource::Sniffles),
  888. "Nanomonsv" => Ok(VCFSource::Nanomonsv),
  889. _ => Err(anyhow!("Error parsing VCFSource")),
  890. }
  891. }
  892. }
  893. impl Variant {
  894. pub fn from_vcfrow(row: &VCFRow, source: VCFSource, variant_type: VariantType) -> Result<Self> {
  895. let callers_data = vec![CallerData {
  896. qual: row.qual.parse::<f32>().ok(),
  897. info: parse_info(&row.info, &source).context(anyhow!(
  898. "Can't parse {:?} info for {}",
  899. source,
  900. row.info
  901. ))?,
  902. format: parse_format(&source, &row.value).context(anyhow!(
  903. "Can't parse {:?} format for {}",
  904. source,
  905. row.value
  906. ))?,
  907. }];
  908. Ok(Variant {
  909. contig: row.chr.to_string(),
  910. position: row.pos,
  911. reference: row
  912. .reference
  913. .parse()
  914. .context(anyhow!("Error while parsing {}", row.reference))?,
  915. alternative: row
  916. .alt
  917. .parse()
  918. .context(anyhow!("Error while parsing {}", row.alt))?,
  919. n_ref: None,
  920. n_alt: None,
  921. vaf: None,
  922. depth: None,
  923. callers_data,
  924. source: vec![source],
  925. variant_type,
  926. annotations: Vec::new(),
  927. })
  928. }
  929. pub fn get_depth(&mut self) -> u32 {
  930. if let Some(depth) = self.depth {
  931. return depth;
  932. } else {
  933. let depth = self
  934. .callers_data
  935. .iter_mut()
  936. .map(|v| v.get_depth())
  937. .max()
  938. .unwrap();
  939. self.depth = Some(depth);
  940. return depth;
  941. }
  942. }
  943. pub fn get_n_alt(&mut self) -> u32 {
  944. if let Some(n_alt) = self.n_alt {
  945. return n_alt;
  946. } else {
  947. let n_alt = self
  948. .callers_data
  949. .iter_mut()
  950. .map(|v| v.get_n_alt())
  951. .max()
  952. .unwrap();
  953. self.n_alt = Some(n_alt);
  954. return n_alt;
  955. }
  956. }
  957. pub fn vaf(&mut self) -> f64 {
  958. let n_alt = self.get_n_alt() as f64;
  959. let depth = self.get_depth() as f64;
  960. n_alt / depth
  961. }
  962. fn is_ins(&self) -> bool {
  963. match (&self.reference, &self.alternative) {
  964. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => true,
  965. _ => false,
  966. }
  967. }
  968. fn alt_cat(&self) -> AlterationCategory {
  969. match (&self.reference, &self.alternative) {
  970. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
  971. AlterationCategory::SNV
  972. }
  973. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
  974. AlterationCategory::INS
  975. }
  976. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
  977. AlterationCategory::Other
  978. }
  979. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
  980. AlterationCategory::DEL
  981. }
  982. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) => {
  983. let a = a.len();
  984. let b = b.len();
  985. if a < b {
  986. AlterationCategory::INS
  987. } else if a > b {
  988. AlterationCategory::DEL
  989. } else {
  990. AlterationCategory::REP
  991. }
  992. }
  993. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Unstructured(_)) => {
  994. AlterationCategory::Other
  995. }
  996. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotide(_)) => {
  997. AlterationCategory::Other
  998. }
  999. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotides(_)) => {
  1000. AlterationCategory::Other
  1001. }
  1002. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Unstructured(_)) => {
  1003. AlterationCategory::Other
  1004. }
  1005. }
  1006. }
  1007. pub fn to_min_string(&mut self) -> String {
  1008. let depth = self.get_depth();
  1009. let n_alt = self.get_n_alt();
  1010. format!(
  1011. "DP:AD\t{}:{}",
  1012. depth,
  1013. vec![(depth - n_alt).to_string(), n_alt.to_string()].join(",")
  1014. )
  1015. }
  1016. pub fn get_veps(&self) -> Vec<VEP> {
  1017. self.annotations
  1018. .iter()
  1019. .flat_map(|e| {
  1020. if let AnnotationType::VEP(e) = e {
  1021. e.clone()
  1022. } else {
  1023. vec![]
  1024. }
  1025. })
  1026. .collect()
  1027. }
  1028. pub fn get_best_vep(&self) -> Result<VEP> {
  1029. get_best_vep(&self.get_veps())
  1030. }
  1031. }
  1032. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  1033. enum AlterationCategory {
  1034. SNV,
  1035. INS,
  1036. DEL,
  1037. REP,
  1038. Other,
  1039. }
  1040. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  1041. pub enum AnnotationType {
  1042. VariantCategory(VariantCategory),
  1043. VEP(Vec<VEP>),
  1044. Cluster(i32),
  1045. Cosmic(Cosmic),
  1046. GnomAD(GnomAD),
  1047. NCBIGFF(NCBIGFF),
  1048. }
  1049. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  1050. pub enum VariantCategory {
  1051. Somatic,
  1052. LowMRDDepth,
  1053. LOH,
  1054. Constit,
  1055. LowDiversity,
  1056. }
  1057. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  1058. pub enum ReferenceAlternative {
  1059. Nucleotide(Base),
  1060. Nucleotides(Vec<Base>),
  1061. Unstructured(String),
  1062. }
  1063. impl FromStr for ReferenceAlternative {
  1064. type Err = anyhow::Error;
  1065. fn from_str(s: &str) -> Result<Self> {
  1066. let mut possible_bases = s.as_bytes().iter();
  1067. let mut res: Vec<Base> = Vec::new();
  1068. while let Some(&base) = possible_bases.next() {
  1069. match base.try_into() {
  1070. std::result::Result::Ok(b) => res.push(b),
  1071. Err(_) => {
  1072. return Ok(Self::Unstructured(s.to_string()));
  1073. }
  1074. }
  1075. }
  1076. if res.len() == 1 {
  1077. return Ok(Self::Nucleotide(res.pop().unwrap()));
  1078. } else {
  1079. return Ok(Self::Nucleotides(res));
  1080. }
  1081. }
  1082. }
  1083. impl fmt::Display for ReferenceAlternative {
  1084. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  1085. let string = match self {
  1086. ReferenceAlternative::Nucleotide(b) => b.to_string(),
  1087. ReferenceAlternative::Nucleotides(bases) => bases
  1088. .iter()
  1089. .fold(String::new(), |acc, e| format!("{}{}", acc, e.to_string())),
  1090. ReferenceAlternative::Unstructured(s) => s.to_string(),
  1091. };
  1092. write!(f, "{}", string)
  1093. }
  1094. }
  1095. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  1096. pub enum Base {
  1097. A,
  1098. T,
  1099. C,
  1100. G,
  1101. N,
  1102. }
  1103. impl TryFrom<u8> for Base {
  1104. type Error = anyhow::Error;
  1105. fn try_from(base: u8) -> Result<Self> {
  1106. match base {
  1107. b'A' => Ok(Base::A),
  1108. b'T' => Ok(Base::T),
  1109. b'C' => Ok(Base::C),
  1110. b'G' => Ok(Base::G),
  1111. b'N' => Ok(Base::N),
  1112. _ => Err(anyhow!(
  1113. "Unknown base: {}",
  1114. String::from_utf8_lossy(&vec![base])
  1115. )),
  1116. }
  1117. }
  1118. }
  1119. impl Base {
  1120. pub fn into_u8(self) -> u8 {
  1121. return match self {
  1122. Base::A => b'A',
  1123. Base::T => b'T',
  1124. Base::C => b'C',
  1125. Base::G => b'G',
  1126. Base::N => b'N',
  1127. };
  1128. }
  1129. }
  1130. impl fmt::Display for Base {
  1131. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  1132. // Use `self.number` to refer to each positional data point.
  1133. let str = match self {
  1134. Base::A => "A",
  1135. Base::T => "T",
  1136. Base::C => "C",
  1137. Base::G => "G",
  1138. Base::N => "N",
  1139. };
  1140. write!(f, "{}", str)
  1141. }
  1142. }
  1143. #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)]
  1144. pub enum Format {
  1145. DeepVariant(DeepVariantFormat),
  1146. ClairS(ClairSFormat),
  1147. Sniffles(SnifflesFormat),
  1148. Nanomonsv(NanomonsvFormat),
  1149. }
  1150. #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)]
  1151. pub enum Info {
  1152. DeepVariant(DeepVariantInfo),
  1153. ClairS(ClairSInfo),
  1154. Sniffles(SnifflesInfo),
  1155. Nanomonsv(NanomonsvInfo),
  1156. }
  1157. fn parse_info(s: &str, source: &VCFSource) -> Result<Info> {
  1158. match source {
  1159. VCFSource::DeepVariant => Ok(Info::DeepVariant(s.parse()?)),
  1160. VCFSource::ClairS => Ok(Info::ClairS(s.parse()?)),
  1161. VCFSource::Sniffles => Ok(Info::Sniffles(s.parse()?)),
  1162. VCFSource::Nanomonsv => Ok(Info::Nanomonsv(s.parse()?)),
  1163. }
  1164. }
  1165. fn parse_format(vcf_source: &VCFSource, data: &str) -> Result<Format> {
  1166. let res = match vcf_source {
  1167. VCFSource::DeepVariant => Format::DeepVariant(data.parse()?),
  1168. VCFSource::ClairS => Format::ClairS(data.parse()?),
  1169. VCFSource::Sniffles => Format::Sniffles(data.parse()?),
  1170. VCFSource::Nanomonsv => Format::Nanomonsv(data.parse()?),
  1171. };
  1172. Ok(res)
  1173. }
  1174. pub fn sort_variants(d: Vec<Variant>, dict_path: &str) -> Result<Vec<Variant>> {
  1175. info!("Sorting {} entries", d.len());
  1176. let dict = read_dict(dict_path)?;
  1177. let mut store: HashMap<String, Vec<Variant>> = HashMap::new();
  1178. // add to store
  1179. d.iter().for_each(|e| {
  1180. if let Some(vec) = store.get_mut(&e.contig) {
  1181. vec.push(e.clone());
  1182. } else {
  1183. store.insert(e.contig.to_string(), vec![e.clone()]);
  1184. }
  1185. });
  1186. // sort in each contig
  1187. store
  1188. .iter_mut()
  1189. .for_each(|(_, vec)| vec.sort_by(|a, b| a.position.partial_cmp(&b.position).unwrap()));
  1190. // return contig in the order of dict file
  1191. Ok(dict
  1192. .iter()
  1193. .flat_map(|(chr, _)| {
  1194. if let Some((_, vec)) = store.remove_entry(chr) {
  1195. vec
  1196. } else {
  1197. vec![]
  1198. }
  1199. })
  1200. .collect())
  1201. }
  1202. // pub fn cluster_variants(d: &mut Vec<Variant>, max_dist: u32) -> i32 {
  1203. // let mut cluster_id = 0;
  1204. // let first = d.get(0).unwrap();
  1205. // let mut last_pos = first.position;
  1206. // let mut last_contig = first.contig.to_string();
  1207. //
  1208. // d.iter_mut().for_each(|e| {
  1209. // if e.contig != last_contig {
  1210. // cluster_id += 1;
  1211. // last_contig = e.contig.to_string();
  1212. // } else if e.position - last_pos > max_dist {
  1213. // cluster_id += 1;
  1214. // }
  1215. // e.annotations.push(AnnotationType::Cluster(cluster_id));
  1216. // last_pos = e.position;
  1217. // });
  1218. //
  1219. // cluster_id
  1220. // }