variants.rs 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295
  1. use crate::{
  2. annotations::{
  3. cosmic::Cosmic,
  4. echtvar::{parse_echtvar_val, run_echtvar},
  5. gnomad::GnomAD,
  6. ncbi_gff::NCBIGFF,
  7. vep::{get_best_vep, vep_chunk, VEP},
  8. },
  9. callers::{
  10. clairs::{ClairSFormat, ClairSInfo},
  11. deepvariant::{DeepVariantFormat, DeepVariantInfo},
  12. nanomonsv::{NanomonsvFormat, NanomonsvInfo},
  13. sniffles::{SnifflesFormat, SnifflesInfo},
  14. },
  15. config::Config,
  16. in_out::{
  17. dict_reader::read_dict,
  18. get_reader,
  19. vcf_reader::{read_vcf, read_vcf_progress, VCFRow},
  20. vcf_writer::{vcf_header_from, VariantWritter},
  21. },
  22. sql::{stats_sql::insert_stats, variants_sql::insert_variants},
  23. utils::{
  24. chi_square_test_for_proportions, estimate_shannon_entropy, get_hts_nt_pileup, new_pg,
  25. new_pg_speed, print_stat_cat,
  26. },
  27. };
  28. use anyhow::{anyhow, Context, Error, Ok, Result};
  29. use csv::ReaderBuilder;
  30. use dashmap::DashMap;
  31. use hashbrown::HashMap;
  32. use indicatif::{MultiProgress, ParallelProgressIterator};
  33. use log::{info, warn};
  34. use noodles_core::{region::Region, Position};
  35. use noodles_fasta::indexed_reader::Builder as FastaBuilder;
  36. use noodles_gff as gff;
  37. use rayon::prelude::*;
  38. use serde::{Deserialize, Serialize};
  39. use std::io::Write;
  40. use std::{
  41. env::temp_dir,
  42. fmt,
  43. fs::File,
  44. str::FromStr,
  45. sync::{
  46. atomic::{AtomicI32, Ordering},
  47. Arc,
  48. },
  49. };
  50. // chr12:25116542|G>T KRAS
  51. #[derive(Debug, Clone)]
  52. pub struct Variants {
  53. pub name: String,
  54. pub data: Vec<Variant>,
  55. constit: DashMap<String, Variant>,
  56. stats_vcf: StatsVCF,
  57. stats_bam: StatsBAM,
  58. cfg: Config,
  59. mp: MultiProgress,
  60. }
  61. #[derive(Debug, Clone, Serialize, Deserialize, Default)]
  62. pub struct StatsVCF {
  63. n_tumoral_init: usize,
  64. n_constit_init: usize,
  65. n_constit: i32,
  66. n_loh: i32,
  67. n_low_mrd_depth: i32,
  68. }
  69. impl fmt::Display for StatsVCF {
  70. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  71. let k = 100.0 / self.n_tumoral_init as f64;
  72. let string = format!(
  73. "VCF filters found {} ({:.1}%) constit, {} ({:.1}%) LOH, {} ({:.1}%) Low depth for constit variants",
  74. self.n_constit, self.n_constit as f64 * k,
  75. self.n_loh, self.n_loh as f64 * k,
  76. self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k
  77. );
  78. write!(f, "{}", string)
  79. }
  80. }
  81. #[derive(Debug, Clone, Serialize, Deserialize, Default)]
  82. pub struct StatsBAM {
  83. n_lasting: i32,
  84. n_constit: i32,
  85. n_low_mrd_depth: i32,
  86. n_low_diversity: i32,
  87. n_somatic: i32,
  88. }
  89. impl fmt::Display for StatsBAM {
  90. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  91. let k = 100.0 / self.n_lasting as f64;
  92. let string = format!(
  93. "BAM filters found {} ({:.1}%) constit, {} ({:.1}%) low depth for constit variants, {} ({:.1}%) low diversity of sequence at the variant position, {} ({:.1}%) somatic variants",
  94. self.n_constit, self.n_constit as f64 * k,
  95. self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k,
  96. self.n_low_diversity, self.n_low_diversity as f64 * k,
  97. self.n_somatic, self.n_somatic as f64 * k
  98. );
  99. write!(f, "{}", string)
  100. }
  101. }
  102. impl Variants {
  103. pub fn from_vec(name: String, mp: &MultiProgress, data: Vec<Variant>) -> Self {
  104. Self {
  105. name,
  106. data,
  107. constit: DashMap::new(),
  108. stats_vcf: StatsVCF::default(),
  109. stats_bam: StatsBAM::default(),
  110. cfg: Config::get().unwrap(),
  111. mp: mp.clone(),
  112. }
  113. }
  114. pub fn from_vcfs(
  115. name: String,
  116. v: Vec<(&str, &VCFSource, &VariantType)>,
  117. cfg: &Config,
  118. mp: MultiProgress,
  119. ) -> Result<Self> {
  120. let pg = mp.add(new_pg(v.len() as u64));
  121. pg.set_message("Reading VCF");
  122. let constit: Arc<DashMap<String, Variant>> = Arc::new(DashMap::new());
  123. let n_constit = AtomicI32::new(0);
  124. let data: Vec<Variant> = v
  125. .par_iter()
  126. // .progress_count(v.len() as u64)
  127. .flat_map(|(path, source, variant_type)| {
  128. let r = match variant_type {
  129. VariantType::Somatic => read_vcf(path, source, variant_type).unwrap(),
  130. VariantType::Constitutionnal => {
  131. read_vcf(path, source, variant_type)
  132. .unwrap()
  133. .par_iter()
  134. .for_each(|e| {
  135. n_constit.fetch_add(1, Ordering::SeqCst);
  136. constit.insert(
  137. format!(
  138. "{}:{}|{}>{}",
  139. e.contig, e.position, e.reference, e.alternative
  140. ),
  141. e.clone(),
  142. );
  143. });
  144. vec![]
  145. }
  146. };
  147. pg.inc(1);
  148. r
  149. })
  150. .collect();
  151. let stats_vcf = StatsVCF::default();
  152. let stats_bam = StatsBAM::default();
  153. let constit = Arc::try_unwrap(constit).unwrap();
  154. let elapsed = pg.elapsed();
  155. pg.finish();
  156. info!("{} variants parsed from somatic VCFs and {} variants positions parsed from constitutionnal VCFs. Executed in {}s", data.len(), constit.len(), elapsed.as_secs());
  157. let cfg = cfg.clone();
  158. return Ok(Self {
  159. name,
  160. data,
  161. constit,
  162. stats_vcf,
  163. stats_bam,
  164. cfg,
  165. mp: mp.clone(),
  166. });
  167. }
  168. pub fn vcf_filters(&mut self) {
  169. let cfg = &self.cfg;
  170. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  171. pg.set_message("VCF filtering");
  172. let n_tumoral_init = self.len();
  173. let n_constit_init = self.constit_len();
  174. let min_loh_diff = cfg.deepvariant_loh_pval as f64;
  175. let min_mrd_depth = cfg.min_mrd_depth;
  176. info!("Filtering Constitutionnal (reported variant in constit), LOH (VAF proportion test < {}), LowMRDDepth (< {} in constit) variants by VCF annotations of {} likely somatic variants", min_loh_diff, min_mrd_depth, n_tumoral_init);
  177. let n_constit = AtomicI32::new(0);
  178. let n_loh = AtomicI32::new(0);
  179. let n_low_mrd_depth = AtomicI32::new(0);
  180. self.data = self
  181. .data
  182. .par_iter()
  183. .map(|e| {
  184. let mut tumoral = e.clone();
  185. let k = format!(
  186. "{}:{}|{}>{}",
  187. tumoral.contig, tumoral.position, tumoral.reference, tumoral.alternative
  188. );
  189. if let Some(mut constit) = self.constit.get_mut(&k) {
  190. if constit.get_depth() < min_mrd_depth {
  191. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  192. tumoral.annotations.push(AnnotationType::VariantCategory(
  193. VariantCategory::LowMRDDepth,
  194. ));
  195. } else if constit.get_n_alt() == constit.get_depth()
  196. && tumoral.get_n_alt() == tumoral.get_depth()
  197. {
  198. n_constit.fetch_add(1, Ordering::SeqCst);
  199. tumoral
  200. .annotations
  201. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  202. } else {
  203. let pval = chi_square_test_for_proportions(
  204. tumoral.get_n_alt() as f64,
  205. tumoral.get_depth() as f64,
  206. constit.get_n_alt() as f64,
  207. constit.get_depth() as f64,
  208. )
  209. .unwrap();
  210. if pval != 0.0 && pval <= min_loh_diff {
  211. n_loh.fetch_add(1, Ordering::SeqCst);
  212. tumoral
  213. .annotations
  214. .push(AnnotationType::VariantCategory(VariantCategory::LOH));
  215. } else {
  216. n_constit.fetch_add(1, Ordering::SeqCst);
  217. tumoral
  218. .annotations
  219. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  220. }
  221. }
  222. // If not un Constit registry, ClairS look for VCF constit depth and n_alt
  223. } else if let Format::ClairS(format) = &tumoral.callers_data.get(0).unwrap().format
  224. {
  225. if format.ndp < min_mrd_depth {
  226. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  227. tumoral.annotations.push(AnnotationType::VariantCategory(
  228. VariantCategory::LowMRDDepth,
  229. ));
  230. } else if let ReferenceAlternative::Nucleotide(alt_base) = &tumoral.alternative
  231. {
  232. let mrd_n_alt = match alt_base {
  233. Base::A => format.nau,
  234. Base::T => format.ntu,
  235. Base::C => format.ncu,
  236. Base::G => format.ngu,
  237. _ => 0,
  238. };
  239. if mrd_n_alt != 0 {
  240. n_constit.fetch_add(1, Ordering::SeqCst);
  241. tumoral
  242. .annotations
  243. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  244. }
  245. }
  246. }
  247. pg.inc(1);
  248. tumoral
  249. })
  250. .collect();
  251. let n_constit = n_constit.load(Ordering::SeqCst);
  252. let n_loh = n_loh.load(Ordering::SeqCst);
  253. let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
  254. self.stats_vcf = StatsVCF {
  255. n_tumoral_init,
  256. n_constit_init,
  257. n_constit,
  258. n_loh,
  259. n_low_mrd_depth,
  260. };
  261. // let elapsed = start.elapsed();
  262. let elapsed = pg.elapsed();
  263. pg.finish();
  264. info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
  265. }
  266. /// Filter variants by reading informations from constist BAM.
  267. pub fn bam_filters(&mut self, mrd_bam: &str) {
  268. let cfg = &self.cfg;
  269. // let start = Instant::now();
  270. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  271. pg.set_message("BAM filtering");
  272. let min_mrd_depth = cfg.min_mrd_depth;
  273. info!("Filtering Constitutionnal (Alt base found in BAM pileup), LowDiversity (sequence +/- 20nt around variant with entropy < {}), LowMRDDepth (BAM pileup depth < {}) variants by BAM pileup fetching of {} likely somatic variants", cfg.min_diversity, min_mrd_depth, self.stats_vcf.n_tumoral_init - (self.stats_vcf.n_constit + self.stats_vcf.n_loh + self.stats_vcf.n_low_mrd_depth) as usize);
  274. let n_already = AtomicI32::new(0);
  275. let n_constit = AtomicI32::new(0);
  276. let n_low_mrd_depth = AtomicI32::new(0);
  277. let n_low_diversity = AtomicI32::new(0);
  278. let n_somatic = AtomicI32::new(0);
  279. self.data.par_chunks_mut(10_000).for_each(|chunk| {
  280. let mut bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam)
  281. .context(anyhow!("Reading {}", mrd_bam))
  282. .unwrap();
  283. let mut genome_reader = FastaBuilder::default()
  284. .build_from_path(&cfg.reference_fa)
  285. .unwrap();
  286. for tumoral in chunk.iter_mut() {
  287. pg.inc(1);
  288. if tumoral.annotations.len() > 0 {
  289. n_already.fetch_add(1, Ordering::SeqCst);
  290. continue;
  291. }
  292. let (pos, is_ins) = match tumoral.alt_cat() {
  293. AlterationCategory::INS => (tumoral.position, true),
  294. AlterationCategory::DEL => (tumoral.position, false),
  295. _ => (tumoral.position, false),
  296. };
  297. match get_hts_nt_pileup(
  298. &mut bam,
  299. &tumoral.contig,
  300. pos as i32,
  301. is_ins, // tumoral.position as i32,
  302. ) {
  303. std::result::Result::Ok(bases) => {
  304. let depth = bases.len() as u32;
  305. if depth < min_mrd_depth {
  306. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  307. tumoral.annotations.push(AnnotationType::VariantCategory(
  308. VariantCategory::LowMRDDepth,
  309. ));
  310. } else {
  311. // Check local diversity
  312. let start =
  313. Position::try_from((tumoral.position - 20) as usize).unwrap();
  314. let end = Position::try_from((tumoral.position + 19) as usize).unwrap();
  315. let r = Region::new(tumoral.contig.to_string(), start..=end);
  316. if let std::result::Result::Ok(reg) = genome_reader.query(&r) {
  317. let s = reg.sequence();
  318. let u = s.as_ref();
  319. let s = String::from_utf8(u.to_vec()).unwrap();
  320. let ent = estimate_shannon_entropy(&s.to_lowercase());
  321. if ent < cfg.min_diversity {
  322. if tumoral.position == 148725437 {
  323. warn!("POS {}", ent);
  324. }
  325. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  326. tumoral.annotations.push(AnnotationType::VariantCategory(
  327. VariantCategory::LowDiversity,
  328. ));
  329. continue;
  330. }
  331. }
  332. // Check if the base is in constitutionnal pileup
  333. if let ReferenceAlternative::Nucleotide(alt_b) = &tumoral.alternative {
  334. let alt_b = alt_b.clone().into_u8();
  335. let n_alt_mrd = bases
  336. .clone()
  337. .into_iter()
  338. .filter(|e| *e == alt_b)
  339. .collect::<Vec<_>>()
  340. .len();
  341. if n_alt_mrd > 0 {
  342. n_constit.fetch_add(1, Ordering::SeqCst);
  343. tumoral.annotations.push(AnnotationType::VariantCategory(
  344. VariantCategory::Constit,
  345. ));
  346. } else {
  347. n_somatic.fetch_add(1, Ordering::SeqCst);
  348. tumoral.annotations.push(AnnotationType::VariantCategory(
  349. VariantCategory::Somatic,
  350. ));
  351. }
  352. } else if tumoral.is_ins() {
  353. let n_alt_mrd =
  354. bases.clone().into_iter().filter(|e| *e == b'I').count();
  355. if n_alt_mrd > 0 {
  356. n_constit.fetch_add(1, Ordering::SeqCst);
  357. tumoral.annotations.push(AnnotationType::VariantCategory(
  358. VariantCategory::Constit,
  359. ));
  360. } else {
  361. n_somatic.fetch_add(1, Ordering::SeqCst);
  362. tumoral.annotations.push(AnnotationType::VariantCategory(
  363. VariantCategory::Somatic,
  364. ));
  365. }
  366. } else if tumoral.alt_cat() == AlterationCategory::DEL {
  367. let n_alt_mrd =
  368. bases.clone().into_iter().filter(|e| *e == b'D').count();
  369. if n_alt_mrd > 0 {
  370. n_constit.fetch_add(1, Ordering::SeqCst);
  371. tumoral.annotations.push(AnnotationType::VariantCategory(
  372. VariantCategory::Constit,
  373. ));
  374. } else {
  375. n_somatic.fetch_add(1, Ordering::SeqCst);
  376. tumoral.annotations.push(AnnotationType::VariantCategory(
  377. VariantCategory::Somatic,
  378. ));
  379. }
  380. }
  381. }
  382. }
  383. Err(r) => panic!("{}", r),
  384. }
  385. }
  386. });
  387. let n_constit = n_constit.load(Ordering::SeqCst);
  388. let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
  389. let n_low_diversity = n_low_diversity.load(Ordering::SeqCst);
  390. let n_somatic = n_somatic.load(Ordering::SeqCst);
  391. let n_lasting = self.data.len() as i32 - n_already.load(Ordering::SeqCst);
  392. self.stats_bam = StatsBAM {
  393. n_lasting,
  394. n_constit,
  395. n_low_mrd_depth,
  396. n_low_diversity,
  397. n_somatic,
  398. };
  399. let elapsed = pg.elapsed();
  400. pg.finish();
  401. info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
  402. }
  403. pub fn get_cat(&mut self, cat: VariantCategory) -> Vec<Variant> {
  404. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  405. pg.set_message(format!("Get cat {:?}", cat));
  406. self.data
  407. .par_iter()
  408. .progress_with(pg)
  409. .flat_map(|e| {
  410. if e.annotations
  411. .iter()
  412. .filter(|e| match e {
  413. AnnotationType::VariantCategory(vc) => *vc == cat,
  414. _ => false,
  415. })
  416. .count()
  417. > 0
  418. {
  419. vec![e.clone()]
  420. } else {
  421. vec![]
  422. }
  423. })
  424. .collect::<Vec<Variant>>()
  425. }
  426. pub fn write_vcf_cat(&mut self, path: &str, cat: &VariantCategory) -> Result<()> {
  427. info!("Writing VCF {}", path);
  428. let mut to_write = sort_variants(self.get_cat(cat), &self.cfg.dict_file)?;
  429. let pg = self.mp.add(new_pg_speed(to_write.len() as u64));
  430. pg.set_message("Writing VCF");
  431. let mut w = VariantWritter::new(path, &self.cfg.dict_file)?;
  432. for row in to_write.iter_mut() {
  433. w.write_variant(row)?;
  434. pg.inc(1);
  435. }
  436. w.write_index_finish()?;
  437. Ok(())
  438. }
  439. pub fn keep_somatics_un(&mut self) {
  440. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  441. pg.set_message("Filtering Variants");
  442. self.data = self
  443. .data
  444. .par_iter_mut()
  445. .progress_with(pg)
  446. .flat_map(|e| {
  447. // keep unannotated and somatic
  448. if e.annotations
  449. .iter()
  450. .filter(|a| match a {
  451. AnnotationType::VariantCategory(vc) => match vc {
  452. VariantCategory::Somatic => false,
  453. _ => true,
  454. },
  455. _ => false,
  456. })
  457. .count()
  458. == 0
  459. {
  460. vec![e]
  461. } else {
  462. vec![]
  463. }
  464. })
  465. .map(|e| e.clone())
  466. .collect();
  467. }
  468. pub fn vep(&mut self) {
  469. // let steps = num_integer::div_ceil(self.data.len(), self.cfg.vep_chunk_size);
  470. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  471. pg.set_message("VEP");
  472. self.data
  473. .par_chunks_mut(self.cfg.vep_chunk_size)
  474. .progress_with(pg)
  475. .for_each(|chunks| vep_chunk(chunks).unwrap());
  476. }
  477. pub fn sort(&mut self) -> Result<()> {
  478. let cfg = &self.cfg;
  479. self.data = sort_variants(self.data.clone(), &cfg.dict_file)?;
  480. Ok(())
  481. }
  482. pub fn merge(&mut self) {
  483. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  484. pg.set_message("Merging Variants by contig, positions, ref, alt");
  485. let hm: DashMap<String, Variant> = DashMap::new();
  486. self.data.par_iter().progress_with(pg).for_each(|e| {
  487. let k = format!(
  488. "{}:{}|{}>{}",
  489. e.contig, e.position, e.reference, e.alternative
  490. );
  491. if let Some(mut v) = hm.get_mut(&k) {
  492. let v = v.value_mut();
  493. e.callers_data.iter().for_each(|cd| {
  494. v.callers_data.push(cd.clone());
  495. v.callers_data.dedup();
  496. });
  497. v.source.extend(e.source.clone());
  498. v.source.dedup();
  499. } else {
  500. hm.insert(k, e.clone());
  501. }
  502. });
  503. self.data = hm.iter().map(|e| e.value().clone()).collect();
  504. }
  505. pub fn annotate_gff_feature(&mut self, gff_path: &str) -> Result<()> {
  506. let gff_path = gff_path.to_string();
  507. let len = self.data.len();
  508. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  509. pg.set_message("GFF Annotate");
  510. self.data
  511. .par_chunks_mut(len / 33)
  512. .progress_with(pg)
  513. .for_each(|chunk| {
  514. let mut reader = File::open(gff_path.to_string())
  515. .map(noodles_bgzf::Reader::new)
  516. .map(gff::Reader::new)
  517. .unwrap();
  518. let index = noodles_csi::read(format!("{}.csi", gff_path)).unwrap();
  519. for v in chunk.iter_mut() {
  520. let start = Position::try_from(v.position as usize).unwrap();
  521. let r = Region::new(v.contig.to_string(), start..=start);
  522. if let std::result::Result::Ok(rows) = reader.query(&index, &r.clone()) {
  523. for row in rows {
  524. let ncbi = NCBIGFF::try_from(row.unwrap()).unwrap();
  525. v.annotations.push(AnnotationType::NCBIGFF(ncbi));
  526. }
  527. }
  528. }
  529. });
  530. Ok(())
  531. }
  532. pub fn echtvar_annotate(&mut self, header_path: &str) -> Result<()> {
  533. let len = self.len();
  534. let header = vcf_header_from(header_path)?;
  535. let pg = self.mp.add(new_pg_speed(len as u64));
  536. pg.set_message("Echtvar Annotate");
  537. self.data
  538. .par_chunks_mut(len / 33)
  539. .progress_with(pg)
  540. .for_each(|chunk| {
  541. let in_tmp = format!(
  542. "{}/echtvar_in_{}.vcf",
  543. temp_dir().to_str().unwrap(),
  544. uuid::Uuid::new_v4()
  545. );
  546. let out_tmp = format!(
  547. "{}/echtvar_in_{}.vcf.gz",
  548. temp_dir().to_str().unwrap(),
  549. uuid::Uuid::new_v4()
  550. );
  551. let mut vcf = File::create(&in_tmp).unwrap();
  552. let _ = writeln!(vcf, "{}", header);
  553. for (i, row) in chunk.iter().enumerate() {
  554. let _ = writeln!(
  555. vcf,
  556. "{}\t{}\t{}\t{}\t{}\t{}\tPASS\t.\t{}\t{}",
  557. row.contig,
  558. row.position,
  559. i + 1,
  560. row.reference,
  561. row.alternative,
  562. ".",
  563. ".",
  564. "."
  565. );
  566. }
  567. run_echtvar(&in_tmp, &out_tmp).unwrap();
  568. let mut reader = ReaderBuilder::new()
  569. .delimiter(b'\t')
  570. .has_headers(false)
  571. .comment(Some(b'#'))
  572. .flexible(true)
  573. .from_reader(get_reader(&out_tmp).unwrap());
  574. // let mut lines: HashMap<u64, Vec<VEPLine>> = HashMap::new();
  575. let mut last: usize = 1;
  576. for line in reader.deserialize::<VCFRow>() {
  577. if let std::result::Result::Ok(row) = line {
  578. let (cosmic, gnomad) = parse_echtvar_val(&row.info).unwrap();
  579. let id: usize = row.id.parse().unwrap();
  580. if id != last {
  581. panic!("Echtvar output not in input order!");
  582. }
  583. if let Some(c) = cosmic {
  584. chunk[id - 1].annotations.push(AnnotationType::Cosmic(c));
  585. }
  586. if let Some(g) = gnomad {
  587. chunk[id - 1].annotations.push(AnnotationType::GnomAD(g));
  588. }
  589. last += 1;
  590. }
  591. }
  592. });
  593. Ok(())
  594. }
  595. pub fn category_iter(&self, category: &VariantCategory) -> Vec<&Variant> {
  596. self.data.par_iter().filter(|v| {
  597. for annotation in v.annotations.iter() {
  598. match annotation {
  599. AnnotationType::VariantCategory(cat) => {
  600. if cat == category {
  601. return true;
  602. }
  603. }
  604. _ => (),
  605. }
  606. }
  607. return false;
  608. }).collect::<Vec<&Variant>>()
  609. }
  610. pub fn filter_snp(&mut self) -> Result<i32> {
  611. let n_snp = AtomicI32::new(0);
  612. self.data = self
  613. .data
  614. .clone()
  615. .into_par_iter()
  616. .filter(|e| {
  617. let mut res = true;
  618. e.annotations.iter().for_each(|a| {
  619. match a {
  620. AnnotationType::GnomAD(g) => {
  621. res = g.gnomad_af < 0.01;
  622. }
  623. _ => (),
  624. };
  625. });
  626. if !res {
  627. n_snp.fetch_add(1, Ordering::SeqCst);
  628. }
  629. res
  630. })
  631. .collect();
  632. let n = n_snp.load(Ordering::SeqCst);
  633. Ok(n)
  634. }
  635. pub fn len(&self) -> usize {
  636. self.data.len()
  637. }
  638. pub fn constit_len(&self) -> usize {
  639. self.constit.len()
  640. }
  641. pub fn get_variant(&self, contig: &str, pos: u32) -> Vec<Variant> {
  642. self.data
  643. .par_iter()
  644. .filter(|v| v.contig == contig && v.position == pos)
  645. .map(|v| v.clone())
  646. .collect()
  647. }
  648. pub fn stats(&self) -> Result<()> {
  649. let mut callers_cat = HashMap::new();
  650. let mut n_caller_data = 0;
  651. let mut ncbi_feature = HashMap::new();
  652. let mut n_ncbi_feature = 0;
  653. let mut cosmic_sup_1 = HashMap::new();
  654. let mut n_cosmic_sup_1 = 0;
  655. let mut cons_cat = HashMap::new();
  656. let mut n_csq = 0;
  657. let add_hm = |hm: &mut HashMap<String, u32>, k: &str| {
  658. let (_, v) = hm.raw_entry_mut().from_key(k).or_insert(k.to_string(), 1);
  659. *v += 1;
  660. };
  661. for ele in self.data.iter() {
  662. // Callers
  663. let mut callers = Vec::new();
  664. for cd in &ele.callers_data {
  665. callers.push(
  666. match cd.format {
  667. Format::DeepVariant(_) => "DeepVariant",
  668. Format::ClairS(_) => "ClairS",
  669. Format::Sniffles(_) => "Sniffles",
  670. Format::Nanomonsv(_) => "Nanomonsv",
  671. }
  672. .to_string(),
  673. );
  674. }
  675. if callers.len() > 0 {
  676. n_caller_data += 1;
  677. callers.sort();
  678. let k = callers.join(",");
  679. let (_, v) = callers_cat
  680. .raw_entry_mut()
  681. .from_key(&k)
  682. .or_insert(k.clone(), 1);
  683. *v += 1;
  684. }
  685. // Annotations
  686. for annot in ele.annotations.iter() {
  687. let mut features = Vec::new();
  688. let mut cosmic_m1 = false;
  689. match annot {
  690. AnnotationType::NCBIGFF(ncbi) => {
  691. features.push(ncbi.feature.to_string());
  692. }
  693. AnnotationType::Cosmic(c) => {
  694. if c.cosmic_cnt > 1 {
  695. cosmic_m1 = true;
  696. }
  697. }
  698. _ => (),
  699. };
  700. if features.len() > 0 {
  701. features.sort();
  702. add_hm(&mut ncbi_feature, &features.join(","));
  703. n_ncbi_feature += 1;
  704. }
  705. if cosmic_m1 {
  706. add_hm(&mut cosmic_sup_1, "Cosmic > 1");
  707. n_cosmic_sup_1 += 1;
  708. }
  709. }
  710. // VEP
  711. let d: Vec<VEP> = ele
  712. .annotations
  713. .iter()
  714. .flat_map(|e| {
  715. if let AnnotationType::VEP(e) = e {
  716. e.clone()
  717. } else {
  718. vec![]
  719. }
  720. })
  721. .collect();
  722. if let std::result::Result::Ok(vep) = get_best_vep(&d) {
  723. if let Some(csq) = vep.consequence {
  724. n_csq += 1;
  725. let csq = csq.join(",");
  726. let (_, v) = cons_cat
  727. .raw_entry_mut()
  728. .from_key(&csq)
  729. .or_insert(csq.clone(), 1);
  730. *v += 1;
  731. }
  732. }
  733. }
  734. print_stat_cat(&cons_cat, n_csq as u32);
  735. print_stat_cat(&ncbi_feature, n_ncbi_feature as u32);
  736. print_stat_cat(&cosmic_sup_1, n_cosmic_sup_1 as u32);
  737. print_stat_cat(&callers_cat, n_caller_data as u32);
  738. // let file = File::create(path)?;
  739. // let mut writer = BufWriter::new(file);
  740. // let tow = Stats::new(
  741. // (n_csq, cons_cat),
  742. // (n_ncbi_feature, ncbi_feature),
  743. // (n_caller_data, callers_cat),
  744. // n_cosmic_sup_1,
  745. // n_total,
  746. // n_constit,
  747. // n_tumoral,
  748. // n_constit_first,
  749. // n_loh_first,
  750. // n_low_mrd_depth_first,
  751. // n_constit_sec,
  752. // n_low_diversity_sec,
  753. // n_low_mrd_depth_sec,
  754. // n_somatic_sec,
  755. // );
  756. // serde_json::to_writer(&mut writer, &tow)?;
  757. Ok(())
  758. }
  759. pub fn save_sql(&self, path: &str) -> Result<()> {
  760. insert_variants(&self, path)
  761. }
  762. pub fn stats_sql(&self, path: &str) -> Result<()> {
  763. insert_stats(
  764. "VCF".to_string(),
  765. serde_json::to_string(&self.stats_vcf)?,
  766. path,
  767. )?;
  768. insert_stats(
  769. "BAM".to_string(),
  770. serde_json::to_string(&self.stats_bam)?,
  771. path,
  772. )?;
  773. Ok(())
  774. }
  775. }
  776. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  777. pub struct Variant {
  778. pub contig: String,
  779. pub position: u32,
  780. pub reference: ReferenceAlternative,
  781. pub alternative: ReferenceAlternative,
  782. pub callers_data: Vec<CallerData>,
  783. pub n_alt: Option<u32>,
  784. pub n_ref: Option<u32>,
  785. pub vaf: Option<f32>,
  786. pub depth: Option<u32>,
  787. pub variant_type: VariantType,
  788. pub source: Vec<VCFSource>,
  789. pub annotations: Vec<AnnotationType>,
  790. }
  791. #[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
  792. pub struct CallerData {
  793. pub qual: Option<f32>,
  794. pub format: Format,
  795. pub info: Info,
  796. }
  797. impl CallerData {
  798. pub fn get_vaf(&self) -> f64 {
  799. match &self.format {
  800. Format::DeepVariant(v) => v.vaf as f64,
  801. Format::ClairS(v) => v.af,
  802. Format::Sniffles(v) => v.dv as f64 / (v.dv as f64 + v.dr as f64),
  803. Format::Nanomonsv(v) => v.vr as f64 / v.tr as f64,
  804. }
  805. }
  806. pub fn get_depth(&mut self) -> u32 {
  807. match &self.format {
  808. Format::DeepVariant(v) => v.dp,
  809. Format::ClairS(v) => v.dp,
  810. Format::Sniffles(v) => v.dv + v.dr,
  811. Format::Nanomonsv(v) => v.tr,
  812. }
  813. }
  814. pub fn get_n_alt(&mut self) -> u32 {
  815. match &self.format {
  816. Format::DeepVariant(v) => v.ad.get(1).unwrap().to_owned(),
  817. Format::ClairS(v) => v.ad.get(1).unwrap().to_owned(),
  818. Format::Sniffles(v) => v.dv,
  819. Format::Nanomonsv(v) => v.tr - v.vr,
  820. }
  821. }
  822. /// Variants filter rules
  823. pub fn should_filter(&self) -> bool {
  824. if let Info::Sniffles(info) = &self.info {
  825. let imprecise = info
  826. .tags
  827. .iter()
  828. .filter(|s| s.to_string() == "IMPRECISE".to_string())
  829. .count();
  830. let mut n_alt = 0;
  831. if let Format::Sniffles(f) = &self.format {
  832. n_alt = f.dv;
  833. }
  834. if imprecise == 0 && n_alt >= 3 {
  835. return false;
  836. } else {
  837. return true;
  838. }
  839. } else {
  840. return false;
  841. }
  842. }
  843. }
  844. #[derive(Debug, Clone, Serialize, Eq, PartialEq, Deserialize)]
  845. pub enum VariantType {
  846. Somatic,
  847. Constitutionnal,
  848. }
  849. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  850. pub enum VCFSource {
  851. DeepVariant,
  852. ClairS,
  853. Sniffles,
  854. Nanomonsv,
  855. }
  856. impl FromStr for VCFSource {
  857. type Err = anyhow::Error;
  858. fn from_str(s: &str) -> Result<Self> {
  859. match s {
  860. "DeepVariant" => Ok(VCFSource::DeepVariant),
  861. "ClairS" => Ok(VCFSource::ClairS),
  862. "Sniffles" => Ok(VCFSource::Sniffles),
  863. "Nanomonsv" => Ok(VCFSource::Nanomonsv),
  864. _ => Err(anyhow!("Error parsing VCFSource")),
  865. }
  866. }
  867. }
  868. impl Variant {
  869. pub fn from_vcfrow(row: &VCFRow, source: VCFSource, variant_type: VariantType) -> Result<Self> {
  870. let callers_data = vec![CallerData {
  871. qual: row.qual.parse::<f32>().ok(),
  872. info: parse_info(&row.info, &source).context(anyhow!(
  873. "Can't parse {:?} info for {}",
  874. source,
  875. row.info
  876. ))?,
  877. format: parse_format(&source, &row.value).context(anyhow!(
  878. "Can't parse {:?} format for {}",
  879. source,
  880. row.value
  881. ))?,
  882. }];
  883. Ok(Variant {
  884. contig: row.chr.to_string(),
  885. position: row.pos,
  886. reference: row
  887. .reference
  888. .parse()
  889. .context(anyhow!("Error while parsing {}", row.reference))?,
  890. alternative: row
  891. .alt
  892. .parse()
  893. .context(anyhow!("Error while parsing {}", row.alt))?,
  894. n_ref: None,
  895. n_alt: None,
  896. vaf: None,
  897. depth: None,
  898. callers_data,
  899. source: vec![source],
  900. variant_type,
  901. annotations: Vec::new(),
  902. })
  903. }
  904. pub fn get_depth(&mut self) -> u32 {
  905. if let Some(depth) = self.depth {
  906. return depth;
  907. } else {
  908. let depth = self
  909. .callers_data
  910. .iter_mut()
  911. .map(|v| v.get_depth())
  912. .max()
  913. .unwrap();
  914. self.depth = Some(depth);
  915. return depth;
  916. }
  917. }
  918. pub fn get_n_alt(&mut self) -> u32 {
  919. if let Some(n_alt) = self.n_alt {
  920. return n_alt;
  921. } else {
  922. let n_alt = self
  923. .callers_data
  924. .iter_mut()
  925. .map(|v| v.get_n_alt())
  926. .max()
  927. .unwrap();
  928. self.n_alt = Some(n_alt);
  929. return n_alt;
  930. }
  931. }
  932. fn is_ins(&self) -> bool {
  933. match (&self.reference, &self.alternative) {
  934. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => true,
  935. _ => false,
  936. }
  937. }
  938. fn alt_cat(&self) -> AlterationCategory {
  939. match (&self.reference, &self.alternative) {
  940. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
  941. AlterationCategory::SNV
  942. }
  943. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
  944. AlterationCategory::INS
  945. }
  946. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
  947. AlterationCategory::Other
  948. }
  949. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
  950. AlterationCategory::DEL
  951. }
  952. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) => {
  953. let a = a.len();
  954. let b = b.len();
  955. if a < b {
  956. AlterationCategory::INS
  957. } else if a > b {
  958. AlterationCategory::DEL
  959. } else {
  960. AlterationCategory::REP
  961. }
  962. }
  963. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Unstructured(_)) => {
  964. AlterationCategory::Other
  965. }
  966. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotide(_)) => {
  967. AlterationCategory::Other
  968. }
  969. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotides(_)) => {
  970. AlterationCategory::Other
  971. }
  972. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Unstructured(_)) => {
  973. AlterationCategory::Other
  974. }
  975. }
  976. }
  977. pub fn to_min_string(&mut self) -> String {
  978. let depth = self.get_depth();
  979. let n_alt = self.get_n_alt();
  980. format!(
  981. "DP:AD\t{}:{}",
  982. depth,
  983. vec![(depth - n_alt).to_string(), n_alt.to_string()].join(",")
  984. )
  985. }
  986. pub fn get_veps(&self) -> Vec<VEP> {
  987. self.annotations
  988. .iter()
  989. .flat_map(|e| {
  990. if let AnnotationType::VEP(e) = e {
  991. e.clone()
  992. } else {
  993. vec![]
  994. }
  995. })
  996. .collect()
  997. }
  998. pub fn get_best_vep(&self) -> Result<VEP> {
  999. get_best_vep(&self.get_veps())
  1000. }
  1001. }
  1002. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  1003. enum AlterationCategory {
  1004. SNV,
  1005. INS,
  1006. DEL,
  1007. REP,
  1008. Other,
  1009. }
  1010. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  1011. pub enum AnnotationType {
  1012. VariantCategory(VariantCategory),
  1013. VEP(Vec<VEP>),
  1014. Cluster(i32),
  1015. Cosmic(Cosmic),
  1016. GnomAD(GnomAD),
  1017. NCBIGFF(NCBIGFF),
  1018. }
  1019. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
  1020. pub enum VariantCategory {
  1021. Somatic,
  1022. LowMRDDepth,
  1023. LOH,
  1024. Constit,
  1025. LowDiversity,
  1026. }
  1027. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  1028. pub enum ReferenceAlternative {
  1029. Nucleotide(Base),
  1030. Nucleotides(Vec<Base>),
  1031. Unstructured(String),
  1032. }
  1033. impl FromStr for ReferenceAlternative {
  1034. type Err = anyhow::Error;
  1035. fn from_str(s: &str) -> Result<Self> {
  1036. let mut possible_bases = s.as_bytes().iter();
  1037. let mut res: Vec<Base> = Vec::new();
  1038. while let Some(&base) = possible_bases.next() {
  1039. match base.try_into() {
  1040. std::result::Result::Ok(b) => res.push(b),
  1041. Err(_) => {
  1042. return Ok(Self::Unstructured(s.to_string()));
  1043. }
  1044. }
  1045. }
  1046. if res.len() == 1 {
  1047. return Ok(Self::Nucleotide(res.pop().unwrap()));
  1048. } else {
  1049. return Ok(Self::Nucleotides(res));
  1050. }
  1051. }
  1052. }
  1053. impl fmt::Display for ReferenceAlternative {
  1054. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  1055. let string = match self {
  1056. ReferenceAlternative::Nucleotide(b) => b.to_string(),
  1057. ReferenceAlternative::Nucleotides(bases) => bases
  1058. .iter()
  1059. .fold(String::new(), |acc, e| format!("{}{}", acc, e.to_string())),
  1060. ReferenceAlternative::Unstructured(s) => s.to_string(),
  1061. };
  1062. write!(f, "{}", string)
  1063. }
  1064. }
  1065. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  1066. pub enum Base {
  1067. A,
  1068. T,
  1069. C,
  1070. G,
  1071. N,
  1072. }
  1073. impl TryFrom<u8> for Base {
  1074. type Error = anyhow::Error;
  1075. fn try_from(base: u8) -> Result<Self> {
  1076. match base {
  1077. b'A' => Ok(Base::A),
  1078. b'T' => Ok(Base::T),
  1079. b'C' => Ok(Base::C),
  1080. b'G' => Ok(Base::G),
  1081. b'N' => Ok(Base::N),
  1082. _ => Err(anyhow!(
  1083. "Unknown base: {}",
  1084. String::from_utf8_lossy(&vec![base])
  1085. )),
  1086. }
  1087. }
  1088. }
  1089. impl Base {
  1090. pub fn into_u8(self) -> u8 {
  1091. return match self {
  1092. Base::A => b'A',
  1093. Base::T => b'T',
  1094. Base::C => b'C',
  1095. Base::G => b'G',
  1096. Base::N => b'N',
  1097. };
  1098. }
  1099. }
  1100. impl fmt::Display for Base {
  1101. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  1102. // Use `self.number` to refer to each positional data point.
  1103. let str = match self {
  1104. Base::A => "A",
  1105. Base::T => "T",
  1106. Base::C => "C",
  1107. Base::G => "G",
  1108. Base::N => "N",
  1109. };
  1110. write!(f, "{}", str)
  1111. }
  1112. }
  1113. #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)]
  1114. pub enum Format {
  1115. DeepVariant(DeepVariantFormat),
  1116. ClairS(ClairSFormat),
  1117. Sniffles(SnifflesFormat),
  1118. Nanomonsv(NanomonsvFormat),
  1119. }
  1120. #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)]
  1121. pub enum Info {
  1122. DeepVariant(DeepVariantInfo),
  1123. ClairS(ClairSInfo),
  1124. Sniffles(SnifflesInfo),
  1125. Nanomonsv(NanomonsvInfo),
  1126. }
  1127. fn parse_info(s: &str, source: &VCFSource) -> Result<Info> {
  1128. match source {
  1129. VCFSource::DeepVariant => Ok(Info::DeepVariant(s.parse()?)),
  1130. VCFSource::ClairS => Ok(Info::ClairS(s.parse()?)),
  1131. VCFSource::Sniffles => Ok(Info::Sniffles(s.parse()?)),
  1132. VCFSource::Nanomonsv => Ok(Info::Nanomonsv(s.parse()?)),
  1133. }
  1134. }
  1135. fn parse_format(vcf_source: &VCFSource, data: &str) -> Result<Format> {
  1136. let res = match vcf_source {
  1137. VCFSource::DeepVariant => Format::DeepVariant(data.parse()?),
  1138. VCFSource::ClairS => Format::ClairS(data.parse()?),
  1139. VCFSource::Sniffles => Format::Sniffles(data.parse()?),
  1140. VCFSource::Nanomonsv => Format::Nanomonsv(data.parse()?),
  1141. };
  1142. Ok(res)
  1143. }
  1144. pub fn sort_variants(d: Vec<Variant>, dict_path: &str) -> Result<Vec<Variant>> {
  1145. info!("Sorting {} entries", d.len());
  1146. let dict = read_dict(dict_path)?;
  1147. let mut store: HashMap<String, Vec<Variant>> = HashMap::new();
  1148. // add to store
  1149. d.iter().for_each(|e| {
  1150. if let Some(vec) = store.get_mut(&e.contig) {
  1151. vec.push(e.clone());
  1152. } else {
  1153. store.insert(e.contig.to_string(), vec![e.clone()]);
  1154. }
  1155. });
  1156. // sort in each contig
  1157. store
  1158. .iter_mut()
  1159. .for_each(|(_, vec)| vec.sort_by(|a, b| a.position.partial_cmp(&b.position).unwrap()));
  1160. // return contig in the order of dict file
  1161. Ok(dict
  1162. .iter()
  1163. .flat_map(|(chr, _)| {
  1164. if let Some((_, vec)) = store.remove_entry(chr) {
  1165. vec
  1166. } else {
  1167. vec![]
  1168. }
  1169. })
  1170. .collect())
  1171. }
  1172. // pub fn cluster_variants(d: &mut Vec<Variant>, max_dist: u32) -> i32 {
  1173. // let mut cluster_id = 0;
  1174. // let first = d.get(0).unwrap();
  1175. // let mut last_pos = first.position;
  1176. // let mut last_contig = first.contig.to_string();
  1177. //
  1178. // d.iter_mut().for_each(|e| {
  1179. // if e.contig != last_contig {
  1180. // cluster_id += 1;
  1181. // last_contig = e.contig.to_string();
  1182. // } else if e.position - last_pos > max_dist {
  1183. // cluster_id += 1;
  1184. // }
  1185. // e.annotations.push(AnnotationType::Cluster(cluster_id));
  1186. // last_pos = e.position;
  1187. // });
  1188. //
  1189. // cluster_id
  1190. // }