: 61 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764
  1. use crate::{
  2. annotations::{
  3. cosmic::Cosmic, echtvar::{parse_echtvar_val, run_echtvar}, gnomad::GnomAD, ncbi_gff::NCBIGFF, pangolin::{pangolin_parse_results, pangolin_save_variants, run_pangolin, Pangolin}, vep::{get_best_vep, vep_chunk, VEP}
  4. },
  5. callers::{
  6. clairs::{ClairSFormat, ClairSInfo},
  7. deepvariant::{DeepVariantFormat, DeepVariantInfo},
  8. nanomonsv::{NanomonsvFormat, NanomonsvInfo},
  9. sniffles::{SnifflesFormat, SnifflesInfo},
  10. },
  11. config::{self, Config},
  12. in_out::{
  13. self,
  14. dict_reader::read_dict,
  15. get_reader,
  16. vcf_reader::{read_vcf, VCFRow},
  17. vcf_writer::{vcf_header_from, VariantWritter},
  18. },
  19. sql::{stats_sql::insert_stats, variants_sql::insert_variants},
  20. utils::{
  21. chi_square_test_for_proportions, count_repetitions, estimate_shannon_entropy,
  22. get_hts_nt_pileup, new_pg, new_pg_speed, print_stat_cat,
  23. },
  24. };
  25. use anyhow::{anyhow, Context, Ok, Result};
  26. use csv::ReaderBuilder;
  27. use dashmap::DashMap;
  28. use hashbrown::HashMap;
  29. use indicatif::{MultiProgress, ParallelProgressIterator};
  30. use log::{info, warn};
  31. use noodles_core::{region::Region, Position};
  32. use noodles_fasta::indexed_reader::Builder as FastaBuilder;
  33. use noodles_gff as gff;
  34. use rayon::prelude::*;
  35. use serde::{ser::SerializeStruct, Deserialize, Serialize, Serializer};
  36. use std::{fs, io::Write};
  37. use std::{
  38. env::temp_dir,
  39. fmt,
  40. fs::File,
  41. str::FromStr,
  42. sync::{
  43. atomic::{AtomicI32, Ordering},
  44. Arc,
  45. },
  46. };
  47. use utoipa::{openapi::schema, ToSchema};
  48. // chr12:25116542|G>T KRAS
  49. #[derive(Debug, Clone)]
  50. pub struct Variants {
  51. pub name: String,
  52. pub data: Vec<Variant>,
  53. pub constit: DashMap<String, Variant>,
  54. pub stats_vcf: StatsVCF,
  55. pub stats_bam: StatsBAM,
  56. pub cfg: Config,
  57. pub mp: MultiProgress,
  58. }
  59. impl Serialize for Variants {
  60. fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
  61. where
  62. S: Serializer,
  63. {
  64. // 3 is the number of fields in the struct.
  65. let mut state = serializer.serialize_struct("Variants", 5)?;
  66. state.serialize_field("name", &self.name)?;
  67. state.serialize_field("data", &self.data)?;
  68. state.serialize_field("constit", &self.constit)?;
  69. state.serialize_field("stats_vcf", &self.stats_vcf)?;
  70. state.serialize_field("stats_bam", &self.stats_bam)?;
  71. state.end()
  72. }
  73. }
  74. #[derive(Debug, Clone, Serialize, Deserialize, Default)]
  75. pub struct StatsVCF {
  76. n_tumoral_init: usize,
  77. n_constit_init: usize,
  78. n_constit: i32,
  79. n_loh: i32,
  80. n_low_mrd_depth: i32,
  81. }
  82. impl fmt::Display for StatsVCF {
  83. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  84. let k = 100.0 / self.n_tumoral_init as f64;
  85. let string = format!(
  86. "VCF filters found {} ({:.1}%) constit, {} ({:.1}%) LOH, {} ({:.1}%) Low depth for constit variants",
  87. self.n_constit, self.n_constit as f64 * k,
  88. self.n_loh, self.n_loh as f64 * k,
  89. self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k
  90. );
  91. write!(f, "{}", string)
  92. }
  93. }
  94. #[derive(Debug, Clone, Serialize, Deserialize, Default)]
  95. pub struct StatsBAM {
  96. n_lasting: i32,
  97. n_constit: i32,
  98. n_low_mrd_depth: i32,
  99. n_low_diversity: i32,
  100. n_somatic: i32,
  101. }
  102. impl fmt::Display for StatsBAM {
  103. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  104. let k = 100.0 / self.n_lasting as f64;
  105. let string = format!(
  106. "BAM filters found {} ({:.1}%) constit, {} ({:.1}%) low depth for constit variants, {} ({:.1}%) low diversity of sequence at the variant position, {} ({:.1}%) somatic variants",
  107. self.n_constit, self.n_constit as f64 * k,
  108. self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k,
  109. self.n_low_diversity, self.n_low_diversity as f64 * k,
  110. self.n_somatic, self.n_somatic as f64 * k
  111. );
  112. write!(f, "{}", string)
  113. }
  114. }
  115. impl Variants {
  116. pub fn from_vec(name: String, mp: &MultiProgress, data: Vec<Variant>) -> Self {
  117. Self {
  118. name,
  119. data,
  120. constit: DashMap::new(),
  121. stats_vcf: StatsVCF::default(),
  122. stats_bam: StatsBAM::default(),
  123. cfg: Config::get().unwrap(),
  124. mp: mp.clone(),
  125. }
  126. }
  127. pub fn from_vcfs(
  128. name: String,
  129. v: Vec<(&str, &VCFSource, &VariantType)>,
  130. cfg: &Config,
  131. mp: MultiProgress,
  132. ) -> Result<Self> {
  133. let pg = mp.add(new_pg(v.len() as u64));
  134. pg.set_message("Reading VCF");
  135. let constit: Arc<DashMap<String, Variant>> = Arc::new(DashMap::new());
  136. let n_constit = AtomicI32::new(0);
  137. let data: Vec<Variant> = v
  138. .par_iter()
  139. // .progress_count(v.len() as u64)
  140. .flat_map(|(path, source, variant_type)| {
  141. let r = match variant_type {
  142. VariantType::Somatic => read_vcf(path, source, variant_type).unwrap(),
  143. VariantType::Constitutionnal => {
  144. read_vcf(path, source, variant_type)
  145. .unwrap()
  146. .par_iter()
  147. .for_each(|e| {
  148. n_constit.fetch_add(1, Ordering::SeqCst);
  149. constit.insert(
  150. format!(
  151. "{}:{}|{}>{}",
  152. e.contig, e.position, e.reference, e.alternative
  153. ),
  154. e.clone(),
  155. );
  156. });
  157. vec![]
  158. }
  159. };
  160. pg.inc(1);
  161. r
  162. })
  163. .collect();
  164. let stats_vcf = StatsVCF::default();
  165. let stats_bam = StatsBAM::default();
  166. let constit = Arc::try_unwrap(constit).unwrap();
  167. let elapsed = pg.elapsed();
  168. pg.finish();
  169. info!("{} variants parsed from somatic VCFs and {} variants positions parsed from constitutionnal VCFs. Executed in {}s", data.len(), constit.len(), elapsed.as_secs());
  170. let cfg = cfg.clone();
  171. return Ok(Self {
  172. name,
  173. data,
  174. constit,
  175. stats_vcf,
  176. stats_bam,
  177. cfg,
  178. mp: mp.clone(),
  179. });
  180. }
  181. pub fn vcf_filters(&mut self) {
  182. let cfg = &self.cfg;
  183. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  184. pg.set_message("VCF filtering");
  185. let n_tumoral_init = self.len();
  186. let n_constit_init = self.constit_len();
  187. let min_loh_diff = cfg.deepvariant_loh_pval as f64;
  188. let min_mrd_depth = cfg.min_mrd_depth;
  189. info!("Filtering Constitutionnal (reported variant in constit), LOH (VAF proportion test < {}), LowMRDDepth (< {} in constit) variants by VCF annotations of {} likely somatic variants", min_loh_diff, min_mrd_depth, n_tumoral_init);
  190. let n_constit = AtomicI32::new(0);
  191. let n_loh = AtomicI32::new(0);
  192. let n_low_mrd_depth = AtomicI32::new(0);
  193. self.data = self
  194. .data
  195. .par_iter()
  196. .map(|e| {
  197. let mut tumoral = e.clone();
  198. let k = format!(
  199. "{}:{}|{}>{}",
  200. tumoral.contig, tumoral.position, tumoral.reference, tumoral.alternative
  201. );
  202. if let Some(mut constit) = self.constit.get_mut(&k) {
  203. if constit.get_depth() < min_mrd_depth {
  204. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  205. tumoral.annotations.push(AnnotationType::VariantCategory(
  206. VariantCategory::LowMRDDepth,
  207. ));
  208. } else if constit.get_n_alt() == constit.get_depth()
  209. && tumoral.get_n_alt() == tumoral.get_depth()
  210. {
  211. n_constit.fetch_add(1, Ordering::SeqCst);
  212. tumoral
  213. .annotations
  214. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  215. } else {
  216. let pval = chi_square_test_for_proportions(
  217. tumoral.get_n_alt() as f64,
  218. tumoral.get_depth() as f64,
  219. constit.get_n_alt() as f64,
  220. constit.get_depth() as f64,
  221. )
  222. .unwrap();
  223. if pval != 0.0 && pval <= min_loh_diff {
  224. n_loh.fetch_add(1, Ordering::SeqCst);
  225. tumoral
  226. .annotations
  227. .push(AnnotationType::VariantCategory(VariantCategory::LOH));
  228. } else {
  229. n_constit.fetch_add(1, Ordering::SeqCst);
  230. tumoral
  231. .annotations
  232. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  233. }
  234. }
  235. // If not un Constit registry, ClairS look for VCF constit depth and n_alt
  236. } else if let Format::ClairS(format) = &tumoral.callers_data.get(0).unwrap().format
  237. {
  238. if format.ndp < min_mrd_depth {
  239. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  240. tumoral.annotations.push(AnnotationType::VariantCategory(
  241. VariantCategory::LowMRDDepth,
  242. ));
  243. } else if let ReferenceAlternative::Nucleotide(alt_base) = &tumoral.alternative
  244. {
  245. let mrd_n_alt = match alt_base {
  246. Base::A => format.nau,
  247. Base::T => format.ntu,
  248. Base::C => format.ncu,
  249. Base::G => format.ngu,
  250. _ => 0,
  251. };
  252. if mrd_n_alt != 0 {
  253. n_constit.fetch_add(1, Ordering::SeqCst);
  254. tumoral
  255. .annotations
  256. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  257. }
  258. }
  259. }
  260. pg.inc(1);
  261. tumoral
  262. })
  263. .collect();
  264. let n_constit = n_constit.load(Ordering::SeqCst);
  265. let n_loh = n_loh.load(Ordering::SeqCst);
  266. let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
  267. self.stats_vcf = StatsVCF {
  268. n_tumoral_init,
  269. n_constit_init,
  270. n_constit,
  271. n_loh,
  272. n_low_mrd_depth,
  273. };
  274. // let elapsed = start.elapsed();
  275. let elapsed = pg.elapsed();
  276. pg.finish();
  277. info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
  278. }
  279. /// Filter variants by reading informations from constist BAM.
  280. pub fn bam_filters(&mut self, mrd_bam: &str) {
  281. let cfg = &self.cfg;
  282. // let start = Instant::now();
  283. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  284. pg.set_message("BAM filtering");
  285. let min_mrd_depth = cfg.min_mrd_depth;
  286. info!("Filtering Constitutionnal (Alt base found in BAM pileup), LowDiversity (sequence +/- 20nt around variant with entropy < {}), LowMRDDepth (BAM pileup depth < {}) variants by BAM pileup fetching of {} likely somatic variants", cfg.min_diversity, min_mrd_depth, self.stats_vcf.n_tumoral_init - (self.stats_vcf.n_constit + self.stats_vcf.n_loh + self.stats_vcf.n_low_mrd_depth) as usize);
  287. let n_already = AtomicI32::new(0);
  288. let n_constit = AtomicI32::new(0);
  289. let n_low_mrd_depth = AtomicI32::new(0);
  290. let n_low_diversity = AtomicI32::new(0);
  291. let n_somatic = AtomicI32::new(0);
  292. self.data.par_chunks_mut(10_000).for_each(|chunk| {
  293. let mut bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam)
  294. .context(anyhow!("Reading {}", mrd_bam))
  295. .unwrap();
  296. let mut genome_reader = FastaBuilder::default()
  297. .build_from_path(&cfg.reference_fa)
  298. .unwrap();
  299. for tumoral in chunk.iter_mut() {
  300. pg.inc(1);
  301. if tumoral.annotations.len() > 0 {
  302. n_already.fetch_add(1, Ordering::SeqCst);
  303. continue;
  304. }
  305. let (pos, is_ins) = match tumoral.alt_cat() {
  306. AlterationCategory::INS => (tumoral.position, true),
  307. AlterationCategory::DEL => (tumoral.position, false),
  308. _ => (tumoral.position, false),
  309. };
  310. match get_hts_nt_pileup(
  311. &mut bam,
  312. &tumoral.contig,
  313. pos as i32,
  314. is_ins, // tumoral.position as i32,
  315. ) {
  316. std::result::Result::Ok(bases) => {
  317. let depth = bases.len() as u32;
  318. if depth < min_mrd_depth {
  319. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  320. tumoral.annotations.push(AnnotationType::VariantCategory(
  321. VariantCategory::LowMRDDepth,
  322. ));
  323. } else {
  324. // Check local diversity
  325. let start =
  326. Position::try_from((tumoral.position - 20) as usize).unwrap();
  327. let end = Position::try_from((tumoral.position + 19) as usize).unwrap();
  328. let r = Region::new(tumoral.contig.to_string(), start..=end);
  329. if let std::result::Result::Ok(reg) = genome_reader.query(&r) {
  330. let s = reg.sequence();
  331. let u = s.as_ref();
  332. let s = String::from_utf8(u.to_vec()).unwrap();
  333. let ent = estimate_shannon_entropy(&s.to_lowercase());
  334. if ent < cfg.min_diversity {
  335. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  336. tumoral.annotations.push(AnnotationType::VariantCategory(
  337. VariantCategory::LowDiversity,
  338. ));
  339. continue;
  340. }
  341. // Check triplets or doublets if DeepVariant
  342. let callers = tumoral.callers();
  343. if callers.len() == 1 {
  344. if callers[0] == "DeepVariant".to_string() {
  345. let seq_left = &s[0..20];
  346. let seq_right = &s[21..s.len() - 1];
  347. // Triplet right
  348. if count_repetitions(seq_right, 3) >= 3 {
  349. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  350. tumoral.annotations.push(
  351. AnnotationType::VariantCategory(
  352. VariantCategory::LowDiversity,
  353. ),
  354. );
  355. continue;
  356. }
  357. // Doublet right
  358. if count_repetitions(seq_right, 2) >= 4 {
  359. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  360. tumoral.annotations.push(
  361. AnnotationType::VariantCategory(
  362. VariantCategory::LowDiversity,
  363. ),
  364. );
  365. continue;
  366. }
  367. // Triplet left
  368. if count_repetitions(seq_left, 3) >= 3 {
  369. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  370. tumoral.annotations.push(
  371. AnnotationType::VariantCategory(
  372. VariantCategory::LowDiversity,
  373. ),
  374. );
  375. continue;
  376. }
  377. // Doublet left
  378. if count_repetitions(seq_left, 2) >= 4 {
  379. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  380. tumoral.annotations.push(
  381. AnnotationType::VariantCategory(
  382. VariantCategory::LowDiversity,
  383. ),
  384. );
  385. continue;
  386. }
  387. }
  388. }
  389. }
  390. // Check if the base is in constitutionnal pileup
  391. if let ReferenceAlternative::Nucleotide(alt_b) = &tumoral.alternative {
  392. let alt_b = alt_b.clone().into_u8();
  393. let n_alt_mrd = bases
  394. .clone()
  395. .into_iter()
  396. .filter(|e| *e == alt_b)
  397. .collect::<Vec<_>>()
  398. .len();
  399. if n_alt_mrd > 0 {
  400. n_constit.fetch_add(1, Ordering::SeqCst);
  401. tumoral.annotations.push(AnnotationType::VariantCategory(
  402. VariantCategory::Constit,
  403. ));
  404. } else {
  405. n_somatic.fetch_add(1, Ordering::SeqCst);
  406. tumoral.annotations.push(AnnotationType::VariantCategory(
  407. VariantCategory::Somatic,
  408. ));
  409. }
  410. } else if tumoral.is_ins() {
  411. let n_alt_mrd =
  412. bases.clone().into_iter().filter(|e| *e == b'I').count();
  413. if n_alt_mrd > 0 {
  414. n_constit.fetch_add(1, Ordering::SeqCst);
  415. tumoral.annotations.push(AnnotationType::VariantCategory(
  416. VariantCategory::Constit,
  417. ));
  418. } else {
  419. n_somatic.fetch_add(1, Ordering::SeqCst);
  420. tumoral.annotations.push(AnnotationType::VariantCategory(
  421. VariantCategory::Somatic,
  422. ));
  423. }
  424. } else if tumoral.alt_cat() == AlterationCategory::DEL {
  425. let n_alt_mrd =
  426. bases.clone().into_iter().filter(|e| *e == b'D').count();
  427. if n_alt_mrd > 0 {
  428. n_constit.fetch_add(1, Ordering::SeqCst);
  429. tumoral.annotations.push(AnnotationType::VariantCategory(
  430. VariantCategory::Constit,
  431. ));
  432. } else {
  433. n_somatic.fetch_add(1, Ordering::SeqCst);
  434. tumoral.annotations.push(AnnotationType::VariantCategory(
  435. VariantCategory::Somatic,
  436. ));
  437. }
  438. }
  439. }
  440. }
  441. Err(r) => panic!("{}", r),
  442. }
  443. }
  444. });
  445. let n_constit = n_constit.load(Ordering::SeqCst);
  446. let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
  447. let n_low_diversity = n_low_diversity.load(Ordering::SeqCst);
  448. let n_somatic = n_somatic.load(Ordering::SeqCst);
  449. let n_lasting = self.data.len() as i32 - n_already.load(Ordering::SeqCst);
  450. self.stats_bam = StatsBAM {
  451. n_lasting,
  452. n_constit,
  453. n_low_mrd_depth,
  454. n_low_diversity,
  455. n_somatic,
  456. };
  457. let elapsed = pg.elapsed();
  458. pg.finish();
  459. info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
  460. }
  461. pub fn get_cat(&mut self, cat: &VariantCategory) -> Vec<Variant> {
  462. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  463. pg.set_message(format!("Get cat {:?}", cat));
  464. self.data
  465. .par_iter()
  466. .progress_with(pg)
  467. .flat_map(|e| {
  468. if e.annotations
  469. .iter()
  470. .filter(|e| match e {
  471. AnnotationType::VariantCategory(vc) => vc == cat,
  472. _ => false,
  473. })
  474. .count()
  475. > 0
  476. {
  477. vec![e.clone()]
  478. } else {
  479. vec![]
  480. }
  481. })
  482. .collect::<Vec<Variant>>()
  483. }
  484. pub fn write_vcf_cat(&mut self, path: &str, cat: &VariantCategory) -> Result<()> {
  485. info!("Writing VCF {}", path);
  486. let mut to_write = sort_variants(self.get_cat(cat), &self.cfg.dict_file)?;
  487. let pg = self.mp.add(new_pg_speed(to_write.len() as u64));
  488. pg.set_message("Writing VCF");
  489. let mut w = VariantWritter::new(path, &self.cfg.dict_file)?;
  490. for row in to_write.iter_mut() {
  491. w.write_variant(row)?;
  492. pg.inc(1);
  493. }
  494. w.write_index_finish()?;
  495. Ok(())
  496. }
  497. /// Keep variants annotated Somatic
  498. pub fn keep_somatics_un(&mut self) {
  499. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  500. pg.set_message("Filtering Variants");
  501. self.data = self
  502. .data
  503. .par_iter_mut()
  504. .progress_with(pg)
  505. .flat_map(|e| {
  506. // keep unannotated and somatic
  507. if e.annotations
  508. .iter()
  509. .filter(|a| match a {
  510. AnnotationType::VariantCategory(vc) => match vc {
  511. VariantCategory::Somatic => false,
  512. _ => true,
  513. },
  514. _ => false,
  515. })
  516. .count()
  517. == 0
  518. {
  519. vec![e]
  520. } else {
  521. vec![]
  522. }
  523. })
  524. .map(|e| e.clone())
  525. .collect();
  526. }
  527. /// Annotate with VEP
  528. pub fn vep(&mut self) {
  529. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  530. pg.set_message("VEP");
  531. self.data
  532. .par_chunks_mut(self.cfg.vep_chunk_size)
  533. .progress_with(pg)
  534. .for_each(|chunks| vep_chunk(chunks).unwrap());
  535. }
  536. /// sort_variants TODO
  537. pub fn sort(&mut self) -> Result<()> {
  538. let cfg = &self.cfg;
  539. self.data = sort_variants(self.data.clone(), &cfg.dict_file)?;
  540. Ok(())
  541. }
  542. ///
  543. pub fn merge(&mut self) {
  544. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  545. pg.set_message("Merging Variants by contig, positions, ref, alt");
  546. let hm: DashMap<String, Variant> = DashMap::new();
  547. self.data.par_iter().progress_with(pg).for_each(|e| {
  548. let k = format!(
  549. "{}:{}|{}>{}",
  550. e.contig, e.position, e.reference, e.alternative
  551. );
  552. if let Some(mut v) = hm.get_mut(&k) {
  553. let v = v.value_mut();
  554. e.callers_data.iter().for_each(|cd| {
  555. v.callers_data.push(cd.clone());
  556. v.callers_data.dedup();
  557. });
  558. v.source.extend(e.source.clone());
  559. v.source.dedup();
  560. } else {
  561. hm.insert(k, e.clone());
  562. }
  563. });
  564. self.data = hm.iter().map(|e| e.value().clone()).collect();
  565. }
  566. pub fn annotate_gff_feature(&mut self, gff_path: &str) -> Result<()> {
  567. let gff_path = gff_path.to_string();
  568. let len = self.data.len();
  569. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  570. pg.set_message("GFF Annotate");
  571. self.data
  572. .par_chunks_mut(len / 33)
  573. .progress_with(pg)
  574. .for_each(|chunk| {
  575. let mut reader = File::open(gff_path.to_string())
  576. .map(noodles_bgzf::Reader::new)
  577. .map(gff::Reader::new)
  578. .unwrap();
  579. let index = noodles_csi::read(format!("{}.csi", gff_path)).unwrap();
  580. for v in chunk.iter_mut() {
  581. let start = Position::try_from(v.position as usize).unwrap();
  582. let r = Region::new(v.contig.to_string(), start..=start);
  583. if let std::result::Result::Ok(rows) = reader.query(&index, &r.clone()) {
  584. for row in rows {
  585. let ncbi = NCBIGFF::try_from(row.unwrap()).unwrap();
  586. v.annotations.push(AnnotationType::NCBIGFF(ncbi));
  587. }
  588. }
  589. }
  590. });
  591. Ok(())
  592. }
  593. pub fn echtvar_annotate(&mut self, header_path: &str) -> Result<()> {
  594. let len = self.len();
  595. let header = vcf_header_from(header_path)?;
  596. let pg = self.mp.add(new_pg_speed(len as u64));
  597. pg.set_message("Echtvar Annotate");
  598. self.data
  599. .par_chunks_mut(len / 33)
  600. .progress_with(pg)
  601. .for_each(|chunk| {
  602. let in_tmp = format!(
  603. "{}/echtvar_in_{}.vcf",
  604. temp_dir().to_str().unwrap(),
  605. uuid::Uuid::new_v4()
  606. );
  607. let out_tmp = format!(
  608. "{}/echtvar_in_{}.vcf.gz",
  609. temp_dir().to_str().unwrap(),
  610. uuid::Uuid::new_v4()
  611. );
  612. let mut vcf = File::create(&in_tmp).unwrap();
  613. let _ = writeln!(vcf, "{}", header);
  614. for (i, row) in chunk.iter().enumerate() {
  615. let _ = writeln!(
  616. vcf,
  617. "{}\t{}\t{}\t{}\t{}\t{}\tPASS\t.\t{}\t{}",
  618. row.contig,
  619. row.position,
  620. i + 1,
  621. row.reference,
  622. row.alternative,
  623. ".",
  624. ".",
  625. "."
  626. );
  627. }
  628. run_echtvar(&in_tmp, &out_tmp).unwrap();
  629. let mut reader = ReaderBuilder::new()
  630. .delimiter(b'\t')
  631. .has_headers(false)
  632. .comment(Some(b'#'))
  633. .flexible(true)
  634. .from_reader(get_reader(&out_tmp).unwrap());
  635. // let mut lines: HashMap<u64, Vec<VEPLine>> = HashMap::new();
  636. let mut last: usize = 1;
  637. for line in reader.deserialize::<VCFRow>() {
  638. if let std::result::Result::Ok(row) = line {
  639. let (cosmic, gnomad) = parse_echtvar_val(&row.info).unwrap();
  640. let id: usize = row.id.parse().unwrap();
  641. if id != last {
  642. panic!("Echtvar output not in input order!");
  643. }
  644. if let Some(c) = cosmic {
  645. chunk[id - 1].annotations.push(AnnotationType::Cosmic(c));
  646. }
  647. if let Some(g) = gnomad {
  648. chunk[id - 1].annotations.push(AnnotationType::GnomAD(g));
  649. }
  650. last += 1;
  651. }
  652. }
  653. });
  654. Ok(())
  655. }
  656. pub fn category_iter(&self, category: &VariantCategory) -> Vec<&Variant> {
  657. self.data
  658. .par_iter()
  659. .filter(|v| {
  660. for annotation in v.annotations.iter() {
  661. match annotation {
  662. AnnotationType::VariantCategory(cat) => {
  663. if cat == category {
  664. return true;
  665. }
  666. }
  667. _ => (),
  668. }
  669. }
  670. return false;
  671. })
  672. .collect::<Vec<&Variant>>()
  673. }
  674. /// Filter based on GnomAD if gnomad_af < max_gnomad_af
  675. pub fn filter_snp(&mut self) -> Result<i32> {
  676. let n_snp = AtomicI32::new(0);
  677. self.data = self
  678. .data
  679. .clone()
  680. .into_par_iter()
  681. .filter(|e| {
  682. let mut res = true;
  683. e.annotations.iter().for_each(|a| {
  684. match a {
  685. AnnotationType::GnomAD(g) => {
  686. res = g.gnomad_af < self.cfg.max_gnomad_af;
  687. }
  688. _ => (),
  689. };
  690. });
  691. if !res {
  692. n_snp.fetch_add(1, Ordering::SeqCst);
  693. }
  694. res
  695. })
  696. .collect();
  697. let n = n_snp.load(Ordering::SeqCst);
  698. Ok(n)
  699. }
  700. pub fn pangolin(&mut self) -> Result<()> {
  701. let tmp_file = pangolin_save_variants(&self)?;
  702. let res_file = run_pangolin(&tmp_file)?;
  703. fs::remove_file(tmp_file)?;
  704. let res = pangolin_parse_results(&res_file)?;
  705. let mut res = res.iter();
  706. fs::remove_file(res_file)?;
  707. info!("Adding pangolin results for {} variants.", res.len());
  708. let mut n_added = 0;
  709. if let Some(r) = res.next() {
  710. let mut curr = r.clone();
  711. for variant in self.data.iter_mut() {
  712. if variant.contig == curr.0 && variant.position == curr.1 {
  713. if variant.reference == curr.2 {
  714. if variant.alternative == curr.3 {
  715. variant.annotations.push(AnnotationType::Pangolin(curr.4));
  716. n_added += 1;
  717. if let Some(r) = res.next() {
  718. curr = r.clone();
  719. } else {
  720. break;
  721. }
  722. }
  723. }
  724. }
  725. }
  726. }
  727. assert_eq!(res.len(), 0);
  728. Ok(())
  729. }
  730. pub fn len(&self) -> usize {
  731. self.data.len()
  732. }
  733. pub fn constit_len(&self) -> usize {
  734. self.constit.len()
  735. }
  736. pub fn get_variant(&self, contig: &str, pos: u32) -> Vec<Variant> {
  737. self.data
  738. .par_iter()
  739. .filter(|v| v.contig == contig && v.position == pos)
  740. .map(|v| v.clone())
  741. .collect()
  742. }
  743. pub fn stats(&self) -> Result<Vec<Stat>> {
  744. let mut callers_cat = HashMap::new();
  745. let mut n_caller_data = 0;
  746. let mut variants_cat = HashMap::new();
  747. let mut n_variants_wcat = 0;
  748. let mut ncbi_feature = HashMap::new();
  749. let mut n_ncbi_feature = 0;
  750. let mut cosmic_sup_1 = HashMap::new();
  751. let mut n_cosmic_sup_1 = 0;
  752. let mut cons_cat = HashMap::new();
  753. let mut n_csq = 0;
  754. let add_hm = |hm: &mut HashMap<String, u32>, k: &str| {
  755. let (_, v) = hm.raw_entry_mut().from_key(k).or_insert(k.to_string(), 1);
  756. *v += 1;
  757. };
  758. for ele in self.data.iter() {
  759. // Callers
  760. let mut callers = Vec::new();
  761. for cd in &ele.callers_data {
  762. callers.push(
  763. match cd.format {
  764. Format::DeepVariant(_) => "DeepVariant",
  765. Format::ClairS(_) => "ClairS",
  766. Format::Sniffles(_) => "Sniffles",
  767. Format::Nanomonsv(_) => "Nanomonsv",
  768. }
  769. .to_string(),
  770. );
  771. }
  772. if !callers.is_empty() {
  773. n_caller_data += 1;
  774. callers.sort();
  775. let k = callers.join(",");
  776. let (_, v) = callers_cat
  777. .raw_entry_mut()
  778. .from_key(&k)
  779. .or_insert(k.clone(), 1);
  780. *v += 1;
  781. }
  782. // Var cat
  783. // Annotations
  784. for annot in ele.annotations.iter() {
  785. let mut features = Vec::new();
  786. let mut variant_cat = Vec::new();
  787. let mut cosmic_m1 = false;
  788. match annot {
  789. AnnotationType::NCBIGFF(ncbi) => {
  790. features.push(ncbi.feature.to_string());
  791. }
  792. AnnotationType::Cosmic(c) => {
  793. if c.cosmic_cnt > 1 {
  794. cosmic_m1 = true;
  795. }
  796. }
  797. AnnotationType::VariantCategory(vc) => {
  798. let s = serde_json::to_string(vc)?;
  799. variant_cat.push(s);
  800. }
  801. _ => (),
  802. };
  803. if !features.is_empty() {
  804. features.sort();
  805. add_hm(&mut ncbi_feature, &features.join(","));
  806. n_ncbi_feature += 1;
  807. }
  808. if !variant_cat.is_empty() {
  809. add_hm(&mut variants_cat, &variant_cat.join(","));
  810. n_variants_wcat += 1;
  811. }
  812. if cosmic_m1 {
  813. add_hm(&mut cosmic_sup_1, "Cosmic > 1");
  814. n_cosmic_sup_1 += 1;
  815. }
  816. }
  817. // VEP
  818. let d: Vec<VEP> = ele
  819. .annotations
  820. .iter()
  821. .flat_map(|e| {
  822. if let AnnotationType::VEP(e) = e {
  823. e.clone()
  824. } else {
  825. vec![]
  826. }
  827. })
  828. .collect();
  829. if let std::result::Result::Ok(vep) = get_best_vep(&d) {
  830. if let Some(csq) = vep.consequence {
  831. n_csq += 1;
  832. let csq = csq.join(",");
  833. let (_, v) = cons_cat
  834. .raw_entry_mut()
  835. .from_key(&csq)
  836. .or_insert(csq.clone(), 1);
  837. *v += 1;
  838. }
  839. }
  840. }
  841. print_stat_cat(&cons_cat, n_csq as u32);
  842. print_stat_cat(&ncbi_feature, n_ncbi_feature as u32);
  843. print_stat_cat(&cosmic_sup_1, n_cosmic_sup_1 as u32);
  844. print_stat_cat(&callers_cat, n_caller_data as u32);
  845. // let file = File::create(path)?;
  846. // let mut writer = BufWriter::new(file);
  847. let mut results = Vec::new();
  848. results.push(Stat::new(
  849. "consequences".to_string(),
  850. cons_cat,
  851. n_csq as u32,
  852. ));
  853. results.push(Stat::new(
  854. "variants_cat".to_string(),
  855. variants_cat,
  856. n_variants_wcat as u32,
  857. ));
  858. results.push(Stat::new(
  859. "ncbi_feature".to_string(),
  860. ncbi_feature,
  861. n_ncbi_feature as u32,
  862. ));
  863. results.push(Stat::new(
  864. "callers_cat".to_string(),
  865. callers_cat,
  866. n_caller_data as u32,
  867. ));
  868. // let res = serde_json::to_string(&results)?;
  869. Ok(results)
  870. }
  871. pub fn save_sql(&self, path: &str) -> Result<()> {
  872. insert_variants(self, path)
  873. }
  874. pub fn stats_sql(&self, path: &str) -> Result<()> {
  875. insert_stats(
  876. "VCF".to_string(),
  877. serde_json::to_string(&self.stats_vcf)?,
  878. path,
  879. )?;
  880. insert_stats(
  881. "BAM".to_string(),
  882. serde_json::to_string(&self.stats_bam)?,
  883. path,
  884. )?;
  885. Ok(())
  886. }
  887. pub fn save_bytes(&self, path: &str) -> Result<()> {
  888. let serialized = pot::to_vec(&self.data)?;
  889. let mut w = noodles_bgzf::writer::Builder::default().build_with_writer(File::create(path)?);
  890. w.write_all(&serialized)?;
  891. Ok(())
  892. }
  893. pub fn new_from_bytes(name: &str, path: &str, mp: MultiProgress) -> Result<Self> {
  894. info!("Loading variants from: {path}");
  895. let r = in_out::get_reader_progress(path, &mp)?;
  896. let data: Vec<Variant> = pot::from_reader(r)?;
  897. Ok(Self {
  898. name: name.to_string(),
  899. data,
  900. constit: DashMap::new(),
  901. stats_vcf: StatsVCF::default(),
  902. stats_bam: StatsBAM::default(),
  903. cfg: Config::get()?,
  904. mp,
  905. })
  906. }
  907. pub fn filter_category(&self, and_categories: &Vec<Category>) -> Vec<&Variant> {
  908. self.data
  909. .par_iter()
  910. .flat_map(|v| {
  911. if v.is_from_category(and_categories) {
  912. vec![v]
  913. } else {
  914. vec![]
  915. }
  916. })
  917. .collect()
  918. }
  919. }
  920. #[derive(Debug, Clone, Serialize, ToSchema)]
  921. pub struct Stat {
  922. name: String,
  923. counts: HashMap<String, u32>,
  924. n_with_annotation: u32,
  925. }
  926. impl Stat {
  927. pub fn new(name: String, counts: HashMap<String, u32>, n_with_annotation: u32) -> Self {
  928. Stat {
  929. counts,
  930. n_with_annotation,
  931. name,
  932. }
  933. }
  934. }
  935. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
  936. pub struct Variant {
  937. pub contig: String,
  938. pub position: u32,
  939. pub reference: ReferenceAlternative,
  940. pub alternative: ReferenceAlternative,
  941. pub callers_data: Vec<CallerData>,
  942. pub n_alt: Option<u32>,
  943. pub n_ref: Option<u32>,
  944. pub vaf: Option<f32>,
  945. pub depth: Option<u32>,
  946. pub variant_type: VariantType,
  947. pub source: Vec<VCFSource>,
  948. pub annotations: Vec<AnnotationType>,
  949. }
  950. #[derive(Debug, PartialEq, Serialize, Deserialize, Clone, ToSchema)]
  951. pub struct CallerData {
  952. pub qual: Option<f32>,
  953. pub format: Format,
  954. pub info: Info,
  955. }
  956. impl CallerData {
  957. pub fn get_vaf(&self) -> f64 {
  958. match &self.format {
  959. Format::DeepVariant(v) => v.vaf as f64,
  960. Format::ClairS(v) => v.af,
  961. Format::Sniffles(v) => v.dv as f64 / (v.dv as f64 + v.dr as f64),
  962. Format::Nanomonsv(v) => v.vr as f64 / v.tr as f64,
  963. }
  964. }
  965. pub fn get_depth(&mut self) -> u32 {
  966. match &self.format {
  967. Format::DeepVariant(v) => v.dp,
  968. Format::ClairS(v) => v.dp,
  969. Format::Sniffles(v) => v.dv + v.dr,
  970. Format::Nanomonsv(v) => v.tr,
  971. }
  972. }
  973. pub fn get_n_alt(&mut self) -> u32 {
  974. match &self.format {
  975. Format::DeepVariant(v) => v.ad.get(1).unwrap().to_owned(),
  976. Format::ClairS(v) => v.ad.get(1).unwrap().to_owned(),
  977. Format::Sniffles(v) => v.dv,
  978. Format::Nanomonsv(v) => v.tr - v.vr,
  979. }
  980. }
  981. /// Variants filter rules
  982. pub fn should_filter(&self) -> bool {
  983. if let Info::Sniffles(info) = &self.info {
  984. let imprecise = info
  985. .tags
  986. .iter()
  987. .filter(|s| s.to_string() == "IMPRECISE".to_string())
  988. .count();
  989. let mut n_alt = 0;
  990. if let Format::Sniffles(f) = &self.format {
  991. n_alt = f.dv;
  992. }
  993. if imprecise == 0 && n_alt >= 3 {
  994. return false;
  995. } else {
  996. return true;
  997. }
  998. } else {
  999. return false;
  1000. }
  1001. }
  1002. }
  1003. #[derive(Debug, Clone, Serialize, Eq, PartialEq, Deserialize, ToSchema)]
  1004. pub enum VariantType {
  1005. Somatic,
  1006. Constitutionnal,
  1007. }
  1008. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
  1009. pub enum VCFSource {
  1010. DeepVariant,
  1011. ClairS,
  1012. Sniffles,
  1013. Nanomonsv,
  1014. }
  1015. impl FromStr for VCFSource {
  1016. type Err = anyhow::Error;
  1017. fn from_str(s: &str) -> Result<Self> {
  1018. match s {
  1019. "DeepVariant" => Ok(VCFSource::DeepVariant),
  1020. "ClairS" => Ok(VCFSource::ClairS),
  1021. "Sniffles" => Ok(VCFSource::Sniffles),
  1022. "Nanomonsv" => Ok(VCFSource::Nanomonsv),
  1023. _ => Err(anyhow!("Error parsing VCFSource")),
  1024. }
  1025. }
  1026. }
  1027. impl ToString for VCFSource {
  1028. fn to_string(&self) -> String {
  1029. let s = match self {
  1030. VCFSource::DeepVariant => "DeepVariant",
  1031. VCFSource::ClairS => "ClairS",
  1032. VCFSource::Sniffles => "Sniffles",
  1033. VCFSource::Nanomonsv => "Nanomonsv",
  1034. };
  1035. s.to_string()
  1036. }
  1037. }
  1038. impl Variant {
  1039. pub fn from_vcfrow(row: &VCFRow, source: VCFSource, variant_type: VariantType) -> Result<Self> {
  1040. let callers_data = vec![CallerData {
  1041. qual: row.qual.parse::<f32>().ok(),
  1042. info: parse_info(&row.info, &source).context(anyhow!(
  1043. "Can't parse {:?} info for {}",
  1044. source,
  1045. row.info
  1046. ))?,
  1047. format: parse_format(&source, &row.value).context(anyhow!(
  1048. "Can't parse {:?} format for {}",
  1049. source,
  1050. row.value
  1051. ))?,
  1052. }];
  1053. Ok(Variant {
  1054. contig: row.chr.to_string(),
  1055. position: row.pos,
  1056. reference: row
  1057. .reference
  1058. .parse()
  1059. .context(anyhow!("Error while parsing {}", row.reference))?,
  1060. alternative: row
  1061. .alt
  1062. .parse()
  1063. .context(anyhow!("Error while parsing {}", row.alt))?,
  1064. n_ref: None,
  1065. n_alt: None,
  1066. vaf: None,
  1067. depth: None,
  1068. callers_data,
  1069. source: vec![source],
  1070. variant_type,
  1071. annotations: Vec::new(),
  1072. })
  1073. }
  1074. pub fn get_depth(&mut self) -> u32 {
  1075. if let Some(depth) = self.depth {
  1076. return depth;
  1077. } else {
  1078. let depth = self
  1079. .callers_data
  1080. .iter_mut()
  1081. .map(|v| v.get_depth())
  1082. .max()
  1083. .unwrap();
  1084. self.depth = Some(depth);
  1085. return depth;
  1086. }
  1087. }
  1088. pub fn get_n_alt(&mut self) -> u32 {
  1089. if let Some(n_alt) = self.n_alt {
  1090. return n_alt;
  1091. } else {
  1092. let n_alt = self
  1093. .callers_data
  1094. .iter_mut()
  1095. .map(|v| v.get_n_alt())
  1096. .max()
  1097. .unwrap();
  1098. self.n_alt = Some(n_alt);
  1099. return n_alt;
  1100. }
  1101. }
  1102. pub fn vaf(&mut self) -> f32 {
  1103. let n_alt = self.get_n_alt() as f32;
  1104. let depth = self.get_depth() as f32;
  1105. self.vaf = Some(n_alt / depth);
  1106. self.vaf.unwrap()
  1107. }
  1108. fn is_ins(&self) -> bool {
  1109. match (&self.reference, &self.alternative) {
  1110. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => true,
  1111. _ => false,
  1112. }
  1113. }
  1114. fn alt_cat(&self) -> AlterationCategory {
  1115. match (&self.reference, &self.alternative) {
  1116. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
  1117. AlterationCategory::SNV
  1118. }
  1119. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
  1120. AlterationCategory::INS
  1121. }
  1122. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
  1123. AlterationCategory::Other
  1124. }
  1125. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
  1126. AlterationCategory::DEL
  1127. }
  1128. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) => {
  1129. let a = a.len();
  1130. let b = b.len();
  1131. if a < b {
  1132. AlterationCategory::INS
  1133. } else if a > b {
  1134. AlterationCategory::DEL
  1135. } else {
  1136. AlterationCategory::REP
  1137. }
  1138. }
  1139. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Unstructured(_)) => {
  1140. AlterationCategory::Other
  1141. }
  1142. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotide(_)) => {
  1143. AlterationCategory::Other
  1144. }
  1145. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotides(_)) => {
  1146. AlterationCategory::Other
  1147. }
  1148. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Unstructured(_)) => {
  1149. AlterationCategory::Other
  1150. }
  1151. }
  1152. }
  1153. pub fn to_min_string(&mut self) -> String {
  1154. let depth = self.get_depth();
  1155. let n_alt = self.get_n_alt();
  1156. format!(
  1157. "DP:AD\t{}:{}",
  1158. depth,
  1159. vec![(depth - n_alt).to_string(), n_alt.to_string()].join(",")
  1160. )
  1161. }
  1162. pub fn get_veps(&self) -> Vec<VEP> {
  1163. self.annotations
  1164. .iter()
  1165. .flat_map(|e| {
  1166. if let AnnotationType::VEP(e) = e {
  1167. e.clone()
  1168. } else {
  1169. vec![]
  1170. }
  1171. })
  1172. .collect()
  1173. }
  1174. pub fn get_best_vep(&self) -> Result<VEP> {
  1175. get_best_vep(&self.get_veps())
  1176. }
  1177. pub fn is_from_category(&self, and_categories: &Vec<Category>) -> bool {
  1178. let mut vec_bools = Vec::new();
  1179. for category in and_categories.iter() {
  1180. match category {
  1181. Category::VariantCategory(vc) => {
  1182. for annotations in self.annotations.iter() {
  1183. match annotations {
  1184. AnnotationType::VariantCategory(vvc) => {
  1185. if vc == vvc {
  1186. vec_bools.push(true);
  1187. break;
  1188. }
  1189. }
  1190. _ => (),
  1191. }
  1192. }
  1193. }
  1194. Category::PositionRange { contig, from, to } => {
  1195. if self.contig == *contig {
  1196. match (from, to) {
  1197. (None, None) => vec_bools.push(true),
  1198. (None, Some(to)) => vec_bools.push(self.position <= *to),
  1199. (Some(from), None) => vec_bools.push(self.position >= *from),
  1200. (Some(from), Some(to)) => {
  1201. vec_bools.push(self.position >= *from && self.position <= *to)
  1202. }
  1203. }
  1204. } else {
  1205. vec_bools.push(false);
  1206. }
  1207. }
  1208. Category::VCFSource(_) => (),
  1209. Category::NCosmic(n) => {
  1210. let mut bools = Vec::new();
  1211. for annotations in self.annotations.iter() {
  1212. match annotations {
  1213. AnnotationType::Cosmic(c) => {
  1214. bools.push(c.cosmic_cnt >= *n);
  1215. break;
  1216. }
  1217. _ => (),
  1218. }
  1219. }
  1220. vec_bools.push(bools.iter().any(|&b| b));
  1221. }
  1222. Category::NCBIFeature(ncbi_feature) => {
  1223. let mut bools = Vec::new();
  1224. for annotations in self.annotations.iter() {
  1225. match annotations {
  1226. AnnotationType::NCBIGFF(v) => {
  1227. bools.push(v.feature == *ncbi_feature);
  1228. }
  1229. _ => (),
  1230. }
  1231. }
  1232. vec_bools.push(bools.iter().any(|&b| b));
  1233. }
  1234. Category::VAF { min, max } => {
  1235. let v = if self.vaf.is_none() {
  1236. let mut s = self.clone();
  1237. s.vaf()
  1238. } else {
  1239. self.vaf.unwrap()
  1240. };
  1241. vec_bools.push(v >= *min && v <= *max);
  1242. }
  1243. Category::Pangolin => {
  1244. vec_bools.push(self.annotations.iter().filter(|a| match a {
  1245. AnnotationType::Pangolin(_) => true,
  1246. _ => false
  1247. }).count() > 0);
  1248. }
  1249. }
  1250. }
  1251. vec_bools.iter().all(|&x| x)
  1252. }
  1253. pub fn callers(&self) -> Vec<String> {
  1254. self.source
  1255. .iter()
  1256. .map(|source| source.to_string())
  1257. .collect()
  1258. }
  1259. }
  1260. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  1261. enum AlterationCategory {
  1262. SNV,
  1263. INS,
  1264. DEL,
  1265. REP,
  1266. Other,
  1267. }
  1268. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
  1269. pub enum AnnotationType {
  1270. VariantCategory(VariantCategory),
  1271. VEP(Vec<VEP>),
  1272. Cluster(i32),
  1273. Cosmic(Cosmic),
  1274. GnomAD(GnomAD),
  1275. NCBIGFF(NCBIGFF),
  1276. Pangolin(Pangolin)
  1277. }
  1278. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
  1279. pub enum VariantCategory {
  1280. Somatic,
  1281. LowMRDDepth,
  1282. LOH,
  1283. Constit,
  1284. LowDiversity,
  1285. }
  1286. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, ToSchema)]
  1287. pub enum ReferenceAlternative {
  1288. Nucleotide(Base),
  1289. Nucleotides(Vec<Base>),
  1290. Unstructured(String),
  1291. }
  1292. impl FromStr for ReferenceAlternative {
  1293. type Err = anyhow::Error;
  1294. fn from_str(s: &str) -> Result<Self> {
  1295. let mut possible_bases = s.as_bytes().iter();
  1296. let mut res: Vec<Base> = Vec::new();
  1297. while let Some(&base) = possible_bases.next() {
  1298. match base.try_into() {
  1299. std::result::Result::Ok(b) => res.push(b),
  1300. Err(_) => {
  1301. return Ok(Self::Unstructured(s.to_string()));
  1302. }
  1303. }
  1304. }
  1305. if res.len() == 1 {
  1306. return Ok(Self::Nucleotide(res.pop().unwrap()));
  1307. } else {
  1308. return Ok(Self::Nucleotides(res));
  1309. }
  1310. }
  1311. }
  1312. impl fmt::Display for ReferenceAlternative {
  1313. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  1314. let string = match self {
  1315. ReferenceAlternative::Nucleotide(b) => b.to_string(),
  1316. ReferenceAlternative::Nucleotides(bases) => bases
  1317. .iter()
  1318. .fold(String::new(), |acc, e| format!("{}{}", acc, e.to_string())),
  1319. ReferenceAlternative::Unstructured(s) => s.to_string(),
  1320. };
  1321. write!(f, "{}", string)
  1322. }
  1323. }
  1324. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, ToSchema)]
  1325. pub enum Base {
  1326. A,
  1327. T,
  1328. C,
  1329. G,
  1330. N,
  1331. }
  1332. impl TryFrom<u8> for Base {
  1333. type Error = anyhow::Error;
  1334. fn try_from(base: u8) -> Result<Self> {
  1335. match base {
  1336. b'A' => Ok(Base::A),
  1337. b'T' => Ok(Base::T),
  1338. b'C' => Ok(Base::C),
  1339. b'G' => Ok(Base::G),
  1340. b'N' => Ok(Base::N),
  1341. _ => Err(anyhow!(
  1342. "Unknown base: {}",
  1343. String::from_utf8_lossy(&vec![base])
  1344. )),
  1345. }
  1346. }
  1347. }
  1348. impl Base {
  1349. pub fn into_u8(self) -> u8 {
  1350. return match self {
  1351. Base::A => b'A',
  1352. Base::T => b'T',
  1353. Base::C => b'C',
  1354. Base::G => b'G',
  1355. Base::N => b'N',
  1356. };
  1357. }
  1358. }
  1359. impl fmt::Display for Base {
  1360. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  1361. // Use `self.number` to refer to each positional data point.
  1362. let str = match self {
  1363. Base::A => "A",
  1364. Base::T => "T",
  1365. Base::C => "C",
  1366. Base::G => "G",
  1367. Base::N => "N",
  1368. };
  1369. write!(f, "{}", str)
  1370. }
  1371. }
  1372. #[derive(Debug, Serialize, Deserialize, PartialEq, Clone, ToSchema)]
  1373. pub enum Format {
  1374. DeepVariant(DeepVariantFormat),
  1375. ClairS(ClairSFormat),
  1376. Sniffles(SnifflesFormat),
  1377. Nanomonsv(NanomonsvFormat),
  1378. }
  1379. #[derive(Debug, Serialize, Deserialize, PartialEq, Clone, ToSchema)]
  1380. pub enum Info {
  1381. #[schema(value_type=String)]
  1382. DeepVariant(DeepVariantInfo),
  1383. #[schema(value_type=String)]
  1384. ClairS(ClairSInfo),
  1385. #[schema(value_type=String)]
  1386. Sniffles(SnifflesInfo),
  1387. #[schema(value_type=String)]
  1388. Nanomonsv(NanomonsvInfo),
  1389. }
  1390. fn parse_info(s: &str, source: &VCFSource) -> Result<Info> {
  1391. match source {
  1392. VCFSource::DeepVariant => Ok(Info::DeepVariant(s.parse()?)),
  1393. VCFSource::ClairS => Ok(Info::ClairS(s.parse()?)),
  1394. VCFSource::Sniffles => Ok(Info::Sniffles(s.parse()?)),
  1395. VCFSource::Nanomonsv => Ok(Info::Nanomonsv(s.parse()?)),
  1396. }
  1397. }
  1398. fn parse_format(vcf_source: &VCFSource, data: &str) -> Result<Format> {
  1399. let res = match vcf_source {
  1400. VCFSource::DeepVariant => Format::DeepVariant(data.parse()?),
  1401. VCFSource::ClairS => Format::ClairS(data.parse()?),
  1402. VCFSource::Sniffles => Format::Sniffles(data.parse()?),
  1403. VCFSource::Nanomonsv => Format::Nanomonsv(data.parse()?),
  1404. };
  1405. Ok(res)
  1406. }
  1407. pub fn sort_variants(d: Vec<Variant>, dict_path: &str) -> Result<Vec<Variant>> {
  1408. info!("Sorting {} entries", d.len());
  1409. let dict = read_dict(dict_path)?;
  1410. let mut store: HashMap<String, Vec<Variant>> = HashMap::new();
  1411. // add to store
  1412. d.iter().for_each(|e| {
  1413. if let Some(vec) = store.get_mut(&e.contig) {
  1414. vec.push(e.clone());
  1415. } else {
  1416. store.insert(e.contig.to_string(), vec![e.clone()]);
  1417. }
  1418. });
  1419. // sort in each contig
  1420. store
  1421. .iter_mut()
  1422. .for_each(|(_, vec)| vec.sort_by(|a, b| a.position.partial_cmp(&b.position).unwrap()));
  1423. // return contig in the order of dict file
  1424. Ok(dict
  1425. .iter()
  1426. .flat_map(|(chr, _)| {
  1427. if let Some((_, vec)) = store.remove_entry(chr) {
  1428. vec
  1429. } else {
  1430. vec![]
  1431. }
  1432. })
  1433. .collect())
  1434. }
  1435. #[derive(Debug)]
  1436. pub enum Category {
  1437. VariantCategory(VariantCategory),
  1438. PositionRange {
  1439. contig: String,
  1440. from: Option<u32>,
  1441. to: Option<u32>,
  1442. },
  1443. VCFSource(VCFSource),
  1444. NCosmic(u64),
  1445. NCBIFeature(String),
  1446. VAF {
  1447. min: f32,
  1448. max: f32,
  1449. },
  1450. Pangolin
  1451. }
  1452. pub fn run_pipe(name: &str, multi: &MultiProgress) -> Result<()> {
  1453. let cfg = config::Config::get()?;
  1454. let deepvariant_diag_vcf = format!(
  1455. "{}/{name}/diag/DeepVariant/{name}_diag_DeepVariant_PASSED.vcf.gz",
  1456. cfg.longreads_results_dir
  1457. );
  1458. if !std::path::Path::new(&deepvariant_diag_vcf).exists() {
  1459. return Err(anyhow!("{deepvariant_diag_vcf} is required"));
  1460. // panic!("{deepvariant_diag_vcf} is required")
  1461. }
  1462. let deepvariant_mrd_vcf = format!(
  1463. "{}/{name}/mrd/DeepVariant/{name}_mrd_DeepVariant_PASSED.vcf.gz",
  1464. cfg.longreads_results_dir
  1465. );
  1466. if !std::path::Path::new(&deepvariant_mrd_vcf).exists() {
  1467. return Err(anyhow!("{deepvariant_mrd_vcf} is required"));
  1468. }
  1469. let mrd_bam = format!(
  1470. "{}/{name}/mrd/{name}_mrd_hs1.bam",
  1471. cfg.longreads_results_dir
  1472. );
  1473. if !std::path::Path::new(&mrd_bam).exists() {
  1474. return Err(anyhow!("{mrd_bam} is required"));
  1475. }
  1476. let clairs_vcf = format!(
  1477. "{}/{name}/diag/ClairS/{name}_diag_clairs_PASSED.vcf.gz",
  1478. cfg.longreads_results_dir
  1479. );
  1480. if !std::path::Path::new(&clairs_vcf).exists() {
  1481. return Err(anyhow!("{clairs_vcf} is required"));
  1482. }
  1483. let clairs_indels_vcf = format!(
  1484. "{}/{name}/diag/ClairS/{name}_diag_clairs_indel_PASSED.vcf.gz",
  1485. cfg.longreads_results_dir
  1486. );
  1487. if !std::path::Path::new(&clairs_indels_vcf).exists() {
  1488. return Err(anyhow!("{clairs_indels_vcf} is required"));
  1489. }
  1490. let sniffles_vcf = format!(
  1491. "{}/{name}/diag/Sniffles/{name}_diag_sniffles.vcf",
  1492. cfg.longreads_results_dir
  1493. );
  1494. let sniffles_mrd_vcf = format!(
  1495. "{}/{name}/mrd/Sniffles/{name}_mrd_sniffles.vcf",
  1496. cfg.longreads_results_dir
  1497. );
  1498. if !std::path::Path::new(&sniffles_vcf).exists() {
  1499. return Err(anyhow!("{sniffles_vcf} is required"));
  1500. }
  1501. let nanomonsv_vcf = format!(
  1502. "{}/{name}/diag/nanomonsv/{name}_diag_nanomonsv_PASSED.vcf.gz",
  1503. cfg.longreads_results_dir
  1504. );
  1505. if !std::path::Path::new(&nanomonsv_vcf).exists() {
  1506. return Err(anyhow!("{nanomonsv_vcf} is required"));
  1507. }
  1508. // let db_path = "/data/db_results.sqlite".to_string();
  1509. // `${data_dir}/${name}/diag/${name}_variants.sqlite`
  1510. let db_path = format!(
  1511. "{}/{name}/diag/{name}_variants.sqlite",
  1512. cfg.longreads_results_dir
  1513. );
  1514. let bytes_path = format!(
  1515. "{}/{name}/diag/{name}_variants.bytes.gz",
  1516. cfg.longreads_results_dir
  1517. );
  1518. let loh_path = format!(
  1519. "{}/{name}/diag/{name}_loh.vcf.gz",
  1520. cfg.longreads_results_dir
  1521. );
  1522. // let db_constit_path = format!(
  1523. // "{}/{name}/diag/{name}_constit.sqlite",
  1524. // cfg.longreads_results_dir
  1525. // );
  1526. let bytes_constit_path = format!(
  1527. "{}/{name}/diag/{name}_constit.bytes.gz",
  1528. cfg.longreads_results_dir
  1529. );
  1530. let sources = vec![
  1531. (
  1532. deepvariant_diag_vcf.as_str(),
  1533. &VCFSource::DeepVariant,
  1534. &VariantType::Somatic,
  1535. ),
  1536. (
  1537. deepvariant_mrd_vcf.as_str(),
  1538. &VCFSource::DeepVariant,
  1539. &VariantType::Constitutionnal,
  1540. ),
  1541. (
  1542. clairs_vcf.as_str(),
  1543. &VCFSource::ClairS,
  1544. &VariantType::Somatic,
  1545. ),
  1546. (
  1547. sniffles_vcf.as_str(),
  1548. &VCFSource::Sniffles,
  1549. &VariantType::Somatic,
  1550. ),
  1551. (
  1552. sniffles_mrd_vcf.as_str(),
  1553. &VCFSource::Sniffles,
  1554. &VariantType::Constitutionnal,
  1555. ),
  1556. (
  1557. nanomonsv_vcf.as_str(),
  1558. &VCFSource::Nanomonsv,
  1559. &VariantType::Somatic,
  1560. ),
  1561. ];
  1562. let mut variants = Variants::from_vcfs(name.to_string(), sources, &cfg, multi.clone())?;
  1563. variants.vcf_filters();
  1564. variants.write_vcf_cat(&loh_path, &VariantCategory::LOH)?;
  1565. variants.bam_filters(&mrd_bam);
  1566. let constits = variants.get_cat(&VariantCategory::Constit);
  1567. let constits = Variants::from_vec(name.to_string(), &multi, constits);
  1568. constits.save_bytes(&bytes_constit_path)?;
  1569. variants.keep_somatics_un();
  1570. info!("Variants retained: {}", variants.len());
  1571. // TODO check if SNP are matching
  1572. if variants.len() > 100_000 {
  1573. return Err(anyhow!(
  1574. "Too many variants, verify if somatic and tumoral samples match."
  1575. ));
  1576. }
  1577. variants.merge();
  1578. variants.sort()?;
  1579. info!("Variants retained: {}", variants.len());
  1580. variants.vep();
  1581. variants.pangolin()?;
  1582. variants.annotate_gff_feature(&cfg.gff_path)?;
  1583. variants.echtvar_annotate(&deepvariant_mrd_vcf)?;
  1584. variants.filter_snp()?;
  1585. variants.save_bytes(&bytes_path)?;
  1586. // variants.stats()?;
  1587. //
  1588. // if std::path::Path::new(&db_path).exists() {
  1589. // crate::sql::variants_sql::remove_variants_names(&db_path, &name)?;
  1590. // }
  1591. //
  1592. variants.save_sql(&db_path)?;
  1593. variants.stats_sql(&db_path)?;
  1594. info!("Variants : {}", variants.len());
  1595. Ok(())
  1596. }
  1597. // pub fn cluster_variants(d: &mut Vec<Variant>, max_dist: u32) -> i32 {
  1598. // let mut cluster_id = 0;
  1599. // let first = d.get(0).unwrap();
  1600. // let mut last_pos = first.position;
  1601. // let mut last_contig = first.contig.to_string();
  1602. //
  1603. // d.iter_mut().for_each(|e| {
  1604. // if e.contig != last_contig {
  1605. // cluster_id += 1;
  1606. // last_contig = e.contig.to_string();
  1607. // } else if e.position - last_pos > max_dist {
  1608. // cluster_id += 1;
  1609. // }
  1610. // e.annotations.push(AnnotationType::Cluster(cluster_id));
  1611. // last_pos = e.position;
  1612. // });
  1613. //
  1614. // cluster_id
  1615. // }