: 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182
  1. use crate::{
  2. annotations::{
  3. cosmic::Cosmic,
  4. echtvar::{parse_echtvar_val, run_echtvar},
  5. gnomad::GnomAD,
  6. ncbi_gff::NCBIGFF,
  7. pangolin::{pangolin_parse_results, pangolin_save_variants, run_pangolin, Pangolin},
  8. phase::{Phase, PhaseAnnotation},
  9. vep::{get_best_vep, vep_chunk, VEP},
  10. }, callers::{
  11. clairs::{ClairSFormat, ClairSInfo},
  12. deepvariant::{DeepVariantFormat, DeepVariantInfo},
  13. nanomonsv::{NanomonsvFormat, NanomonsvInfo},
  14. sniffles::{SnifflesFormat, SnifflesInfo},
  15. }, config::{self, Config}, in_out::{
  16. self,
  17. dict_reader::read_dict,
  18. get_reader,
  19. vcf_reader::{read_vcf, VCFRow},
  20. vcf_writer::{vcf_header_from, VariantWritter},
  21. }, rna_seq::find_monoallelics, sql::{
  22. stats_sql::insert_stats,
  23. variants_sql::{insert_variants, update_annotations, write_annotaded},
  24. }, utils::{
  25. chi_square_test_for_proportions, count_repetitions, estimate_shannon_entropy,
  26. get_hts_nt_pileup, new_pg, new_pg_speed, print_stat_cat,
  27. }
  28. };
  29. use anyhow::{anyhow, bail, Context, Ok, Result};
  30. use charming::{
  31. component::{Axis, Grid},
  32. element::{AxisLabel, AxisType, ItemStyle, Label, LabelPosition},
  33. series::Bar,
  34. Chart,
  35. };
  36. use csv::ReaderBuilder;
  37. use dashmap::DashMap;
  38. use hashbrown::HashMap;
  39. use indicatif::{MultiProgress, ParallelProgressIterator};
  40. use log::{info, warn};
  41. use noodles_core::{region::Region, Position};
  42. use noodles_fasta::indexed_reader::Builder as FastaBuilder;
  43. use noodles_gff as gff;
  44. use rayon::prelude::*;
  45. use rust_htslib::bam::IndexedReader;
  46. use serde::{ser::SerializeStruct, Deserialize, Serialize, Serializer};
  47. use std::{
  48. env::temp_dir,
  49. fmt,
  50. fs::File,
  51. io::Read,
  52. str::FromStr,
  53. sync::{
  54. atomic::{AtomicI32, Ordering},
  55. Arc,
  56. },
  57. };
  58. use std::{fs, io::Write};
  59. use utoipa::ToSchema;
  60. // chr12:25116542|G>T KRAS
  61. #[derive(Debug, Clone)]
  62. pub struct Variants {
  63. pub name: String,
  64. pub data: Vec<Variant>,
  65. pub constit: DashMap<String, Variant>,
  66. pub stats_vcf: StatsVCF,
  67. pub stats_bam: StatsBAM,
  68. pub cfg: Config,
  69. pub mp: MultiProgress,
  70. }
  71. impl Serialize for Variants {
  72. fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
  73. where
  74. S: Serializer,
  75. {
  76. // 3 is the number of fields in the struct.
  77. let mut state = serializer.serialize_struct("Variants", 5)?;
  78. state.serialize_field("name", &self.name)?;
  79. state.serialize_field("data", &self.data)?;
  80. state.serialize_field("constit", &self.constit)?;
  81. state.serialize_field("stats_vcf", &self.stats_vcf)?;
  82. state.serialize_field("stats_bam", &self.stats_bam)?;
  83. state.end()
  84. }
  85. }
  86. #[derive(Debug, Clone, Serialize, Deserialize, Default)]
  87. pub struct StatsVCF {
  88. n_tumoral_init: usize,
  89. n_constit_init: usize,
  90. n_constit: i32,
  91. n_loh: i32,
  92. n_low_mrd_depth: i32,
  93. }
  94. impl fmt::Display for StatsVCF {
  95. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  96. let k = 100.0 / self.n_tumoral_init as f64;
  97. let string = format!(
  98. "VCF filters found {} ({:.1}%) constit, {} ({:.1}%) LOH, {} ({:.1}%) Low depth for constit variants",
  99. self.n_constit, self.n_constit as f64 * k,
  100. self.n_loh, self.n_loh as f64 * k,
  101. self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k
  102. );
  103. write!(f, "{}", string)
  104. }
  105. }
  106. #[derive(Debug, Clone, Serialize, Deserialize, Default)]
  107. pub struct StatsBAM {
  108. n_lasting: i32,
  109. n_constit: i32,
  110. n_low_mrd_depth: i32,
  111. n_low_diversity: i32,
  112. n_somatic: i32,
  113. }
  114. impl fmt::Display for StatsBAM {
  115. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  116. let k = 100.0 / self.n_lasting as f64;
  117. let string = format!(
  118. "BAM filters found {} ({:.1}%) constit, {} ({:.1}%) low depth for constit variants, {} ({:.1}%) low diversity of sequence at the variant position, {} ({:.1}%) somatic variants",
  119. self.n_constit, self.n_constit as f64 * k,
  120. self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k,
  121. self.n_low_diversity, self.n_low_diversity as f64 * k,
  122. self.n_somatic, self.n_somatic as f64 * k
  123. );
  124. write!(f, "{}", string)
  125. }
  126. }
  127. impl Variants {
  128. pub fn from_vec(name: String, mp: &MultiProgress, data: Vec<Variant>) -> Self {
  129. Self {
  130. name,
  131. data,
  132. constit: DashMap::new(),
  133. stats_vcf: StatsVCF::default(),
  134. stats_bam: StatsBAM::default(),
  135. cfg: Config::get().unwrap(),
  136. mp: mp.clone(),
  137. }
  138. }
  139. pub fn from_vcfs(
  140. name: String,
  141. v: Vec<(&str, &VCFSource, &VariantType)>,
  142. cfg: &Config,
  143. mp: MultiProgress,
  144. ) -> Result<Self> {
  145. let pg = mp.add(new_pg(v.len() as u64));
  146. pg.set_message("Reading VCF");
  147. let constit: Arc<DashMap<String, Variant>> = Arc::new(DashMap::new());
  148. let n_constit = AtomicI32::new(0);
  149. let data: Vec<Variant> = v
  150. .par_iter()
  151. // .progress_count(v.len() as u64)
  152. .flat_map(|(path, source, variant_type)| {
  153. let r = match variant_type {
  154. VariantType::Somatic => read_vcf(path, source, variant_type).unwrap(),
  155. VariantType::Constitutionnal => {
  156. read_vcf(path, source, variant_type)
  157. .unwrap()
  158. .par_iter()
  159. .for_each(|e| {
  160. n_constit.fetch_add(1, Ordering::SeqCst);
  161. constit.insert(
  162. format!(
  163. "{}:{}|{}>{}",
  164. e.contig, e.position, e.reference, e.alternative
  165. ),
  166. e.clone(),
  167. );
  168. });
  169. vec![]
  170. }
  171. };
  172. pg.inc(1);
  173. r
  174. })
  175. .collect();
  176. let stats_vcf = StatsVCF::default();
  177. let stats_bam = StatsBAM::default();
  178. let constit = Arc::try_unwrap(constit).unwrap();
  179. let elapsed = pg.elapsed();
  180. pg.finish();
  181. info!("{} variants parsed from somatic VCFs and {} variants positions parsed from constitutionnal VCFs. Executed in {}s", data.len(), constit.len(), elapsed.as_secs());
  182. let cfg = cfg.clone();
  183. Ok(Self {
  184. name,
  185. data,
  186. constit,
  187. stats_vcf,
  188. stats_bam,
  189. cfg,
  190. mp: mp.clone(),
  191. })
  192. }
  193. pub fn vcf_filters(&mut self) {
  194. let cfg = &self.cfg;
  195. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  196. pg.set_message("VCF filtering");
  197. let n_tumoral_init = self.len();
  198. let n_constit_init = self.constit_len();
  199. let min_loh_diff = cfg.deepvariant_loh_pval;
  200. let min_mrd_depth = cfg.min_mrd_depth;
  201. info!("Filtering Constitutionnal (reported variant in constit), LOH (VAF proportion test < {}), LowMRDDepth (< {} in constit) variants by VCF annotations of {} likely somatic variants", min_loh_diff, min_mrd_depth, n_tumoral_init);
  202. let n_constit = AtomicI32::new(0);
  203. let n_loh = AtomicI32::new(0);
  204. let n_low_mrd_depth = AtomicI32::new(0);
  205. self.data = self
  206. .data
  207. .par_iter()
  208. .map(|e| {
  209. let mut tumoral = e.clone();
  210. let k = format!(
  211. "{}:{}|{}>{}",
  212. tumoral.contig, tumoral.position, tumoral.reference, tumoral.alternative
  213. );
  214. if let Some(mut constit) = self.constit.get_mut(&k) {
  215. if constit.get_depth() < min_mrd_depth {
  216. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  217. tumoral.annotations.push(AnnotationType::VariantCategory(
  218. VariantCategory::LowMRDDepth,
  219. ));
  220. } else if constit.get_n_alt() == constit.get_depth()
  221. && tumoral.get_n_alt() == tumoral.get_depth()
  222. {
  223. n_constit.fetch_add(1, Ordering::SeqCst);
  224. tumoral
  225. .annotations
  226. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  227. } else {
  228. let pval = chi_square_test_for_proportions(
  229. tumoral.get_n_alt() as f64,
  230. tumoral.get_depth() as f64,
  231. constit.get_n_alt() as f64,
  232. constit.get_depth() as f64,
  233. )
  234. .unwrap();
  235. if pval != 0.0 && pval <= min_loh_diff {
  236. n_loh.fetch_add(1, Ordering::SeqCst);
  237. tumoral
  238. .annotations
  239. .push(AnnotationType::VariantCategory(VariantCategory::LOH));
  240. } else {
  241. n_constit.fetch_add(1, Ordering::SeqCst);
  242. tumoral
  243. .annotations
  244. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  245. }
  246. }
  247. // If not un Constit registry, ClairS look for VCF constit depth and n_alt
  248. } else if let Format::ClairS(format) = &tumoral.callers_data.first().unwrap().format
  249. {
  250. if format.ndp < min_mrd_depth {
  251. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  252. tumoral.annotations.push(AnnotationType::VariantCategory(
  253. VariantCategory::LowMRDDepth,
  254. ));
  255. } else if let ReferenceAlternative::Nucleotide(alt_base) = &tumoral.alternative
  256. {
  257. let mrd_n_alt = match alt_base {
  258. Base::A => format.nau,
  259. Base::T => format.ntu,
  260. Base::C => format.ncu,
  261. Base::G => format.ngu,
  262. _ => 0,
  263. };
  264. if mrd_n_alt != 0 {
  265. n_constit.fetch_add(1, Ordering::SeqCst);
  266. tumoral
  267. .annotations
  268. .push(AnnotationType::VariantCategory(VariantCategory::Constit));
  269. }
  270. }
  271. }
  272. pg.inc(1);
  273. tumoral
  274. })
  275. .collect();
  276. let n_constit = n_constit.load(Ordering::SeqCst);
  277. let n_loh = n_loh.load(Ordering::SeqCst);
  278. let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
  279. self.stats_vcf = StatsVCF {
  280. n_tumoral_init,
  281. n_constit_init,
  282. n_constit,
  283. n_loh,
  284. n_low_mrd_depth,
  285. };
  286. // let elapsed = start.elapsed();
  287. let elapsed = pg.elapsed();
  288. pg.finish();
  289. info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
  290. }
  291. /// Filter variants by reading informations from constist BAM.
  292. pub fn bam_filters(&mut self, mrd_bam: &str) {
  293. let cfg = &self.cfg;
  294. // let start = Instant::now();
  295. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  296. pg.set_message("BAM filtering");
  297. let min_mrd_depth = cfg.min_mrd_depth;
  298. info!("Filtering Constitutionnal (Alt base found in BAM pileup), LowDiversity (sequence +/- 20nt around variant with entropy < {}), LowMRDDepth (BAM pileup depth < {}) variants by BAM pileup fetching of {} likely somatic variants", cfg.min_diversity, min_mrd_depth, self.stats_vcf.n_tumoral_init - (self.stats_vcf.n_constit + self.stats_vcf.n_loh + self.stats_vcf.n_low_mrd_depth) as usize);
  299. let n_already = AtomicI32::new(0);
  300. let n_constit = AtomicI32::new(0);
  301. let n_low_mrd_depth = AtomicI32::new(0);
  302. let n_low_diversity = AtomicI32::new(0);
  303. let n_somatic = AtomicI32::new(0);
  304. self.data.par_chunks_mut(10_000).for_each(|chunk| {
  305. let mut bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam)
  306. .context(anyhow!("Reading {}", mrd_bam))
  307. .unwrap();
  308. let mut genome_reader = FastaBuilder::default()
  309. .build_from_path(&cfg.reference_fa)
  310. .unwrap();
  311. for tumoral in chunk.iter_mut() {
  312. pg.inc(1);
  313. if !tumoral.annotations.is_empty() {
  314. n_already.fetch_add(1, Ordering::SeqCst);
  315. continue;
  316. }
  317. let (pos, is_ins) = match tumoral.alteration_category() {
  318. AlterationCategory::Ins => (tumoral.position, true),
  319. AlterationCategory::Del => (tumoral.position, false),
  320. _ => (tumoral.position, false),
  321. };
  322. match get_hts_nt_pileup(
  323. &mut bam,
  324. &tumoral.contig,
  325. pos as i32,
  326. is_ins, // tumoral.position as i32,
  327. ) {
  328. std::result::Result::Ok(bases) => {
  329. let depth = bases.len() as u32;
  330. if depth < min_mrd_depth {
  331. n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
  332. tumoral.annotations.push(AnnotationType::VariantCategory(
  333. VariantCategory::LowMRDDepth,
  334. ));
  335. } else {
  336. // Check local diversity
  337. let start =
  338. Position::try_from((tumoral.position - 20) as usize).unwrap();
  339. let end = Position::try_from((tumoral.position + 19) as usize).unwrap();
  340. let r = Region::new(tumoral.contig.to_string(), start..=end);
  341. if let std::result::Result::Ok(reg) = genome_reader.query(&r) {
  342. let s = reg.sequence();
  343. let u = s.as_ref();
  344. let s = String::from_utf8(u.to_vec()).unwrap();
  345. let ent = estimate_shannon_entropy(&s.to_lowercase());
  346. if ent < cfg.min_diversity {
  347. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  348. tumoral.annotations.push(AnnotationType::VariantCategory(
  349. VariantCategory::LowDiversity,
  350. ));
  351. continue;
  352. }
  353. // Check triplets or doublets if DeepVariant
  354. let callers = tumoral.callers();
  355. if callers.len() == 1 && callers[0] == *"DeepVariant" {
  356. let seq_left = &s[0..20];
  357. let seq_right = &s[21..s.len() - 1];
  358. // Triplet right
  359. if count_repetitions(seq_right, 3) >= 3 {
  360. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  361. tumoral.annotations.push(AnnotationType::VariantCategory(
  362. VariantCategory::LowDiversity,
  363. ));
  364. continue;
  365. }
  366. // Doublet right
  367. if count_repetitions(seq_right, 2) >= 4 {
  368. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  369. tumoral.annotations.push(AnnotationType::VariantCategory(
  370. VariantCategory::LowDiversity,
  371. ));
  372. continue;
  373. }
  374. // Triplet left
  375. if count_repetitions(seq_left, 3) >= 3 {
  376. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  377. tumoral.annotations.push(AnnotationType::VariantCategory(
  378. VariantCategory::LowDiversity,
  379. ));
  380. continue;
  381. }
  382. // Doublet left
  383. if count_repetitions(seq_left, 2) >= 4 {
  384. n_low_diversity.fetch_add(1, Ordering::SeqCst);
  385. tumoral.annotations.push(AnnotationType::VariantCategory(
  386. VariantCategory::LowDiversity,
  387. ));
  388. continue;
  389. }
  390. }
  391. }
  392. // Check if the base is in constitutionnal pileup
  393. if let ReferenceAlternative::Nucleotide(alt_b) = &tumoral.alternative {
  394. let alt_b = alt_b.clone().into_u8();
  395. let n_alt_mrd = bases
  396. .clone()
  397. .into_iter()
  398. .filter(|e| *e == alt_b)
  399. .collect::<Vec<_>>()
  400. .len();
  401. if n_alt_mrd > 0 {
  402. n_constit.fetch_add(1, Ordering::SeqCst);
  403. tumoral.annotations.push(AnnotationType::VariantCategory(
  404. VariantCategory::Constit,
  405. ));
  406. } else {
  407. n_somatic.fetch_add(1, Ordering::SeqCst);
  408. tumoral.annotations.push(AnnotationType::VariantCategory(
  409. VariantCategory::Somatic,
  410. ));
  411. }
  412. } else if tumoral.is_ins() {
  413. let n_alt_mrd =
  414. bases.clone().into_iter().filter(|e| *e == b'I').count();
  415. if n_alt_mrd > 0 {
  416. n_constit.fetch_add(1, Ordering::SeqCst);
  417. tumoral.annotations.push(AnnotationType::VariantCategory(
  418. VariantCategory::Constit,
  419. ));
  420. } else {
  421. n_somatic.fetch_add(1, Ordering::SeqCst);
  422. tumoral.annotations.push(AnnotationType::VariantCategory(
  423. VariantCategory::Somatic,
  424. ));
  425. }
  426. } else if tumoral.alteration_category() == AlterationCategory::Del {
  427. let n_alt_mrd =
  428. bases.clone().into_iter().filter(|e| *e == b'D').count();
  429. if n_alt_mrd > 0 {
  430. n_constit.fetch_add(1, Ordering::SeqCst);
  431. tumoral.annotations.push(AnnotationType::VariantCategory(
  432. VariantCategory::Constit,
  433. ));
  434. } else {
  435. n_somatic.fetch_add(1, Ordering::SeqCst);
  436. tumoral.annotations.push(AnnotationType::VariantCategory(
  437. VariantCategory::Somatic,
  438. ));
  439. }
  440. }
  441. }
  442. }
  443. Err(r) => panic!("{}", r),
  444. }
  445. }
  446. });
  447. let n_constit = n_constit.load(Ordering::SeqCst);
  448. let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
  449. let n_low_diversity = n_low_diversity.load(Ordering::SeqCst);
  450. let n_somatic = n_somatic.load(Ordering::SeqCst);
  451. let n_lasting = self.data.len() as i32 - n_already.load(Ordering::SeqCst);
  452. self.stats_bam = StatsBAM {
  453. n_lasting,
  454. n_constit,
  455. n_low_mrd_depth,
  456. n_low_diversity,
  457. n_somatic,
  458. };
  459. let elapsed = pg.elapsed();
  460. pg.finish();
  461. info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
  462. }
  463. pub fn get_cat(&mut self, cat: &VariantCategory) -> Vec<Variant> {
  464. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  465. pg.set_message(format!("Get cat {:?}", cat));
  466. self.data
  467. .par_iter()
  468. .progress_with(pg)
  469. .flat_map(|e| {
  470. if e.annotations
  471. .iter()
  472. .filter(|e| match e {
  473. AnnotationType::VariantCategory(vc) => vc == cat,
  474. _ => false,
  475. })
  476. .count()
  477. > 0
  478. {
  479. vec![e.clone()]
  480. } else {
  481. vec![]
  482. }
  483. })
  484. .collect::<Vec<Variant>>()
  485. }
  486. pub fn with_annotation(&self, annotation: &AnnotationType) -> Vec<Variant> {
  487. self.data
  488. .clone()
  489. .into_iter()
  490. .filter(|v| {
  491. v.annotations.iter().any(|a| {
  492. matches!(
  493. (annotation, a),
  494. (
  495. AnnotationType::VariantCategory(_),
  496. AnnotationType::VariantCategory(_)
  497. ) | (AnnotationType::VEP(_), AnnotationType::VEP(_))
  498. | (AnnotationType::Cluster(_), AnnotationType::Cluster(_))
  499. | (AnnotationType::Cosmic(_), AnnotationType::Cosmic(_))
  500. | (AnnotationType::GnomAD(_), AnnotationType::GnomAD(_))
  501. | (AnnotationType::NCBIGFF(_), AnnotationType::NCBIGFF(_))
  502. | (AnnotationType::Pangolin(_), AnnotationType::Pangolin(_))
  503. | (AnnotationType::Phase(_), AnnotationType::Phase(_))
  504. )
  505. })
  506. })
  507. .collect()
  508. }
  509. pub fn write_vcf_cat(&mut self, path: &str, cat: &VariantCategory) -> Result<()> {
  510. info!("Writing VCF {}", path);
  511. let mut to_write = sort_variants(self.get_cat(cat), &self.cfg.dict_file)?;
  512. if to_write.is_empty() {
  513. warn!("No variants to write");
  514. return Ok(());
  515. }
  516. let pg = self.mp.add(new_pg_speed(to_write.len() as u64));
  517. pg.set_message("Writing VCF");
  518. let mut w = VariantWritter::new(path, &self.cfg.dict_file)?;
  519. for row in to_write.iter_mut() {
  520. w.write_variant(row)
  521. .context(format!("Error writing VCF row {:#?}", row))?;
  522. pg.inc(1);
  523. }
  524. w.write_index_finish()
  525. .context(format!("Can't write index for {}", path))?;
  526. Ok(())
  527. }
  528. /// Keep variants annotated Somatic
  529. pub fn keep_somatics_un(&mut self) {
  530. let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
  531. pg.set_message("Filtering Variants");
  532. self.data = self
  533. .data
  534. .par_iter_mut()
  535. .progress_with(pg)
  536. .flat_map(|e| {
  537. // keep unannotated and somatic
  538. if e.annotations
  539. .iter()
  540. .filter(|a| match a {
  541. AnnotationType::VariantCategory(vc) => {
  542. !matches!(vc, VariantCategory::Somatic)
  543. }
  544. _ => false,
  545. })
  546. .count()
  547. == 0
  548. {
  549. vec![e]
  550. } else {
  551. vec![]
  552. }
  553. })
  554. .map(|e| e.clone())
  555. .collect();
  556. }
  557. /// Annotate with VEP
  558. pub fn vep(&mut self) {
  559. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  560. pg.set_message("VEP");
  561. self.data
  562. .par_chunks_mut(self.cfg.vep_chunk_size)
  563. .progress_with(pg)
  564. .for_each(|chunks| {
  565. if let Err(err) = vep_chunk(chunks) {
  566. panic!("{err}");
  567. }
  568. });
  569. }
  570. /// sort_variants TODO
  571. pub fn sort(&mut self) -> Result<()> {
  572. let cfg = &self.cfg;
  573. self.data = sort_variants(self.data.clone(), &cfg.dict_file)?;
  574. Ok(())
  575. }
  576. pub fn merge(&mut self) {
  577. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  578. pg.set_message("Merging Variants by contig, positions, ref, alt");
  579. let hm: DashMap<String, Variant> = DashMap::new();
  580. self.data.par_iter().progress_with(pg).for_each(|e| {
  581. let k = format!(
  582. "{}:{}|{}>{}",
  583. e.contig, e.position, e.reference, e.alternative
  584. );
  585. if let Some(mut v) = hm.get_mut(&k) {
  586. let v = v.value_mut();
  587. e.callers_data.iter().for_each(|cd| {
  588. v.callers_data.push(cd.clone());
  589. v.callers_data.dedup();
  590. });
  591. v.source.extend(e.source.clone());
  592. v.source.dedup();
  593. } else {
  594. hm.insert(k, e.clone());
  595. }
  596. });
  597. self.data = hm.iter().map(|e| e.value().clone()).collect();
  598. }
  599. pub fn annotate_gff_feature(&mut self, gff_path: &str) -> Result<()> {
  600. let gff_path = gff_path.to_string();
  601. let len = self.data.len();
  602. let pg = self.mp.add(new_pg_speed(self.len() as u64));
  603. pg.set_message("GFF Annotate");
  604. self.data
  605. .par_chunks_mut(len / 33)
  606. .progress_with(pg)
  607. .for_each(|chunk| {
  608. let mut reader = File::open(&gff_path)
  609. .map(noodles_bgzf::Reader::new)
  610. .map(gff::Reader::new)
  611. .unwrap();
  612. let index = noodles_csi::read(format!("{}.csi", gff_path))
  613. .context("Can't load csi index")
  614. .unwrap();
  615. for v in chunk.iter_mut() {
  616. let start = Position::try_from(v.position as usize).unwrap();
  617. let r = Region::new(v.contig.to_string(), start..=start);
  618. if let std::result::Result::Ok(rows) = reader.query(&index, &r.clone()) {
  619. for row in rows {
  620. let ncbi = NCBIGFF::from(row.unwrap());
  621. v.annotations.push(AnnotationType::NCBIGFF(ncbi));
  622. }
  623. }
  624. }
  625. });
  626. Ok(())
  627. }
  628. pub fn echtvar_annotate(&mut self, header_path: &str) -> Result<()> {
  629. let len = self.len();
  630. let header = vcf_header_from(header_path)?;
  631. let pg = self.mp.add(new_pg_speed(len as u64));
  632. pg.set_message("Echtvar Annotate");
  633. self.data
  634. .par_chunks_mut(len / 33)
  635. .progress_with(pg)
  636. .for_each(|chunk| {
  637. let in_tmp = format!(
  638. "{}/echtvar_in_{}.vcf",
  639. temp_dir().to_str().unwrap(),
  640. uuid::Uuid::new_v4()
  641. );
  642. let out_tmp = format!(
  643. "{}/echtvar_in_{}.vcf.gz",
  644. temp_dir().to_str().unwrap(),
  645. uuid::Uuid::new_v4()
  646. );
  647. let mut vcf = File::create(&in_tmp).unwrap();
  648. let _ = writeln!(vcf, "{}", header);
  649. for (i, row) in chunk.iter().enumerate() {
  650. let _ = writeln!(
  651. vcf,
  652. "{}\t{}\t{}\t{}\t{}\t.\tPASS\t.\t.\t.",
  653. row.contig,
  654. row.position,
  655. i + 1,
  656. row.reference,
  657. row.alternative
  658. );
  659. }
  660. run_echtvar(&in_tmp, &out_tmp).unwrap();
  661. let mut reader = ReaderBuilder::new()
  662. .delimiter(b'\t')
  663. .has_headers(false)
  664. .comment(Some(b'#'))
  665. .flexible(true)
  666. .from_reader(get_reader(&out_tmp).unwrap());
  667. // let mut lines: HashMap<u64, Vec<VEPLine>> = HashMap::new();
  668. let mut last: usize = 1;
  669. for row in reader.deserialize::<VCFRow>().flatten() {
  670. let (cosmic, gnomad) = parse_echtvar_val(&row.info).unwrap();
  671. let id: usize = row.id.parse().unwrap();
  672. if id != last {
  673. panic!("Echtvar output not in input order!");
  674. }
  675. if let Some(c) = cosmic {
  676. chunk[id - 1].annotations.push(AnnotationType::Cosmic(c));
  677. }
  678. if let Some(g) = gnomad {
  679. chunk[id - 1].annotations.push(AnnotationType::GnomAD(g));
  680. }
  681. last += 1;
  682. }
  683. });
  684. Ok(())
  685. }
  686. pub fn category_iter(&self, category: &VariantCategory) -> Vec<&Variant> {
  687. self.data
  688. .par_iter()
  689. .filter(|v| {
  690. for annotation in v.annotations.iter() {
  691. if let AnnotationType::VariantCategory(cat) = annotation {
  692. if cat == category {
  693. return true;
  694. }
  695. }
  696. }
  697. false
  698. })
  699. .collect::<Vec<&Variant>>()
  700. }
  701. /// Filter based on GnomAD if gnomad_af < max_gnomad_af
  702. pub fn filter_snp(&mut self) -> Result<i32> {
  703. let n_snp = AtomicI32::new(0);
  704. self.data = self
  705. .data
  706. .clone()
  707. .into_par_iter()
  708. .filter(|e| {
  709. let mut res = true;
  710. e.annotations.iter().for_each(|a| {
  711. if let AnnotationType::GnomAD(g) = a {
  712. res = g.gnomad_af < self.cfg.max_gnomad_af;
  713. };
  714. });
  715. if !res {
  716. n_snp.fetch_add(1, Ordering::SeqCst);
  717. }
  718. res
  719. })
  720. .collect();
  721. let n = n_snp.load(Ordering::SeqCst);
  722. Ok(n)
  723. }
  724. pub fn pangolin(&mut self) -> Result<()> {
  725. let tmp_file = pangolin_save_variants(self)?;
  726. let res_file = run_pangolin(&tmp_file)?;
  727. fs::remove_file(tmp_file)?;
  728. let res = pangolin_parse_results(&res_file)?;
  729. let mut res = res.iter();
  730. fs::remove_file(res_file)?;
  731. info!("Adding pangolin results for {} variants.", res.len());
  732. let mut n_added = 0;
  733. if let Some(r) = res.next() {
  734. let mut curr = r.clone();
  735. for variant in self.data.iter_mut() {
  736. if variant.contig == curr.0
  737. && variant.position == curr.1
  738. && variant.reference == curr.2
  739. && variant.alternative == curr.3
  740. {
  741. variant.annotations.push(AnnotationType::Pangolin(curr.4));
  742. n_added += 1;
  743. if let Some(r) = res.next() {
  744. curr = r.clone();
  745. } else {
  746. break;
  747. }
  748. }
  749. }
  750. }
  751. info!("Pangolin annotations {}", n_added);
  752. assert_eq!(res.len(), 0);
  753. Ok(())
  754. }
  755. pub fn find_monoallelics(&self, rna_bam: &str, out_tsv: &str) -> anyhow::Result<()> {
  756. let mut variants = self.clone();
  757. variants.data = variants
  758. .data
  759. .into_iter()
  760. .filter_map(|mut v| {
  761. if v.vaf() < 0.75 && v.vaf() > 0.25 && v.get_depth() > 6 {
  762. Some(v)
  763. } else {
  764. None
  765. }
  766. })
  767. .collect();
  768. // variants.save_bytes("/data/test_var.bytes_chr1.gz")?;
  769. let monoall = find_monoallelics(
  770. &variants,
  771. &rna_bam,
  772. )?;
  773. monoall
  774. .iter()
  775. .filter(|(_k, v)| v.iter().any(|v| v.0))
  776. // can be used as a CTRL for women
  777. .filter(|(_, v)| v[0].2 != "chrX") // can
  778. .for_each(|(k, v)| {
  779. println!(
  780. "{k}\t{}/{}\t{}",
  781. v.iter().filter(|v| v.0).count(),
  782. v.len(),
  783. v.iter()
  784. .map(|(_, _, c, p)| format!("{c}:{p}"))
  785. .collect::<Vec<String>>()
  786. .join("|")
  787. )
  788. });
  789. Ok(())
  790. }
  791. pub fn len(&self) -> usize {
  792. self.data.len()
  793. }
  794. pub fn is_empty(&self) -> bool {
  795. self.data.is_empty()
  796. }
  797. pub fn constit_len(&self) -> usize {
  798. self.constit.len()
  799. }
  800. pub fn get_variant(&self, contig: &str, pos: u32) -> Vec<Variant> {
  801. self.data
  802. .par_iter()
  803. .filter(|v| v.contig == contig && v.position == pos)
  804. .map(|v| v.clone())
  805. .collect()
  806. }
  807. pub fn phase_contig(&mut self, phases: &[Phase], bam_path: &str, contig: &str) {
  808. type Iv = rust_lapper::Interval<u64, Phase>;
  809. let data: Vec<Iv> = phases
  810. .iter()
  811. .map(|p| Iv {
  812. start: p.range.unwrap().0,
  813. stop: p.range.unwrap().3 + 1, // TODO verif if inclusif
  814. val: p.clone(),
  815. })
  816. .collect();
  817. let lapper = rust_lapper::Lapper::new(data);
  818. let mut bam = IndexedReader::from_path(bam_path).unwrap();
  819. let mut annotations = 0;
  820. for v in self.data.iter_mut().filter(|v| v.contig == contig) {
  821. let over_phases: Vec<&Phase> = lapper
  822. .find(v.position as u64, v.position as u64)
  823. .map(|iv| &iv.val)
  824. .collect();
  825. if !over_phases.is_empty() {
  826. let reads: Vec<String> = match v.alteration_category() {
  827. AlterationCategory::Snv => {
  828. let base = v.alternative.to_string().as_bytes()[0];
  829. pandora_lib_pileup::qnames_at_base(
  830. &mut bam,
  831. &v.contig,
  832. v.position as i32,
  833. false,
  834. 50,
  835. )
  836. .unwrap()
  837. .iter()
  838. .filter(|(_, b)| *b == base)
  839. .map(|(r, _)| r.to_string())
  840. .collect()
  841. }
  842. AlterationCategory::Ins => pandora_lib_pileup::qnames_at_base(
  843. &mut bam,
  844. &v.contig,
  845. v.position as i32,
  846. true,
  847. 50,
  848. )
  849. .unwrap()
  850. .iter()
  851. .filter(|(_, b)| *b == b'I')
  852. .map(|(r, _)| r.to_string())
  853. .collect(),
  854. AlterationCategory::Del => pandora_lib_pileup::qnames_at_base(
  855. &mut bam,
  856. &v.contig,
  857. v.position as i32,
  858. false,
  859. 50,
  860. )
  861. .unwrap()
  862. .iter()
  863. .filter(|(_, b)| *b == b'D')
  864. .map(|(r, _)| r.to_string())
  865. .collect(),
  866. AlterationCategory::Rep => Vec::new(),
  867. AlterationCategory::Other => Vec::new(),
  868. };
  869. over_phases
  870. .iter()
  871. .filter(|p| p.contains(&reads))
  872. .for_each(|p| {
  873. if let Some(id) = p.id() {
  874. annotations += 1;
  875. v.annotations
  876. .push(AnnotationType::Phase(PhaseAnnotation { id }));
  877. };
  878. });
  879. }
  880. }
  881. }
  882. pub fn stats(&self) -> Result<Vec<Stat>> {
  883. let mut callers_cat = HashMap::new();
  884. let mut n_caller_data = 0;
  885. let mut variants_cat = HashMap::new();
  886. let mut n_variants_wcat = 0;
  887. let mut ncbi_feature = HashMap::new();
  888. let mut n_ncbi_feature = 0;
  889. let mut cosmic_sup_1 = HashMap::new();
  890. let mut n_cosmic_sup_1 = 0;
  891. let mut cons_cat = HashMap::new();
  892. let mut n_csq = 0;
  893. let add_hm = |hm: &mut HashMap<String, u32>, k: &str| {
  894. let (_, v) = hm.raw_entry_mut().from_key(k).or_insert(k.to_string(), 1);
  895. *v += 1;
  896. };
  897. for ele in self.data.iter() {
  898. // Callers
  899. let mut callers = Vec::new();
  900. for cd in &ele.callers_data {
  901. callers.push(
  902. match cd.format {
  903. Format::DeepVariant(_) => "DeepVariant",
  904. Format::ClairS(_) => "ClairS",
  905. Format::Sniffles(_) => "Sniffles",
  906. Format::Nanomonsv(_) => "Nanomonsv",
  907. }
  908. .to_string(),
  909. );
  910. }
  911. if !callers.is_empty() {
  912. n_caller_data += 1;
  913. callers.sort();
  914. let k = callers.join(",");
  915. let (_, v) = callers_cat
  916. .raw_entry_mut()
  917. .from_key(&k)
  918. .or_insert(k.clone(), 1);
  919. *v += 1;
  920. }
  921. // Var cat
  922. // Annotations
  923. for annot in ele.annotations.iter() {
  924. let mut features = Vec::new();
  925. let mut variant_cat = Vec::new();
  926. let mut cosmic_m1 = false;
  927. match annot {
  928. AnnotationType::NCBIGFF(ncbi) => {
  929. features.push(ncbi.feature.to_string());
  930. }
  931. AnnotationType::Cosmic(c) => {
  932. if c.cosmic_cnt > 1 {
  933. cosmic_m1 = true;
  934. }
  935. }
  936. AnnotationType::VariantCategory(vc) => {
  937. let s = serde_json::to_string(vc)?;
  938. variant_cat.push(s);
  939. }
  940. _ => (),
  941. };
  942. if !features.is_empty() {
  943. features.sort();
  944. add_hm(&mut ncbi_feature, &features.join(","));
  945. n_ncbi_feature += 1;
  946. }
  947. if !variant_cat.is_empty() {
  948. add_hm(&mut variants_cat, &variant_cat.join(","));
  949. n_variants_wcat += 1;
  950. }
  951. if cosmic_m1 {
  952. add_hm(&mut cosmic_sup_1, "Cosmic > 1");
  953. n_cosmic_sup_1 += 1;
  954. }
  955. }
  956. // VEP
  957. let d: Vec<VEP> = ele
  958. .annotations
  959. .iter()
  960. .flat_map(|e| {
  961. if let AnnotationType::VEP(e) = e {
  962. e.clone()
  963. } else {
  964. vec![]
  965. }
  966. })
  967. .collect();
  968. if let std::result::Result::Ok(vep) = get_best_vep(&d) {
  969. if let Some(csq) = vep.consequence {
  970. n_csq += 1;
  971. let csq = csq.join(",");
  972. let (_, v) = cons_cat
  973. .raw_entry_mut()
  974. .from_key(&csq)
  975. .or_insert(csq.clone(), 1);
  976. *v += 1;
  977. }
  978. }
  979. }
  980. print_stat_cat(&cons_cat, n_csq as u32);
  981. print_stat_cat(&ncbi_feature, n_ncbi_feature as u32);
  982. print_stat_cat(&cosmic_sup_1, n_cosmic_sup_1 as u32);
  983. print_stat_cat(&callers_cat, n_caller_data as u32);
  984. // let file = File::create(path)?;
  985. // let mut writer = BufWriter::new(file);
  986. let results = vec![
  987. Stat::new("consequences".to_string(), cons_cat, n_csq as u32),
  988. Stat::new(
  989. "variants_cat".to_string(),
  990. variants_cat,
  991. n_variants_wcat as u32,
  992. ),
  993. Stat::new(
  994. "ncbi_feature".to_string(),
  995. ncbi_feature,
  996. n_ncbi_feature as u32,
  997. ),
  998. Stat::new("callers_cat".to_string(), callers_cat, n_caller_data as u32),
  999. ];
  1000. Ok(results)
  1001. }
  1002. pub fn save_sql(&self, path: &str) -> Result<()> {
  1003. insert_variants(self, path)
  1004. }
  1005. pub fn stats_sql(&self, path: &str) -> Result<()> {
  1006. insert_stats(
  1007. "VCF".to_string(),
  1008. serde_json::to_string(&self.stats_vcf)?,
  1009. path,
  1010. )?;
  1011. insert_stats(
  1012. "BAM".to_string(),
  1013. serde_json::to_string(&self.stats_bam)?,
  1014. path,
  1015. )?;
  1016. Ok(())
  1017. }
  1018. pub fn stats_json(&self, path: &str) -> Result<AllStats> {
  1019. let variants_stats = self.stats()?;
  1020. let all_stats = AllStats {
  1021. variants_stats,
  1022. vcf_stats: self.stats_vcf.clone(),
  1023. bam_stats: self.stats_bam.clone(),
  1024. n_variants: self.data.len(),
  1025. };
  1026. let s = serde_json::to_string(&all_stats)?;
  1027. fs::write(path, s)?;
  1028. Ok(all_stats)
  1029. }
  1030. pub fn save_bytes(&self, path: &str) -> Result<()> {
  1031. let serialized = pot::to_vec(&self.data)?;
  1032. let mut w = noodles_bgzf::writer::Builder::default().build_from_writer(File::create(path)?);
  1033. w.write_all(&serialized)?;
  1034. Ok(())
  1035. }
  1036. pub fn new_from_bytes(name: &str, path: &str, mp: MultiProgress) -> Result<Self> {
  1037. info!("Loading variants from: {path}");
  1038. let r = in_out::get_reader_progress(path, &mp)?;
  1039. let data: Vec<Variant> = pot::from_reader(r)?;
  1040. Ok(Self {
  1041. name: name.to_string(),
  1042. data,
  1043. constit: DashMap::new(),
  1044. stats_vcf: StatsVCF::default(),
  1045. stats_bam: StatsBAM::default(),
  1046. cfg: Config::get()?,
  1047. mp,
  1048. })
  1049. }
  1050. pub fn filter_category(&self, and_categories: &[Category]) -> Vec<&Variant> {
  1051. self.data
  1052. .par_iter()
  1053. .flat_map(|v| {
  1054. if v.is_from_category(and_categories) {
  1055. vec![v]
  1056. } else {
  1057. vec![]
  1058. }
  1059. })
  1060. .collect()
  1061. }
  1062. }
  1063. #[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
  1064. pub struct Stat {
  1065. name: String,
  1066. counts: HashMap<String, u32>,
  1067. n_with_annotation: u32,
  1068. }
  1069. impl Stat {
  1070. pub fn new(name: String, counts: HashMap<String, u32>, n_with_annotation: u32) -> Self {
  1071. Stat {
  1072. counts,
  1073. n_with_annotation,
  1074. name,
  1075. }
  1076. }
  1077. }
  1078. #[derive(Debug, Clone, Serialize, Deserialize)]
  1079. pub struct AllStats {
  1080. pub variants_stats: Vec<Stat>,
  1081. pub vcf_stats: StatsVCF,
  1082. pub bam_stats: StatsBAM,
  1083. pub n_variants: usize,
  1084. }
  1085. impl AllStats {
  1086. pub fn load_json(path: &str) -> anyhow::Result<AllStats> {
  1087. File::open(path)
  1088. .context(format!("Failed to open file: {path}"))
  1089. .and_then(|mut file| {
  1090. let mut contents = String::new();
  1091. file.read_to_string(&mut contents)
  1092. .context(format!("Failed to read file: {path}"))
  1093. .map(|_| contents)
  1094. })
  1095. .and_then(|json_str| {
  1096. serde_json::from_str::<AllStats>(&json_str).with_context(|| "Failed to parse JSON")
  1097. })
  1098. }
  1099. pub fn generate_graph(&self, prefix: &str) -> anyhow::Result<()> {
  1100. let bar_interval = 20_f64;
  1101. // 1. Consequences
  1102. let consequences_data: Vec<&Stat> = self
  1103. .variants_stats
  1104. .iter()
  1105. .filter(|v| v.name == "consequences")
  1106. .collect();
  1107. if consequences_data.is_empty() {
  1108. bail!("No consequences data");
  1109. }
  1110. let mut consequences_data: Vec<(String, i32)> = consequences_data
  1111. .first()
  1112. .unwrap()
  1113. .counts
  1114. .iter()
  1115. .map(|(k, v)| {
  1116. let k = k.replace("_variant", "");
  1117. let k = k.replace("_", " ");
  1118. let k = if k == "intron,non coding transcript" {
  1119. "non coding transcript intron".to_string()
  1120. } else {
  1121. k
  1122. };
  1123. let k = if let Some((before_comma, _)) = k.split_once(',') {
  1124. before_comma.to_string()
  1125. } else {
  1126. k
  1127. };
  1128. (k, *v as i32)
  1129. })
  1130. .collect();
  1131. consequences_data.sort_by(|a, b| a.1.cmp(&b.1));
  1132. let n = consequences_data.len() as f64;
  1133. let height = (n * bar_interval) + 30.0;
  1134. let (y_data, x_data): (Vec<String>, Vec<i32>) = consequences_data.into_iter().unzip();
  1135. let chart = Chart::new()
  1136. .grid(Grid::new().left("18%").right("5%").top("0%").bottom(30))
  1137. .x_axis(Axis::new().type_(AxisType::Log))
  1138. .y_axis(
  1139. Axis::new()
  1140. .type_(AxisType::Category)
  1141. .data(y_data)
  1142. .axis_label(AxisLabel::new().font_size(12)),
  1143. )
  1144. .series(
  1145. Bar::new()
  1146. .show_background(true)
  1147. .data(x_data)
  1148. .bar_width(12)
  1149. .item_style(ItemStyle::new().border_radius(3))
  1150. .label(Label::new().show(true).position(LabelPosition::Right)),
  1151. );
  1152. let mut renderer = charming::ImageRenderer::new(1000, height as u32);
  1153. renderer
  1154. .save(&chart, format!("{prefix}_consequences.svg"))
  1155. .unwrap();
  1156. // 2. NCBI features
  1157. let ncbi_data: Vec<&Stat> = self
  1158. .variants_stats
  1159. .iter()
  1160. .filter(|v| v.name == "ncbi_feature")
  1161. .collect();
  1162. if ncbi_data.is_empty() {
  1163. bail!("No NCBI features data");
  1164. }
  1165. let mut ncbi_data: Vec<(String, i32)> = ncbi_data
  1166. .first()
  1167. .unwrap()
  1168. .counts
  1169. .iter()
  1170. .map(|(k, v)| {
  1171. let k = if k == "non_allelic_homologous_recombination_region" {
  1172. "non_allelic_homologous\n_recombination_region".to_string()
  1173. } else {
  1174. k.to_string()
  1175. };
  1176. (k.replace("_", " "), *v as i32)
  1177. })
  1178. .collect();
  1179. ncbi_data.sort_by(|a, b| a.1.cmp(&b.1));
  1180. let n = ncbi_data.len() as f64;
  1181. let height = (n * bar_interval) + 30.0;
  1182. let (y_data, x_data): (Vec<String>, Vec<i32>) = ncbi_data.into_iter().unzip();
  1183. let chart = Chart::new()
  1184. .grid(Grid::new().left("18%").right("5%").top("0%").bottom(30))
  1185. .x_axis(Axis::new().type_(AxisType::Log))
  1186. .y_axis(
  1187. Axis::new()
  1188. .type_(AxisType::Category)
  1189. .data(y_data)
  1190. .axis_label(AxisLabel::new().font_size(12)),
  1191. )
  1192. .series(
  1193. Bar::new()
  1194. .show_background(true)
  1195. .data(x_data)
  1196. .bar_width(12)
  1197. .item_style(ItemStyle::new().border_radius(3))
  1198. .label(Label::new().show(true).position(LabelPosition::Right)),
  1199. );
  1200. let mut renderer = charming::ImageRenderer::new(1000, height as u32);
  1201. renderer.save(&chart, format!("{prefix}_ncbi.svg")).unwrap();
  1202. // 3. Callers
  1203. let callers_data: Vec<&Stat> = self
  1204. .variants_stats
  1205. .iter()
  1206. .filter(|v| v.name == "callers_cat")
  1207. .collect();
  1208. if callers_data.is_empty() {
  1209. bail!("No callers data");
  1210. }
  1211. let mut callers_data: Vec<(String, i32)> = callers_data
  1212. .first()
  1213. .unwrap()
  1214. .counts
  1215. .iter()
  1216. .map(|(k, v)| (k.replace(",", " & "), *v as i32))
  1217. .collect();
  1218. let n = callers_data.len() as f64;
  1219. let height = (n * bar_interval) + 30.0;
  1220. callers_data.sort_by(|a, b| a.1.cmp(&b.1));
  1221. let (y_data, x_data): (Vec<String>, Vec<i32>) = callers_data.into_iter().unzip();
  1222. let chart = Chart::new()
  1223. .grid(Grid::new().left("18%").right("5%").top("0%").bottom(30))
  1224. .x_axis(Axis::new().type_(AxisType::Log))
  1225. .y_axis(
  1226. Axis::new()
  1227. .type_(AxisType::Category)
  1228. .data(y_data)
  1229. .axis_label(AxisLabel::new().font_size(12)),
  1230. )
  1231. .series(
  1232. Bar::new()
  1233. .show_background(true)
  1234. .data(x_data)
  1235. .bar_width(12)
  1236. .item_style(ItemStyle::new().border_radius(3))
  1237. .label(Label::new().show(true).position(LabelPosition::Right)),
  1238. );
  1239. let mut renderer = charming::ImageRenderer::new(1000, height as u32);
  1240. renderer
  1241. .save(&chart, format!("{prefix}_callers.svg"))
  1242. .unwrap();
  1243. Ok(())
  1244. }
  1245. }
  1246. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
  1247. pub struct Variant {
  1248. pub contig: String,
  1249. pub position: u32,
  1250. pub reference: ReferenceAlternative,
  1251. pub alternative: ReferenceAlternative,
  1252. pub callers_data: Vec<CallerData>,
  1253. pub n_alt: Option<u32>,
  1254. pub n_ref: Option<u32>,
  1255. pub vaf: Option<f32>,
  1256. pub depth: Option<u32>,
  1257. pub variant_type: VariantType,
  1258. pub source: Vec<VCFSource>,
  1259. pub annotations: Vec<AnnotationType>,
  1260. }
  1261. #[derive(Debug, PartialEq, Serialize, Deserialize, Clone, ToSchema)]
  1262. pub struct CallerData {
  1263. pub qual: Option<f32>,
  1264. pub format: Format,
  1265. pub info: Info,
  1266. }
  1267. impl CallerData {
  1268. pub fn get_vaf(&self) -> f64 {
  1269. match &self.format {
  1270. Format::DeepVariant(v) => v.vaf as f64,
  1271. Format::ClairS(v) => v.af,
  1272. Format::Sniffles(v) => v.dv as f64 / (v.dv as f64 + v.dr as f64),
  1273. Format::Nanomonsv(v) => v.vr as f64 / v.tr as f64,
  1274. }
  1275. }
  1276. pub fn get_depth(&mut self) -> u32 {
  1277. match &self.format {
  1278. Format::DeepVariant(v) => v.dp,
  1279. Format::ClairS(v) => v.dp,
  1280. Format::Sniffles(v) => v.dv + v.dr,
  1281. Format::Nanomonsv(v) => v.tr,
  1282. }
  1283. }
  1284. pub fn get_n_alt(&mut self) -> u32 {
  1285. match &self.format {
  1286. Format::DeepVariant(v) => v.ad.get(1).unwrap().to_owned(),
  1287. Format::ClairS(v) => v.ad.get(1).unwrap().to_owned(),
  1288. Format::Sniffles(v) => v.dv,
  1289. Format::Nanomonsv(v) => v.tr - v.vr,
  1290. }
  1291. }
  1292. /// Variants filter rules
  1293. pub fn should_filter(&self) -> bool {
  1294. if let Info::Sniffles(info) = &self.info {
  1295. let imprecise = info
  1296. .tags
  1297. .iter()
  1298. .filter(|s| s.to_string() == *"IMPRECISE")
  1299. .count();
  1300. let mut n_alt = 0;
  1301. if let Format::Sniffles(f) = &self.format {
  1302. n_alt = f.dv;
  1303. }
  1304. !(imprecise == 0 && n_alt >= 3)
  1305. } else {
  1306. false
  1307. }
  1308. }
  1309. }
  1310. #[derive(Debug, Clone, Serialize, Eq, PartialEq, Deserialize, ToSchema)]
  1311. pub enum VariantType {
  1312. Somatic,
  1313. Constitutionnal,
  1314. }
  1315. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
  1316. pub enum VCFSource {
  1317. DeepVariant,
  1318. ClairS,
  1319. Sniffles,
  1320. Nanomonsv,
  1321. }
  1322. impl FromStr for VCFSource {
  1323. type Err = anyhow::Error;
  1324. fn from_str(s: &str) -> Result<Self> {
  1325. match s {
  1326. "DeepVariant" => Ok(VCFSource::DeepVariant),
  1327. "ClairS" => Ok(VCFSource::ClairS),
  1328. "Sniffles" => Ok(VCFSource::Sniffles),
  1329. "Nanomonsv" => Ok(VCFSource::Nanomonsv),
  1330. _ => Err(anyhow!("Error parsing VCFSource")),
  1331. }
  1332. }
  1333. }
  1334. impl fmt::Display for VCFSource {
  1335. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  1336. let s = match self {
  1337. VCFSource::DeepVariant => "DeepVariant",
  1338. VCFSource::ClairS => "ClairS",
  1339. VCFSource::Sniffles => "Sniffles",
  1340. VCFSource::Nanomonsv => "Nanomonsv",
  1341. };
  1342. write!(f, "{}", s)
  1343. }
  1344. }
  1345. impl Variant {
  1346. pub fn from_vcfrow(row: &VCFRow, source: VCFSource, variant_type: VariantType) -> Result<Self> {
  1347. let callers_data = vec![CallerData {
  1348. qual: row.qual.parse::<f32>().ok(),
  1349. info: parse_info(&row.info, &source).context(anyhow!(
  1350. "Can't parse {:?} info for {}",
  1351. source,
  1352. row.info
  1353. ))?,
  1354. format: parse_format(&source, &row.value).context(anyhow!(
  1355. "Can't parse {:?} format for {}",
  1356. source,
  1357. row.value
  1358. ))?,
  1359. }];
  1360. Ok(Variant {
  1361. contig: row.chr.to_string(),
  1362. position: row.pos,
  1363. reference: row
  1364. .reference
  1365. .parse()
  1366. .context(anyhow!("Error while parsing {}", row.reference))?,
  1367. alternative: row
  1368. .alt
  1369. .parse()
  1370. .context(anyhow!("Error while parsing {}", row.alt))?,
  1371. n_ref: None,
  1372. n_alt: None,
  1373. vaf: None,
  1374. depth: None,
  1375. callers_data,
  1376. source: vec![source],
  1377. variant_type,
  1378. annotations: Vec::new(),
  1379. })
  1380. }
  1381. pub fn get_depth(&mut self) -> u32 {
  1382. if let Some(depth) = self.depth {
  1383. depth
  1384. } else {
  1385. let depth = self
  1386. .callers_data
  1387. .iter_mut()
  1388. .map(|v| v.get_depth())
  1389. .max()
  1390. .unwrap();
  1391. self.depth = Some(depth);
  1392. depth
  1393. }
  1394. }
  1395. pub fn get_n_alt(&mut self) -> u32 {
  1396. if let Some(n_alt) = self.n_alt {
  1397. n_alt
  1398. } else {
  1399. let n_alt = self
  1400. .callers_data
  1401. .iter_mut()
  1402. .map(|v| v.get_n_alt())
  1403. .max()
  1404. .unwrap();
  1405. self.n_alt = Some(n_alt);
  1406. n_alt
  1407. }
  1408. }
  1409. pub fn vaf(&mut self) -> f32 {
  1410. let n_alt = self.get_n_alt() as f32;
  1411. let depth = self.get_depth() as f32;
  1412. self.vaf = Some(n_alt / depth);
  1413. self.vaf.unwrap()
  1414. }
  1415. pub fn is_ins(&self) -> bool {
  1416. matches!(
  1417. (&self.reference, &self.alternative),
  1418. (
  1419. ReferenceAlternative::Nucleotide(_),
  1420. ReferenceAlternative::Nucleotides(_)
  1421. )
  1422. )
  1423. }
  1424. pub fn alteration_category(&self) -> AlterationCategory {
  1425. match (&self.reference, &self.alternative) {
  1426. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
  1427. AlterationCategory::Snv
  1428. }
  1429. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
  1430. AlterationCategory::Ins
  1431. }
  1432. (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
  1433. AlterationCategory::Other
  1434. }
  1435. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
  1436. AlterationCategory::Del
  1437. }
  1438. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  1439. if a.len() < b.len() =>
  1440. {
  1441. AlterationCategory::Ins
  1442. }
  1443. (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
  1444. if a.len() > b.len() =>
  1445. {
  1446. AlterationCategory::Del
  1447. }
  1448. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotides(_)) => {
  1449. AlterationCategory::Rep
  1450. }
  1451. (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Unstructured(_)) => {
  1452. AlterationCategory::Other
  1453. }
  1454. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotide(_)) => {
  1455. AlterationCategory::Other
  1456. }
  1457. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotides(_)) => {
  1458. AlterationCategory::Other
  1459. }
  1460. (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Unstructured(_)) => {
  1461. AlterationCategory::Other
  1462. }
  1463. }
  1464. }
  1465. pub fn to_min_string(&mut self) -> String {
  1466. let depth = self.get_depth();
  1467. let n_alt = self.get_n_alt();
  1468. format!(
  1469. "DP:AD\t{}:{}",
  1470. depth,
  1471. [(depth - n_alt).to_string(), n_alt.to_string()].join(",")
  1472. )
  1473. }
  1474. pub fn get_veps(&self) -> Vec<VEP> {
  1475. self.annotations
  1476. .iter()
  1477. .flat_map(|e| {
  1478. if let AnnotationType::VEP(e) = e {
  1479. e.clone()
  1480. } else {
  1481. vec![]
  1482. }
  1483. })
  1484. .collect()
  1485. }
  1486. pub fn get_best_vep(&self) -> Result<VEP> {
  1487. get_best_vep(&self.get_veps())
  1488. }
  1489. pub fn is_from_category(&self, and_categories: &[Category]) -> bool {
  1490. let mut vec_bools = Vec::new();
  1491. for category in and_categories.iter() {
  1492. match category {
  1493. Category::VariantCategory(vc) => {
  1494. for annotations in self.annotations.iter() {
  1495. if let AnnotationType::VariantCategory(vvc) = annotations {
  1496. if vc == vvc {
  1497. vec_bools.push(true);
  1498. break;
  1499. }
  1500. }
  1501. }
  1502. }
  1503. Category::PositionRange { contig, from, to } => {
  1504. if self.contig == *contig {
  1505. match (from, to) {
  1506. (None, None) => vec_bools.push(true),
  1507. (None, Some(to)) => vec_bools.push(self.position <= *to),
  1508. (Some(from), None) => vec_bools.push(self.position >= *from),
  1509. (Some(from), Some(to)) => {
  1510. vec_bools.push(self.position >= *from && self.position <= *to)
  1511. }
  1512. }
  1513. } else {
  1514. vec_bools.push(false);
  1515. }
  1516. }
  1517. Category::VCFSource(_) => (),
  1518. Category::NCosmic(n) => {
  1519. let mut bools = Vec::new();
  1520. for annotations in self.annotations.iter() {
  1521. if let AnnotationType::Cosmic(c) = annotations {
  1522. bools.push(c.cosmic_cnt >= *n);
  1523. break;
  1524. }
  1525. }
  1526. vec_bools.push(bools.iter().any(|&b| b));
  1527. }
  1528. Category::NCBIFeature(ncbi_feature) => {
  1529. let mut bools = Vec::new();
  1530. for annotations in self.annotations.iter() {
  1531. if let AnnotationType::NCBIGFF(v) = annotations {
  1532. bools.push(v.feature == *ncbi_feature);
  1533. }
  1534. }
  1535. vec_bools.push(bools.iter().any(|&b| b));
  1536. }
  1537. Category::VAF { min, max } => {
  1538. let v = if self.vaf.is_none() {
  1539. let mut s = self.clone();
  1540. s.vaf()
  1541. } else {
  1542. self.vaf.unwrap()
  1543. };
  1544. vec_bools.push(v >= *min && v <= *max);
  1545. }
  1546. Category::Pangolin => {
  1547. vec_bools.push(
  1548. self.annotations
  1549. .iter()
  1550. .filter(|a| matches!(a, AnnotationType::Pangolin(_)))
  1551. .count()
  1552. > 0,
  1553. );
  1554. }
  1555. }
  1556. }
  1557. vec_bools.iter().all(|&x| x)
  1558. }
  1559. pub fn callers(&self) -> Vec<String> {
  1560. self.source
  1561. .iter()
  1562. .map(|source| source.to_string())
  1563. .collect()
  1564. }
  1565. }
  1566. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  1567. pub enum AlterationCategory {
  1568. Snv,
  1569. Ins,
  1570. Del,
  1571. Rep,
  1572. Other,
  1573. }
  1574. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
  1575. pub enum AnnotationType {
  1576. VariantCategory(VariantCategory),
  1577. VEP(Vec<VEP>),
  1578. Cluster(i32),
  1579. Cosmic(Cosmic),
  1580. GnomAD(GnomAD),
  1581. NCBIGFF(NCBIGFF),
  1582. Pangolin(Pangolin),
  1583. Phase(PhaseAnnotation),
  1584. }
  1585. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
  1586. pub enum VariantCategory {
  1587. Somatic,
  1588. LowMRDDepth,
  1589. LOH,
  1590. Constit,
  1591. LowDiversity,
  1592. }
  1593. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, ToSchema)]
  1594. pub enum ReferenceAlternative {
  1595. Nucleotide(Base),
  1596. Nucleotides(Vec<Base>),
  1597. Unstructured(String),
  1598. }
  1599. impl FromStr for ReferenceAlternative {
  1600. type Err = anyhow::Error;
  1601. fn from_str(s: &str) -> Result<Self> {
  1602. let possible_bases = s.as_bytes().iter();
  1603. let mut res: Vec<Base> = Vec::new();
  1604. for &base in possible_bases {
  1605. match base.try_into() {
  1606. std::result::Result::Ok(b) => res.push(b),
  1607. Err(_) => {
  1608. return Ok(Self::Unstructured(s.to_string()));
  1609. }
  1610. }
  1611. }
  1612. if res.len() == 1 {
  1613. Ok(Self::Nucleotide(res.pop().unwrap()))
  1614. } else {
  1615. Ok(Self::Nucleotides(res))
  1616. }
  1617. }
  1618. }
  1619. impl fmt::Display for ReferenceAlternative {
  1620. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  1621. let string = match self {
  1622. ReferenceAlternative::Nucleotide(b) => b.to_string(),
  1623. ReferenceAlternative::Nucleotides(bases) => bases
  1624. .iter()
  1625. .fold(String::new(), |acc, e| format!("{}{}", acc, e)),
  1626. ReferenceAlternative::Unstructured(s) => s.to_string(),
  1627. };
  1628. write!(f, "{}", string)
  1629. }
  1630. }
  1631. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, ToSchema)]
  1632. pub enum Base {
  1633. A,
  1634. T,
  1635. C,
  1636. G,
  1637. N,
  1638. }
  1639. impl TryFrom<u8> for Base {
  1640. type Error = anyhow::Error;
  1641. fn try_from(base: u8) -> Result<Self> {
  1642. match base {
  1643. b'A' => Ok(Base::A),
  1644. b'T' => Ok(Base::T),
  1645. b'C' => Ok(Base::C),
  1646. b'G' => Ok(Base::G),
  1647. b'N' => Ok(Base::N),
  1648. _ => Err(anyhow!(
  1649. "Unknown base: {}",
  1650. String::from_utf8_lossy(&[base])
  1651. )),
  1652. }
  1653. }
  1654. }
  1655. impl Base {
  1656. pub fn into_u8(self) -> u8 {
  1657. match self {
  1658. Base::A => b'A',
  1659. Base::T => b'T',
  1660. Base::C => b'C',
  1661. Base::G => b'G',
  1662. Base::N => b'N',
  1663. }
  1664. }
  1665. }
  1666. impl fmt::Display for Base {
  1667. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  1668. // Use `self.number` to refer to each positional data point.
  1669. let str = match self {
  1670. Base::A => "A",
  1671. Base::T => "T",
  1672. Base::C => "C",
  1673. Base::G => "G",
  1674. Base::N => "N",
  1675. };
  1676. write!(f, "{}", str)
  1677. }
  1678. }
  1679. #[derive(Debug, Serialize, Deserialize, PartialEq, Clone, ToSchema)]
  1680. pub enum Format {
  1681. DeepVariant(DeepVariantFormat),
  1682. ClairS(ClairSFormat),
  1683. Sniffles(SnifflesFormat),
  1684. Nanomonsv(NanomonsvFormat),
  1685. }
  1686. #[derive(Debug, Serialize, Deserialize, PartialEq, Clone, ToSchema)]
  1687. pub enum Info {
  1688. #[schema(value_type=String)]
  1689. DeepVariant(DeepVariantInfo),
  1690. #[schema(value_type=String)]
  1691. ClairS(ClairSInfo),
  1692. #[schema(value_type=String)]
  1693. Sniffles(SnifflesInfo),
  1694. #[schema(value_type=String)]
  1695. Nanomonsv(NanomonsvInfo),
  1696. }
  1697. fn parse_info(s: &str, source: &VCFSource) -> Result<Info> {
  1698. match source {
  1699. VCFSource::DeepVariant => Ok(Info::DeepVariant(s.parse()?)),
  1700. VCFSource::ClairS => Ok(Info::ClairS(s.parse()?)),
  1701. VCFSource::Sniffles => Ok(Info::Sniffles(s.parse()?)),
  1702. VCFSource::Nanomonsv => Ok(Info::Nanomonsv(s.parse()?)),
  1703. }
  1704. }
  1705. fn parse_format(vcf_source: &VCFSource, data: &str) -> Result<Format> {
  1706. let res = match vcf_source {
  1707. VCFSource::DeepVariant => Format::DeepVariant(data.parse()?),
  1708. VCFSource::ClairS => Format::ClairS(data.parse()?),
  1709. VCFSource::Sniffles => Format::Sniffles(data.parse()?),
  1710. VCFSource::Nanomonsv => Format::Nanomonsv(data.parse()?),
  1711. };
  1712. Ok(res)
  1713. }
  1714. pub fn sort_variants(d: Vec<Variant>, dict_path: &str) -> Result<Vec<Variant>> {
  1715. info!("Sorting {} entries", d.len());
  1716. let dict = read_dict(dict_path)?;
  1717. let mut store: HashMap<String, Vec<Variant>> = HashMap::new();
  1718. // add to store
  1719. d.iter().for_each(|e| {
  1720. if let Some(vec) = store.get_mut(&e.contig) {
  1721. vec.push(e.clone());
  1722. } else {
  1723. store.insert(e.contig.to_string(), vec![e.clone()]);
  1724. }
  1725. });
  1726. // sort in each contig
  1727. store
  1728. .iter_mut()
  1729. .for_each(|(_, vec)| vec.sort_by(|a, b| a.position.partial_cmp(&b.position).unwrap()));
  1730. // return contig in the order of dict file
  1731. Ok(dict
  1732. .iter()
  1733. .flat_map(|(chr, _)| {
  1734. if let Some((_, vec)) = store.remove_entry(chr) {
  1735. vec
  1736. } else {
  1737. vec![]
  1738. }
  1739. })
  1740. .collect())
  1741. }
  1742. #[derive(Debug)]
  1743. pub enum Category {
  1744. VariantCategory(VariantCategory),
  1745. PositionRange {
  1746. contig: String,
  1747. from: Option<u32>,
  1748. to: Option<u32>,
  1749. },
  1750. VCFSource(VCFSource),
  1751. NCosmic(u64),
  1752. NCBIFeature(String),
  1753. VAF {
  1754. min: f32,
  1755. max: f32,
  1756. },
  1757. Pangolin,
  1758. }
  1759. pub fn run_pipe(id: &str, force: bool, multi: &MultiProgress) -> Result<()> {
  1760. let cfg = config::Config::get()?;
  1761. let deepvariant_diag_vcf = format!(
  1762. "{}/{id}/diag/DeepVariant/{id}_diag_DeepVariant_PASSED.vcf.gz",
  1763. cfg.longreads_results_dir
  1764. );
  1765. if !std::path::Path::new(&deepvariant_diag_vcf).exists() {
  1766. return Err(anyhow!("{deepvariant_diag_vcf} is required"));
  1767. // panic!("{deepvariant_diag_vcf} is required")
  1768. }
  1769. let deepvariant_mrd_vcf = format!(
  1770. "{}/{id}/mrd/DeepVariant/{id}_mrd_DeepVariant_PASSED.vcf.gz",
  1771. cfg.longreads_results_dir
  1772. );
  1773. if !std::path::Path::new(&deepvariant_mrd_vcf).exists() {
  1774. return Err(anyhow!("{deepvariant_mrd_vcf} is required"));
  1775. }
  1776. let mrd_bam = format!("{}/{id}/mrd/{id}_mrd_hs1.bam", cfg.longreads_results_dir);
  1777. if !std::path::Path::new(&mrd_bam).exists() {
  1778. return Err(anyhow!("{mrd_bam} is required"));
  1779. }
  1780. let clairs_vcf = format!(
  1781. "{}/{id}/diag/ClairS/{id}_diag_clairs_PASSED.vcf.gz",
  1782. cfg.longreads_results_dir
  1783. );
  1784. if !std::path::Path::new(&clairs_vcf).exists() {
  1785. return Err(anyhow!("{clairs_vcf} is required"));
  1786. }
  1787. let clairs_indels_vcf = format!(
  1788. "{}/{id}/diag/ClairS/{id}_diag_clairs_indel_PASSED.vcf.gz",
  1789. cfg.longreads_results_dir
  1790. );
  1791. if !std::path::Path::new(&clairs_indels_vcf).exists() {
  1792. return Err(anyhow!("{clairs_indels_vcf} is required"));
  1793. }
  1794. // let sniffles_vcf = format!(
  1795. // "{}/{name}/diag/Sniffles/{name}_diag_sniffles.vcf",
  1796. // cfg.longreads_results_dir
  1797. // );
  1798. // let sniffles_mrd_vcf = format!(
  1799. // "{}/{name}/mrd/Sniffles/{name}_mrd_sniffles.vcf",
  1800. // cfg.longreads_results_dir
  1801. // );
  1802. // if !std::path::Path::new(&sniffles_vcf).exists() {
  1803. // return Err(anyhow!("{sniffles_vcf} is required"));
  1804. // }
  1805. let nanomonsv_vcf = format!(
  1806. "{}/{id}/diag/nanomonsv/{id}_diag_nanomonsv_PASSED.vcf.gz",
  1807. cfg.longreads_results_dir
  1808. );
  1809. if !std::path::Path::new(&nanomonsv_vcf).exists() {
  1810. return Err(anyhow!("{nanomonsv_vcf} is required"));
  1811. }
  1812. // let db_path = "/data/db_results.sqlite".to_string();
  1813. // `${data_dir}/${name}/diag/${name}_variants.sqlite`
  1814. let db_path = format!(
  1815. "{}/{id}/diag/{id}_variants.sqlite",
  1816. cfg.longreads_results_dir
  1817. );
  1818. let bytes_path = format!(
  1819. "{}/{id}/diag/{id}_variants.bytes.gz",
  1820. cfg.longreads_results_dir
  1821. );
  1822. let loh_path = format!("{}/{id}/diag/{id}_loh.vcf.gz", cfg.longreads_results_dir);
  1823. // let db_constit_path = format!(
  1824. // "{}/{name}/diag/{name}_constit.sqlite",
  1825. // cfg.longreads_results_dir
  1826. // );
  1827. let bytes_constit_path = format!(
  1828. "{}/{id}/diag/{id}_constit.bytes.gz",
  1829. cfg.longreads_results_dir
  1830. );
  1831. let report_data_dir = format!("{}/{id}/diag/report/data", cfg.longreads_results_dir);
  1832. if !std::path::Path::new(&report_data_dir).exists() {
  1833. fs::create_dir_all(&report_data_dir)?;
  1834. }
  1835. let stats_path = format!("{}/{id}_variants_stats.json", report_data_dir);
  1836. let af_init_path = format!("{}/{id}_variants_af_init.csv", report_data_dir);
  1837. let af_final_path = format!("{}/{id}_variants_af_final.csv", report_data_dir);
  1838. let graphs_prefix = format!("{}/{id}_barcharts", report_data_dir);
  1839. let annot_json = format!("{report_data_dir}/{id}_annot_variants.json");
  1840. let sources = vec![
  1841. (
  1842. deepvariant_diag_vcf.as_str(),
  1843. &VCFSource::DeepVariant,
  1844. &VariantType::Somatic,
  1845. ),
  1846. (
  1847. deepvariant_mrd_vcf.as_str(),
  1848. &VCFSource::DeepVariant,
  1849. &VariantType::Constitutionnal,
  1850. ),
  1851. (
  1852. clairs_vcf.as_str(),
  1853. &VCFSource::ClairS,
  1854. &VariantType::Somatic,
  1855. ),
  1856. // (
  1857. // sniffles_vcf.as_str(),
  1858. // &VCFSource::Sniffles,
  1859. // &VariantType::Somatic,
  1860. // ),
  1861. // (
  1862. // sniffles_mrd_vcf.as_str(),
  1863. // &VCFSource::Sniffles,
  1864. // &VariantType::Constitutionnal,
  1865. // ),
  1866. (
  1867. nanomonsv_vcf.as_str(),
  1868. &VCFSource::Nanomonsv,
  1869. &VariantType::Somatic,
  1870. ),
  1871. ];
  1872. let mut variants = Variants::from_vcfs(id.to_string(), sources, &cfg, multi.clone())?;
  1873. write_af_data(&variants, &af_init_path).context("Can't write initial AF data")?;
  1874. variants.vcf_filters();
  1875. variants
  1876. .write_vcf_cat(&loh_path, &VariantCategory::LOH)
  1877. .context("Can't write LOH")?;
  1878. variants.bam_filters(&mrd_bam);
  1879. let constits = variants.get_cat(&VariantCategory::Constit);
  1880. let constits = Variants::from_vec(id.to_string(), multi, constits);
  1881. constits.save_bytes(&bytes_constit_path)?;
  1882. variants.keep_somatics_un();
  1883. info!("Variants retained: {}", variants.len());
  1884. // TODO check if SNP are matching
  1885. if variants.len() > 100_000 {
  1886. return Err(anyhow!(
  1887. "Too many variants, verify if somatic and tumoral samples match."
  1888. ));
  1889. }
  1890. variants.merge();
  1891. variants.sort()?;
  1892. info!("Variants retained: {}", variants.len());
  1893. variants.vep();
  1894. // variants.pangolin()?;
  1895. variants.annotate_gff_feature(&cfg.gff_path)?;
  1896. variants.echtvar_annotate(&deepvariant_mrd_vcf)?;
  1897. variants.filter_snp()?;
  1898. variants.save_bytes(&bytes_path)?;
  1899. let all_stats = variants
  1900. .stats_json(&stats_path)
  1901. .context("Can't write stats")?;
  1902. all_stats.generate_graph(&graphs_prefix)?;
  1903. write_af_data(&variants, &af_final_path).context("Can't write final AF data")?;
  1904. if std::path::Path::new(&db_path).exists() {
  1905. if force {
  1906. write_annotaded(&db_path, &annot_json)?;
  1907. fs::remove_file(&db_path)?;
  1908. variants.save_sql(&db_path)?;
  1909. }
  1910. } else {
  1911. variants.save_sql(&db_path)?;
  1912. }
  1913. if std::path::Path::new(&annot_json).exists() {
  1914. update_annotations(&db_path, &annot_json)?;
  1915. }
  1916. info!("Variants : {}", variants.len());
  1917. Ok(())
  1918. }
  1919. // pub fn cluster_variants(d: &mut Vec<Variant>, max_dist: u32) -> i32 {
  1920. // let mut cluster_id = 0;
  1921. // let first = d.get(0).unwrap();
  1922. // let mut last_pos = first.position;
  1923. // let mut last_contig = first.contig.to_string();
  1924. //
  1925. // d.iter_mut().for_each(|e| {
  1926. // if e.contig != last_contig {
  1927. // cluster_id += 1;
  1928. // last_contig = e.contig.to_string();
  1929. // } else if e.position - last_pos > max_dist {
  1930. // cluster_id += 1;
  1931. // }
  1932. // e.annotations.push(AnnotationType::Cluster(cluster_id));
  1933. // last_pos = e.position;
  1934. // });
  1935. //
  1936. // cluster_id
  1937. // }
  1938. fn write_af_data(variants: &Variants, path: &str) -> anyhow::Result<()> {
  1939. info!("Writing AF data into {path}");
  1940. let af_data: DashMap<u32, DashMap<u32, u32>> = DashMap::new();
  1941. variants.data.par_iter().for_each(|v| {
  1942. af_data
  1943. .entry(v.depth.unwrap())
  1944. .or_default()
  1945. .entry(v.n_alt.unwrap())
  1946. .and_modify(|count| *count += 1)
  1947. .or_insert(1);
  1948. });
  1949. write_dashmap_to_tsv(&af_data, path)?;
  1950. Ok(())
  1951. }
  1952. fn write_dashmap_to_tsv(
  1953. af_data: &DashMap<u32, DashMap<u32, u32>>,
  1954. filename: &str,
  1955. ) -> anyhow::Result<()> {
  1956. let mut writer = csv::Writer::from_path(filename)?;
  1957. // Find the maximum second key to determine the number of columns
  1958. let max_second_key = af_data
  1959. .iter()
  1960. .flat_map(|entry| {
  1961. entry
  1962. .value()
  1963. .iter()
  1964. .map(|inner_entry| *inner_entry.key())
  1965. .collect::<Vec<_>>()
  1966. })
  1967. .max()
  1968. .unwrap_or(0);
  1969. // Write header
  1970. let mut header = vec!["depth".to_string()];
  1971. header.extend((0..=max_second_key).map(|i| format!("n_alt_{}", i)));
  1972. writer.write_record(&header)?;
  1973. // Write data rows
  1974. for entry in af_data.iter() {
  1975. let depth = *entry.key();
  1976. let inner_map = entry.value();
  1977. let mut row = vec![depth.to_string()];
  1978. for i in 0..=max_second_key {
  1979. let count = inner_map.get(&i).map(|v| *v).unwrap_or(0).to_string();
  1980. row.push(count);
  1981. }
  1982. writer.write_record(&row)?;
  1983. }
  1984. writer.flush()?;
  1985. Ok(())
  1986. }