| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295 |
- use crate::{
- annotations::{
- cosmic::Cosmic,
- echtvar::{parse_echtvar_val, run_echtvar},
- gnomad::GnomAD,
- ncbi_gff::NCBIGFF,
- vep::{get_best_vep, vep_chunk, VEP},
- },
- callers::{
- clairs::{ClairSFormat, ClairSInfo},
- deepvariant::{DeepVariantFormat, DeepVariantInfo},
- nanomonsv::{NanomonsvFormat, NanomonsvInfo},
- sniffles::{SnifflesFormat, SnifflesInfo},
- },
- config::Config,
- in_out::{
- dict_reader::read_dict,
- get_reader,
- vcf_reader::{read_vcf, read_vcf_progress, VCFRow},
- vcf_writer::{vcf_header_from, VariantWritter},
- },
- sql::{stats_sql::insert_stats, variants_sql::insert_variants},
- utils::{
- chi_square_test_for_proportions, estimate_shannon_entropy, get_hts_nt_pileup, new_pg,
- new_pg_speed, print_stat_cat,
- },
- };
- use anyhow::{anyhow, Context, Error, Ok, Result};
- use csv::ReaderBuilder;
- use dashmap::DashMap;
- use hashbrown::HashMap;
- use indicatif::{MultiProgress, ParallelProgressIterator};
- use log::{info, warn};
- use noodles_core::{region::Region, Position};
- use noodles_fasta::indexed_reader::Builder as FastaBuilder;
- use noodles_gff as gff;
- use rayon::prelude::*;
- use serde::{Deserialize, Serialize};
- use std::io::Write;
- use std::{
- env::temp_dir,
- fmt,
- fs::File,
- str::FromStr,
- sync::{
- atomic::{AtomicI32, Ordering},
- Arc,
- },
- };
- // chr12:25116542|G>T KRAS
- #[derive(Debug, Clone)]
- pub struct Variants {
- pub name: String,
- pub data: Vec<Variant>,
- constit: DashMap<String, Variant>,
- stats_vcf: StatsVCF,
- stats_bam: StatsBAM,
- cfg: Config,
- mp: MultiProgress,
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Default)]
- pub struct StatsVCF {
- n_tumoral_init: usize,
- n_constit_init: usize,
- n_constit: i32,
- n_loh: i32,
- n_low_mrd_depth: i32,
- }
- impl fmt::Display for StatsVCF {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let k = 100.0 / self.n_tumoral_init as f64;
- let string = format!(
- "VCF filters found {} ({:.1}%) constit, {} ({:.1}%) LOH, {} ({:.1}%) Low depth for constit variants",
- self.n_constit, self.n_constit as f64 * k,
- self.n_loh, self.n_loh as f64 * k,
- self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k
- );
- write!(f, "{}", string)
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Default)]
- pub struct StatsBAM {
- n_lasting: i32,
- n_constit: i32,
- n_low_mrd_depth: i32,
- n_low_diversity: i32,
- n_somatic: i32,
- }
- impl fmt::Display for StatsBAM {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let k = 100.0 / self.n_lasting as f64;
- let string = format!(
- "BAM filters found {} ({:.1}%) constit, {} ({:.1}%) low depth for constit variants, {} ({:.1}%) low diversity of sequence at the variant position, {} ({:.1}%) somatic variants",
- self.n_constit, self.n_constit as f64 * k,
- self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k,
- self.n_low_diversity, self.n_low_diversity as f64 * k,
- self.n_somatic, self.n_somatic as f64 * k
- );
- write!(f, "{}", string)
- }
- }
- impl Variants {
- pub fn from_vec(name: String, mp: &MultiProgress, data: Vec<Variant>) -> Self {
- Self {
- name,
- data,
- constit: DashMap::new(),
- stats_vcf: StatsVCF::default(),
- stats_bam: StatsBAM::default(),
- cfg: Config::get().unwrap(),
- mp: mp.clone(),
- }
- }
- pub fn from_vcfs(
- name: String,
- v: Vec<(&str, &VCFSource, &VariantType)>,
- cfg: &Config,
- mp: MultiProgress,
- ) -> Result<Self> {
- let pg = mp.add(new_pg(v.len() as u64));
- pg.set_message("Reading VCF");
- let constit: Arc<DashMap<String, Variant>> = Arc::new(DashMap::new());
- let n_constit = AtomicI32::new(0);
- let data: Vec<Variant> = v
- .par_iter()
- // .progress_count(v.len() as u64)
- .flat_map(|(path, source, variant_type)| {
- let r = match variant_type {
- VariantType::Somatic => read_vcf(path, source, variant_type).unwrap(),
- VariantType::Constitutionnal => {
- read_vcf(path, source, variant_type)
- .unwrap()
- .par_iter()
- .for_each(|e| {
- n_constit.fetch_add(1, Ordering::SeqCst);
- constit.insert(
- format!(
- "{}:{}|{}>{}",
- e.contig, e.position, e.reference, e.alternative
- ),
- e.clone(),
- );
- });
- vec![]
- }
- };
- pg.inc(1);
- r
- })
- .collect();
- let stats_vcf = StatsVCF::default();
- let stats_bam = StatsBAM::default();
- let constit = Arc::try_unwrap(constit).unwrap();
- let elapsed = pg.elapsed();
- pg.finish();
- info!("{} variants parsed from somatic VCFs and {} variants positions parsed from constitutionnal VCFs. Executed in {}s", data.len(), constit.len(), elapsed.as_secs());
- let cfg = cfg.clone();
- return Ok(Self {
- name,
- data,
- constit,
- stats_vcf,
- stats_bam,
- cfg,
- mp: mp.clone(),
- });
- }
- pub fn vcf_filters(&mut self) {
- let cfg = &self.cfg;
- let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
- pg.set_message("VCF filtering");
- let n_tumoral_init = self.len();
- let n_constit_init = self.constit_len();
- let min_loh_diff = cfg.deepvariant_loh_pval as f64;
- let min_mrd_depth = cfg.min_mrd_depth;
- info!("Filtering Constitutionnal (reported variant in constit), LOH (VAF proportion test < {}), LowMRDDepth (< {} in constit) variants by VCF annotations of {} likely somatic variants", min_loh_diff, min_mrd_depth, n_tumoral_init);
- let n_constit = AtomicI32::new(0);
- let n_loh = AtomicI32::new(0);
- let n_low_mrd_depth = AtomicI32::new(0);
- self.data = self
- .data
- .par_iter()
- .map(|e| {
- let mut tumoral = e.clone();
- let k = format!(
- "{}:{}|{}>{}",
- tumoral.contig, tumoral.position, tumoral.reference, tumoral.alternative
- );
- if let Some(mut constit) = self.constit.get_mut(&k) {
- if constit.get_depth() < min_mrd_depth {
- n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::LowMRDDepth,
- ));
- } else if constit.get_n_alt() == constit.get_depth()
- && tumoral.get_n_alt() == tumoral.get_depth()
- {
- n_constit.fetch_add(1, Ordering::SeqCst);
- tumoral
- .annotations
- .push(AnnotationType::VariantCategory(VariantCategory::Constit));
- } else {
- let pval = chi_square_test_for_proportions(
- tumoral.get_n_alt() as f64,
- tumoral.get_depth() as f64,
- constit.get_n_alt() as f64,
- constit.get_depth() as f64,
- )
- .unwrap();
- if pval != 0.0 && pval <= min_loh_diff {
- n_loh.fetch_add(1, Ordering::SeqCst);
- tumoral
- .annotations
- .push(AnnotationType::VariantCategory(VariantCategory::LOH));
- } else {
- n_constit.fetch_add(1, Ordering::SeqCst);
- tumoral
- .annotations
- .push(AnnotationType::VariantCategory(VariantCategory::Constit));
- }
- }
- // If not un Constit registry, ClairS look for VCF constit depth and n_alt
- } else if let Format::ClairS(format) = &tumoral.callers_data.get(0).unwrap().format
- {
- if format.ndp < min_mrd_depth {
- n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::LowMRDDepth,
- ));
- } else if let ReferenceAlternative::Nucleotide(alt_base) = &tumoral.alternative
- {
- let mrd_n_alt = match alt_base {
- Base::A => format.nau,
- Base::T => format.ntu,
- Base::C => format.ncu,
- Base::G => format.ngu,
- _ => 0,
- };
- if mrd_n_alt != 0 {
- n_constit.fetch_add(1, Ordering::SeqCst);
- tumoral
- .annotations
- .push(AnnotationType::VariantCategory(VariantCategory::Constit));
- }
- }
- }
- pg.inc(1);
- tumoral
- })
- .collect();
- let n_constit = n_constit.load(Ordering::SeqCst);
- let n_loh = n_loh.load(Ordering::SeqCst);
- let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
- self.stats_vcf = StatsVCF {
- n_tumoral_init,
- n_constit_init,
- n_constit,
- n_loh,
- n_low_mrd_depth,
- };
- // let elapsed = start.elapsed();
- let elapsed = pg.elapsed();
- pg.finish();
- info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
- }
- /// Filter variants by reading informations from constist BAM.
- pub fn bam_filters(&mut self, mrd_bam: &str) {
- let cfg = &self.cfg;
- // let start = Instant::now();
- let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
- pg.set_message("BAM filtering");
- let min_mrd_depth = cfg.min_mrd_depth;
- info!("Filtering Constitutionnal (Alt base found in BAM pileup), LowDiversity (sequence +/- 20nt around variant with entropy < {}), LowMRDDepth (BAM pileup depth < {}) variants by BAM pileup fetching of {} likely somatic variants", cfg.min_diversity, min_mrd_depth, self.stats_vcf.n_tumoral_init - (self.stats_vcf.n_constit + self.stats_vcf.n_loh + self.stats_vcf.n_low_mrd_depth) as usize);
- let n_already = AtomicI32::new(0);
- let n_constit = AtomicI32::new(0);
- let n_low_mrd_depth = AtomicI32::new(0);
- let n_low_diversity = AtomicI32::new(0);
- let n_somatic = AtomicI32::new(0);
- self.data.par_chunks_mut(10_000).for_each(|chunk| {
- let mut bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam)
- .context(anyhow!("Reading {}", mrd_bam))
- .unwrap();
- let mut genome_reader = FastaBuilder::default()
- .build_from_path(&cfg.reference_fa)
- .unwrap();
- for tumoral in chunk.iter_mut() {
- pg.inc(1);
- if tumoral.annotations.len() > 0 {
- n_already.fetch_add(1, Ordering::SeqCst);
- continue;
- }
- let (pos, is_ins) = match tumoral.alt_cat() {
- AlterationCategory::INS => (tumoral.position, true),
- AlterationCategory::DEL => (tumoral.position, false),
- _ => (tumoral.position, false),
- };
- match get_hts_nt_pileup(
- &mut bam,
- &tumoral.contig,
- pos as i32,
- is_ins, // tumoral.position as i32,
- ) {
- std::result::Result::Ok(bases) => {
- let depth = bases.len() as u32;
- if depth < min_mrd_depth {
- n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::LowMRDDepth,
- ));
- } else {
- // Check local diversity
- let start =
- Position::try_from((tumoral.position - 20) as usize).unwrap();
- let end = Position::try_from((tumoral.position + 19) as usize).unwrap();
- let r = Region::new(tumoral.contig.to_string(), start..=end);
- if let std::result::Result::Ok(reg) = genome_reader.query(&r) {
- let s = reg.sequence();
- let u = s.as_ref();
- let s = String::from_utf8(u.to_vec()).unwrap();
- let ent = estimate_shannon_entropy(&s.to_lowercase());
- if ent < cfg.min_diversity {
- if tumoral.position == 148725437 {
- warn!("POS {}", ent);
- }
- n_low_diversity.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::LowDiversity,
- ));
- continue;
- }
- }
- // Check if the base is in constitutionnal pileup
- if let ReferenceAlternative::Nucleotide(alt_b) = &tumoral.alternative {
- let alt_b = alt_b.clone().into_u8();
- let n_alt_mrd = bases
- .clone()
- .into_iter()
- .filter(|e| *e == alt_b)
- .collect::<Vec<_>>()
- .len();
- if n_alt_mrd > 0 {
- n_constit.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::Constit,
- ));
- } else {
- n_somatic.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::Somatic,
- ));
- }
- } else if tumoral.is_ins() {
- let n_alt_mrd =
- bases.clone().into_iter().filter(|e| *e == b'I').count();
- if n_alt_mrd > 0 {
- n_constit.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::Constit,
- ));
- } else {
- n_somatic.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::Somatic,
- ));
- }
- } else if tumoral.alt_cat() == AlterationCategory::DEL {
- let n_alt_mrd =
- bases.clone().into_iter().filter(|e| *e == b'D').count();
- if n_alt_mrd > 0 {
- n_constit.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::Constit,
- ));
- } else {
- n_somatic.fetch_add(1, Ordering::SeqCst);
- tumoral.annotations.push(AnnotationType::VariantCategory(
- VariantCategory::Somatic,
- ));
- }
- }
- }
- }
- Err(r) => panic!("{}", r),
- }
- }
- });
- let n_constit = n_constit.load(Ordering::SeqCst);
- let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
- let n_low_diversity = n_low_diversity.load(Ordering::SeqCst);
- let n_somatic = n_somatic.load(Ordering::SeqCst);
- let n_lasting = self.data.len() as i32 - n_already.load(Ordering::SeqCst);
- self.stats_bam = StatsBAM {
- n_lasting,
- n_constit,
- n_low_mrd_depth,
- n_low_diversity,
- n_somatic,
- };
- let elapsed = pg.elapsed();
- pg.finish();
- info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
- }
- pub fn get_cat(&mut self, cat: VariantCategory) -> Vec<Variant> {
- let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
- pg.set_message(format!("Get cat {:?}", cat));
- self.data
- .par_iter()
- .progress_with(pg)
- .flat_map(|e| {
- if e.annotations
- .iter()
- .filter(|e| match e {
- AnnotationType::VariantCategory(vc) => *vc == cat,
- _ => false,
- })
- .count()
- > 0
- {
- vec![e.clone()]
- } else {
- vec![]
- }
- })
- .collect::<Vec<Variant>>()
- }
- pub fn write_vcf_cat(&mut self, path: &str, cat: &VariantCategory) -> Result<()> {
- info!("Writing VCF {}", path);
- let mut to_write = sort_variants(self.get_cat(cat), &self.cfg.dict_file)?;
- let pg = self.mp.add(new_pg_speed(to_write.len() as u64));
- pg.set_message("Writing VCF");
- let mut w = VariantWritter::new(path, &self.cfg.dict_file)?;
- for row in to_write.iter_mut() {
- w.write_variant(row)?;
- pg.inc(1);
- }
- w.write_index_finish()?;
- Ok(())
- }
- pub fn keep_somatics_un(&mut self) {
- let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
- pg.set_message("Filtering Variants");
- self.data = self
- .data
- .par_iter_mut()
- .progress_with(pg)
- .flat_map(|e| {
- // keep unannotated and somatic
- if e.annotations
- .iter()
- .filter(|a| match a {
- AnnotationType::VariantCategory(vc) => match vc {
- VariantCategory::Somatic => false,
- _ => true,
- },
- _ => false,
- })
- .count()
- == 0
- {
- vec![e]
- } else {
- vec![]
- }
- })
- .map(|e| e.clone())
- .collect();
- }
- pub fn vep(&mut self) {
- // let steps = num_integer::div_ceil(self.data.len(), self.cfg.vep_chunk_size);
- let pg = self.mp.add(new_pg_speed(self.len() as u64));
- pg.set_message("VEP");
- self.data
- .par_chunks_mut(self.cfg.vep_chunk_size)
- .progress_with(pg)
- .for_each(|chunks| vep_chunk(chunks).unwrap());
- }
- pub fn sort(&mut self) -> Result<()> {
- let cfg = &self.cfg;
- self.data = sort_variants(self.data.clone(), &cfg.dict_file)?;
- Ok(())
- }
- pub fn merge(&mut self) {
- let pg = self.mp.add(new_pg_speed(self.len() as u64));
- pg.set_message("Merging Variants by contig, positions, ref, alt");
- let hm: DashMap<String, Variant> = DashMap::new();
- self.data.par_iter().progress_with(pg).for_each(|e| {
- let k = format!(
- "{}:{}|{}>{}",
- e.contig, e.position, e.reference, e.alternative
- );
- if let Some(mut v) = hm.get_mut(&k) {
- let v = v.value_mut();
- e.callers_data.iter().for_each(|cd| {
- v.callers_data.push(cd.clone());
- v.callers_data.dedup();
- });
- v.source.extend(e.source.clone());
- v.source.dedup();
- } else {
- hm.insert(k, e.clone());
- }
- });
- self.data = hm.iter().map(|e| e.value().clone()).collect();
- }
- pub fn annotate_gff_feature(&mut self, gff_path: &str) -> Result<()> {
- let gff_path = gff_path.to_string();
- let len = self.data.len();
- let pg = self.mp.add(new_pg_speed(self.len() as u64));
- pg.set_message("GFF Annotate");
- self.data
- .par_chunks_mut(len / 33)
- .progress_with(pg)
- .for_each(|chunk| {
- let mut reader = File::open(gff_path.to_string())
- .map(noodles_bgzf::Reader::new)
- .map(gff::Reader::new)
- .unwrap();
- let index = noodles_csi::read(format!("{}.csi", gff_path)).unwrap();
- for v in chunk.iter_mut() {
- let start = Position::try_from(v.position as usize).unwrap();
- let r = Region::new(v.contig.to_string(), start..=start);
- if let std::result::Result::Ok(rows) = reader.query(&index, &r.clone()) {
- for row in rows {
- let ncbi = NCBIGFF::try_from(row.unwrap()).unwrap();
- v.annotations.push(AnnotationType::NCBIGFF(ncbi));
- }
- }
- }
- });
- Ok(())
- }
- pub fn echtvar_annotate(&mut self, header_path: &str) -> Result<()> {
- let len = self.len();
- let header = vcf_header_from(header_path)?;
- let pg = self.mp.add(new_pg_speed(len as u64));
- pg.set_message("Echtvar Annotate");
- self.data
- .par_chunks_mut(len / 33)
- .progress_with(pg)
- .for_each(|chunk| {
- let in_tmp = format!(
- "{}/echtvar_in_{}.vcf",
- temp_dir().to_str().unwrap(),
- uuid::Uuid::new_v4()
- );
- let out_tmp = format!(
- "{}/echtvar_in_{}.vcf.gz",
- temp_dir().to_str().unwrap(),
- uuid::Uuid::new_v4()
- );
- let mut vcf = File::create(&in_tmp).unwrap();
- let _ = writeln!(vcf, "{}", header);
- for (i, row) in chunk.iter().enumerate() {
- let _ = writeln!(
- vcf,
- "{}\t{}\t{}\t{}\t{}\t{}\tPASS\t.\t{}\t{}",
- row.contig,
- row.position,
- i + 1,
- row.reference,
- row.alternative,
- ".",
- ".",
- "."
- );
- }
- run_echtvar(&in_tmp, &out_tmp).unwrap();
- let mut reader = ReaderBuilder::new()
- .delimiter(b'\t')
- .has_headers(false)
- .comment(Some(b'#'))
- .flexible(true)
- .from_reader(get_reader(&out_tmp).unwrap());
- // let mut lines: HashMap<u64, Vec<VEPLine>> = HashMap::new();
- let mut last: usize = 1;
- for line in reader.deserialize::<VCFRow>() {
- if let std::result::Result::Ok(row) = line {
- let (cosmic, gnomad) = parse_echtvar_val(&row.info).unwrap();
- let id: usize = row.id.parse().unwrap();
- if id != last {
- panic!("Echtvar output not in input order!");
- }
- if let Some(c) = cosmic {
- chunk[id - 1].annotations.push(AnnotationType::Cosmic(c));
- }
- if let Some(g) = gnomad {
- chunk[id - 1].annotations.push(AnnotationType::GnomAD(g));
- }
- last += 1;
- }
- }
- });
- Ok(())
- }
- pub fn category_iter(&self, category: &VariantCategory) -> Vec<&Variant> {
- self.data.par_iter().filter(|v| {
- for annotation in v.annotations.iter() {
- match annotation {
- AnnotationType::VariantCategory(cat) => {
- if cat == category {
- return true;
- }
- }
- _ => (),
- }
- }
- return false;
- }).collect::<Vec<&Variant>>()
- }
- pub fn filter_snp(&mut self) -> Result<i32> {
- let n_snp = AtomicI32::new(0);
- self.data = self
- .data
- .clone()
- .into_par_iter()
- .filter(|e| {
- let mut res = true;
- e.annotations.iter().for_each(|a| {
- match a {
- AnnotationType::GnomAD(g) => {
- res = g.gnomad_af < 0.01;
- }
- _ => (),
- };
- });
- if !res {
- n_snp.fetch_add(1, Ordering::SeqCst);
- }
- res
- })
- .collect();
- let n = n_snp.load(Ordering::SeqCst);
- Ok(n)
- }
- pub fn len(&self) -> usize {
- self.data.len()
- }
- pub fn constit_len(&self) -> usize {
- self.constit.len()
- }
- pub fn get_variant(&self, contig: &str, pos: u32) -> Vec<Variant> {
- self.data
- .par_iter()
- .filter(|v| v.contig == contig && v.position == pos)
- .map(|v| v.clone())
- .collect()
- }
- pub fn stats(&self) -> Result<()> {
- let mut callers_cat = HashMap::new();
- let mut n_caller_data = 0;
- let mut ncbi_feature = HashMap::new();
- let mut n_ncbi_feature = 0;
- let mut cosmic_sup_1 = HashMap::new();
- let mut n_cosmic_sup_1 = 0;
- let mut cons_cat = HashMap::new();
- let mut n_csq = 0;
- let add_hm = |hm: &mut HashMap<String, u32>, k: &str| {
- let (_, v) = hm.raw_entry_mut().from_key(k).or_insert(k.to_string(), 1);
- *v += 1;
- };
- for ele in self.data.iter() {
- // Callers
- let mut callers = Vec::new();
- for cd in &ele.callers_data {
- callers.push(
- match cd.format {
- Format::DeepVariant(_) => "DeepVariant",
- Format::ClairS(_) => "ClairS",
- Format::Sniffles(_) => "Sniffles",
- Format::Nanomonsv(_) => "Nanomonsv",
- }
- .to_string(),
- );
- }
- if callers.len() > 0 {
- n_caller_data += 1;
- callers.sort();
- let k = callers.join(",");
- let (_, v) = callers_cat
- .raw_entry_mut()
- .from_key(&k)
- .or_insert(k.clone(), 1);
- *v += 1;
- }
- // Annotations
- for annot in ele.annotations.iter() {
- let mut features = Vec::new();
- let mut cosmic_m1 = false;
- match annot {
- AnnotationType::NCBIGFF(ncbi) => {
- features.push(ncbi.feature.to_string());
- }
- AnnotationType::Cosmic(c) => {
- if c.cosmic_cnt > 1 {
- cosmic_m1 = true;
- }
- }
- _ => (),
- };
- if features.len() > 0 {
- features.sort();
- add_hm(&mut ncbi_feature, &features.join(","));
- n_ncbi_feature += 1;
- }
- if cosmic_m1 {
- add_hm(&mut cosmic_sup_1, "Cosmic > 1");
- n_cosmic_sup_1 += 1;
- }
- }
- // VEP
- let d: Vec<VEP> = ele
- .annotations
- .iter()
- .flat_map(|e| {
- if let AnnotationType::VEP(e) = e {
- e.clone()
- } else {
- vec![]
- }
- })
- .collect();
- if let std::result::Result::Ok(vep) = get_best_vep(&d) {
- if let Some(csq) = vep.consequence {
- n_csq += 1;
- let csq = csq.join(",");
- let (_, v) = cons_cat
- .raw_entry_mut()
- .from_key(&csq)
- .or_insert(csq.clone(), 1);
- *v += 1;
- }
- }
- }
- print_stat_cat(&cons_cat, n_csq as u32);
- print_stat_cat(&ncbi_feature, n_ncbi_feature as u32);
- print_stat_cat(&cosmic_sup_1, n_cosmic_sup_1 as u32);
- print_stat_cat(&callers_cat, n_caller_data as u32);
- // let file = File::create(path)?;
- // let mut writer = BufWriter::new(file);
- // let tow = Stats::new(
- // (n_csq, cons_cat),
- // (n_ncbi_feature, ncbi_feature),
- // (n_caller_data, callers_cat),
- // n_cosmic_sup_1,
- // n_total,
- // n_constit,
- // n_tumoral,
- // n_constit_first,
- // n_loh_first,
- // n_low_mrd_depth_first,
- // n_constit_sec,
- // n_low_diversity_sec,
- // n_low_mrd_depth_sec,
- // n_somatic_sec,
- // );
- // serde_json::to_writer(&mut writer, &tow)?;
- Ok(())
- }
- pub fn save_sql(&self, path: &str) -> Result<()> {
- insert_variants(&self, path)
- }
- pub fn stats_sql(&self, path: &str) -> Result<()> {
- insert_stats(
- "VCF".to_string(),
- serde_json::to_string(&self.stats_vcf)?,
- path,
- )?;
- insert_stats(
- "BAM".to_string(),
- serde_json::to_string(&self.stats_bam)?,
- path,
- )?;
- Ok(())
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
- pub struct Variant {
- pub contig: String,
- pub position: u32,
- pub reference: ReferenceAlternative,
- pub alternative: ReferenceAlternative,
- pub callers_data: Vec<CallerData>,
- pub n_alt: Option<u32>,
- pub n_ref: Option<u32>,
- pub vaf: Option<f32>,
- pub depth: Option<u32>,
- pub variant_type: VariantType,
- pub source: Vec<VCFSource>,
- pub annotations: Vec<AnnotationType>,
- }
- #[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
- pub struct CallerData {
- pub qual: Option<f32>,
- pub format: Format,
- pub info: Info,
- }
- impl CallerData {
- pub fn get_vaf(&self) -> f64 {
- match &self.format {
- Format::DeepVariant(v) => v.vaf as f64,
- Format::ClairS(v) => v.af,
- Format::Sniffles(v) => v.dv as f64 / (v.dv as f64 + v.dr as f64),
- Format::Nanomonsv(v) => v.vr as f64 / v.tr as f64,
- }
- }
- pub fn get_depth(&mut self) -> u32 {
- match &self.format {
- Format::DeepVariant(v) => v.dp,
- Format::ClairS(v) => v.dp,
- Format::Sniffles(v) => v.dv + v.dr,
- Format::Nanomonsv(v) => v.tr,
- }
- }
- pub fn get_n_alt(&mut self) -> u32 {
- match &self.format {
- Format::DeepVariant(v) => v.ad.get(1).unwrap().to_owned(),
- Format::ClairS(v) => v.ad.get(1).unwrap().to_owned(),
- Format::Sniffles(v) => v.dv,
- Format::Nanomonsv(v) => v.tr - v.vr,
- }
- }
- /// Variants filter rules
- pub fn should_filter(&self) -> bool {
- if let Info::Sniffles(info) = &self.info {
- let imprecise = info
- .tags
- .iter()
- .filter(|s| s.to_string() == "IMPRECISE".to_string())
- .count();
- let mut n_alt = 0;
- if let Format::Sniffles(f) = &self.format {
- n_alt = f.dv;
- }
- if imprecise == 0 && n_alt >= 3 {
- return false;
- } else {
- return true;
- }
- } else {
- return false;
- }
- }
- }
- #[derive(Debug, Clone, Serialize, Eq, PartialEq, Deserialize)]
- pub enum VariantType {
- Somatic,
- Constitutionnal,
- }
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
- pub enum VCFSource {
- DeepVariant,
- ClairS,
- Sniffles,
- Nanomonsv,
- }
- impl FromStr for VCFSource {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> Result<Self> {
- match s {
- "DeepVariant" => Ok(VCFSource::DeepVariant),
- "ClairS" => Ok(VCFSource::ClairS),
- "Sniffles" => Ok(VCFSource::Sniffles),
- "Nanomonsv" => Ok(VCFSource::Nanomonsv),
- _ => Err(anyhow!("Error parsing VCFSource")),
- }
- }
- }
- impl Variant {
- pub fn from_vcfrow(row: &VCFRow, source: VCFSource, variant_type: VariantType) -> Result<Self> {
- let callers_data = vec![CallerData {
- qual: row.qual.parse::<f32>().ok(),
- info: parse_info(&row.info, &source).context(anyhow!(
- "Can't parse {:?} info for {}",
- source,
- row.info
- ))?,
- format: parse_format(&source, &row.value).context(anyhow!(
- "Can't parse {:?} format for {}",
- source,
- row.value
- ))?,
- }];
- Ok(Variant {
- contig: row.chr.to_string(),
- position: row.pos,
- reference: row
- .reference
- .parse()
- .context(anyhow!("Error while parsing {}", row.reference))?,
- alternative: row
- .alt
- .parse()
- .context(anyhow!("Error while parsing {}", row.alt))?,
- n_ref: None,
- n_alt: None,
- vaf: None,
- depth: None,
- callers_data,
- source: vec![source],
- variant_type,
- annotations: Vec::new(),
- })
- }
- pub fn get_depth(&mut self) -> u32 {
- if let Some(depth) = self.depth {
- return depth;
- } else {
- let depth = self
- .callers_data
- .iter_mut()
- .map(|v| v.get_depth())
- .max()
- .unwrap();
- self.depth = Some(depth);
- return depth;
- }
- }
- pub fn get_n_alt(&mut self) -> u32 {
- if let Some(n_alt) = self.n_alt {
- return n_alt;
- } else {
- let n_alt = self
- .callers_data
- .iter_mut()
- .map(|v| v.get_n_alt())
- .max()
- .unwrap();
- self.n_alt = Some(n_alt);
- return n_alt;
- }
- }
- fn is_ins(&self) -> bool {
- match (&self.reference, &self.alternative) {
- (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => true,
- _ => false,
- }
- }
- fn alt_cat(&self) -> AlterationCategory {
- match (&self.reference, &self.alternative) {
- (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
- AlterationCategory::SNV
- }
- (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
- AlterationCategory::INS
- }
- (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
- AlterationCategory::Other
- }
- (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
- AlterationCategory::DEL
- }
- (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) => {
- let a = a.len();
- let b = b.len();
- if a < b {
- AlterationCategory::INS
- } else if a > b {
- AlterationCategory::DEL
- } else {
- AlterationCategory::REP
- }
- }
- (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Unstructured(_)) => {
- AlterationCategory::Other
- }
- (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotide(_)) => {
- AlterationCategory::Other
- }
- (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotides(_)) => {
- AlterationCategory::Other
- }
- (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Unstructured(_)) => {
- AlterationCategory::Other
- }
- }
- }
- pub fn to_min_string(&mut self) -> String {
- let depth = self.get_depth();
- let n_alt = self.get_n_alt();
- format!(
- "DP:AD\t{}:{}",
- depth,
- vec![(depth - n_alt).to_string(), n_alt.to_string()].join(",")
- )
- }
- pub fn get_veps(&self) -> Vec<VEP> {
- self.annotations
- .iter()
- .flat_map(|e| {
- if let AnnotationType::VEP(e) = e {
- e.clone()
- } else {
- vec![]
- }
- })
- .collect()
- }
- pub fn get_best_vep(&self) -> Result<VEP> {
- get_best_vep(&self.get_veps())
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
- enum AlterationCategory {
- SNV,
- INS,
- DEL,
- REP,
- Other,
- }
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
- pub enum AnnotationType {
- VariantCategory(VariantCategory),
- VEP(Vec<VEP>),
- Cluster(i32),
- Cosmic(Cosmic),
- GnomAD(GnomAD),
- NCBIGFF(NCBIGFF),
- }
- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
- pub enum VariantCategory {
- Somatic,
- LowMRDDepth,
- LOH,
- Constit,
- LowDiversity,
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
- pub enum ReferenceAlternative {
- Nucleotide(Base),
- Nucleotides(Vec<Base>),
- Unstructured(String),
- }
- impl FromStr for ReferenceAlternative {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> Result<Self> {
- let mut possible_bases = s.as_bytes().iter();
- let mut res: Vec<Base> = Vec::new();
- while let Some(&base) = possible_bases.next() {
- match base.try_into() {
- std::result::Result::Ok(b) => res.push(b),
- Err(_) => {
- return Ok(Self::Unstructured(s.to_string()));
- }
- }
- }
- if res.len() == 1 {
- return Ok(Self::Nucleotide(res.pop().unwrap()));
- } else {
- return Ok(Self::Nucleotides(res));
- }
- }
- }
- impl fmt::Display for ReferenceAlternative {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let string = match self {
- ReferenceAlternative::Nucleotide(b) => b.to_string(),
- ReferenceAlternative::Nucleotides(bases) => bases
- .iter()
- .fold(String::new(), |acc, e| format!("{}{}", acc, e.to_string())),
- ReferenceAlternative::Unstructured(s) => s.to_string(),
- };
- write!(f, "{}", string)
- }
- }
- #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
- pub enum Base {
- A,
- T,
- C,
- G,
- N,
- }
- impl TryFrom<u8> for Base {
- type Error = anyhow::Error;
- fn try_from(base: u8) -> Result<Self> {
- match base {
- b'A' => Ok(Base::A),
- b'T' => Ok(Base::T),
- b'C' => Ok(Base::C),
- b'G' => Ok(Base::G),
- b'N' => Ok(Base::N),
- _ => Err(anyhow!(
- "Unknown base: {}",
- String::from_utf8_lossy(&vec![base])
- )),
- }
- }
- }
- impl Base {
- pub fn into_u8(self) -> u8 {
- return match self {
- Base::A => b'A',
- Base::T => b'T',
- Base::C => b'C',
- Base::G => b'G',
- Base::N => b'N',
- };
- }
- }
- impl fmt::Display for Base {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- // Use `self.number` to refer to each positional data point.
- let str = match self {
- Base::A => "A",
- Base::T => "T",
- Base::C => "C",
- Base::G => "G",
- Base::N => "N",
- };
- write!(f, "{}", str)
- }
- }
- #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)]
- pub enum Format {
- DeepVariant(DeepVariantFormat),
- ClairS(ClairSFormat),
- Sniffles(SnifflesFormat),
- Nanomonsv(NanomonsvFormat),
- }
- #[derive(Debug, Serialize, Deserialize, PartialEq, Clone)]
- pub enum Info {
- DeepVariant(DeepVariantInfo),
- ClairS(ClairSInfo),
- Sniffles(SnifflesInfo),
- Nanomonsv(NanomonsvInfo),
- }
- fn parse_info(s: &str, source: &VCFSource) -> Result<Info> {
- match source {
- VCFSource::DeepVariant => Ok(Info::DeepVariant(s.parse()?)),
- VCFSource::ClairS => Ok(Info::ClairS(s.parse()?)),
- VCFSource::Sniffles => Ok(Info::Sniffles(s.parse()?)),
- VCFSource::Nanomonsv => Ok(Info::Nanomonsv(s.parse()?)),
- }
- }
- fn parse_format(vcf_source: &VCFSource, data: &str) -> Result<Format> {
- let res = match vcf_source {
- VCFSource::DeepVariant => Format::DeepVariant(data.parse()?),
- VCFSource::ClairS => Format::ClairS(data.parse()?),
- VCFSource::Sniffles => Format::Sniffles(data.parse()?),
- VCFSource::Nanomonsv => Format::Nanomonsv(data.parse()?),
- };
- Ok(res)
- }
- pub fn sort_variants(d: Vec<Variant>, dict_path: &str) -> Result<Vec<Variant>> {
- info!("Sorting {} entries", d.len());
- let dict = read_dict(dict_path)?;
- let mut store: HashMap<String, Vec<Variant>> = HashMap::new();
- // add to store
- d.iter().for_each(|e| {
- if let Some(vec) = store.get_mut(&e.contig) {
- vec.push(e.clone());
- } else {
- store.insert(e.contig.to_string(), vec![e.clone()]);
- }
- });
- // sort in each contig
- store
- .iter_mut()
- .for_each(|(_, vec)| vec.sort_by(|a, b| a.position.partial_cmp(&b.position).unwrap()));
- // return contig in the order of dict file
- Ok(dict
- .iter()
- .flat_map(|(chr, _)| {
- if let Some((_, vec)) = store.remove_entry(chr) {
- vec
- } else {
- vec![]
- }
- })
- .collect())
- }
- // pub fn cluster_variants(d: &mut Vec<Variant>, max_dist: u32) -> i32 {
- // let mut cluster_id = 0;
- // let first = d.get(0).unwrap();
- // let mut last_pos = first.position;
- // let mut last_contig = first.contig.to_string();
- //
- // d.iter_mut().for_each(|e| {
- // if e.contig != last_contig {
- // cluster_id += 1;
- // last_contig = e.contig.to_string();
- // } else if e.position - last_pos > max_dist {
- // cluster_id += 1;
- // }
- // e.annotations.push(AnnotationType::Cluster(cluster_id));
- // last_pos = e.position;
- // });
- //
- // cluster_id
- // }
|