| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- use log::info;
- use rayon::prelude::*;
- use std::{collections::HashSet, fs::File, sync::Arc};
- use crate::{
- annotation::{Annotation, Annotations},
- callers::{clairs::ClairS, deep_variant::DeepVariant},
- collection::{Initialize, InitializeSolo},
- config::Config,
- io::vcf::write_vcf,
- runners::Run,
- variant::variant::{load_variants, parallel_intersection, RunnerVariants, VcfVariant},
- };
- pub struct Somatic {
- pub id: String,
- pub config: Config,
- pub annotations: Annotations,
- }
- impl Initialize for Somatic {
- fn initialize(id: &str, config: crate::config::Config) -> anyhow::Result<Self> {
- let id = id.to_string();
- Ok(Self {
- id,
- config,
- annotations: Annotations::default(),
- })
- }
- }
- impl Run for Somatic {
- fn run(&mut self) -> anyhow::Result<()> {
- info!("Running somatic pipe for {}.", self.id);
- let id = self.id.clone();
- info!("Initialization...");
- let mut v: Vec<Box<dyn RunnerVariants + Send + Sync>> = vec![
- Box::new(ClairS::initialize(&id, self.config.clone())?),
- Box::new(DeepVariant::initialize(&id, "diag", self.config.clone())?),
- Box::new(DeepVariant::initialize(&id, "mrd", self.config.clone())?),
- ];
- let annotations = Arc::new(self.annotations.clone());
- let mut variants_collection = load_variants(&mut v, &annotations)?;
- let clairs_germline =
- ClairS::initialize(&id, self.config.clone())?.germline(&annotations)?;
- variants_collection.push(clairs_germline);
- info!("Variants sources loaded: {}", variants_collection.len());
- // Annotations Stats
- let mut annotations = Arc::unwrap_or_clone(annotations);
- annotations.callers_stat();
- // TODO: look at variants: ClairS + DeepVariant + SoloConstit + SoloDiag + Somatic (error
- // in ClairS somatic)
- // Filtering Somatic variants
- info!("Filtering somatic variants (variants not in salo callers on constit sample or germline).");
- let germline_or_somatic_keys = annotations.get_keys_filter(|anns| {
- !anns.contains(&Annotation::Germline) && !anns.contains(&Annotation::SoloConstit)
- });
- let (somatic_keys, germline_keys, remains) = parallel_intersection(
- &variants_collection
- .iter()
- .flat_map(|e| e.keys())
- .collect::<Vec<u128>>(),
- &germline_or_somatic_keys,
- );
- assert_eq!(0, remains.len());
- info!("Somatic variants positions {}.", somatic_keys.len());
- info!("Germline variants positions {}.", germline_keys.len());
- let somatic_keys: HashSet<u128> = somatic_keys.into_iter().collect();
- annotations.retain_keys(&somatic_keys);
- annotations.callers_stat();
- variants_collection.par_iter_mut().for_each(|c| {
- let before = c.variants.len();
- c.retain_keys(&somatic_keys);
- let after = c.variants.len();
- info!(
- "Variants removed from {}: {}",
- c.vcf.path.display(),
- before - after
- );
- });
- variants_collection.retain(|e| !e.variants.is_empty());
- info!("Constit Bam annotation...");
- variants_collection.iter().try_for_each(|c| {
- c.annotate_with_constit_bam(&annotations, &self.config.normal_bam(&id), 150)
- })?;
- annotations.solo_constit_boundaries(
- self.config.solo_max_alt_constit,
- self.config.solo_min_constit_depth,
- );
- info!("Entropy annotation...");
- variants_collection.iter().for_each(|c| {
- c.annotate_with_sequence_entropy(&annotations, &self.config.reference, 10, 150);
- });
- annotations.callers_stat();
- let prob_keys: HashSet<u128> = annotations
- .get_keys_filter(|anns| {
- // let contains = anns.iter().any(|item| matches!(item, Annotation::SoloDiag));
- // let contains_not = anns.iter().all(|item| !matches!(item, Annotation::Somatic));
- //
- // contains && contains_not
- anns.contains(&Annotation::SoloDiag)
- && !anns.contains(&Annotation::Somatic)
- && !anns.contains(&Annotation::LowConstitDepth)
- && !anns.contains(&Annotation::HighConstitAlt)
- })
- .into_iter()
- .collect();
- info!("Problematic variants {}", prob_keys.len());
- let mut problematic_variants = variants_collection.clone();
- let problematic_variants: Vec<VcfVariant> = problematic_variants
- .iter_mut()
- .flat_map(|e| {
- e.retain_keys(&prob_keys);
- e.variants.clone()
- })
- .collect();
- write_vcf(&problematic_variants, "prob.vcf.gz")?;
- Ok(())
- }
- }
- // 0-based position
- pub fn sequence_at(
- fasta_reader: &mut noodles_fasta::IndexedReader<noodles_fasta::io::BufReader<File>>,
- contig: &str,
- position: usize,
- len: usize,
- ) -> anyhow::Result<String> {
- // convert to 1-based
- let position = position + 1;
- let start = position.saturating_sub(len / 2).max(1);
- let end = start + len - 1;
- // debug!("Region {contig}:{start}-{end} (1-based inclusive)");
- let start = noodles_core::Position::try_from(start)?;
- let end = noodles_core::Position::try_from(end)?;
- let interval = noodles_core::region::interval::Interval::from(start..=end);
- let r = noodles_core::Region::new(contig.to_string(), interval);
- let record = fasta_reader.query(&r)?;
- let s = String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase();
- Ok(s)
- }
|