|
|
@@ -1,14 +1,15 @@
|
|
|
use log::info;
|
|
|
use rayon::prelude::*;
|
|
|
-use std::{collections::HashSet, sync::Arc};
|
|
|
+use std::{collections::HashSet, fs::File, sync::Arc};
|
|
|
|
|
|
use crate::{
|
|
|
annotation::{Annotation, Annotations},
|
|
|
callers::{clairs::ClairS, deep_variant::DeepVariant},
|
|
|
collection::{Initialize, InitializeSolo},
|
|
|
config::Config,
|
|
|
+ io::vcf::write_vcf,
|
|
|
runners::Run,
|
|
|
- variant::variant::{load_variants, parallel_intersection, RunnerVariants},
|
|
|
+ variant::variant::{load_variants, parallel_intersection, RunnerVariants, VcfVariant},
|
|
|
};
|
|
|
|
|
|
pub struct Somatic {
|
|
|
@@ -51,22 +52,20 @@ impl Run for Somatic {
|
|
|
// Annotations Stats
|
|
|
let mut annotations = Arc::unwrap_or_clone(annotations);
|
|
|
annotations.callers_stat();
|
|
|
+ // TODO: look at variants: ClairS + DeepVariant + SoloConstit + SoloDiag + Somatic (error
|
|
|
+ // in ClairS somatic)
|
|
|
|
|
|
// Filtering Somatic variants
|
|
|
- info!("Filtering somatic variants (variants from somatic callers or not found in germline or in a constit sample).");
|
|
|
+ info!("Filtering somatic variants (variants not in salo callers on constit sample or germline).");
|
|
|
let germline_or_somatic_keys = annotations.get_keys_filter(|anns| {
|
|
|
- anns.contains(&Annotation::Somatic)
|
|
|
- | (anns.contains(&Annotation::SoloDiag)
|
|
|
- && !anns
|
|
|
- .iter()
|
|
|
- .any(|ann| matches!(ann, Annotation::Germline | Annotation::SoloConstit)))
|
|
|
+ !anns.contains(&Annotation::Germline) && !anns.contains(&Annotation::SoloConstit)
|
|
|
});
|
|
|
|
|
|
let (somatic_keys, germline_keys, remains) = parallel_intersection(
|
|
|
&variants_collection
|
|
|
.iter()
|
|
|
.flat_map(|e| e.keys())
|
|
|
- .collect::<Vec<u64>>(),
|
|
|
+ .collect::<Vec<u128>>(),
|
|
|
&germline_or_somatic_keys,
|
|
|
);
|
|
|
assert_eq!(0, remains.len());
|
|
|
@@ -74,7 +73,7 @@ impl Run for Somatic {
|
|
|
info!("Somatic variants positions {}.", somatic_keys.len());
|
|
|
info!("Germline variants positions {}.", germline_keys.len());
|
|
|
|
|
|
- let somatic_keys: HashSet<u64> = somatic_keys.into_iter().collect();
|
|
|
+ let somatic_keys: HashSet<u128> = somatic_keys.into_iter().collect();
|
|
|
annotations.retain_keys(&somatic_keys);
|
|
|
annotations.callers_stat();
|
|
|
|
|
|
@@ -82,13 +81,67 @@ impl Run for Somatic {
|
|
|
let before = c.variants.len();
|
|
|
c.retain_keys(&somatic_keys);
|
|
|
let after = c.variants.len();
|
|
|
- info!("Variants removed from {}: {}", c.vcf.path.display(), before - after);
|
|
|
+ info!(
|
|
|
+ "Variants removed from {}: {}",
|
|
|
+ c.vcf.path.display(),
|
|
|
+ before - after
|
|
|
+ );
|
|
|
});
|
|
|
|
|
|
+ variants_collection.retain(|e| !e.variants.is_empty());
|
|
|
+
|
|
|
+ info!("Entropy annotation...");
|
|
|
+ variants_collection.iter().for_each(|c| {
|
|
|
+ c.annotate_with_sequence_entropy(&annotations, &self.config.reference);
|
|
|
+ });
|
|
|
+ annotations.callers_stat();
|
|
|
+
|
|
|
+ let prob_keys: HashSet<u128> = annotations
|
|
|
+ .get_keys_filter(|anns| {
|
|
|
+ anns.contains(&Annotation::Somatic) && anns.contains(&Annotation::Germline)
|
|
|
+ })
|
|
|
+ .into_iter()
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ info!("Problematic variants {}", prob_keys.len());
|
|
|
+
|
|
|
+ // let mut problematic_variants = variants_collection.clone();
|
|
|
+ //
|
|
|
+ // let problematic_variants: Vec<VcfVariant> = problematic_variants
|
|
|
+ // .iter_mut()
|
|
|
+ // .flat_map(|e| {
|
|
|
+ // e.retain_keys(&prob_keys);
|
|
|
+ // e.variants.clone()
|
|
|
+ // })
|
|
|
+ // .collect();
|
|
|
+
|
|
|
+ // write_vcf(&problematic_variants, "prob.vcf.gz")?;
|
|
|
+
|
|
|
Ok(())
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-pub fn filter_entropy() {
|
|
|
-
|
|
|
+// 0-based position
|
|
|
+pub fn sequence_at(
|
|
|
+ fasta_reader: &mut noodles_fasta::IndexedReader<noodles_fasta::io::BufReader<File>>,
|
|
|
+ contig: &str,
|
|
|
+ position: usize,
|
|
|
+ len: usize,
|
|
|
+) -> anyhow::Result<String> {
|
|
|
+ // convert to 1-based
|
|
|
+ let position = position + 1;
|
|
|
+
|
|
|
+ let start = position.saturating_sub(len / 2).max(1);
|
|
|
+ let end = start + len - 1;
|
|
|
+ // debug!("Region {contig}:{start}-{end} (1-based inclusive)");
|
|
|
+
|
|
|
+ let start = noodles_core::Position::try_from(start)?;
|
|
|
+ let end = noodles_core::Position::try_from(end)?;
|
|
|
+ let interval = noodles_core::region::interval::Interval::from(start..=end);
|
|
|
+
|
|
|
+ let r = noodles_core::Region::new(contig.to_string(), interval);
|
|
|
+ let record = fasta_reader.query(&r)?;
|
|
|
+ let s = String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase();
|
|
|
+
|
|
|
+ Ok(s)
|
|
|
}
|