|
|
@@ -7,17 +7,18 @@ use crate::{
|
|
|
scan::scan::SomaticScan,
|
|
|
variant::{
|
|
|
variants_stats::somatic_depth_quality_ranges,
|
|
|
- vcf_variant::{ShouldRunBox, run_if_required},
|
|
|
+ vcf_variant::{run_if_required, ShouldRunBox},
|
|
|
},
|
|
|
};
|
|
|
use itertools::Itertools;
|
|
|
use log::info;
|
|
|
use rayon::slice::ParallelSliceMut;
|
|
|
+use serde::Serialize;
|
|
|
use std::{
|
|
|
- collections::HashMap,
|
|
|
+ collections::{BTreeMap, HashMap},
|
|
|
fs::{self, File},
|
|
|
io::Write,
|
|
|
- path::Path,
|
|
|
+ path::{Path, PathBuf},
|
|
|
};
|
|
|
|
|
|
use crate::{
|
|
|
@@ -176,11 +177,7 @@ impl ShouldRun for SomaticPipe {
|
|
|
fn should_run(&self) -> bool {
|
|
|
// path to the existing “.bit” file
|
|
|
let tumoral_dir = self.config.tumoral_dir(&self.id);
|
|
|
- let bit_path = format!("{}/{}_somatic_variants.bit", tumoral_dir, self.id);
|
|
|
- // let bit_path = self
|
|
|
- // .config
|
|
|
- // .tumoral_dir(&self.id)
|
|
|
- // .join(format!("{}_somatic_variants.bit", self.id));
|
|
|
+ let bit_path = Path::new(&tumoral_dir).join(format!("{}_somatic_variants.bit", self.id));
|
|
|
|
|
|
// if we can’t read its modification time, re-run
|
|
|
let res_meta = match fs::metadata(&bit_path).and_then(|m| m.modified()) {
|
|
|
@@ -188,14 +185,16 @@ impl ShouldRun for SomaticPipe {
|
|
|
Err(_) => return true,
|
|
|
};
|
|
|
|
|
|
- // if *either* BAM is newer than the .bit, or we can’t stat it, re-run
|
|
|
- [
|
|
|
- self.config.normal_bam(&self.id),
|
|
|
- self.config.tumoral_bam(&self.id),
|
|
|
- ]
|
|
|
- .iter()
|
|
|
- .any(|bam| {
|
|
|
- fs::metadata(bam)
|
|
|
+ let deps: [PathBuf; 5] = [
|
|
|
+ Path::new(&self.config.normal_bam(&self.id)).to_path_buf(),
|
|
|
+ Path::new(&self.config.tumoral_bam(&self.id)).to_path_buf(),
|
|
|
+ Path::new(&self.config.reference).to_path_buf(),
|
|
|
+ Path::new(&self.config.vntrs_bed).to_path_buf(),
|
|
|
+ Path::new(&self.config.repeats_bed).to_path_buf(),
|
|
|
+ ];
|
|
|
+
|
|
|
+ deps.iter().any(|p| {
|
|
|
+ fs::metadata(p)
|
|
|
.and_then(|m| m.modified())
|
|
|
.map_or(true, |ts| ts > res_meta)
|
|
|
})
|
|
|
@@ -214,7 +213,8 @@ impl Run for SomaticPipe {
|
|
|
let result_bit = format!("{}/{id}_somatic_variants.bit", config.tumoral_dir(&id));
|
|
|
let result_vcf = format!("{}/{id}_somatic_variants.vcf.gz", config.tumoral_dir(&id));
|
|
|
|
|
|
- if Path::new(&result_bit).exists() && !config.somatic_pipe_force {
|
|
|
+ let outputs = [&result_json, &result_bit, &result_vcf];
|
|
|
+ if !config.somatic_pipe_force && outputs.iter().any(|p| Path::new(p).exists()) {
|
|
|
return Err(anyhow::anyhow!(
|
|
|
"Somatic Pipe output already exists for {id}."
|
|
|
));
|
|
|
@@ -323,13 +323,18 @@ impl Run for SomaticPipe {
|
|
|
// MASK mapq
|
|
|
let (mut high_depth_ranges, mut low_quality_ranges) =
|
|
|
somatic_depth_quality_ranges(&id, &config)?;
|
|
|
+
|
|
|
high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
|
|
|
low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
|
|
|
|
|
|
info!(
|
|
|
- "High-depth ranges: {} {} bp\nLowQ ranges: {} {} bp",
|
|
|
+ "High-depth ranges: n={} bp={}",
|
|
|
high_depth_ranges.len(),
|
|
|
high_depth_ranges.total_len(),
|
|
|
+ );
|
|
|
+
|
|
|
+ info!(
|
|
|
+ "LowQ ranges: n={} bp={}",
|
|
|
low_quality_ranges.len(),
|
|
|
low_quality_ranges.total_len(),
|
|
|
);
|
|
|
@@ -364,19 +369,6 @@ impl Run for SomaticPipe {
|
|
|
})))
|
|
|
.save_to_json(&format!("{stats_dir}/{id}_annotations_03_bam.json"))?;
|
|
|
|
|
|
- // variants_collections.iter().for_each(|col| {
|
|
|
- // col.variants
|
|
|
- // .iter()
|
|
|
- // .filter(|v| v.position.position == 36122735)
|
|
|
- // .for_each(|v| {
|
|
|
- // if let Some(ann) = annotations.store.get(&v.hash()) {
|
|
|
- // println!("before const DEPTH v: {:?}\n\n{:?}", v, ann.value());
|
|
|
- // } else {
|
|
|
- // println!("no ann but present");
|
|
|
- // }
|
|
|
- // });
|
|
|
- // });
|
|
|
-
|
|
|
// Filter based on low constitutional depth
|
|
|
info!(
|
|
|
"Removing variants when depth in constit bam < {}.",
|
|
|
@@ -416,19 +408,20 @@ impl Run for SomaticPipe {
|
|
|
})))
|
|
|
.save_to_json(&format!("{stats_dir}/{id}_annotations_04_bam_filter.json"))?;
|
|
|
|
|
|
- // Annotate variants with sequence entropy
|
|
|
+ // Annotate variants with sequence context (entropy + trinucleotide)
|
|
|
info!(
|
|
|
- "Entropy annotation from {} sequences.",
|
|
|
+ "Annotating variants with sequence context using reference: {}",
|
|
|
self.config.reference
|
|
|
);
|
|
|
- variants_collections.iter().for_each(|c| {
|
|
|
+
|
|
|
+ variants_collections.iter().try_for_each(|c| {
|
|
|
c.annotate_with_sequence_context(
|
|
|
&annotations,
|
|
|
&self.config.reference,
|
|
|
self.config.entropy_seq_len,
|
|
|
self.config.somatic_pipe_threads,
|
|
|
- );
|
|
|
- });
|
|
|
+ )
|
|
|
+ })?;
|
|
|
|
|
|
// Annotate with external databases like COSMIC and GnomAD
|
|
|
info!("Annotation with external databases like COSMIC and GnomAD.");
|
|
|
@@ -510,8 +503,6 @@ impl Run for SomaticPipe {
|
|
|
.callers_stat(Some(Box::new(caller_cat_anns)))
|
|
|
.save_to_json(&format!("{stats_dir}/{id}_annotations_09_vep.json"))?;
|
|
|
|
|
|
- annotations.vep_stats()?;
|
|
|
-
|
|
|
// Merge all variants into a final collection
|
|
|
let mut variants = variants_collections.into_iter().fold(
|
|
|
Variants::default(),
|
|
|
@@ -578,19 +569,26 @@ pub struct SomaticPipeStats {
|
|
|
/// Summary of input variant collections grouped by sample type.
|
|
|
pub input: InputStats,
|
|
|
|
|
|
- /// Number of variants labeled as both constitutional and germline.
|
|
|
+ /// Number of variants removed because they were classified as germline
|
|
|
+ /// by at least one caller, or detected in the normal-only (solo constitutional)
|
|
|
+ /// callset (including variants classified as both).
|
|
|
pub n_constit_germline: usize,
|
|
|
|
|
|
- /// Number of variants in constitutional samples with low allele frequency.
|
|
|
+ /// Number of variants removed due to low depth in the constitutional (normal) BAM
|
|
|
+ /// (i.e. annotated with [`Annotation::LowConstitDepth`]).
|
|
|
pub n_low_constit: usize,
|
|
|
|
|
|
- /// Number of variants in constitutional samples with high alternative allele count.
|
|
|
+ /// Number of variants removed due to excessive alternate-allele support in the
|
|
|
+ /// constitutional (normal) BAM (i.e. annotated with [`Annotation::HighConstitAlt`]).
|
|
|
pub n_high_alt_constit: usize,
|
|
|
|
|
|
- /// Number of high-alt constitutional variants that are also found in gnomAD.
|
|
|
+ /// Number of variants removed because they are present in gnomAD and also show
|
|
|
+ /// alternate-allele support in the constitutional (normal) BAM, according to
|
|
|
+ /// [`is_gnomad_and_constit_alt`].
|
|
|
pub n_high_alt_constit_gnomad: usize,
|
|
|
|
|
|
- /// Number of variants filtered due to low entropy (indicative of low complexity regions).
|
|
|
+ /// Number of variants removed due to low sequence complexity (Shannon entropy),
|
|
|
+ /// i.e. annotated with [`Annotation::LowEntropy`].
|
|
|
pub n_low_entropies: usize,
|
|
|
}
|
|
|
|
|
|
@@ -677,148 +675,130 @@ impl SomaticPipeStats {
|
|
|
/// stats.annot_init(&annotation_stats, "output/matrix.json")?;
|
|
|
/// ```
|
|
|
pub fn annot_init(&self, stats: &AnnotationsStats, json_path: &str) -> anyhow::Result<()> {
|
|
|
- // Parse annotations from stats
|
|
|
- let stats: Vec<(Vec<Annotation>, u64)> = stats
|
|
|
+ #[derive(Serialize)]
|
|
|
+ struct TumorRow {
|
|
|
+ caller_name: String,
|
|
|
+ germline: BTreeMap<String, u64>,
|
|
|
+ }
|
|
|
+
|
|
|
+ // Parse stats keys into Vec<Annotation> once
|
|
|
+ let parsed: Vec<(Vec<Annotation>, u64)> = stats
|
|
|
.categorical
|
|
|
.iter()
|
|
|
.map(|e| {
|
|
|
let anns = e
|
|
|
.key()
|
|
|
.split(" + ")
|
|
|
- .map(|k| k.parse())
|
|
|
- .collect::<anyhow::Result<Vec<Annotation>>>()
|
|
|
+ .map(|k| k.parse::<Annotation>())
|
|
|
+ .collect::<anyhow::Result<Vec<_>>>()
|
|
|
.map_err(|err| {
|
|
|
- anyhow::anyhow!("Error while splitting key in AnnotationsStats.\n{err}")
|
|
|
+ anyhow::anyhow!("Error while parsing AnnotationsStats key: {err}")
|
|
|
})?;
|
|
|
Ok((anns, *e.value()))
|
|
|
})
|
|
|
- .collect::<anyhow::Result<Vec<(Vec<Annotation>, u64)>>>()?;
|
|
|
+ .collect::<anyhow::Result<_>>()?;
|
|
|
|
|
|
- // Collect tumor and germline callers from input stats
|
|
|
- let callers_somatic_solo_tumor = [
|
|
|
- self.input
|
|
|
- .somatic
|
|
|
- .iter()
|
|
|
- .map(|(caller, _)| caller.clone())
|
|
|
- .collect::<Vec<Annotation>>(),
|
|
|
- self.input
|
|
|
- .solo_tumor
|
|
|
- .iter()
|
|
|
- .map(|(caller, _)| caller.clone())
|
|
|
- .collect(),
|
|
|
- ]
|
|
|
- .concat();
|
|
|
-
|
|
|
- let callers_germline_solo_constit = [
|
|
|
- self.input
|
|
|
- .germline
|
|
|
- .iter()
|
|
|
- .map(|(caller, _)| caller.clone())
|
|
|
- .collect::<Vec<Annotation>>(),
|
|
|
- self.input
|
|
|
- .solo_constit
|
|
|
- .iter()
|
|
|
- .map(|(caller, _)| caller.clone())
|
|
|
- .collect(),
|
|
|
- ]
|
|
|
- .concat();
|
|
|
-
|
|
|
- // Build a matrix of tumor vs germline hits
|
|
|
- let mut with_germline: HashMap<String, HashMap<String, u64>> = HashMap::new();
|
|
|
- stats.iter().for_each(|(anns, v)| {
|
|
|
- // Only proceed if this annotation includes a germline/constit sample
|
|
|
- if anns.iter().any(|a| {
|
|
|
+ // Collect caller labels
|
|
|
+ let tumor_callers: Vec<Annotation> = self
|
|
|
+ .input
|
|
|
+ .somatic
|
|
|
+ .iter()
|
|
|
+ .map(|(a, _)| a.clone())
|
|
|
+ .chain(self.input.solo_tumor.iter().map(|(a, _)| a.clone()))
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ let germ_callers: Vec<Annotation> = self
|
|
|
+ .input
|
|
|
+ .germline
|
|
|
+ .iter()
|
|
|
+ .map(|(a, _)| a.clone())
|
|
|
+ .chain(self.input.solo_constit.iter().map(|(a, _)| a.clone()))
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ // Matrix: tumor -> (germline label -> count), deterministic by construction
|
|
|
+ let mut matrix: BTreeMap<String, BTreeMap<String, u64>> = BTreeMap::new();
|
|
|
+
|
|
|
+ for (anns, v) in parsed.iter() {
|
|
|
+ // only rows that include any germline/constit label
|
|
|
+ if !anns.iter().any(|a| {
|
|
|
matches!(
|
|
|
a,
|
|
|
Annotation::Callers(_, Sample::SoloConstit)
|
|
|
| Annotation::Callers(_, Sample::Germline)
|
|
|
)
|
|
|
}) {
|
|
|
- // Find all tumor callers present in this annotation set
|
|
|
- let n_by_tumor: Vec<(String, u64)> = callers_somatic_solo_tumor
|
|
|
- .iter()
|
|
|
- .flat_map(|tumor| {
|
|
|
- if anns.contains(tumor) {
|
|
|
- vec![(tumor.to_string(), *v)]
|
|
|
- } else {
|
|
|
- vec![]
|
|
|
- }
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- // Build a normalized germline key
|
|
|
- let mut germline_caller: Vec<String> = callers_germline_solo_constit
|
|
|
- .iter()
|
|
|
- .flat_map(|germ| {
|
|
|
- if anns.contains(germ) {
|
|
|
- vec![germ.to_string()]
|
|
|
- } else {
|
|
|
- vec![]
|
|
|
- }
|
|
|
- })
|
|
|
- .collect();
|
|
|
- germline_caller.sort();
|
|
|
- let germline_caller = germline_caller.join(" + ");
|
|
|
-
|
|
|
- // Update matrix: tumor -> germline -> count
|
|
|
- n_by_tumor.iter().for_each(|(tumoral_caller, n)| {
|
|
|
- if let Some(row) = with_germline.get_mut(tumoral_caller) {
|
|
|
- if let Some(col) = row.get_mut(&germline_caller) {
|
|
|
- *col += *n;
|
|
|
- } else {
|
|
|
- row.insert(germline_caller.to_string(), *n);
|
|
|
- }
|
|
|
- } else {
|
|
|
- let mut row = HashMap::new();
|
|
|
- row.insert(germline_caller.to_string(), *n);
|
|
|
- with_germline.insert(tumoral_caller.to_string(), row);
|
|
|
- }
|
|
|
- });
|
|
|
+ continue;
|
|
|
}
|
|
|
- });
|
|
|
|
|
|
- // Extract all unique germline caller labels
|
|
|
- let mut germlines_callers: Vec<String> = with_germline
|
|
|
- .iter()
|
|
|
- .flat_map(|(_, r)| r.keys().map(|k| k.to_string()).collect::<Vec<String>>())
|
|
|
+ // Which tumor callers are present in this set?
|
|
|
+ let present_tumors: Vec<String> = tumor_callers
|
|
|
+ .iter()
|
|
|
+ .filter(|t| anns.contains(t))
|
|
|
+ .map(|t| t.to_string())
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ if present_tumors.is_empty() {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Canonical germline key: sorted labels joined by " + "
|
|
|
+ let mut present_germs: Vec<String> = germ_callers
|
|
|
+ .iter()
|
|
|
+ .filter(|g| anns.contains(g))
|
|
|
+ .map(|g| g.to_string())
|
|
|
+ .collect();
|
|
|
+ present_germs.sort();
|
|
|
+ let germ_key = present_germs.join(" + ");
|
|
|
+
|
|
|
+ for tumor in present_tumors {
|
|
|
+ *matrix
|
|
|
+ .entry(tumor)
|
|
|
+ .or_default()
|
|
|
+ .entry(germ_key.clone())
|
|
|
+ .or_default() += *v;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Collect all germline columns (deterministic)
|
|
|
+ let mut germ_cols: Vec<String> = matrix
|
|
|
+ .values()
|
|
|
+ .flat_map(|row| row.keys().cloned())
|
|
|
.collect();
|
|
|
- germlines_callers.sort();
|
|
|
- germlines_callers.dedup();
|
|
|
+ germ_cols.sort();
|
|
|
+ germ_cols.dedup();
|
|
|
+
|
|
|
+ // TSV output (deterministic ordering)
|
|
|
+ let mut lines: Vec<String> = Vec::with_capacity(matrix.len());
|
|
|
+ for (tumor, row) in matrix.iter() {
|
|
|
+ let line = format!(
|
|
|
+ "{tumor}\t{}",
|
|
|
+ germ_cols
|
|
|
+ .iter()
|
|
|
+ .map(|g| format!("{g}: {}", row.get(g).copied().unwrap_or(0)))
|
|
|
+ .collect::<Vec<_>>()
|
|
|
+ .join("\t")
|
|
|
+ );
|
|
|
+ lines.push(line);
|
|
|
+ }
|
|
|
+ println!("{}", lines.join("\n"));
|
|
|
|
|
|
- // Print a readable tab-separated matrix
|
|
|
- let mut json = Vec::new();
|
|
|
- let mut lines: Vec<String> = with_germline
|
|
|
+ // JSON output: list of rows with all columns filled (0 if absent)
|
|
|
+ let json_rows: Vec<TumorRow> = matrix
|
|
|
.iter()
|
|
|
.map(|(tumor, row)| {
|
|
|
- json.push(format!(
|
|
|
- "{{\"caller_name\": \"{tumor}\", \"germline\": [{}] }}",
|
|
|
- germlines_callers
|
|
|
- .iter()
|
|
|
- .map(|g| {
|
|
|
- let v = row.get(g).unwrap_or(&0);
|
|
|
- format!("{{\"{g}\": {v}}}")
|
|
|
- })
|
|
|
- .join(", ")
|
|
|
- ));
|
|
|
- format!(
|
|
|
- "{tumor}\t{}",
|
|
|
- germlines_callers
|
|
|
- .iter()
|
|
|
- .map(|g| {
|
|
|
- let v = row.get(g).unwrap_or(&0);
|
|
|
- format!("{g}: {v}")
|
|
|
- })
|
|
|
- .join("\t")
|
|
|
- )
|
|
|
+ let mut full = BTreeMap::new();
|
|
|
+ for g in germ_cols.iter() {
|
|
|
+ full.insert(g.clone(), row.get(g).copied().unwrap_or(0));
|
|
|
+ }
|
|
|
+ TumorRow {
|
|
|
+ caller_name: tumor.clone(),
|
|
|
+ germline: full,
|
|
|
+ }
|
|
|
})
|
|
|
.collect();
|
|
|
- lines.sort();
|
|
|
- println!("{}", lines.join("\n"));
|
|
|
|
|
|
- // Write JSON to file
|
|
|
- let json = format!("[{}]", json.join(", "));
|
|
|
- let mut file = File::create(json_path)?;
|
|
|
- file.write_all(json.as_bytes())?;
|
|
|
+ let file = File::create(json_path)?;
|
|
|
+ serde_json::to_writer_pretty(file, &json_rows)?;
|
|
|
|
|
|
Ok(())
|
|
|
}
|