|
|
@@ -1,4 +1,4 @@
|
|
|
-use std::{collections::{BTreeMap, HashMap}, io::BufRead};
|
|
|
+use std::{collections::BTreeMap, io::BufRead, sync::Arc};
|
|
|
|
|
|
use anyhow::Context;
|
|
|
use dashmap::DashMap;
|
|
|
@@ -11,8 +11,8 @@ use crate::{
|
|
|
annotation::{vep::VepImpact, Annotation},
|
|
|
config::Config,
|
|
|
helpers::bin_data,
|
|
|
- io::{dict::read_dict, readers::get_gz_reader, writers::get_gz_writer},
|
|
|
- positions::{contig_to_num, par_overlaps, GenomeRange},
|
|
|
+ io::{dict::read_dict, gff::features_ranges, readers::get_gz_reader, writers::get_gz_writer},
|
|
|
+ positions::{contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par, GenomeRange},
|
|
|
scan::scan::BinCount,
|
|
|
};
|
|
|
|
|
|
@@ -37,9 +37,16 @@ pub struct VariantsStats {
|
|
|
pub consequences: DashMap<String, u32>,
|
|
|
#[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
pub genes: DashMap<String, u32>,
|
|
|
+ #[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
+ pub genes_nonsynonymus: DashMap<String, u32>,
|
|
|
+
|
|
|
pub n_gnomad: usize,
|
|
|
pub gnomad: Vec<(String, Vec<(f64, usize)>)>,
|
|
|
+
|
|
|
+ pub somatic_rates: SomaticVariantRates,
|
|
|
+ pub high_depth_somatic_rates: SomaticVariantRates,
|
|
|
}
|
|
|
+
|
|
|
pub fn serialize_dashmap_sort<S, T>(
|
|
|
data: &DashMap<T, u32>,
|
|
|
serializer: S,
|
|
|
@@ -57,11 +64,12 @@ where
|
|
|
}
|
|
|
|
|
|
impl VariantsStats {
|
|
|
- pub fn new(variants: &Variants) -> Self {
|
|
|
+ pub fn new(variants: &Variants, id:&str, config: &Config) -> anyhow::Result<Self> {
|
|
|
let n = variants.data.len() as u32;
|
|
|
let alteration_categories: DashMap<String, u32> = DashMap::new();
|
|
|
let vep_impact: DashMap<String, u32> = DashMap::new();
|
|
|
let genes: DashMap<String, u32> = DashMap::new();
|
|
|
+ let genes_nonsynonymus: DashMap<String, u32> = DashMap::new();
|
|
|
let consequences: DashMap<String, u32> = DashMap::new();
|
|
|
let n_alts: DashMap<u32, u32> = DashMap::new();
|
|
|
let depths: DashMap<u32, u32> = DashMap::new();
|
|
|
@@ -71,12 +79,18 @@ impl VariantsStats {
|
|
|
|
|
|
variants.data.par_iter().for_each(|v| {
|
|
|
if let Ok(best_vep) = v.best_vep() {
|
|
|
- if let Some(impact) = best_vep.extra.impact {
|
|
|
+ if let Some(ref impact) = best_vep.extra.impact {
|
|
|
*vep_impact.entry(impact.to_string()).or_default() += 1;
|
|
|
}
|
|
|
- if let Some(gene) = best_vep.extra.symbol {
|
|
|
- *genes.entry(gene).or_default() += 1;
|
|
|
+ if let Some(ref gene) = best_vep.extra.symbol {
|
|
|
+ *genes.entry(gene.to_string()).or_default() += 1;
|
|
|
+ }
|
|
|
+ if let (Some(impact), Some(gene)) = (best_vep.extra.impact, best_vep.extra.symbol) {
|
|
|
+ if impact <= VepImpact::MODERATE {
|
|
|
+ *genes_nonsynonymus.entry(gene).or_default() += 1;
|
|
|
+ }
|
|
|
}
|
|
|
+
|
|
|
if let Some(csqs) = best_vep.consequence {
|
|
|
let mut csq: Vec<String> = csqs.into_iter().map(String::from).collect();
|
|
|
csq.sort();
|
|
|
@@ -131,7 +145,21 @@ impl VariantsStats {
|
|
|
})
|
|
|
.collect();
|
|
|
|
|
|
- Self {
|
|
|
+ let exon_ranges = features_ranges("exon", &config)?;
|
|
|
+ let exon_ranges = merge_overlapping_genome_ranges(&exon_ranges);
|
|
|
+
|
|
|
+ let all_somatic_rates = somatic_rates(&variants.data, &exon_ranges, &config)?;
|
|
|
+
|
|
|
+ let mut high_depth_ranges = high_depth_somatic(id, &config)?;
|
|
|
+ high_depth_ranges.par_sort_by_key(|r| ( r.contig, r.range.start ));
|
|
|
+
|
|
|
+ let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
|
|
|
+ let exons_high_depth = range_intersection_par(&high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(), &exon_ranges_ref);
|
|
|
+
|
|
|
+ let high_depth = somatic_rates(&variants.data, &exons_high_depth, &config)?;
|
|
|
+
|
|
|
+
|
|
|
+ Ok(Self {
|
|
|
n,
|
|
|
alteration_categories,
|
|
|
cosmic,
|
|
|
@@ -142,8 +170,11 @@ impl VariantsStats {
|
|
|
vep_impact,
|
|
|
consequences,
|
|
|
genes,
|
|
|
+ genes_nonsynonymus,
|
|
|
n_gnomad,
|
|
|
- }
|
|
|
+ somatic_rates: all_somatic_rates,
|
|
|
+ high_depth_somatic_rates: high_depth
|
|
|
+ })
|
|
|
}
|
|
|
|
|
|
pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
|
|
|
@@ -160,6 +191,7 @@ impl VariantsStats {
|
|
|
debug!("Successfully saved variants to {}", filename);
|
|
|
Ok(())
|
|
|
}
|
|
|
+
|
|
|
pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
|
|
|
let mut reader = get_gz_reader(filename)
|
|
|
.with_context(|| format!("Failed to create BGZF reader for file: {}", filename))?;
|
|
|
@@ -172,25 +204,25 @@ impl VariantsStats {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-#[derive(Debug)]
|
|
|
+#[derive(Debug, Serialize, Deserialize, Clone)]
|
|
|
pub struct SomaticVariantRates {
|
|
|
pub wgs_length: u32,
|
|
|
pub total_variants: usize,
|
|
|
pub somatic_mutation_rate_wgs: f64,
|
|
|
pub exon_count: usize,
|
|
|
- pub variants_in_coding: usize,
|
|
|
- pub coding_variants: usize,
|
|
|
pub total_exon_bases: u32,
|
|
|
+ pub variants_in_coding: usize,
|
|
|
pub somatic_mutation_rate_coding: f64,
|
|
|
+ pub nonsynonymous_variants: usize,
|
|
|
pub somatic_nonsynonymous_rate_coding: f64,
|
|
|
}
|
|
|
|
|
|
pub fn somatic_rates(
|
|
|
variants: &[Variant],
|
|
|
- feature_ranges: &Vec<GenomeRange>,
|
|
|
+ exon_ranges: &Vec<GenomeRange>,
|
|
|
config: &Config,
|
|
|
) -> anyhow::Result<SomaticVariantRates> {
|
|
|
- let ol = par_overlaps(variants, feature_ranges);
|
|
|
+ let ol = par_overlaps(variants, exon_ranges);
|
|
|
|
|
|
let n_coding = ol
|
|
|
.iter()
|
|
|
@@ -199,7 +231,7 @@ pub fn somatic_rates(
|
|
|
.filter(|impact| *impact <= VepImpact::MODERATE)
|
|
|
.count();
|
|
|
|
|
|
- let n_bases_m: u32 = feature_ranges.par_iter().map(|gr| gr.length()).sum();
|
|
|
+ let n_bases_m: u32 = exon_ranges.par_iter().map(|gr| gr.length()).sum();
|
|
|
let mega_base_m = n_bases_m as f64 / 10.0e6;
|
|
|
|
|
|
let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
|
|
|
@@ -210,9 +242,9 @@ pub fn somatic_rates(
|
|
|
|
|
|
Ok(SomaticVariantRates {
|
|
|
total_variants: variants.len(),
|
|
|
- exon_count: feature_ranges.len(),
|
|
|
+ exon_count: exon_ranges.len(),
|
|
|
variants_in_coding: ol.len(),
|
|
|
- coding_variants: n_coding,
|
|
|
+ nonsynonymous_variants: n_coding,
|
|
|
total_exon_bases: n_bases_m,
|
|
|
wgs_length: wgs_len,
|
|
|
somatic_mutation_rate_wgs: rate_wgs,
|
|
|
@@ -221,52 +253,136 @@ pub fn somatic_rates(
|
|
|
})
|
|
|
}
|
|
|
|
|
|
-pub fn high_depth_somatic(id: &str, config: &Config) -> anyhow::Result<DashMap<String, Vec<GenomeRange>>> {
|
|
|
- let mut contigs: Vec<String> = (1..22).map(|i| format!("chr{i}")).collect();
|
|
|
- contigs.extend(["chrX", "chrY", "chrM"].into_iter().map(String::from));
|
|
|
-
|
|
|
- let results: DashMap<String, Vec<GenomeRange>> = DashMap::new();
|
|
|
- contigs.into_par_iter().for_each(|contig| {
|
|
|
- let mrd_path = format!("{}/{contig}_count.tsv.gz", config.normal_dir_count(id));
|
|
|
- let mrd_reader =
|
|
|
- get_gz_reader(&mrd_path).with_context(|| format!("Failed to open: {mrd_path}"));
|
|
|
- let diag_path = format!("{}/{contig}_count.tsv.gz", config.tumoral_dir_count(id));
|
|
|
- let diag_reader =
|
|
|
- get_gz_reader(&diag_path).with_context(|| format!("Failed to open: {diag_path}"));
|
|
|
-
|
|
|
- if let (Ok(mrd_reader), Ok(diag_reader)) = (mrd_reader, diag_reader) {
|
|
|
- let ranges: Vec<GenomeRange> = mrd_reader
|
|
|
+/// Computes high-depth somatic regions across all chromosomes for a given sample.
|
|
|
+///
|
|
|
+/// This function reads count files (compressed TSVs) for both normal and tumoral samples
|
|
|
+/// for each contig (`chr1` to `chr22`, `chrX`, `chrY`, and `chrM`). It identifies genomic regions
|
|
|
+/// where both normal and tumoral depths exceed a configured quality threshold, and extracts
|
|
|
+/// consecutive high-quality bins as genomic ranges.
|
|
|
+///
|
|
|
+/// The function performs the computation in parallel across contigs for better performance,
|
|
|
+/// and includes contextual error handling to help trace issues related to I/O or data parsing.
|
|
|
+///
|
|
|
+/// # Arguments
|
|
|
+///
|
|
|
+/// * `id` - The identifier of the sample.
|
|
|
+/// * `config` - A reference to a `Config` struct containing paths and thresholds.
|
|
|
+///
|
|
|
+/// # Returns
|
|
|
+///
|
|
|
+/// A `Result` containing a vector of `GenomeRange` objects representing high-depth somatic regions,
|
|
|
+/// or an error if any file reading, parsing, or logical check fails.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if:
|
|
|
+/// - Any of the input files (normal or tumoral) can't be opened
|
|
|
+/// - Any line in the files fails to read or parse
|
|
|
+/// - The `BinCount` objects from corresponding lines don't match in position
|
|
|
+///
|
|
|
+/// # Parallelism
|
|
|
+///
|
|
|
+/// This function leverages `rayon` to parallelize processing of contigs.
|
|
|
+///
|
|
|
+/// # Example
|
|
|
+///
|
|
|
+/// ```rust
|
|
|
+/// let config = Config::load_from("config_path.toml")?;
|
|
|
+/// let regions = high_depth_somatic("sample_001", &config)?;
|
|
|
+/// for region in regions {
|
|
|
+/// println!("{:?}", region);
|
|
|
+/// }
|
|
|
+/// ```
|
|
|
+///
|
|
|
+/// # Requirements
|
|
|
+///
|
|
|
+/// - The file names must follow the pattern `{contig}_count.tsv.gz`
|
|
|
+/// - The structure of lines must match what `BinCount::from_tsv_row` expects
|
|
|
+pub fn high_depth_somatic(id: &str, config: &Config) -> anyhow::Result<Vec<GenomeRange>> {
|
|
|
+ // Generate contigs from chr1 to chr22, then chrX, Y, M
|
|
|
+ let contigs = (1..=22)
|
|
|
+ .map(|i| format!("chr{i}"))
|
|
|
+ .chain(["chrX", "chrY", "chrM"].iter().map(|s| s.to_string()))
|
|
|
+ .collect::<Vec<_>>();
|
|
|
+
|
|
|
+ let config = Arc::new(config); // Wrap the config in an Arc for shared ownership
|
|
|
+
|
|
|
+ // Process contigs in parallel with proper error propagation
|
|
|
+ let results: Vec<Vec<GenomeRange>> = contigs
|
|
|
+ .into_par_iter()
|
|
|
+ .map(|contig| {
|
|
|
+ let config = Arc::clone(&config);
|
|
|
+ // Build file paths
|
|
|
+ let mrd_path = format!("{}/{contig}_count.tsv.gz", config.normal_dir_count(id));
|
|
|
+ let diag_path = format!("{}/{contig}_count.tsv.gz", config.tumoral_dir_count(id));
|
|
|
+
|
|
|
+ // Open readers with proper error context
|
|
|
+ let mrd_reader = get_gz_reader(&mrd_path)
|
|
|
+ .with_context(|| format!("Failed to open MRD file: {mrd_path}"))?;
|
|
|
+ let diag_reader = get_gz_reader(&diag_path)
|
|
|
+ .with_context(|| format!("Failed to open Diag file: {diag_path}"))?;
|
|
|
+
|
|
|
+ // Process lines in pairs
|
|
|
+ let ranges = mrd_reader
|
|
|
.lines()
|
|
|
.zip(diag_reader.lines())
|
|
|
- .filter_map(|(mrd, diag)| {
|
|
|
- if let (Ok(mrd), Ok(diag)) = (mrd, diag) {
|
|
|
- if let (Ok(mrd), Ok(diag)) =
|
|
|
- (BinCount::from_tsv_row(&mrd), BinCount::from_tsv_row(&diag))
|
|
|
- {
|
|
|
- assert_eq!(mrd.contig, diag.contig);
|
|
|
- assert_eq!(mrd.start, diag.start);
|
|
|
- let bools: Vec<bool> = mrd
|
|
|
- .depths
|
|
|
- .into_iter()
|
|
|
- .zip(diag.depths)
|
|
|
- .map(|(depth_mrd, depth_diag)| {
|
|
|
- depth_mrd >= config.min_high_quality_depth
|
|
|
- && depth_diag >= config.min_high_quality_depth
|
|
|
- })
|
|
|
- .collect();
|
|
|
- let ranges = ranges_from_consecutive_true(&bools, mrd.start, &mrd.contig);
|
|
|
- return Some(ranges);
|
|
|
- }
|
|
|
+ .enumerate()
|
|
|
+ .map(|(line_num, (mrd_line, diag_line))| {
|
|
|
+ let line_num = line_num + 1; // Convert to 1-based indexing
|
|
|
+
|
|
|
+ // Read lines with context
|
|
|
+ let mrd_line = mrd_line.with_context(|| {
|
|
|
+ format!("MRD file {mrd_path} line {line_num} read error")
|
|
|
+ })?;
|
|
|
+ let diag_line = diag_line.with_context(|| {
|
|
|
+ format!("Diag file {diag_path} line {line_num} read error")
|
|
|
+ })?;
|
|
|
+
|
|
|
+ // Parse both lines
|
|
|
+ let mrd = BinCount::from_tsv_row(&mrd_line).with_context(|| {
|
|
|
+ format!("Failed to parse MRD line {line_num}: {mrd_line}")
|
|
|
+ })?;
|
|
|
+ let diag = BinCount::from_tsv_row(&diag_line).with_context(|| {
|
|
|
+ format!("Failed to parse Diag line {line_num}: {diag_line}")
|
|
|
+ })?;
|
|
|
+
|
|
|
+ // Validate matching positions
|
|
|
+ if mrd.contig != diag.contig {
|
|
|
+ anyhow::bail!(
|
|
|
+ "Contig mismatch at line {line_num}: {} vs {}",
|
|
|
+ mrd.contig,
|
|
|
+ diag.contig
|
|
|
+ );
|
|
|
+ }
|
|
|
+ if mrd.start != diag.start {
|
|
|
+ anyhow::bail!(
|
|
|
+ "Start position mismatch at line {line_num}: {} vs {}",
|
|
|
+ mrd.start,
|
|
|
+ diag.start
|
|
|
+ );
|
|
|
}
|
|
|
- None
|
|
|
+
|
|
|
+ // Calculate high-depth regions
|
|
|
+ let bools: Vec<bool> = mrd
|
|
|
+ .depths
|
|
|
+ .iter()
|
|
|
+ .zip(diag.depths.iter())
|
|
|
+ .map(|(&m, &d)| {
|
|
|
+ m >= config.min_high_quality_depth && d >= config.min_high_quality_depth
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ Ok(ranges_from_consecutive_true(&bools, mrd.start, &mrd.contig))
|
|
|
})
|
|
|
- .flatten()
|
|
|
- .collect();
|
|
|
- results.insert(contig, ranges);
|
|
|
- }
|
|
|
- });
|
|
|
+ .collect::<anyhow::Result<Vec<_>>>()?;
|
|
|
+
|
|
|
+ // Flatten nested ranges and return contig's ranges
|
|
|
+ Ok(ranges.into_iter().flatten().collect::<Vec<GenomeRange>>())
|
|
|
+ })
|
|
|
+ .collect::<anyhow::Result<Vec<_>>>()?;
|
|
|
|
|
|
- Ok(results)
|
|
|
+ // Flatten the results from all contigs into a single vector
|
|
|
+ Ok(results.into_iter().flatten().collect())
|
|
|
}
|
|
|
|
|
|
fn ranges_from_consecutive_true(vec: &[bool], start: u32, contig: &str) -> Vec<GenomeRange> {
|