|
@@ -62,7 +62,6 @@ pub struct VariantsStats {
|
|
|
|
|
|
|
|
#[serde(serialize_with = "serialize_dashmap_sort")]
|
|
#[serde(serialize_with = "serialize_dashmap_sort")]
|
|
|
pub deletions_len: DashMap<u32, u32>,
|
|
pub deletions_len: DashMap<u32, u32>,
|
|
|
-
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
pub fn serialize_dashmap_sort<S, T>(
|
|
pub fn serialize_dashmap_sort<S, T>(
|
|
@@ -82,7 +81,7 @@ where
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
impl VariantsStats {
|
|
impl VariantsStats {
|
|
|
- pub fn new(variants: &mut Variants, id: &str, config: &Config) -> anyhow::Result<Self> {
|
|
|
|
|
|
|
+ pub fn new(variants: &mut Variants, id: &str, config: &Config, high_depth_ranges: &[GenomeRange]) -> anyhow::Result<Self> {
|
|
|
let n = variants.data.len() as u32;
|
|
let n = variants.data.len() as u32;
|
|
|
let alteration_categories: DashMap<String, u32> = DashMap::new();
|
|
let alteration_categories: DashMap<String, u32> = DashMap::new();
|
|
|
let vep_impact: DashMap<String, u32> = DashMap::new();
|
|
let vep_impact: DashMap<String, u32> = DashMap::new();
|
|
@@ -173,7 +172,6 @@ impl VariantsStats {
|
|
|
if let Some(len) = v.deletion_length() {
|
|
if let Some(len) = v.deletion_length() {
|
|
|
*deletions_len.entry(len).or_default() += 1;
|
|
*deletions_len.entry(len).or_default() += 1;
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
});
|
|
});
|
|
|
|
|
|
|
|
let mut n_gnomad = 0;
|
|
let mut n_gnomad = 0;
|
|
@@ -191,8 +189,8 @@ impl VariantsStats {
|
|
|
|
|
|
|
|
let all_somatic_rates = somatic_rates(&variants.data, &exon_ranges, config)?;
|
|
let all_somatic_rates = somatic_rates(&variants.data, &exon_ranges, config)?;
|
|
|
|
|
|
|
|
- let mut high_depth_ranges = high_depth_somatic(id, config)?;
|
|
|
|
|
- high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
|
|
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
|
|
let exon_ranges_ref: Vec<&GenomeRange> = exon_ranges.iter().collect();
|
|
|
let exons_high_depth = range_intersection_par(
|
|
let exons_high_depth = range_intersection_par(
|
|
|
&high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
|
|
&high_depth_ranges.iter().collect::<Vec<&GenomeRange>>(),
|
|
@@ -452,136 +450,118 @@ pub fn somatic_rates(
|
|
|
})
|
|
})
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// Computes high-depth somatic regions across all chromosomes for a given sample.
|
|
|
|
|
|
|
+/// Computes high‑depth and low‑quality genomic regions for a given sample.
|
|
|
///
|
|
///
|
|
|
-/// This function reads count files (compressed TSVs) for both normal and tumoral samples
|
|
|
|
|
-/// for each contig (`chr1` to `chr22`, `chrX`, `chrY`, and `chrM`). It identifies genomic regions
|
|
|
|
|
-/// where both normal and tumoral depths exceed a configured quality threshold, and extracts
|
|
|
|
|
-/// consecutive high-quality bins as genomic ranges.
|
|
|
|
|
-///
|
|
|
|
|
-/// The function performs the computation in parallel across contigs for better performance,
|
|
|
|
|
-/// and includes contextual error handling to help trace issues related to I/O or data parsing.
|
|
|
|
|
|
|
+/// For each chromosome (`chr1`…`chr22`, `chrX`, `chrY`, `chrM`), this function
|
|
|
|
|
+/// reads paired “normal” (MRD) and “tumoral” (Diag) count files, then:
|
|
|
|
|
+/// 1. Marks positions where both depths ≥ `config.min_high_quality_depth` as high‑depth.
|
|
|
|
|
+/// 2. Marks positions where both depths < `config.max_depth_low_quality` as low‑quality.
|
|
|
|
|
+/// Consecutive runs of true values are merged into `GenomeRange`s.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
|
///
|
|
///
|
|
|
-/// * `id` - The identifier of the sample.
|
|
|
|
|
-/// * `config` - A reference to a `Config` struct containing paths and thresholds.
|
|
|
|
|
|
|
+/// * `id` — Sample identifier (used to locate per‑contig files).
|
|
|
|
|
+/// * `config` — Analysis settings.
|
|
|
///
|
|
///
|
|
|
/// # Returns
|
|
/// # Returns
|
|
|
///
|
|
///
|
|
|
-/// A `Result` containing a vector of `GenomeRange` objects representing high-depth somatic regions,
|
|
|
|
|
-/// or an error if any file reading, parsing, or logical check fails.
|
|
|
|
|
|
|
+/// On success, returns `Ok((high_depth_ranges, low_quality_ranges))`, where each is a
|
|
|
|
|
+/// flattened `Vec<GenomeRange>` across all contigs. Returns an error if any I/O,
|
|
|
|
|
+/// parsing, or contig/position mismatch occurs.
|
|
|
///
|
|
///
|
|
|
/// # Errors
|
|
/// # Errors
|
|
|
///
|
|
///
|
|
|
-/// Returns an error if:
|
|
|
|
|
-/// - Any of the input files (normal or tumoral) can't be opened
|
|
|
|
|
-/// - Any line in the files fails to read or parse
|
|
|
|
|
-/// - The `BinCount` objects from corresponding lines don't match in position
|
|
|
|
|
-///
|
|
|
|
|
-/// # Parallelism
|
|
|
|
|
-///
|
|
|
|
|
-/// This function leverages `rayon` to parallelize processing of contigs.
|
|
|
|
|
-///
|
|
|
|
|
-/// # Example
|
|
|
|
|
-///
|
|
|
|
|
-/// ```rust
|
|
|
|
|
-/// let config = Config::load_from("config_path.toml")?;
|
|
|
|
|
-/// let regions = high_depth_somatic("sample_001", &config)?;
|
|
|
|
|
-/// for region in regions {
|
|
|
|
|
-/// println!("{:?}", region);
|
|
|
|
|
-/// }
|
|
|
|
|
-/// ```
|
|
|
|
|
-///
|
|
|
|
|
-/// # Requirements
|
|
|
|
|
-///
|
|
|
|
|
-/// - The file names must follow the pattern `{contig}_count.tsv.gz`
|
|
|
|
|
-/// - The structure of lines must match what `BinCount::from_tsv_row` expects
|
|
|
|
|
-pub fn high_depth_somatic(id: &str, config: &Config) -> anyhow::Result<Vec<GenomeRange>> {
|
|
|
|
|
- // Generate contigs from chr1 to chr22, then chrX, Y, M
|
|
|
|
|
|
|
+/// * File open/read failures (with path & line number in context).
|
|
|
|
|
+/// * TSV parsing errors (with line content in context).
|
|
|
|
|
+/// * Contig or start‑position mismatches between paired files.
|
|
|
|
|
+pub fn somatic_depth_quality_ranges(
|
|
|
|
|
+ id: &str,
|
|
|
|
|
+ config: &Config,
|
|
|
|
|
+) -> anyhow::Result<(Vec<GenomeRange>, Vec<GenomeRange>)> {
|
|
|
|
|
+ // List of contigs: chr1..chr22, then X, Y, M
|
|
|
let contigs = (1..=22)
|
|
let contigs = (1..=22)
|
|
|
.map(|i| format!("chr{i}"))
|
|
.map(|i| format!("chr{i}"))
|
|
|
- .chain(["chrX", "chrY", "chrM"].iter().map(|s| s.to_string()))
|
|
|
|
|
|
|
+ .chain(["chrX", "chrY", "chrM"].iter().map(ToString::to_string))
|
|
|
.collect::<Vec<_>>();
|
|
.collect::<Vec<_>>();
|
|
|
|
|
|
|
|
- let config = Arc::new(config); // Wrap the config in an Arc for shared ownership
|
|
|
|
|
|
|
+ let cfg = Arc::new(config);
|
|
|
|
|
|
|
|
- // Process contigs in parallel with proper error propagation
|
|
|
|
|
- let results: Vec<Vec<GenomeRange>> = contigs
|
|
|
|
|
|
|
+ // For each contig, produce (high_ranges, lowq_ranges)
|
|
|
|
|
+ let per_contig = contigs
|
|
|
.into_par_iter()
|
|
.into_par_iter()
|
|
|
.map(|contig| {
|
|
.map(|contig| {
|
|
|
- let config = Arc::clone(&config);
|
|
|
|
|
- // Build file paths
|
|
|
|
|
- let mrd_path = format!("{}/{contig}_count.tsv.gz", config.normal_dir_count(id));
|
|
|
|
|
- let diag_path = format!("{}/{contig}_count.tsv.gz", config.tumoral_dir_count(id));
|
|
|
|
|
-
|
|
|
|
|
- // Open readers with proper error context
|
|
|
|
|
- let mrd_reader = get_gz_reader(&mrd_path)
|
|
|
|
|
- .with_context(|| format!("Failed to open MRD file: {mrd_path}"))?;
|
|
|
|
|
- let diag_reader = get_gz_reader(&diag_path)
|
|
|
|
|
- .with_context(|| format!("Failed to open Diag file: {diag_path}"))?;
|
|
|
|
|
-
|
|
|
|
|
- // Process lines in pairs
|
|
|
|
|
- let ranges = mrd_reader
|
|
|
|
|
- .lines()
|
|
|
|
|
- .zip(diag_reader.lines())
|
|
|
|
|
- .enumerate()
|
|
|
|
|
- .map(|(line_num, (mrd_line, diag_line))| {
|
|
|
|
|
- let line_num = line_num + 1; // Convert to 1-based indexing
|
|
|
|
|
-
|
|
|
|
|
- // Read lines with context
|
|
|
|
|
- let mrd_line = mrd_line.with_context(|| {
|
|
|
|
|
- format!("MRD file {mrd_path} line {line_num} read error")
|
|
|
|
|
- })?;
|
|
|
|
|
- let diag_line = diag_line.with_context(|| {
|
|
|
|
|
- format!("Diag file {diag_path} line {line_num} read error")
|
|
|
|
|
- })?;
|
|
|
|
|
-
|
|
|
|
|
- // Parse both lines
|
|
|
|
|
- let mrd = BinCount::from_tsv_row(&mrd_line).with_context(|| {
|
|
|
|
|
- format!("Failed to parse MRD line {line_num}: {mrd_line}")
|
|
|
|
|
- })?;
|
|
|
|
|
- let diag = BinCount::from_tsv_row(&diag_line).with_context(|| {
|
|
|
|
|
- format!("Failed to parse Diag line {line_num}: {diag_line}")
|
|
|
|
|
- })?;
|
|
|
|
|
-
|
|
|
|
|
- // Validate matching positions
|
|
|
|
|
- if mrd.contig != diag.contig {
|
|
|
|
|
- anyhow::bail!(
|
|
|
|
|
- "Contig mismatch at line {line_num}: {} vs {}",
|
|
|
|
|
- mrd.contig,
|
|
|
|
|
- diag.contig
|
|
|
|
|
- );
|
|
|
|
|
- }
|
|
|
|
|
- if mrd.start != diag.start {
|
|
|
|
|
- anyhow::bail!(
|
|
|
|
|
- "Start position mismatch at line {line_num}: {} vs {}",
|
|
|
|
|
- mrd.start,
|
|
|
|
|
- diag.start
|
|
|
|
|
- );
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- // Calculate high-depth regions
|
|
|
|
|
- let bools: Vec<bool> = mrd
|
|
|
|
|
- .depths
|
|
|
|
|
- .iter()
|
|
|
|
|
- .zip(diag.depths.iter())
|
|
|
|
|
- .map(|(&m, &d)| {
|
|
|
|
|
- m >= config.min_high_quality_depth && d >= config.min_high_quality_depth
|
|
|
|
|
- })
|
|
|
|
|
- .collect();
|
|
|
|
|
|
|
+ let cfg = Arc::clone(&cfg);
|
|
|
|
|
+ let normal_path = format!("{}/{}_count.tsv.gz", cfg.normal_dir_count(id), contig);
|
|
|
|
|
+ let tumor_path = format!("{}/{}_count.tsv.gz", cfg.tumoral_dir_count(id), contig);
|
|
|
|
|
+
|
|
|
|
|
+ let normal_rdr = get_gz_reader(&normal_path)
|
|
|
|
|
+ .with_context(|| format!("Failed to open normal file: {}", normal_path))?;
|
|
|
|
|
+ let tumor_rdr = get_gz_reader(&tumor_path)
|
|
|
|
|
+ .with_context(|| format!("Failed to open tumor file: {}", tumor_path))?;
|
|
|
|
|
+
|
|
|
|
|
+ // Collect per-line high & low masks
|
|
|
|
|
+ let mut high_runs = Vec::new();
|
|
|
|
|
+ let mut low_runs = Vec::new();
|
|
|
|
|
+
|
|
|
|
|
+ for (idx, (n_line, t_line)) in normal_rdr.lines().zip(tumor_rdr.lines()).enumerate() {
|
|
|
|
|
+ let line_no = idx + 1;
|
|
|
|
|
+ let n_line = n_line.with_context(|| format!("{} line {}", normal_path, line_no))?;
|
|
|
|
|
+ let t_line = t_line.with_context(|| format!("{} line {}", tumor_path, line_no))?;
|
|
|
|
|
+
|
|
|
|
|
+ let n = BinCount::from_tsv_row(&n_line)
|
|
|
|
|
+ .with_context(|| format!("Parse error at {}: {}", normal_path, line_no))?;
|
|
|
|
|
+ let t = BinCount::from_tsv_row(&t_line)
|
|
|
|
|
+ .with_context(|| format!("Parse error at {}: {}", tumor_path, line_no))?;
|
|
|
|
|
+
|
|
|
|
|
+ if n.contig != t.contig {
|
|
|
|
|
+ anyhow::bail!(
|
|
|
|
|
+ "Contig mismatch at line {}: {} vs {}",
|
|
|
|
|
+ line_no,
|
|
|
|
|
+ n.contig,
|
|
|
|
|
+ t.contig
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+ if n.start != t.start {
|
|
|
|
|
+ anyhow::bail!(
|
|
|
|
|
+ "Position mismatch at line {}: {} vs {}",
|
|
|
|
|
+ line_no,
|
|
|
|
|
+ n.start,
|
|
|
|
|
+ t.start
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- Ok(ranges_from_consecutive_true(&bools, mrd.start, &mrd.contig))
|
|
|
|
|
- })
|
|
|
|
|
- .collect::<anyhow::Result<Vec<_>>>()?;
|
|
|
|
|
|
|
+ let high_mask: Vec<bool> = n
|
|
|
|
|
+ .depths
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .zip(&t.depths)
|
|
|
|
|
+ .map(|(&nd, &td)| {
|
|
|
|
|
+ nd >= cfg.min_high_quality_depth && td >= cfg.min_high_quality_depth
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect();
|
|
|
|
|
+
|
|
|
|
|
+ let lowq_mask: Vec<bool> = n
|
|
|
|
|
+ .low_qualities
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .zip(&t.low_qualities)
|
|
|
|
|
+ .map(|(&nq, &tq)| {
|
|
|
|
|
+ nq < cfg.max_depth_low_quality && tq < cfg.max_depth_low_quality
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect();
|
|
|
|
|
+
|
|
|
|
|
+ high_runs.extend(ranges_from_consecutive_true(&high_mask, n.start, &n.contig));
|
|
|
|
|
+ low_runs.extend(ranges_from_consecutive_true(&lowq_mask, n.start, &n.contig));
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- // Flatten nested ranges and return contig's ranges
|
|
|
|
|
- Ok(ranges.into_iter().flatten().collect::<Vec<GenomeRange>>())
|
|
|
|
|
|
|
+ Ok((high_runs, low_runs))
|
|
|
})
|
|
})
|
|
|
.collect::<anyhow::Result<Vec<_>>>()?;
|
|
.collect::<anyhow::Result<Vec<_>>>()?;
|
|
|
|
|
|
|
|
- // Flatten the results from all contigs into a single vector
|
|
|
|
|
- Ok(results.into_iter().flatten().collect())
|
|
|
|
|
|
|
+ // Flatten across all contigs
|
|
|
|
|
+ let (high_all, low_all): (Vec<_>, Vec<_>) = per_contig.into_iter().unzip();
|
|
|
|
|
+ Ok((
|
|
|
|
|
+ high_all.into_iter().flatten().collect(),
|
|
|
|
|
+ low_all.into_iter().flatten().collect(),
|
|
|
|
|
+ ))
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// Converts a slice of booleans into a list of `GenomeRange`s representing
|
|
/// Converts a slice of booleans into a list of `GenomeRange`s representing
|