|
|
@@ -153,12 +153,10 @@
|
|
|
//! - `somatic_depth_quality_ranges()` processes contigs in parallel
|
|
|
//! - `make_glm_rows_from_regions_par()` parallelizes region processing with efficient binary search
|
|
|
|
|
|
-use std::{
|
|
|
- collections::{BTreeMap, BTreeSet},
|
|
|
- io::BufRead,
|
|
|
-};
|
|
|
+use std::collections::{BTreeMap, BTreeSet};
|
|
|
|
|
|
use anyhow::Context;
|
|
|
+use csv::ByteRecord;
|
|
|
use dashmap::DashMap;
|
|
|
use log::debug;
|
|
|
use ordered_float::OrderedFloat;
|
|
|
@@ -171,13 +169,13 @@ use crate::{
|
|
|
helpers::bin_data,
|
|
|
io::{
|
|
|
bed::read_bed, dict::read_dict, gff::features_ranges, readers::get_gz_reader,
|
|
|
- writers::get_gz_writer,
|
|
|
+ tsv::tsv_reader, writers::get_gz_writer,
|
|
|
},
|
|
|
positions::{
|
|
|
contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par,
|
|
|
GenomeRange,
|
|
|
},
|
|
|
- scan::scan::BinCount,
|
|
|
+ scan::bin::{parse_bin_record_into, BinRowBuf},
|
|
|
};
|
|
|
|
|
|
use super::variant_collection::{Variant, Variants};
|
|
|
@@ -526,41 +524,59 @@ pub fn somatic_rates(
|
|
|
})
|
|
|
}
|
|
|
|
|
|
-/// Computes high‑depth and low‑quality genomic regions for a given sample.
|
|
|
+/// Computes genomic ranges with sufficient depth and with excessive low-quality reads
|
|
|
+/// by jointly scanning normal and tumoral per-contig depth TSV files.
|
|
|
///
|
|
|
-/// For each chromosome (`chr1`…`chr22`, `chrX`, `chrY`, `chrM`), this function
|
|
|
-/// reads paired “normal” (MRD) and “tumoral” (Diag) count files, then:
|
|
|
-/// 1. Marks positions where both depths ≥ `config.min_high_quality_depth` as high‑depth.
|
|
|
-/// 2. Marks positions where both depths < `config.max_depth_low_quality` as low‑quality.
|
|
|
-/// Consecutive runs of true values are merged into `GenomeRange`s.
|
|
|
+/// For each contig, the function reads the corresponding gzipped TSV files from the
|
|
|
+/// normal and tumoral directories, iterates over bins in lockstep, and validates that
|
|
|
+/// both files are perfectly aligned (same contig, same start position, same bin layout).
|
|
|
///
|
|
|
-/// # Arguments
|
|
|
+/// Two kinds of ranges are produced:
|
|
|
+/// - **High-quality depth ranges**: consecutive bins where *both* normal and tumoral
|
|
|
+/// depths are greater than or equal to `config.min_high_quality_depth`.
|
|
|
+/// - **Low-quality excess ranges**: consecutive bins where *both* normal and tumoral
|
|
|
+/// bins contain more low-quality reads than `config.max_depth_low_quality`.
|
|
|
+///
|
|
|
+/// Contigs are processed in parallel. Within each contig, adjacent or overlapping
|
|
|
+/// ranges are merged before being returned.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
///
|
|
|
-/// * `id` — Sample identifier (used to locate per‑contig files).
|
|
|
-/// * `config` — Analysis settings.
|
|
|
+/// Returns an error if:
|
|
|
+/// - A normal or tumoral TSV file cannot be opened or decompressed.
|
|
|
+/// - A TSV record cannot be parsed.
|
|
|
+/// - The normal and tumoral files differ in number of lines.
|
|
|
+/// - Corresponding records disagree on contig name, start position, or bin structure
|
|
|
+/// (depth or low-quality vector length).
|
|
|
///
|
|
|
/// # Returns
|
|
|
///
|
|
|
-/// On success, returns `Ok((high_depth_ranges, low_quality_ranges))`, where each is a
|
|
|
-/// flattened `Vec<GenomeRange>` across all contigs. Returns an error if any I/O,
|
|
|
-/// parsing, or contig/position mismatch occurs.
|
|
|
+/// A tuple `(high_depth_ranges, lowq_excess_ranges)` where each element is a `Vec<GenomeRange>`
|
|
|
+/// spanning all contigs.
|
|
|
///
|
|
|
-/// # Errors
|
|
|
+/// # Notes
|
|
|
+///
|
|
|
+/// - Input TSV files are expected to be tab-delimited, headerless, and sorted by
|
|
|
+/// genomic position.
|
|
|
+/// - Low-quality ranges represent **excess low-quality signal** (bins exceeding the
|
|
|
+/// configured threshold), not acceptable regions.
|
|
|
+/// - The implementation reuses parsing buffers and avoids per-line allocations for
|
|
|
+/// performance.
|
|
|
+///
|
|
|
+/// # Panics
|
|
|
///
|
|
|
-/// * File open/read failures (with path & line number in context).
|
|
|
-/// * TSV parsing errors (with line content in context).
|
|
|
-/// * Contig or start‑position mismatches between paired files.
|
|
|
+/// This function does not intentionally panic.
|
|
|
pub fn somatic_depth_quality_ranges(
|
|
|
id: &str,
|
|
|
config: &Config,
|
|
|
) -> anyhow::Result<(Vec<GenomeRange>, Vec<GenomeRange>)> {
|
|
|
// chr1..chr22 + X,Y,M
|
|
|
- let contigs: Vec<String> = (1..=22)
|
|
|
- .map(|i| format!("chr{i}"))
|
|
|
- .chain(["chrX", "chrY", "chrM"].into_iter().map(String::from))
|
|
|
+ let contigs: Vec<String> = read_dict(&config.dict_file)?
|
|
|
+ .into_iter()
|
|
|
+ .map(|(sn, _ln)| sn)
|
|
|
.collect();
|
|
|
|
|
|
- let cfg = config; // no Arc<&Config>
|
|
|
+ let cfg = config;
|
|
|
|
|
|
let per_contig = contigs
|
|
|
.into_par_iter()
|
|
|
@@ -576,114 +592,71 @@ pub fn somatic_depth_quality_ranges(
|
|
|
let mut high_runs: Vec<GenomeRange> = Vec::new();
|
|
|
let mut lowq_runs: Vec<GenomeRange> = Vec::new();
|
|
|
|
|
|
- let mut nl = normal_rdr.lines();
|
|
|
- let mut tl = tumor_rdr.lines();
|
|
|
+ let mut n_tsv = tsv_reader(normal_rdr); // normal_rdr: impl Read
|
|
|
+ let mut t_tsv = tsv_reader(tumor_rdr);
|
|
|
+
|
|
|
+ let mut n_rec = ByteRecord::new();
|
|
|
+ let mut t_rec = ByteRecord::new();
|
|
|
+
|
|
|
+ let mut n_buf = BinRowBuf::default();
|
|
|
+ let mut t_buf = BinRowBuf::default();
|
|
|
+
|
|
|
let mut line_no = 0usize;
|
|
|
|
|
|
loop {
|
|
|
- let n_next = nl.next();
|
|
|
- let t_next = tl.next();
|
|
|
- match (n_next, t_next) {
|
|
|
- (None, None) => break,
|
|
|
- (Some(Err(e)), _) => {
|
|
|
- return Err(anyhow::anyhow!(
|
|
|
- "{} line {}: {}",
|
|
|
- normal_path,
|
|
|
- line_no + 1,
|
|
|
- e
|
|
|
- ))
|
|
|
- }
|
|
|
- (_, Some(Err(e))) => {
|
|
|
- return Err(anyhow::anyhow!(
|
|
|
- "{} line {}: {}",
|
|
|
- tumor_path,
|
|
|
- line_no + 1,
|
|
|
- e
|
|
|
- ))
|
|
|
+ let n_ok = n_tsv.read_byte_record(&mut n_rec)?;
|
|
|
+ let t_ok = t_tsv.read_byte_record(&mut t_rec)?;
|
|
|
+
|
|
|
+ if n_ok || t_ok {
|
|
|
+ line_no += 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ match (n_ok, t_ok) {
|
|
|
+ (false, false) => break,
|
|
|
+ (true, false) => {
|
|
|
+ anyhow::bail!("{} has extra lines at {}", normal_path, line_no)
|
|
|
}
|
|
|
- (Some(Ok(n_line)), Some(Ok(t_line))) => {
|
|
|
- line_no += 1;
|
|
|
-
|
|
|
- let n = BinCount::from_tsv_row(&n_line).with_context(|| {
|
|
|
- format!("Parse error at {}: {}", normal_path, line_no)
|
|
|
- })?;
|
|
|
- let t = BinCount::from_tsv_row(&t_line).with_context(|| {
|
|
|
- format!("Parse error at {}: {}", tumor_path, line_no)
|
|
|
- })?;
|
|
|
-
|
|
|
- if n.contig != t.contig {
|
|
|
- anyhow::bail!(
|
|
|
- "Contig mismatch at line {}: {} vs {}",
|
|
|
- line_no,
|
|
|
- n.contig,
|
|
|
- t.contig
|
|
|
- );
|
|
|
- }
|
|
|
- if n.start != t.start {
|
|
|
- anyhow::bail!(
|
|
|
- "Position mismatch at line {}: {} vs {}",
|
|
|
- line_no,
|
|
|
- n.start,
|
|
|
- t.start
|
|
|
- );
|
|
|
- }
|
|
|
- // Ensure equal bin widths
|
|
|
- if n.depths.len() != t.depths.len() {
|
|
|
- anyhow::bail!(
|
|
|
- "Depth vector length mismatch at line {}: {} vs {}",
|
|
|
- line_no,
|
|
|
- n.depths.len(),
|
|
|
- t.depths.len()
|
|
|
- );
|
|
|
- }
|
|
|
- if n.low_qualities.len() != t.low_qualities.len() {
|
|
|
- anyhow::bail!(
|
|
|
- "LowQ vector length mismatch at line {}: {} vs {}",
|
|
|
- line_no,
|
|
|
- n.low_qualities.len(),
|
|
|
- t.low_qualities.len()
|
|
|
- );
|
|
|
- }
|
|
|
+ (false, true) => anyhow::bail!("{} has extra lines at {}", tumor_path, line_no),
|
|
|
+ (true, true) => {
|
|
|
+ let (n_start, n_depths, n_lowq) =
|
|
|
+ parse_bin_record_into(&n_rec, &mut n_buf, &contig)
|
|
|
+ .with_context(|| format!("{} line {}", normal_path, line_no))?;
|
|
|
+
|
|
|
+ let (t_start, t_depths, t_lowq) =
|
|
|
+ parse_bin_record_into(&t_rec, &mut t_buf, &contig)
|
|
|
+ .with_context(|| format!("{} line {}", tumor_path, line_no))?;
|
|
|
+
|
|
|
+ anyhow::ensure!(n_start == t_start, "start mismatch at line {}", line_no);
|
|
|
+ anyhow::ensure!(
|
|
|
+ n_depths.len() == t_depths.len(),
|
|
|
+ "depth len mismatch at line {}",
|
|
|
+ line_no
|
|
|
+ );
|
|
|
+ anyhow::ensure!(
|
|
|
+ n_lowq.len() == t_lowq.len(),
|
|
|
+ "lowq len mismatch at line {}",
|
|
|
+ line_no
|
|
|
+ );
|
|
|
|
|
|
- // High-quality depth in BOTH samples
|
|
|
- let high_mask_iter = n.depths.iter().zip(&t.depths).map(|(&nd, &td)| {
|
|
|
+ let high_mask_iter = n_depths.iter().zip(t_depths).map(|(&nd, &td)| {
|
|
|
nd >= cfg.min_high_quality_depth && td >= cfg.min_high_quality_depth
|
|
|
});
|
|
|
|
|
|
- // NOTE: if you intended "low-quality regions" (bad), invert predicate.
|
|
|
- let lowq_mask_iter =
|
|
|
- n.low_qualities
|
|
|
- .iter()
|
|
|
- .zip(&t.low_qualities)
|
|
|
- .map(|(&nq, &tq)| {
|
|
|
- nq > cfg.max_depth_low_quality && tq > cfg.max_depth_low_quality
|
|
|
- });
|
|
|
+ let lowq_mask_iter = n_lowq.iter().zip(t_lowq).map(|(&nq, &tq)| {
|
|
|
+ nq > cfg.max_depth_low_quality && tq > cfg.max_depth_low_quality
|
|
|
+ });
|
|
|
|
|
|
high_runs.extend(ranges_from_consecutive_true_iter(
|
|
|
high_mask_iter,
|
|
|
- n.start,
|
|
|
- &n.contig,
|
|
|
+ n_start,
|
|
|
+ &contig,
|
|
|
));
|
|
|
lowq_runs.extend(ranges_from_consecutive_true_iter(
|
|
|
lowq_mask_iter,
|
|
|
- n.start,
|
|
|
- &n.contig,
|
|
|
+ n_start,
|
|
|
+ &contig,
|
|
|
));
|
|
|
}
|
|
|
- (Some(_), None) => {
|
|
|
- anyhow::bail!(
|
|
|
- "Line count mismatch: {} has extra lines after {}",
|
|
|
- normal_path,
|
|
|
- line_no
|
|
|
- );
|
|
|
- }
|
|
|
- (None, Some(_)) => {
|
|
|
- anyhow::bail!(
|
|
|
- "Line count mismatch: {} has extra lines after {}",
|
|
|
- tumor_path,
|
|
|
- line_no
|
|
|
- );
|
|
|
- }
|
|
|
}
|
|
|
}
|
|
|
|