|
@@ -478,85 +478,98 @@ pub fn somatic_depth_quality_ranges(
|
|
|
id: &str,
|
|
id: &str,
|
|
|
config: &Config,
|
|
config: &Config,
|
|
|
) -> anyhow::Result<(Vec<GenomeRange>, Vec<GenomeRange>)> {
|
|
) -> anyhow::Result<(Vec<GenomeRange>, Vec<GenomeRange>)> {
|
|
|
- // List of contigs: chr1..chr22, then X, Y, M
|
|
|
|
|
- let contigs = (1..=22)
|
|
|
|
|
|
|
+ // chr1..chr22 + X,Y,M
|
|
|
|
|
+ let contigs: Vec<String> = (1..=22)
|
|
|
.map(|i| format!("chr{i}"))
|
|
.map(|i| format!("chr{i}"))
|
|
|
- .chain(["chrX", "chrY", "chrM"].iter().map(ToString::to_string))
|
|
|
|
|
- .collect::<Vec<_>>();
|
|
|
|
|
|
|
+ .chain(["chrX", "chrY", "chrM"].into_iter().map(String::from))
|
|
|
|
|
+ .collect();
|
|
|
|
|
|
|
|
- let cfg = Arc::new(config);
|
|
|
|
|
|
|
+ let cfg = config; // no Arc<&Config>
|
|
|
|
|
|
|
|
- // For each contig, produce (high_ranges, lowq_ranges)
|
|
|
|
|
let per_contig = contigs
|
|
let per_contig = contigs
|
|
|
.into_par_iter()
|
|
.into_par_iter()
|
|
|
.map(|contig| {
|
|
.map(|contig| {
|
|
|
- let cfg = Arc::clone(&cfg);
|
|
|
|
|
let normal_path = format!("{}/{}_count.tsv.gz", cfg.normal_dir_count(id), contig);
|
|
let normal_path = format!("{}/{}_count.tsv.gz", cfg.normal_dir_count(id), contig);
|
|
|
- let tumor_path = format!("{}/{}_count.tsv.gz", cfg.tumoral_dir_count(id), contig);
|
|
|
|
|
|
|
+ let tumor_path = format!("{}/{}_count.tsv.gz", cfg.tumoral_dir_count(id), contig);
|
|
|
|
|
|
|
|
let normal_rdr = get_gz_reader(&normal_path)
|
|
let normal_rdr = get_gz_reader(&normal_path)
|
|
|
.with_context(|| format!("Failed to open normal file: {}", normal_path))?;
|
|
.with_context(|| format!("Failed to open normal file: {}", normal_path))?;
|
|
|
let tumor_rdr = get_gz_reader(&tumor_path)
|
|
let tumor_rdr = get_gz_reader(&tumor_path)
|
|
|
.with_context(|| format!("Failed to open tumor file: {}", tumor_path))?;
|
|
.with_context(|| format!("Failed to open tumor file: {}", tumor_path))?;
|
|
|
|
|
|
|
|
- // Collect per-line high & low masks
|
|
|
|
|
- let mut high_runs = Vec::new();
|
|
|
|
|
- let mut low_runs = Vec::new();
|
|
|
|
|
-
|
|
|
|
|
- for (idx, (n_line, t_line)) in normal_rdr.lines().zip(tumor_rdr.lines()).enumerate() {
|
|
|
|
|
- let line_no = idx + 1;
|
|
|
|
|
- let n_line = n_line.with_context(|| format!("{} line {}", normal_path, line_no))?;
|
|
|
|
|
- let t_line = t_line.with_context(|| format!("{} line {}", tumor_path, line_no))?;
|
|
|
|
|
-
|
|
|
|
|
- let n = BinCount::from_tsv_row(&n_line)
|
|
|
|
|
- .with_context(|| format!("Parse error at {}: {}", normal_path, line_no))?;
|
|
|
|
|
- let t = BinCount::from_tsv_row(&t_line)
|
|
|
|
|
- .with_context(|| format!("Parse error at {}: {}", tumor_path, line_no))?;
|
|
|
|
|
-
|
|
|
|
|
- if n.contig != t.contig {
|
|
|
|
|
- anyhow::bail!(
|
|
|
|
|
- "Contig mismatch at line {}: {} vs {}",
|
|
|
|
|
- line_no,
|
|
|
|
|
- n.contig,
|
|
|
|
|
- t.contig
|
|
|
|
|
- );
|
|
|
|
|
- }
|
|
|
|
|
- if n.start != t.start {
|
|
|
|
|
- anyhow::bail!(
|
|
|
|
|
- "Position mismatch at line {}: {} vs {}",
|
|
|
|
|
- line_no,
|
|
|
|
|
- n.start,
|
|
|
|
|
- t.start
|
|
|
|
|
- );
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ let mut high_runs: Vec<GenomeRange> = Vec::new();
|
|
|
|
|
+ let mut lowq_runs: Vec<GenomeRange> = Vec::new();
|
|
|
|
|
+
|
|
|
|
|
+ let mut nl = normal_rdr.lines();
|
|
|
|
|
+ let mut tl = tumor_rdr.lines();
|
|
|
|
|
+ let mut line_no = 0usize;
|
|
|
|
|
+
|
|
|
|
|
+ loop {
|
|
|
|
|
+ let n_next = nl.next();
|
|
|
|
|
+ let t_next = tl.next();
|
|
|
|
|
+ match (n_next, t_next) {
|
|
|
|
|
+ (None, None) => break,
|
|
|
|
|
+ (Some(Err(e)), _) => return Err(anyhow::anyhow!("{} line {}: {}", normal_path, line_no + 1, e)),
|
|
|
|
|
+ (_, Some(Err(e))) => return Err(anyhow::anyhow!("{} line {}: {}", tumor_path, line_no + 1, e)),
|
|
|
|
|
+ (Some(Ok(n_line)), Some(Ok(t_line))) => {
|
|
|
|
|
+ line_no += 1;
|
|
|
|
|
+
|
|
|
|
|
+ let n = BinCount::from_tsv_row(&n_line)
|
|
|
|
|
+ .with_context(|| format!("Parse error at {}: {}", normal_path, line_no))?;
|
|
|
|
|
+ let t = BinCount::from_tsv_row(&t_line)
|
|
|
|
|
+ .with_context(|| format!("Parse error at {}: {}", tumor_path, line_no))?;
|
|
|
|
|
+
|
|
|
|
|
+ if n.contig != t.contig {
|
|
|
|
|
+ anyhow::bail!("Contig mismatch at line {}: {} vs {}", line_no, n.contig, t.contig);
|
|
|
|
|
+ }
|
|
|
|
|
+ if n.start != t.start {
|
|
|
|
|
+ anyhow::bail!("Position mismatch at line {}: {} vs {}", line_no, n.start, t.start);
|
|
|
|
|
+ }
|
|
|
|
|
+ // Ensure equal bin widths
|
|
|
|
|
+ if n.depths.len() != t.depths.len() {
|
|
|
|
|
+ anyhow::bail!(
|
|
|
|
|
+ "Depth vector length mismatch at line {}: {} vs {}",
|
|
|
|
|
+ line_no, n.depths.len(), t.depths.len()
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+ if n.low_qualities.len() != t.low_qualities.len() {
|
|
|
|
|
+ anyhow::bail!(
|
|
|
|
|
+ "LowQ vector length mismatch at line {}: {} vs {}",
|
|
|
|
|
+ line_no, n.low_qualities.len(), t.low_qualities.len()
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // High-quality depth in BOTH samples
|
|
|
|
|
+ let high_mask_iter = n.depths.iter().zip(&t.depths).map(|(&nd, &td)| {
|
|
|
|
|
+ nd >= cfg.min_high_quality_depth && td >= cfg.min_high_quality_depth
|
|
|
|
|
+ });
|
|
|
|
|
|
|
|
- let high_mask: Vec<bool> = n
|
|
|
|
|
- .depths
|
|
|
|
|
- .iter()
|
|
|
|
|
- .zip(&t.depths)
|
|
|
|
|
- .map(|(&nd, &td)| {
|
|
|
|
|
- nd >= cfg.min_high_quality_depth && td >= cfg.min_high_quality_depth
|
|
|
|
|
- })
|
|
|
|
|
- .collect();
|
|
|
|
|
-
|
|
|
|
|
- let lowq_mask: Vec<bool> = n
|
|
|
|
|
- .low_qualities
|
|
|
|
|
- .iter()
|
|
|
|
|
- .zip(&t.low_qualities)
|
|
|
|
|
- .map(|(&nq, &tq)| {
|
|
|
|
|
- nq < cfg.max_depth_low_quality && tq < cfg.max_depth_low_quality
|
|
|
|
|
- })
|
|
|
|
|
- .collect();
|
|
|
|
|
-
|
|
|
|
|
- high_runs.extend(ranges_from_consecutive_true(&high_mask, n.start, &n.contig));
|
|
|
|
|
- low_runs.extend(ranges_from_consecutive_true(&lowq_mask, n.start, &n.contig));
|
|
|
|
|
|
|
+ // NOTE: if you intended "low-quality regions" (bad), invert predicate.
|
|
|
|
|
+ let lowq_mask_iter = n.low_qualities.iter().zip(&t.low_qualities).map(|(&nq, &tq)| {
|
|
|
|
|
+ nq > cfg.max_depth_low_quality && tq > cfg.max_depth_low_quality
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ high_runs.extend(ranges_from_consecutive_true_iter(high_mask_iter, n.start, &n.contig));
|
|
|
|
|
+ lowq_runs.extend(ranges_from_consecutive_true_iter(lowq_mask_iter, n.start, &n.contig));
|
|
|
|
|
+ }
|
|
|
|
|
+ (Some(_), None) => {
|
|
|
|
|
+ anyhow::bail!("Line count mismatch: {} has extra lines after {}", normal_path, line_no);
|
|
|
|
|
+ }
|
|
|
|
|
+ (None, Some(_)) => {
|
|
|
|
|
+ anyhow::bail!("Line count mismatch: {} has extra lines after {}", tumor_path, line_no);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- Ok((high_runs, low_runs))
|
|
|
|
|
|
|
+ // Merge adjacent/overlapping ranges within this contig
|
|
|
|
|
+ high_runs = merge_adjacent_ranges(high_runs);
|
|
|
|
|
+ lowq_runs = merge_adjacent_ranges(lowq_runs);
|
|
|
|
|
+
|
|
|
|
|
+ Ok((high_runs, lowq_runs))
|
|
|
})
|
|
})
|
|
|
.collect::<anyhow::Result<Vec<_>>>()?;
|
|
.collect::<anyhow::Result<Vec<_>>>()?;
|
|
|
|
|
|
|
|
- // Flatten across all contigs
|
|
|
|
|
|
|
+ // Flatten
|
|
|
let (high_all, low_all): (Vec<_>, Vec<_>) = per_contig.into_iter().unzip();
|
|
let (high_all, low_all): (Vec<_>, Vec<_>) = per_contig.into_iter().unzip();
|
|
|
Ok((
|
|
Ok((
|
|
|
high_all.into_iter().flatten().collect(),
|
|
high_all.into_iter().flatten().collect(),
|
|
@@ -564,6 +577,67 @@ pub fn somatic_depth_quality_ranges(
|
|
|
))
|
|
))
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
+/// Iterator-based version (no temporary Vec<bool>).
|
|
|
|
|
+/// Produces end-exclusive ranges in the same shape as your old function.
|
|
|
|
|
+pub fn ranges_from_consecutive_true_iter<I>(
|
|
|
|
|
+ mask: I,
|
|
|
|
|
+ start0: u32,
|
|
|
|
|
+ contig: &str,
|
|
|
|
|
+) -> Vec<GenomeRange>
|
|
|
|
|
+where
|
|
|
|
|
+ I: IntoIterator<Item = bool>,
|
|
|
|
|
+{
|
|
|
|
|
+ let contig = contig_to_num(contig);
|
|
|
|
|
+ let mut ranges = Vec::new();
|
|
|
|
|
+ let mut current_start: Option<u32> = None;
|
|
|
|
|
+ let mut i: u32 = 0;
|
|
|
|
|
+
|
|
|
|
|
+ for v in mask {
|
|
|
|
|
+ if v {
|
|
|
|
|
+ if current_start.is_none() {
|
|
|
|
|
+ current_start = Some(start0 + i);
|
|
|
|
|
+ }
|
|
|
|
|
+ } else if let Some(s) = current_start.take() {
|
|
|
|
|
+ ranges.push(GenomeRange { contig, range: s..(start0 + i) });
|
|
|
|
|
+ }
|
|
|
|
|
+ i += 1;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if let Some(s) = current_start {
|
|
|
|
|
+ ranges.push(GenomeRange { contig, range: s..(start0 + i) });
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ ranges
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/// Merge overlapping or touching ranges per contig (end-exclusive).
|
|
|
|
|
+pub fn merge_adjacent_ranges(mut ranges: Vec<GenomeRange>) -> Vec<GenomeRange> {
|
|
|
|
|
+ if ranges.is_empty() {
|
|
|
|
|
+ return ranges;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ ranges.sort_by(|a, b| {
|
|
|
|
|
+ (a.contig, a.range.start).cmp(&(b.contig, b.range.start))
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ let mut merged = Vec::with_capacity(ranges.len());
|
|
|
|
|
+ let mut cur = ranges[0].clone();
|
|
|
|
|
+
|
|
|
|
|
+ for r in ranges.into_iter().skip(1) {
|
|
|
|
|
+ if r.contig == cur.contig && r.range.start <= cur.range.end {
|
|
|
|
|
+ if r.range.end > cur.range.end {
|
|
|
|
|
+ cur.range.end = r.range.end;
|
|
|
|
|
+ }
|
|
|
|
|
+ } else {
|
|
|
|
|
+ merged.push(cur);
|
|
|
|
|
+ cur = r;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ merged.push(cur);
|
|
|
|
|
+ merged
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
/// Converts a slice of booleans into a list of `GenomeRange`s representing
|
|
/// Converts a slice of booleans into a list of `GenomeRange`s representing
|
|
|
/// consecutive `true` values, offset by a `start` position and tagged with a contig ID.
|
|
/// consecutive `true` values, offset by a `start` position and tagged with a contig ID.
|
|
|
///
|
|
///
|