Thomas 1 kuukausi sitten
vanhempi
commit
4cfdd15a5d
9 muutettua tiedostoa jossa 250 lisäystä ja 152 poistoa
  1. 1 1
      src/functions/assembler.rs
  2. 1 1
      src/functions/whole_scan.rs
  3. 53 17
      src/io/dict.rs
  4. 1 0
      src/io/mod.rs
  5. 33 0
      src/io/tsv.rs
  6. 14 14
      src/pipes/somatic.rs
  7. 13 0
      src/positions.rs
  8. 43 1
      src/scan/bin.rs
  9. 91 118
      src/variant/variants_stats.rs

+ 1 - 1
src/functions/assembler.rs

@@ -16,7 +16,7 @@ impl Default for AssemblerConfig {
             result_dir: "/data/longreads_basic_pipe".to_string(),
             scan_dir_name: "scan".to_string(),
             output_dir_name: "assemblies".to_string(),
-            dict_file: "/data/ref/hs1/chm13v2.0.dict".to_string(),
+            dict_file: "".to_string(),
         }
     }
 }

+ 1 - 1
src/functions/whole_scan.rs

@@ -12,7 +12,7 @@ impl Default for WholeScanConfig {
         Self {
             result_dir: "/data/longreads_basic_pipe".to_string(),
             scan_dir: "scan".to_string(),
-            dict_file: "/data/ref/hs1/chm13v2.0.dict".to_string(),
+            dict_file: "".to_string(),
         }
     }
 }

+ 53 - 17
src/io/dict.rs

@@ -1,32 +1,68 @@
-use std::fs;
 use anyhow::{Context, Ok, Result};
 use csv::ReaderBuilder;
 use log::debug;
+use std::fs;
 
-pub fn read_dict(path: &str) -> Result<Vec<(String, u32)>> {
+// pub fn read_dict(path: &str) -> Result<Vec<(String, u32)>> {
+//     debug!("Parsing {path}.");
+//
+//     let mut reader = ReaderBuilder::new()
+//         .delimiter(b'\t')
+//         .flexible(true)
+//         .has_headers(false)
+//         .from_reader(fs::File::open(path)?);
+//
+//     let mut res = Vec::new();
+//     for line in reader.records() {
+//         let line = line.context("Can't parse dict file.")?;
+//         if line.get(0).context("Can't parse dict file.")? == "@SQ" {
+//             res.push((
+//                 line.get(1)
+//                     .map(|s| s.replace("SN:", ""))
+//                     .context("Can't parse dict file.")?
+//                     .parse()?,
+//                 line.get(2)
+//                     .map(|s| s.replace("LN:", ""))
+//                     .context("Can't parse dict file.")?
+//                     .parse()?,
+//             ));
+//         }
+//     }
+//
+//     Ok(res)
+// }
+
+pub fn read_dict(path: &str) -> anyhow::Result<Vec<(String, u32)>> {
     debug!("Parsing {path}.");
 
-    let mut reader = ReaderBuilder::new()
+    let mut reader = csv::ReaderBuilder::new()
         .delimiter(b'\t')
         .flexible(true)
         .has_headers(false)
-        .from_reader(fs::File::open(path)?);
+        .from_reader(std::fs::File::open(path)?);
 
     let mut res = Vec::new();
-    for line in reader.records() {
-        let line = line.context("Can't parse dict file.")?;
-        if line.get(0).context("Can't parse dict file.")? == "@SQ" {
-            res.push((
-                line.get(1)
-                    .map(|s| s.replace("SN:", ""))
-                    .context("Can't parse dict file.")?
-                    .parse()?,
-                line.get(2)
-                    .map(|s| s.replace("LN:", ""))
-                    .context("Can't parse dict file.")?
-                    .parse()?,
-            ));
+
+    for rec in reader.records() {
+        let rec = rec.context("Can't parse dict file")?;
+        if rec.get(0) != Some("@SQ") {
+            continue;
         }
+
+        let sn = rec
+            .iter()
+            .find_map(|f| f.strip_prefix("SN:"))
+            .context("Missing SN: in @SQ line")?
+            .to_string();
+
+        let ln: u32 = rec
+            .iter()
+            .find_map(|f| f.strip_prefix("LN:"))
+            .context("Missing LN: in @SQ line")?
+            .parse()
+            .context("Invalid LN: value")?;
+
+        res.push((sn, ln));
     }
 
     Ok(res)

+ 1 - 0
src/io/mod.rs

@@ -9,3 +9,4 @@ pub mod gff;
 pub mod bam;
 pub mod writers;
 pub mod straglr;
+pub mod tsv;

+ 33 - 0
src/io/tsv.rs

@@ -0,0 +1,33 @@
+use std::io::Read;
+
+use csv::ReaderBuilder;
+use anyhow::Context;
+
+/// Build a TSV csv::Reader from *any* Read (including gz decoder)
+pub fn tsv_reader<R: Read>(r: R) -> csv::Reader<R> {
+    ReaderBuilder::new()
+        .delimiter(b'\t')
+        .has_headers(false)
+        .flexible(true)
+        .from_reader(r)
+}
+
+pub fn parse_u32(field: &[u8]) -> anyhow::Result<u32> {
+    // fast-ish, no String allocation
+    let s = std::str::from_utf8(field).context("non-utf8 integer field")?;
+    Ok(s.parse::<u32>().context("bad integer")?)
+}
+
+pub fn parse_csv_u32_into(dst: &mut Vec<u32>, field: &[u8]) -> anyhow::Result<()> {
+    dst.clear();
+    if field.is_empty() {
+        return Ok(());
+    }
+    for part in field.split(|&b| b == b',') {
+        if part.is_empty() {
+            continue;
+        }
+        dst.push(parse_u32(part)?);
+    }
+    Ok(())
+}

+ 14 - 14
src/pipes/somatic.rs

@@ -3,11 +3,11 @@ use crate::{
     create_should_run_normal_tumoral, init_solo_callers_normal_tumoral,
     io::bed::read_bed,
     pipes::{Initialize, InitializeSolo, ShouldRun},
-    positions::GenomeRange,
+    positions::{GenomeRange, GenomeRangeExt},
     scan::scan::SomaticScan,
     variant::{
         variants_stats::somatic_depth_quality_ranges,
-        vcf_variant::{run_if_required, ShouldRunBox},
+        vcf_variant::{ShouldRunBox, run_if_required},
     },
 };
 use itertools::Itertools;
@@ -326,7 +326,13 @@ impl Run for SomaticPipe {
         high_depth_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
         low_quality_ranges.par_sort_by_key(|r| (r.contig, r.range.start));
 
-        info!("ranges: {}", low_quality_ranges.len());
+        info!(
+            "High-depth ranges: {} {} bp\nLowQ ranges: {} {} bp",
+            high_depth_ranges.len(),
+            high_depth_ranges.total_len(),
+            low_quality_ranges.len(),
+            low_quality_ranges.total_len(),
+        );
 
         variants_collections.iter_mut().for_each(|e| {
             let _ = e.annotate_with_ranges(&low_quality_ranges, &annotations, Annotation::LowMAPQ);
@@ -532,17 +538,11 @@ impl Run for SomaticPipe {
             .collect();
         variants.annotate_with_ranges(&vntrs, Some(Annotation::VNTR), 0, Vec::new());
 
-        let repeat_masker: Vec<GenomeRange> =
-            read_bed(&config.repeats_bed)?
-                .iter()
-                .map(|r| r.range.clone())
-                .collect();
-        variants.annotate_with_ranges(
-            &repeat_masker,
-            Some(Annotation::Repeat),
-            0,
-            Vec::new(),
-        );
+        let repeat_masker: Vec<GenomeRange> = read_bed(&config.repeats_bed)?
+            .iter()
+            .map(|r| r.range.clone())
+            .collect();
+        variants.annotate_with_ranges(&repeat_masker, Some(Annotation::Repeat), 0, Vec::new());
 
         variants.save_to_json(&result_json)?;
         variants.save_to_file(&result_bit)?;

+ 13 - 0
src/positions.rs

@@ -510,6 +510,19 @@ impl GenomeRange {
     }
 }
 
+pub trait GenomeRangeExt {
+    /// Total length in bp for 0-based, half-open ranges [start, end)
+    fn total_len(&self) -> u64;
+}
+
+impl GenomeRangeExt for [GenomeRange] {
+    fn total_len(&self) -> u64 {
+        self.iter()
+            .map(|r| (r.end - r.start) as u64)
+            .sum()
+    }
+}
+
 impl GetGenomePosition for GenomePosition {
     fn position(&self) -> &GenomePosition {
         self

+ 43 - 1
src/scan/bin.rs

@@ -1,10 +1,11 @@
 use std::collections::HashMap;
 
 use anyhow::Context;
+use csv::ByteRecord;
 use log::error;
 use rust_htslib::bam::{ext::BamRecordExtensions, record::Aux, IndexedReader, Read, Record};
 
-use crate::io::bam::{primary_record, primary_records};
+use crate::io::{bam::{primary_record, primary_records}, tsv::{parse_csv_u32_into, parse_u32}};
 
 /// A genomic bin containing reads from a specific region.
 ///
@@ -363,3 +364,44 @@ fn read_range(record: &Record) -> Option<(u32, u32)> {
         None
     }
 }
+
+/// Reused per-reader parse buffers
+#[derive(Default)]
+pub struct BinRowBuf {
+    pub depths: Vec<u32>,
+    pub lowq: Vec<u32>,
+}
+
+/// Example: parse one TSV record into buffers.
+/// Returns (start, depths_slice, lowq_slice)
+pub fn parse_bin_record_into<'a>(
+    rec: &'a ByteRecord,
+    buf: &'a mut BinRowBuf,
+    contig_expected: &str,
+) -> anyhow::Result<(u32, &'a [u32], &'a [u32])> {
+    // Adjust these indices to your actual TSV schema:
+    let i_contig = 0usize;
+    let i_start = 1usize;
+    let i_depths = 2usize;
+    let i_lowq = 3usize;
+
+    let contig = rec.get(i_contig).context("missing contig")?;
+    let contig = std::str::from_utf8(contig).context("non-utf8 contig")?;
+    anyhow::ensure!(
+        contig == contig_expected,
+        "unexpected contig {} (expected {})",
+        contig,
+        contig_expected
+    );
+
+    let start = parse_u32(rec.get(i_start).context("missing start")?)?;
+
+    let depths_field = rec.get(i_depths).context("missing depths")?;
+    parse_csv_u32_into(&mut buf.depths, depths_field).context("parse depths")?;
+
+    let lowq_field = rec.get(i_lowq).context("missing lowq")?;
+    parse_csv_u32_into(&mut buf.lowq, lowq_field).context("parse lowq")?;
+
+    Ok((start, &buf.depths, &buf.lowq))
+}
+

+ 91 - 118
src/variant/variants_stats.rs

@@ -153,12 +153,10 @@
 //! - `somatic_depth_quality_ranges()` processes contigs in parallel
 //! - `make_glm_rows_from_regions_par()` parallelizes region processing with efficient binary search
 
-use std::{
-    collections::{BTreeMap, BTreeSet},
-    io::BufRead,
-};
+use std::collections::{BTreeMap, BTreeSet};
 
 use anyhow::Context;
+use csv::ByteRecord;
 use dashmap::DashMap;
 use log::debug;
 use ordered_float::OrderedFloat;
@@ -171,13 +169,13 @@ use crate::{
     helpers::bin_data,
     io::{
         bed::read_bed, dict::read_dict, gff::features_ranges, readers::get_gz_reader,
-        writers::get_gz_writer,
+        tsv::tsv_reader, writers::get_gz_writer,
     },
     positions::{
         contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par,
         GenomeRange,
     },
-    scan::scan::BinCount,
+    scan::bin::{parse_bin_record_into, BinRowBuf},
 };
 
 use super::variant_collection::{Variant, Variants};
@@ -526,41 +524,59 @@ pub fn somatic_rates(
     })
 }
 
-/// Computes high‑depth and low‑quality genomic regions for a given sample.
+/// Computes genomic ranges with sufficient depth and with excessive low-quality reads
+/// by jointly scanning normal and tumoral per-contig depth TSV files.
 ///
-/// For each chromosome (`chr1`…`chr22`, `chrX`, `chrY`, `chrM`), this function
-/// reads paired “normal” (MRD) and “tumoral” (Diag) count files, then:
-/// 1. Marks positions where both depths ≥ `config.min_high_quality_depth` as high‑depth.
-/// 2. Marks positions where both depths < `config.max_depth_low_quality` as low‑quality.
-///    Consecutive runs of true values are merged into `GenomeRange`s.
+/// For each contig, the function reads the corresponding gzipped TSV files from the
+/// normal and tumoral directories, iterates over bins in lockstep, and validates that
+/// both files are perfectly aligned (same contig, same start position, same bin layout).
 ///
-/// # Arguments
+/// Two kinds of ranges are produced:
+/// - **High-quality depth ranges**: consecutive bins where *both* normal and tumoral
+///   depths are greater than or equal to `config.min_high_quality_depth`.
+/// - **Low-quality excess ranges**: consecutive bins where *both* normal and tumoral
+///   bins contain more low-quality reads than `config.max_depth_low_quality`.
+///
+/// Contigs are processed in parallel. Within each contig, adjacent or overlapping
+/// ranges are merged before being returned.
+///
+/// # Errors
 ///
-/// * `id` — Sample identifier (used to locate per‑contig files).
-/// * `config` — Analysis settings.
+/// Returns an error if:
+/// - A normal or tumoral TSV file cannot be opened or decompressed.
+/// - A TSV record cannot be parsed.
+/// - The normal and tumoral files differ in number of lines.
+/// - Corresponding records disagree on contig name, start position, or bin structure
+///   (depth or low-quality vector length).
 ///
 /// # Returns
 ///
-/// On success, returns `Ok((high_depth_ranges, low_quality_ranges))`, where each is a
-/// flattened `Vec<GenomeRange>` across all contigs. Returns an error if any I/O,
-/// parsing, or contig/position mismatch occurs.
+/// A tuple `(high_depth_ranges, lowq_excess_ranges)` where each element is a `Vec<GenomeRange>`
+/// spanning all contigs.
 ///
-/// # Errors
+/// # Notes
+///
+/// - Input TSV files are expected to be tab-delimited, headerless, and sorted by
+///   genomic position.
+/// - Low-quality ranges represent **excess low-quality signal** (bins exceeding the
+///   configured threshold), not acceptable regions.
+/// - The implementation reuses parsing buffers and avoids per-line allocations for
+///   performance.
+///
+/// # Panics
 ///
-/// * File open/read failures (with path & line number in context).
-/// * TSV parsing errors (with line content in context).
-/// * Contig or start‑position mismatches between paired files.
+/// This function does not intentionally panic.
 pub fn somatic_depth_quality_ranges(
     id: &str,
     config: &Config,
 ) -> anyhow::Result<(Vec<GenomeRange>, Vec<GenomeRange>)> {
     // chr1..chr22 + X,Y,M
-    let contigs: Vec<String> = (1..=22)
-        .map(|i| format!("chr{i}"))
-        .chain(["chrX", "chrY", "chrM"].into_iter().map(String::from))
+    let contigs: Vec<String> = read_dict(&config.dict_file)?
+        .into_iter()
+        .map(|(sn, _ln)| sn)
         .collect();
 
-    let cfg = config; // no Arc<&Config>
+    let cfg = config;
 
     let per_contig = contigs
         .into_par_iter()
@@ -576,114 +592,71 @@ pub fn somatic_depth_quality_ranges(
             let mut high_runs: Vec<GenomeRange> = Vec::new();
             let mut lowq_runs: Vec<GenomeRange> = Vec::new();
 
-            let mut nl = normal_rdr.lines();
-            let mut tl = tumor_rdr.lines();
+            let mut n_tsv = tsv_reader(normal_rdr); // normal_rdr: impl Read
+            let mut t_tsv = tsv_reader(tumor_rdr);
+
+            let mut n_rec = ByteRecord::new();
+            let mut t_rec = ByteRecord::new();
+
+            let mut n_buf = BinRowBuf::default();
+            let mut t_buf = BinRowBuf::default();
+
             let mut line_no = 0usize;
 
             loop {
-                let n_next = nl.next();
-                let t_next = tl.next();
-                match (n_next, t_next) {
-                    (None, None) => break,
-                    (Some(Err(e)), _) => {
-                        return Err(anyhow::anyhow!(
-                            "{} line {}: {}",
-                            normal_path,
-                            line_no + 1,
-                            e
-                        ))
-                    }
-                    (_, Some(Err(e))) => {
-                        return Err(anyhow::anyhow!(
-                            "{} line {}: {}",
-                            tumor_path,
-                            line_no + 1,
-                            e
-                        ))
+                let n_ok = n_tsv.read_byte_record(&mut n_rec)?;
+                let t_ok = t_tsv.read_byte_record(&mut t_rec)?;
+
+                if n_ok || t_ok {
+                    line_no += 1;
+                }
+
+                match (n_ok, t_ok) {
+                    (false, false) => break,
+                    (true, false) => {
+                        anyhow::bail!("{} has extra lines at {}", normal_path, line_no)
                     }
-                    (Some(Ok(n_line)), Some(Ok(t_line))) => {
-                        line_no += 1;
-
-                        let n = BinCount::from_tsv_row(&n_line).with_context(|| {
-                            format!("Parse error at {}: {}", normal_path, line_no)
-                        })?;
-                        let t = BinCount::from_tsv_row(&t_line).with_context(|| {
-                            format!("Parse error at {}: {}", tumor_path, line_no)
-                        })?;
-
-                        if n.contig != t.contig {
-                            anyhow::bail!(
-                                "Contig mismatch at line {}: {} vs {}",
-                                line_no,
-                                n.contig,
-                                t.contig
-                            );
-                        }
-                        if n.start != t.start {
-                            anyhow::bail!(
-                                "Position mismatch at line {}: {} vs {}",
-                                line_no,
-                                n.start,
-                                t.start
-                            );
-                        }
-                        // Ensure equal bin widths
-                        if n.depths.len() != t.depths.len() {
-                            anyhow::bail!(
-                                "Depth vector length mismatch at line {}: {} vs {}",
-                                line_no,
-                                n.depths.len(),
-                                t.depths.len()
-                            );
-                        }
-                        if n.low_qualities.len() != t.low_qualities.len() {
-                            anyhow::bail!(
-                                "LowQ vector length mismatch at line {}: {} vs {}",
-                                line_no,
-                                n.low_qualities.len(),
-                                t.low_qualities.len()
-                            );
-                        }
+                    (false, true) => anyhow::bail!("{} has extra lines at {}", tumor_path, line_no),
+                    (true, true) => {
+                        let (n_start, n_depths, n_lowq) =
+                            parse_bin_record_into(&n_rec, &mut n_buf, &contig)
+                                .with_context(|| format!("{} line {}", normal_path, line_no))?;
+
+                        let (t_start, t_depths, t_lowq) =
+                            parse_bin_record_into(&t_rec, &mut t_buf, &contig)
+                                .with_context(|| format!("{} line {}", tumor_path, line_no))?;
+
+                        anyhow::ensure!(n_start == t_start, "start mismatch at line {}", line_no);
+                        anyhow::ensure!(
+                            n_depths.len() == t_depths.len(),
+                            "depth len mismatch at line {}",
+                            line_no
+                        );
+                        anyhow::ensure!(
+                            n_lowq.len() == t_lowq.len(),
+                            "lowq len mismatch at line {}",
+                            line_no
+                        );
 
-                        // High-quality depth in BOTH samples
-                        let high_mask_iter = n.depths.iter().zip(&t.depths).map(|(&nd, &td)| {
+                        let high_mask_iter = n_depths.iter().zip(t_depths).map(|(&nd, &td)| {
                             nd >= cfg.min_high_quality_depth && td >= cfg.min_high_quality_depth
                         });
 
-                        // NOTE: if you intended "low-quality regions" (bad), invert predicate.
-                        let lowq_mask_iter =
-                            n.low_qualities
-                                .iter()
-                                .zip(&t.low_qualities)
-                                .map(|(&nq, &tq)| {
-                                    nq > cfg.max_depth_low_quality && tq > cfg.max_depth_low_quality
-                                });
+                        let lowq_mask_iter = n_lowq.iter().zip(t_lowq).map(|(&nq, &tq)| {
+                            nq > cfg.max_depth_low_quality && tq > cfg.max_depth_low_quality
+                        });
 
                         high_runs.extend(ranges_from_consecutive_true_iter(
                             high_mask_iter,
-                            n.start,
-                            &n.contig,
+                            n_start,
+                            &contig,
                         ));
                         lowq_runs.extend(ranges_from_consecutive_true_iter(
                             lowq_mask_iter,
-                            n.start,
-                            &n.contig,
+                            n_start,
+                            &contig,
                         ));
                     }
-                    (Some(_), None) => {
-                        anyhow::bail!(
-                            "Line count mismatch: {} has extra lines after {}",
-                            normal_path,
-                            line_no
-                        );
-                    }
-                    (None, Some(_)) => {
-                        anyhow::bail!(
-                            "Line count mismatch: {} has extra lines after {}",
-                            tumor_path,
-                            line_no
-                        );
-                    }
                 }
             }