فهرست منبع

replace csv::ByteRecord + tsv_reader with TsvLine in scan and variants

Introduce TsvLine in io/tsv.rs: a reused String buffer that reads one
tab-separated line at a time. split_fields() returns Vec<&str> slices
into the buffer — one small pointer-array allocation per line, no per-field
copies, no csv crate dependency in these paths.

Changes:
- io/tsv.rs: add TsvLine; remove tsv_reader (returned csv::Reader<R>);
  update parse_u32 and parse_csv_u32_into from &[u8] to &str signatures
- scan/bin.rs: parse_bin_record_into now takes &[&str] instead of &ByteRecord;
  remove csv::ByteRecord import
- scan/scan.rs: validate_count_file uses TsvLine instead of tsv_reader+ByteRecord
- variant/variants_stats.rs: twin-reader loop uses TsvLine; remove ByteRecord

variant/variant_collection.rs left for a separate pass — it uses
reader.deserialize::<VepLine>() (serde-driven) which needs more changes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 ماه پیش
والد
کامیت
8198808748
4فایلهای تغییر یافته به همراه150 افزوده شده و 91 حذف شده
  1. 93 21
      src/io/tsv.rs
  2. 23 22
      src/scan/bin.rs
  3. 11 15
      src/scan/scan.rs
  4. 23 33
      src/variant/variants_stats.rs

+ 93 - 21
src/io/tsv.rs

@@ -1,33 +1,105 @@
-use std::io::Read;
+//! TSV reading utilities — no `csv` crate dependency.
+//!
+//! The core abstraction is [`TsvLine`]: a reusable `String` buffer that reads
+//! one tab-separated line at a time. Fields are accessed as `&str` slices via
+//! [`TsvLine::split_fields`], which collects into a `Vec<&str>` (one small
+//! allocation per line, no per-field allocation).
+//!
+//! # Typical usage
+//!
+//! ```no_run
+//! use std::io::BufRead;
+//! use pandora_lib_promethion::io::tsv::TsvLine;
+//!
+//! let mut reader = /* any BufRead */;
+//! let mut line = TsvLine::new();
+//! while line.read(&mut reader)? {
+//!     let fields = line.split_fields();
+//!     let chrom = fields.get(0).copied().unwrap_or("");
+//!     // ...
+//! }
+//! ```
 
-use csv::ReaderBuilder;
+use std::io::{self, BufRead};
 use anyhow::Context;
 
-/// Build a TSV csv::Reader from *any* Read (including gz decoder)
-pub fn tsv_reader<R: Read>(r: R) -> csv::Reader<R> {
-    ReaderBuilder::new()
-        .delimiter(b'\t')
-        .has_headers(false)
-        .flexible(true)
-        .from_reader(r)
+/// A reusable TSV line buffer.
+///
+/// Reads one line at a time into an internal `String` that is cleared and
+/// reused on each call to [`read`](TsvLine::read). Fields are returned as
+/// `&str` slices via [`split_fields`](TsvLine::split_fields).
+///
+/// This replaces the `csv::ByteRecord` + `tsv_reader` pattern: the line buffer
+/// is reused (zero per-line allocation), and field splitting produces a
+/// `Vec<&str>` (one small allocation per record, no per-field copies).
+#[derive(Default)]
+pub struct TsvLine {
+    buf: String,
 }
 
-pub fn parse_u32(field: &[u8]) -> anyhow::Result<u32> {
-    // fast-ish, no String allocation
-    let s = std::str::from_utf8(field).context("non-utf8 integer field")?;
-    Ok(s.parse::<u32>().context("bad integer")?)
+impl TsvLine {
+    pub fn new() -> Self {
+        Self { buf: String::new() }
+    }
+
+    /// Read the next line from `reader` into the internal buffer.
+    ///
+    /// Returns `Ok(true)` on success, `Ok(false)` at EOF. The trailing
+    /// newline(s) (`\n`, `\r\n`) are stripped before returning.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the underlying read fails.
+    pub fn read<R: BufRead>(&mut self, reader: &mut R) -> io::Result<bool> {
+        self.buf.clear();
+        let n = reader.read_line(&mut self.buf)?;
+        if n == 0 {
+            return Ok(false);
+        }
+        while self.buf.ends_with(['\n', '\r']) {
+            self.buf.pop();
+        }
+        Ok(true)
+    }
+
+    /// Split the current line on `\t` and collect into a `Vec<&str>`.
+    ///
+    /// Each `&str` borrows from the internal buffer. Call once per record and
+    /// use the returned `Vec` for all field access to avoid repeated scanning.
+    pub fn split_fields(&self) -> Vec<&str> {
+        self.buf.split('\t').collect()
+    }
+
+    /// Return the raw line content (newline already stripped).
+    pub fn as_str(&self) -> &str {
+        &self.buf
+    }
+}
+
+// ─── Field parsing helpers ────────────────────────────────────────────────────
+
+/// Parse a `u32` from a `&str` field.
+///
+/// # Errors
+///
+/// Returns an error if the field is not a valid `u32`.
+pub fn parse_u32(field: &str) -> anyhow::Result<u32> {
+    field.parse::<u32>().context("bad integer")
 }
 
-pub fn parse_csv_u32_into(dst: &mut Vec<u32>, field: &[u8]) -> anyhow::Result<()> {
+/// Parse a comma-separated list of `u32` values into `dst`, reusing its allocation.
+///
+/// `dst` is cleared before filling. Empty parts are skipped.
+///
+/// # Errors
+///
+/// Returns an error if any part fails to parse as `u32`.
+pub fn parse_csv_u32_into(dst: &mut Vec<u32>, field: &str) -> anyhow::Result<()> {
     dst.clear();
-    if field.is_empty() {
-        return Ok(());
-    }
-    for part in field.split(|&b| b == b',') {
-        if part.is_empty() {
-            continue;
+    for part in field.split(',') {
+        if !part.is_empty() {
+            dst.push(part.parse::<u32>().context("bad integer in csv field")?);
         }
-        dst.push(parse_u32(part)?);
     }
     Ok(())
 }

+ 23 - 22
src/scan/bin.rs

@@ -1,7 +1,6 @@
 use std::collections::HashMap;
 
 use anyhow::Context;
-use csv::ByteRecord;
 use log::{error, warn};
 use rust_htslib::bam::{HeaderView, IndexedReader, Read, Record, ext::BamRecordExtensions, record::Aux};
 
@@ -398,38 +397,40 @@ pub struct BinRowBuf {
     pub lowq: Vec<u32>,
 }
 
-/// Example: parse one TSV record into buffers.
-/// Returns (start, depths_slice, lowq_slice)
+/// Parse one TSV record (as `&[&str]` fields) into the depth/lowq buffers.
+///
+/// # Arguments
+///
+/// * `fields` - Fields from [`TsvLine::split_fields`]; expects columns 0 (contig),
+///   1 (start), 2 (end), 9 (depths csv), 10 (lowq csv)
+/// * `buf` - Reused output buffers; cleared and refilled on each call
+/// * `contig_expected` - Expected contig name; returns an error if it doesn't match
+///
+/// # Returns
+///
+/// `(start, depths, lowq)` where `depths` and `lowq` are slices into `buf`.
 pub fn parse_bin_record_into<'a>(
-    rec: &'a ByteRecord,
+    fields: &[&str],
     buf: &'a mut BinRowBuf,
     contig_expected: &str,
 ) -> anyhow::Result<(u32, &'a [u32], &'a [u32])> {
-    let i_contig = 0usize;
-    let i_start  = 1usize;
-    let i_end    = 2usize;
-
-    // adjust if your file has more/less scalar columns, but for your pasted line:
-    let i_depths = 9usize;   // big CSV list
-    let i_lowq   = 10usize;  // big CSV list
+    let get = |i: usize, name: &str| -> anyhow::Result<&str> {
+        fields.get(i).copied().ok_or_else(|| anyhow::anyhow!("missing field {name} (col {i})"))
+    };
 
-    let contig = std::str::from_utf8(rec.get(i_contig).context("missing contig")?)
-        .context("non-utf8 contig")?;
-    anyhow::ensure!(contig == contig_expected, "unexpected contig");
+    let contig = get(0, "contig")?;
+    anyhow::ensure!(contig == contig_expected, "unexpected contig {contig:?}, expected {contig_expected:?}");
 
-    let start = parse_u32(rec.get(i_start).context("missing start")?)?;
-    let end = parse_u32(rec.get(i_end).context("missing end")?)?;
+    let start = parse_u32(get(1, "start")?).context("bad start")?;
+    let end   = parse_u32(get(2, "end")?).context("bad end")?;
     anyhow::ensure!(end >= start, "invalid bin coordinates: end < start ({start} > {end})");
 
-    parse_csv_u32_into(&mut buf.depths, rec.get(i_depths).context("missing depths")?)
-        .context("parse depths")?;
-    parse_csv_u32_into(&mut buf.lowq, rec.get(i_lowq).context("missing lowq")?)
-        .context("parse lowq")?;
+    parse_csv_u32_into(&mut buf.depths, get(9, "depths")?).context("parse depths")?;
+    parse_csv_u32_into(&mut buf.lowq,   get(10, "lowq")?).context("parse lowq")?;
 
-    // critical sanity check: end-start+1 should match vector length for per-base bins
     anyhow::ensure!(
         (end - start + 1) as usize == buf.depths.len(),
-        "bin width mismatch: {}..{} has width {}, depths has len {}",
+        "bin width mismatch: {}..{} → width {}, depths len {}",
         start, end, end - start + 1, buf.depths.len()
     );
     anyhow::ensure!(buf.depths.len() == buf.lowq.len(), "depth/lowq len mismatch");

+ 11 - 15
src/scan/scan.rs

@@ -60,7 +60,7 @@ use rust_htslib::bam::{self, IndexedReader, Read, Record};
 use crate::helpers::{bam_contigs, get_genome_sizes, is_file_older};
 use crate::io::bam::fb_inv_from_record;
 use crate::io::readers::get_gz_reader;
-use crate::io::tsv::tsv_reader;
+use crate::io::tsv::TsvLine;
 use crate::io::writers::{finalize_bgzf_file, get_gz_writer};
 use crate::math::filter_outliers_modified_z_score_with_indices;
 
@@ -839,29 +839,25 @@ fn validate_count_file(
     expected_contig: &str,
     expected_rows: usize,
 ) -> anyhow::Result<()> {
-    let rdr = get_gz_reader(path)?;
-    let mut tsv = tsv_reader(rdr);
-    let mut rec = csv::ByteRecord::new();
-
+    let mut rdr = get_gz_reader(path)?;
+    let mut line = TsvLine::new();
     let mut n = 0usize;
-    while tsv.read_byte_record(&mut rec).with_context(|| {
-        format!(
-            "failed reading validation record in {path} around line {}",
-            n + 1
-        )
+
+    while line.read(&mut rdr).with_context(|| {
+        format!("failed reading {path} around line {}", n + 1)
     })? {
         n += 1;
+        let fields = line.split_fields();
 
         anyhow::ensure!(
-            rec.len() == 12,
+            fields.len() == 12,
             "{path} line {n}: expected 12 fields, got {}",
-            rec.len()
+            fields.len()
         );
-
         anyhow::ensure!(
-            rec.get(0) == Some(expected_contig.as_bytes()),
+            fields[0] == expected_contig,
             "{path} line {n}: unexpected contig {:?}, expected {expected_contig}",
-            rec.get(0).map(String::from_utf8_lossy)
+            fields[0]
         );
     }
 

+ 23 - 33
src/variant/variants_stats.rs

@@ -156,7 +156,6 @@
 use std::collections::{BTreeMap, BTreeSet};
 
 use anyhow::Context;
-use csv::ByteRecord;
 use dashmap::DashMap;
 use log::debug;
 use ordered_float::OrderedFloat;
@@ -169,7 +168,7 @@ use crate::{
     helpers::bin_data,
     io::{
         bed::read_bed, dict::read_dict, gff::features_ranges, readers::get_gz_reader,
-        tsv::tsv_reader, writers::{finalize_bgzf_file, get_gz_writer},
+        tsv::TsvLine, writers::{finalize_bgzf_file, get_gz_writer},
     },
     positions::{
         GenomeRange, contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par
@@ -585,19 +584,16 @@ pub fn somatic_depth_quality_ranges(
             let normal_path = format!("{}/{}_count.tsv.gz", cfg.normal_dir_count(id), contig);
             let tumor_path = format!("{}/{}_count.tsv.gz", cfg.tumoral_dir_count(id), contig);
 
-            let normal_rdr = get_gz_reader(&normal_path)
+            let mut normal_rdr = get_gz_reader(&normal_path)
                 .with_context(|| format!("Failed to open normal file: {}", normal_path))?;
-            let tumor_rdr = get_gz_reader(&tumor_path)
+            let mut tumor_rdr = get_gz_reader(&tumor_path)
                 .with_context(|| format!("Failed to open tumor file: {}", tumor_path))?;
 
             let mut high_runs: Vec<GenomeRange> = Vec::new();
             let mut lowq_runs: Vec<GenomeRange> = Vec::new();
 
-            let mut n_tsv = tsv_reader(normal_rdr); // normal_rdr: impl Read
-            let mut t_tsv = tsv_reader(tumor_rdr);
-
-            let mut n_rec = ByteRecord::new();
-            let mut t_rec = ByteRecord::new();
+            let mut n_line = TsvLine::new();
+            let mut t_line = TsvLine::new();
 
             let mut n_buf = BinRowBuf::default();
             let mut t_buf = BinRowBuf::default();
@@ -605,39 +601,33 @@ pub fn somatic_depth_quality_ranges(
             let mut line_no = 0usize;
 
             loop {
-                let n_ok = n_tsv.read_byte_record(&mut n_rec).with_context(|| {
-                    format!("reading normal TSV: {} line {}", normal_path, line_no + 1)
-                })?;
-
-                let t_ok = t_tsv.read_byte_record(&mut t_rec).with_context(|| {
-                    format!("reading tumor TSV: {} line {}", tumor_path, line_no + 1)
-                })?;
+                let n_ok = n_line.read(&mut normal_rdr)
+                    .with_context(|| format!("reading normal TSV: {} line {}", normal_path, line_no + 1))?;
+                let t_ok = t_line.read(&mut tumor_rdr)
+                    .with_context(|| format!("reading tumor TSV: {} line {}", tumor_path, line_no + 1))?;
 
-                if n_ok || t_ok {
-                    line_no += 1;
-                }
+                if n_ok || t_ok { line_no += 1; }
 
                 match (n_ok, t_ok) {
                     (false, false) => break,
-                    (true, false) => {
-                        anyhow::bail!(
-                            "{normal_path} has extra lines at {line_no}; last normal record = {:?}",
-                            String::from_utf8_lossy(n_rec.as_slice())
-                        )
-                    }
-                    (false, true) => {
-                        anyhow::bail!(
-                            "{tumor_path} has extra lines at {line_no}; last tumor record = {:?}",
-                            String::from_utf8_lossy(t_rec.as_slice())
-                        )
-                    }
+                    (true, false) => anyhow::bail!(
+                        "{normal_path} has extra lines at {line_no}; last record = {:?}",
+                        n_line.as_str()
+                    ),
+                    (false, true) => anyhow::bail!(
+                        "{tumor_path} has extra lines at {line_no}; last record = {:?}",
+                        t_line.as_str()
+                    ),
                     (true, true) => {
+                        let n_fields = n_line.split_fields();
+                        let t_fields = t_line.split_fields();
+
                         let (n_start, n_depths, n_lowq) =
-                            parse_bin_record_into(&n_rec, &mut n_buf, &contig)
+                            parse_bin_record_into(&n_fields, &mut n_buf, &contig)
                                 .with_context(|| format!("{} line {}", normal_path, line_no))?;
 
                         let (t_start, t_depths, t_lowq) =
-                            parse_bin_record_into(&t_rec, &mut t_buf, &contig)
+                            parse_bin_record_into(&t_fields, &mut t_buf, &contig)
                                 .with_context(|| format!("{} line {}", tumor_path, line_no))?;
 
                         anyhow::ensure!(n_start == t_start, "start mismatch at line {}", line_no);