2 ماه پیش · 8198808748
--- a/src/io/tsv.rs
+++ b/src/io/tsv.rs
@@ -1,33 +1,105 @@
 
				-use std::io::Read;
			
 
				+//! TSV reading utilities — no `csv` crate dependency.
			
 
				+//!
			
 
				+//! The core abstraction is [`TsvLine`]: a reusable `String` buffer that reads
			
 
				+//! one tab-separated line at a time. Fields are accessed as `&str` slices via
			
 
				+//! [`TsvLine::split_fields`], which collects into a `Vec<&str>` (one small
			
 
				+//! allocation per line, no per-field allocation).
			
 
				+//!
			
 
				+//! # Typical usage
			
 
				+//!
			
 
				+//! ```no_run
			
 
				+//! use std::io::BufRead;
			
 
				+//! use pandora_lib_promethion::io::tsv::TsvLine;
			
 
				+//!
			
 
				+//! let mut reader = /* any BufRead */;
			
 
				+//! let mut line = TsvLine::new();
			
 
				+//! while line.read(&mut reader)? {
			
 
				+//!     let fields = line.split_fields();
			
 
				+//!     let chrom = fields.get(0).copied().unwrap_or("");
			
 
				+//!     // ...
			
 
				+//! }
			
 
				+//! ```
			
 
				 
			
 
				-use csv::ReaderBuilder;
			
 
				+use std::io::{self, BufRead};
			
 
				 use anyhow::Context;
			
 
				 
			
 
				-/// Build a TSV csv::Reader from *any* Read (including gz decoder)
			
 
				-pub fn tsv_reader<R: Read>(r: R) -> csv::Reader<R> {
			
 
				-    ReaderBuilder::new()
			
 
				-        .delimiter(b'\t')
			
 
				-        .has_headers(false)
			
 
				-        .flexible(true)
			
 
				-        .from_reader(r)
			
 
				+/// A reusable TSV line buffer.
			
 
				+///
			
 
				+/// Reads one line at a time into an internal `String` that is cleared and
			
 
				+/// reused on each call to [`read`](TsvLine::read). Fields are returned as
			
 
				+/// `&str` slices via [`split_fields`](TsvLine::split_fields).
			
 
				+///
			
 
				+/// This replaces the `csv::ByteRecord` + `tsv_reader` pattern: the line buffer
			
 
				+/// is reused (zero per-line allocation), and field splitting produces a
			
 
				+/// `Vec<&str>` (one small allocation per record, no per-field copies).
			
 
				+#[derive(Default)]
			
 
				+pub struct TsvLine {
			
 
				+    buf: String,
			
 
				 }
			
 
				 
			
 
				-pub fn parse_u32(field: &[u8]) -> anyhow::Result<u32> {
			
 
				-    // fast-ish, no String allocation
			
 
				-    let s = std::str::from_utf8(field).context("non-utf8 integer field")?;
			
 
				-    Ok(s.parse::<u32>().context("bad integer")?)
			
 
				+impl TsvLine {
			
 
				+    pub fn new() -> Self {
			
 
				+        Self { buf: String::new() }
			
 
				+    }
			
 
				+
			
 
				+    /// Read the next line from `reader` into the internal buffer.
			
 
				+    ///
			
 
				+    /// Returns `Ok(true)` on success, `Ok(false)` at EOF. The trailing
			
 
				+    /// newline(s) (`\n`, `\r\n`) are stripped before returning.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    ///
			
 
				+    /// Returns an error if the underlying read fails.
			
 
				+    pub fn read<R: BufRead>(&mut self, reader: &mut R) -> io::Result<bool> {
			
 
				+        self.buf.clear();
			
 
				+        let n = reader.read_line(&mut self.buf)?;
			
 
				+        if n == 0 {
			
 
				+            return Ok(false);
			
 
				+        }
			
 
				+        while self.buf.ends_with(['\n', '\r']) {
			
 
				+            self.buf.pop();
			
 
				+        }
			
 
				+        Ok(true)
			
 
				+    }
			
 
				+
			
 
				+    /// Split the current line on `\t` and collect into a `Vec<&str>`.
			
 
				+    ///
			
 
				+    /// Each `&str` borrows from the internal buffer. Call once per record and
			
 
				+    /// use the returned `Vec` for all field access to avoid repeated scanning.
			
 
				+    pub fn split_fields(&self) -> Vec<&str> {
			
 
				+        self.buf.split('\t').collect()
			
 
				+    }
			
 
				+
			
 
				+    /// Return the raw line content (newline already stripped).
			
 
				+    pub fn as_str(&self) -> &str {
			
 
				+        &self.buf
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// ─── Field parsing helpers ────────────────────────────────────────────────────
			
 
				+
			
 
				+/// Parse a `u32` from a `&str` field.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if the field is not a valid `u32`.
			
 
				+pub fn parse_u32(field: &str) -> anyhow::Result<u32> {
			
 
				+    field.parse::<u32>().context("bad integer")
			
 
				 }
			
 
				 
			
 
				-pub fn parse_csv_u32_into(dst: &mut Vec<u32>, field: &[u8]) -> anyhow::Result<()> {
			
 
				+/// Parse a comma-separated list of `u32` values into `dst`, reusing its allocation.
			
 
				+///
			
 
				+/// `dst` is cleared before filling. Empty parts are skipped.
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Returns an error if any part fails to parse as `u32`.
			
 
				+pub fn parse_csv_u32_into(dst: &mut Vec<u32>, field: &str) -> anyhow::Result<()> {
			
 
				     dst.clear();
			
 
				-    if field.is_empty() {
			
 
				-        return Ok(());
			
 
				-    }
			
 
				-    for part in field.split(|&b| b == b',') {
			
 
				-        if part.is_empty() {
			
 
				-            continue;
			
 
				+    for part in field.split(',') {
			
 
				+        if !part.is_empty() {
			
 
				+            dst.push(part.parse::<u32>().context("bad integer in csv field")?);
			
 
				         }
			
 
				-        dst.push(parse_u32(part)?);
			
 
				     }
			
 
				     Ok(())
			
 
				 }
			
--- a/src/scan/bin.rs
+++ b/src/scan/bin.rs
@@ -1,7 +1,6 @@
 
				 use std::collections::HashMap;
			
 
				 
			
 
				 use anyhow::Context;
			
 
				-use csv::ByteRecord;
			
 
				 use log::{error, warn};
			
 
				 use rust_htslib::bam::{HeaderView, IndexedReader, Read, Record, ext::BamRecordExtensions, record::Aux};
			
 
				 
			
@@ -398,38 +397,40 @@ pub struct BinRowBuf {
 
				     pub lowq: Vec<u32>,
			
 
				 }
			
 
				 
			
 
				-/// Example: parse one TSV record into buffers.
			
 
				-/// Returns (start, depths_slice, lowq_slice)
			
 
				+/// Parse one TSV record (as `&[&str]` fields) into the depth/lowq buffers.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `fields` - Fields from [`TsvLine::split_fields`]; expects columns 0 (contig),
			
 
				+///   1 (start), 2 (end), 9 (depths csv), 10 (lowq csv)
			
 
				+/// * `buf` - Reused output buffers; cleared and refilled on each call
			
 
				+/// * `contig_expected` - Expected contig name; returns an error if it doesn't match
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// `(start, depths, lowq)` where `depths` and `lowq` are slices into `buf`.
			
 
				 pub fn parse_bin_record_into<'a>(
			
 
				-    rec: &'a ByteRecord,
			
 
				+    fields: &[&str],
			
 
				     buf: &'a mut BinRowBuf,
			
 
				     contig_expected: &str,
			
 
				 ) -> anyhow::Result<(u32, &'a [u32], &'a [u32])> {
			
 
				-    let i_contig = 0usize;
			
 
				-    let i_start  = 1usize;
			
 
				-    let i_end    = 2usize;
			
 
				-
			
 
				-    // adjust if your file has more/less scalar columns, but for your pasted line:
			
 
				-    let i_depths = 9usize;   // big CSV list
			
 
				-    let i_lowq   = 10usize;  // big CSV list
			
 
				+    let get = |i: usize, name: &str| -> anyhow::Result<&str> {
			
 
				+        fields.get(i).copied().ok_or_else(|| anyhow::anyhow!("missing field {name} (col {i})"))
			
 
				+    };
			
 
				 
			
 
				-    let contig = std::str::from_utf8(rec.get(i_contig).context("missing contig")?)
			
 
				-        .context("non-utf8 contig")?;
			
 
				-    anyhow::ensure!(contig == contig_expected, "unexpected contig");
			
 
				+    let contig = get(0, "contig")?;
			
 
				+    anyhow::ensure!(contig == contig_expected, "unexpected contig {contig:?}, expected {contig_expected:?}");
			
 
				 
			
 
				-    let start = parse_u32(rec.get(i_start).context("missing start")?)?;
			
 
				-    let end = parse_u32(rec.get(i_end).context("missing end")?)?;
			
 
				+    let start = parse_u32(get(1, "start")?).context("bad start")?;
			
 
				+    let end   = parse_u32(get(2, "end")?).context("bad end")?;
			
 
				     anyhow::ensure!(end >= start, "invalid bin coordinates: end < start ({start} > {end})");
			
 
				 
			
 
				-    parse_csv_u32_into(&mut buf.depths, rec.get(i_depths).context("missing depths")?)
			
 
				-        .context("parse depths")?;
			
 
				-    parse_csv_u32_into(&mut buf.lowq, rec.get(i_lowq).context("missing lowq")?)
			
 
				-        .context("parse lowq")?;
			
 
				+    parse_csv_u32_into(&mut buf.depths, get(9, "depths")?).context("parse depths")?;
			
 
				+    parse_csv_u32_into(&mut buf.lowq,   get(10, "lowq")?).context("parse lowq")?;
			
 
				 
			
 
				-    // critical sanity check: end-start+1 should match vector length for per-base bins
			
 
				     anyhow::ensure!(
			
 
				         (end - start + 1) as usize == buf.depths.len(),
			
 
				-        "bin width mismatch: {}..{} has width {}, depths has len {}",
			
 
				+        "bin width mismatch: {}..{} → width {}, depths len {}",
			
 
				         start, end, end - start + 1, buf.depths.len()
			
 
				     );
			
 
				     anyhow::ensure!(buf.depths.len() == buf.lowq.len(), "depth/lowq len mismatch");
			
--- a/src/scan/scan.rs
+++ b/src/scan/scan.rs
@@ -60,7 +60,7 @@ use rust_htslib::bam::{self, IndexedReader, Read, Record};
 
				 use crate::helpers::{bam_contigs, get_genome_sizes, is_file_older};
			
 
				 use crate::io::bam::fb_inv_from_record;
			
 
				 use crate::io::readers::get_gz_reader;
			
 
				-use crate::io::tsv::tsv_reader;
			
 
				+use crate::io::tsv::TsvLine;
			
 
				 use crate::io::writers::{finalize_bgzf_file, get_gz_writer};
			
 
				 use crate::math::filter_outliers_modified_z_score_with_indices;
			
 
				 
			
@@ -839,29 +839,25 @@ fn validate_count_file(
 
				     expected_contig: &str,
			
 
				     expected_rows: usize,
			
 
				 ) -> anyhow::Result<()> {
			
 
				-    let rdr = get_gz_reader(path)?;
			
 
				-    let mut tsv = tsv_reader(rdr);
			
 
				-    let mut rec = csv::ByteRecord::new();
			
 
				-
			
 
				+    let mut rdr = get_gz_reader(path)?;
			
 
				+    let mut line = TsvLine::new();
			
 
				     let mut n = 0usize;
			
 
				-    while tsv.read_byte_record(&mut rec).with_context(|| {
			
 
				-        format!(
			
 
				-            "failed reading validation record in {path} around line {}",
			
 
				-            n + 1
			
 
				-        )
			
 
				+
			
 
				+    while line.read(&mut rdr).with_context(|| {
			
 
				+        format!("failed reading {path} around line {}", n + 1)
			
 
				     })? {
			
 
				         n += 1;
			
 
				+        let fields = line.split_fields();
			
 
				 
			
 
				         anyhow::ensure!(
			
 
				-            rec.len() == 12,
			
 
				+            fields.len() == 12,
			
 
				             "{path} line {n}: expected 12 fields, got {}",
			
 
				-            rec.len()
			
 
				+            fields.len()
			
 
				         );
			
 
				-
			
 
				         anyhow::ensure!(
			
 
				-            rec.get(0) == Some(expected_contig.as_bytes()),
			
 
				+            fields[0] == expected_contig,
			
 
				             "{path} line {n}: unexpected contig {:?}, expected {expected_contig}",
			
 
				-            rec.get(0).map(String::from_utf8_lossy)
			
 
				+            fields[0]
			
 
				         );
			
 
				     }
			
 
				 
			
--- a/src/variant/variants_stats.rs
+++ b/src/variant/variants_stats.rs
@@ -156,7 +156,6 @@
 
				 use std::collections::{BTreeMap, BTreeSet};
			
 
				 
			
 
				 use anyhow::Context;
			
 
				-use csv::ByteRecord;
			
 
				 use dashmap::DashMap;
			
 
				 use log::debug;
			
 
				 use ordered_float::OrderedFloat;
			
@@ -169,7 +168,7 @@ use crate::{
 
				     helpers::bin_data,
			
 
				     io::{
			
 
				         bed::read_bed, dict::read_dict, gff::features_ranges, readers::get_gz_reader,
			
 
				-        tsv::tsv_reader, writers::{finalize_bgzf_file, get_gz_writer},
			
 
				+        tsv::TsvLine, writers::{finalize_bgzf_file, get_gz_writer},
			
 
				     },
			
 
				     positions::{
			
 
				         GenomeRange, contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par
			
@@ -585,19 +584,16 @@ pub fn somatic_depth_quality_ranges(
 
				             let normal_path = format!("{}/{}_count.tsv.gz", cfg.normal_dir_count(id), contig);
			
 
				             let tumor_path = format!("{}/{}_count.tsv.gz", cfg.tumoral_dir_count(id), contig);
			
 
				 
			
 
				-            let normal_rdr = get_gz_reader(&normal_path)
			
 
				+            let mut normal_rdr = get_gz_reader(&normal_path)
			
 
				                 .with_context(|| format!("Failed to open normal file: {}", normal_path))?;
			
 
				-            let tumor_rdr = get_gz_reader(&tumor_path)
			
 
				+            let mut tumor_rdr = get_gz_reader(&tumor_path)
			
 
				                 .with_context(|| format!("Failed to open tumor file: {}", tumor_path))?;
			
 
				 
			
 
				             let mut high_runs: Vec<GenomeRange> = Vec::new();
			
 
				             let mut lowq_runs: Vec<GenomeRange> = Vec::new();
			
 
				 
			
 
				-            let mut n_tsv = tsv_reader(normal_rdr); // normal_rdr: impl Read
			
 
				-            let mut t_tsv = tsv_reader(tumor_rdr);
			
 
				-
			
 
				-            let mut n_rec = ByteRecord::new();
			
 
				-            let mut t_rec = ByteRecord::new();
			
 
				+            let mut n_line = TsvLine::new();
			
 
				+            let mut t_line = TsvLine::new();
			
 
				 
			
 
				             let mut n_buf = BinRowBuf::default();
			
 
				             let mut t_buf = BinRowBuf::default();
			
@@ -605,39 +601,33 @@ pub fn somatic_depth_quality_ranges(
 
				             let mut line_no = 0usize;
			
 
				 
			
 
				             loop {
			
 
				-                let n_ok = n_tsv.read_byte_record(&mut n_rec).with_context(|| {
			
 
				-                    format!("reading normal TSV: {} line {}", normal_path, line_no + 1)
			
 
				-                })?;
			
 
				-
			
 
				-                let t_ok = t_tsv.read_byte_record(&mut t_rec).with_context(|| {
			
 
				-                    format!("reading tumor TSV: {} line {}", tumor_path, line_no + 1)
			
 
				-                })?;
			
 
				+                let n_ok = n_line.read(&mut normal_rdr)
			
 
				+                    .with_context(|| format!("reading normal TSV: {} line {}", normal_path, line_no + 1))?;
			
 
				+                let t_ok = t_line.read(&mut tumor_rdr)
			
 
				+                    .with_context(|| format!("reading tumor TSV: {} line {}", tumor_path, line_no + 1))?;
			
 
				 
			
 
				-                if n_ok || t_ok {
			
 
				-                    line_no += 1;
			
 
				-                }
			
 
				+                if n_ok || t_ok { line_no += 1; }
			
 
				 
			
 
				                 match (n_ok, t_ok) {
			
 
				                     (false, false) => break,
			
 
				-                    (true, false) => {
			
 
				-                        anyhow::bail!(
			
 
				-                            "{normal_path} has extra lines at {line_no}; last normal record = {:?}",
			
 
				-                            String::from_utf8_lossy(n_rec.as_slice())
			
 
				-                        )
			
 
				-                    }
			
 
				-                    (false, true) => {
			
 
				-                        anyhow::bail!(
			
 
				-                            "{tumor_path} has extra lines at {line_no}; last tumor record = {:?}",
			
 
				-                            String::from_utf8_lossy(t_rec.as_slice())
			
 
				-                        )
			
 
				-                    }
			
 
				+                    (true, false) => anyhow::bail!(
			
 
				+                        "{normal_path} has extra lines at {line_no}; last record = {:?}",
			
 
				+                        n_line.as_str()
			
 
				+                    ),
			
 
				+                    (false, true) => anyhow::bail!(
			
 
				+                        "{tumor_path} has extra lines at {line_no}; last record = {:?}",
			
 
				+                        t_line.as_str()
			
 
				+                    ),
			
 
				                     (true, true) => {
			
 
				+                        let n_fields = n_line.split_fields();
			
 
				+                        let t_fields = t_line.split_fields();
			
 
				+
			
 
				                         let (n_start, n_depths, n_lowq) =
			
 
				-                            parse_bin_record_into(&n_rec, &mut n_buf, &contig)
			
 
				+                            parse_bin_record_into(&n_fields, &mut n_buf, &contig)
			
 
				                                 .with_context(|| format!("{} line {}", normal_path, line_no))?;
			
 
				 
			
 
				                         let (t_start, t_depths, t_lowq) =
			
 
				-                            parse_bin_record_into(&t_rec, &mut t_buf, &contig)
			
 
				+                            parse_bin_record_into(&t_fields, &mut t_buf, &contig)
			
 
				                                 .with_context(|| format!("{} line {}", tumor_path, line_no))?;
			
 
				 
			
 
				                         anyhow::ensure!(n_start == t_start, "start mismatch at line {}", line_no);