Browse Source

ditch csv crate from all non-serde paths

TsvLine gains a delimiter parameter (TsvLine::with_delimiter(',')), making it
usable for both TSV and CSV reading. All remaining csv::ReaderBuilder usages
that do not require serde header-based deserialization are replaced.

Changes per file:
- io/tsv.rs: add delimiter field + with_delimiter constructor; update doc
- io/dict.rs: rewrite with TsvLine, remove csv dependency
- io/vcf.rs: add FromStr for VCFRow (10 positional cols, no header)
- vcf_reader.rs: replace ReaderBuilder + serde with TsvLine + VCFRow::from_str;
  drop serde::Deserialize derive (no longer needed)
- commands/modkit.rs: replace read_dmr_tsv ReaderBuilder with TsvLine
- variant/variant_collection.rs: replace all three ReaderBuilder usages —
  echtvar output (VCFRow::from_str) and two VEP output sites (VepLine
  already had FromStr)
- variant/variants_stats.rs: replace csv::Writer in write_glm_rows with
  BufWriter + writeln!

Kept csv for: collection/minknow.rs and collection/pod5_old.rs — both use
serde header-based deserialization (has_headers(true) with field rename
attributes and custom deserializers) that would require disproportionate
manual reimplementation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 tháng trước cách đây
mục cha
commit
e55bc1d745
7 tập tin đã thay đổi với 181 bổ sung129 xóa
  1. 20 45
      src/commands/modkit.rs
  2. 21 14
      src/io/dict.rs
  3. 27 10
      src/io/tsv.rs
  4. 28 2
      src/io/vcf.rs
  5. 33 33
      src/variant/variant_collection.rs
  6. 13 11
      src/variant/variants_stats.rs
  7. 39 14
      src/vcf_reader.rs

+ 20 - 45
src/commands/modkit.rs

@@ -12,7 +12,7 @@ use crate::{
     commands::{Command, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner},
     config::Config,
     helpers::diverging_rgb,
-    io::bed::convert_bgz_with_tabix,
+    io::{bed::convert_bgz_with_tabix, tsv::TsvLine},
     pipes::{Initialize, InitializeSolo},
     run,
     runners::Run,
@@ -489,52 +489,27 @@ pub fn read_dmr_tsv(path: &str, value_col_1based: usize) -> anyhow::Result<Vec<D
     }
     let value_idx = value_col_1based - 1;
 
-    let mut rdr = csv::ReaderBuilder::new()
-        .delimiter(b'\t')
-        .has_headers(false)
-        .flexible(true)
-        .from_path(path)
-        .with_context(|| format!("Failed to open input TSV: {path}"))?;
-
+    let mut reader = BufReader::new(
+        File::open(path).with_context(|| format!("Failed to open input TSV: {path}"))?,
+    );
+    let mut line = TsvLine::new();
     let mut intervals = Vec::new();
+    let mut i = 0usize;
+
+    while line.read(&mut reader).with_context(|| format!("I/O error in {path} around line {i}"))? {
+        if line.as_str().starts_with('#') || line.as_str().is_empty() { continue; }
+        i += 1;
+        let f = line.split_fields();
+        let get = |idx: usize, name: &str| -> anyhow::Result<&str> {
+            f.get(idx).copied().ok_or_else(|| anyhow::anyhow!("Missing {name} at line {i}"))
+        };
 
-    for (i, row) in rdr.records().enumerate() {
-        let row = row.with_context(|| format!("CSV parse error at line {}", i + 1))?;
-
-        let chrom = row
-            .get(0)
-            .ok_or_else(|| anyhow::anyhow!("Missing chrom at line {}", i + 1))?
-            .to_string();
-
-        let start: u64 = row
-            .get(1)
-            .ok_or_else(|| anyhow::anyhow!("Missing start at line {}", i + 1))?
-            .parse()
-            .with_context(|| format!("Invalid start at line {}", i + 1))?;
-
-        let end: u64 = row
-            .get(2)
-            .ok_or_else(|| anyhow::anyhow!("Missing end at line {}", i + 1))?
-            .parse()
-            .with_context(|| format!("Invalid end at line {}", i + 1))?;
-
-        let name = row.get(3).unwrap_or(".").to_string();
-
-        let value_str = row.get(value_idx).ok_or_else(|| {
-            anyhow::anyhow!(
-                "Missing value column {} at line {}",
-                value_col_1based,
-                i + 1
-            )
-        })?;
-
-        let value: f64 = value_str.parse().with_context(|| {
-            format!(
-                "Invalid float in value col {} at line {}",
-                value_col_1based,
-                i + 1
-            )
-        })?;
+        let chrom = get(0, "chrom")?.to_string();
+        let start: u64 = get(1, "start")?.parse().with_context(|| format!("Invalid start at line {i}"))?;
+        let end: u64 = get(2, "end")?.parse().with_context(|| format!("Invalid end at line {i}"))?;
+        let name = f.get(3).copied().unwrap_or(".").to_string();
+        let value: f64 = get(value_idx, &format!("value col {value_col_1based}"))?
+            .parse().with_context(|| format!("Invalid float in value col {value_col_1based} at line {i}"))?;
 
         intervals.push(DmrInterval {
             chrom,

+ 21 - 14
src/io/dict.rs

@@ -1,31 +1,38 @@
-use anyhow::{Context, Ok};
+use std::{fs::File, io::BufReader};
+
+use anyhow::Context;
 use log::debug;
 
+use crate::io::tsv::TsvLine;
+
+/// Read a sequence dictionary (`.dict`) file and return `(name, length)` pairs.
+///
+/// Only `@SQ` lines are processed; other lines are skipped. Each `@SQ` line
+/// must contain `SN:<name>` and `LN:<length>` tab-separated fields.
+///
+/// # Errors
+///
+/// Returns an error if the file cannot be opened, an `@SQ` line is missing
+/// `SN:` or `LN:`, or `LN:` cannot be parsed as `u32`.
 pub fn read_dict(path: &str) -> anyhow::Result<Vec<(String, u32)>> {
     debug!("Parsing {path}.");
 
-    let mut reader = csv::ReaderBuilder::new()
-        .delimiter(b'\t')
-        .flexible(true)
-        .has_headers(false)
-        .from_reader(std::fs::File::open(path)?);
-
+    let mut reader = BufReader::new(File::open(path).with_context(|| format!("cannot open dict: {path}"))?);
+    let mut line = TsvLine::new();
     let mut res = Vec::new();
 
-    for rec in reader.records() {
-        let rec = rec.context("Can't parse dict file")?;
-        if rec.get(0) != Some("@SQ") {
+    while line.read(&mut reader)? {
+        let fields = line.split_fields();
+        if fields.first().copied() != Some("@SQ") {
             continue;
         }
 
-        let sn = rec
-            .iter()
+        let sn = fields.iter()
             .find_map(|f| f.strip_prefix("SN:"))
             .context("Missing SN: in @SQ line")?
             .to_string();
 
-        let ln: u32 = rec
-            .iter()
+        let ln: u32 = fields.iter()
             .find_map(|f| f.strip_prefix("LN:"))
             .context("Missing LN: in @SQ line")?
             .parse()

+ 27 - 10
src/io/tsv.rs

@@ -23,29 +23,46 @@
 use std::io::{self, BufRead};
 use anyhow::Context;
 
-/// A reusable TSV line buffer.
+/// A reusable delimited line buffer.
 ///
 /// Reads one line at a time into an internal `String` that is cleared and
 /// reused on each call to [`read`](TsvLine::read). Fields are returned as
 /// `&str` slices via [`split_fields`](TsvLine::split_fields).
 ///
-/// This replaces the `csv::ByteRecord` + `tsv_reader` pattern: the line buffer
-/// is reused (zero per-line allocation), and field splitting produces a
-/// `Vec<&str>` (one small allocation per record, no per-field copies).
-#[derive(Default)]
+/// The default delimiter is `\t` (TSV). Use [`TsvLine::with_delimiter`] for
+/// other formats (e.g. `','` for CSV).
+///
+/// This replaces the `csv::ByteRecord` + `tsv_reader` pattern without the
+/// `csv` crate: the line buffer is reused (zero per-line allocation), and
+/// field splitting produces a `Vec<&str>` (one small allocation per record,
+/// no per-field copies).
 pub struct TsvLine {
     buf: String,
+    delimiter: char,
+}
+
+impl Default for TsvLine {
+    fn default() -> Self {
+        Self::new()
+    }
 }
 
 impl TsvLine {
+    /// Create a new `TsvLine` with the default `\t` delimiter.
     pub fn new() -> Self {
-        Self { buf: String::new() }
+        Self { buf: String::new(), delimiter: '\t' }
+    }
+
+    /// Create a new `TsvLine` with a custom delimiter (e.g. `','` for CSV).
+    pub fn with_delimiter(delimiter: char) -> Self {
+        Self { buf: String::new(), delimiter }
     }
 
     /// Read the next line from `reader` into the internal buffer.
     ///
-    /// Returns `Ok(true)` on success, `Ok(false)` at EOF. The trailing
-    /// newline(s) (`\n`, `\r\n`) are stripped before returning.
+    /// Returns `Ok(true)` on success, `Ok(false)` at EOF. Trailing
+    /// `\n` / `\r\n` are stripped before returning. Comment lines and blank
+    /// lines are **not** skipped — handle those in the caller if needed.
     ///
     /// # Errors
     ///
@@ -62,12 +79,12 @@ impl TsvLine {
         Ok(true)
     }
 
-    /// Split the current line on `\t` and collect into a `Vec<&str>`.
+    /// Split the current line on the configured delimiter and collect into a `Vec<&str>`.
     ///
     /// Each `&str` borrows from the internal buffer. Call once per record and
     /// use the returned `Vec` for all field access to avoid repeated scanning.
     pub fn split_fields(&self) -> Vec<&str> {
-        self.buf.split('\t').collect()
+        self.buf.split(self.delimiter).collect()
     }
 
     /// Return the raw line content (newline already stripped).

+ 28 - 2
src/io/vcf.rs

@@ -144,12 +144,14 @@ pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<Vcf
     Ok(variants)
 }
 
-/// A flat VCF row for CSV/TSV deserialization (e.g. via `serde`).
+/// A flat VCF row for tab-separated line parsing.
 ///
 /// Use [`VcfVariant`] for richer parsed access. This struct exists for
 /// simple tabular ingestion where field-by-field access is not needed.
 ///
-/// `pos` is **1-based** as stored in the VCF file.
+/// `pos` is **1-based** as stored in the VCF file. Parse from a tab-separated
+/// line via `FromStr`; the `serde::Deserialize` derive is kept for callers
+/// that still use serde-based deserialization.
 #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
 pub struct VCFRow {
     pub chr: String,
@@ -165,6 +167,30 @@ pub struct VCFRow {
     pub value: String,
 }
 
+impl std::str::FromStr for VCFRow {
+    type Err = anyhow::Error;
+
+    /// Parse a tab-separated VCF data line (no header, 10 positional columns).
+    fn from_str(s: &str) -> anyhow::Result<Self> {
+        let f: Vec<&str> = s.split('\t').collect();
+        let get = |i: usize, name: &str| -> anyhow::Result<&str> {
+            f.get(i).copied().ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})"))
+        };
+        Ok(Self {
+            chr:       get(0, "chr")?.to_string(),
+            pos:       get(1, "pos")?.parse().context("bad pos")?,
+            id:        get(2, "id")?.to_string(),
+            reference: get(3, "reference")?.to_string(),
+            alt:       get(4, "alt")?.to_string(),
+            qual:      get(5, "qual")?.to_string(),
+            filter:    get(6, "filter")?.to_string(),
+            info:      get(7, "info")?.to_string(),
+            format:    get(8, "format")?.to_string(),
+            value:     get(9, "value")?.to_string(),
+        })
+    }
+}
+
 /// Build a full VCF header with INFO definitions, contig lines, and Pandora version.
 ///
 /// Includes SV-specific INFO fields (`SVTYPE`, `SVLEN`, `END`) and reads contig

+ 33 - 33
src/variant/variant_collection.rs

@@ -9,7 +9,7 @@ use std::{
 use anyhow::Context;
 // use bgzip::{BGZFReader, BGZFWriter};
 use bitcode::{Decode, Encode};
-use csv::ReaderBuilder;
+use crate::io::tsv::TsvLine;
 use dashmap::DashMap;
 use log::{debug, error, info, warn};
 use rayon::prelude::*;
@@ -1789,30 +1789,32 @@ impl ExternalAnnotation {
                 // fs::remove_file(in_tmp)?;
 
                 // Parse echtvar output
-                let mut reader = ReaderBuilder::new()
-                    .delimiter(b'\t')
-                    .has_headers(false)
-                    .comment(Some(b'#'))
-                    .flexible(true)
-                    .from_reader(get_reader(out_tmp.to_str().unwrap())?);
-
+                let mut echtvar_rdr = std::io::BufReader::new(
+                    get_reader(out_tmp.to_str().unwrap())?
+                );
+                let mut echtvar_line = TsvLine::new();
                 let mut chunk_results = Vec::new();
-                for (i, result) in reader.deserialize::<crate::io::vcf::VCFRow>().enumerate() {
-                    let row = result?;
+                let mut i = 0usize;
+
+                while echtvar_line.read(&mut echtvar_rdr)? {
+                    if echtvar_line.as_str().starts_with('#') || echtvar_line.as_str().is_empty() {
+                        continue;
+                    }
+                    let row: crate::io::vcf::VCFRow = echtvar_line.as_str().parse()
+                        .context("Failed to parse echtvar VCF row")?;
 
                     // Verify that the ID corresponds to the input
                     let id: usize = row.id.parse().context("Failed to parse ID")?;
                     if id != i + 1 {
                         return Err(anyhow::anyhow!(
                             "Echtvar output ID {} does not match expected ID {}",
-                            id,
-                            i + 1
+                            id, i + 1
                         ));
                     }
 
                     let (cosmic, gnomad) = parse_echtvar_val(&row.info)?;
-
                     let hash = chunk[i].hash;
+                    i += 1;
 
                     chunk_results.push((hash, cosmic, gnomad));
                 }
@@ -1978,18 +1980,17 @@ impl ExternalAnnotation {
                         let mut vep_job = VepJob::new(&in_tmp, &out_vep, config);
                         run!(config, &mut vep_job).context("Error while running VEP.")?;
 
-                        let mut reader_vep = ReaderBuilder::new()
-                            .delimiter(b'\t')
-                            .has_headers(false)
-                            .comment(Some(b'#'))
-                            .flexible(true)
-                            .from_reader(
-                                fs::File::open(&out_vep).context("Can't open result file.")?,
-                            );
-
+                        let mut vep_rdr = std::io::BufReader::new(
+                            fs::File::open(&out_vep).context("Can't open result file.")?,
+                        );
+                        let mut vep_line = TsvLine::new();
                         let mut lines: HashMap<u64, Vec<VepLine>> = HashMap::new();
-                        for line in reader_vep.deserialize() {
-                            let line: VepLine = line.context("Failed to deserialize VepLine")?;
+                        while vep_line.read(&mut vep_rdr)? {
+                            if vep_line.as_str().starts_with('#') || vep_line.as_str().is_empty() {
+                                continue;
+                            }
+                            let line: VepLine = vep_line.as_str().parse()
+                                .context("Failed to parse VepLine")?;
                             let key = line
                                 .uploaded_variation
                                 .parse::<u64>()
@@ -2100,16 +2101,15 @@ fn process_vep_chunk(
         return Err(anyhow::anyhow!("VEP execution failed: {}", e));
     }
 
-    let mut reader_vep = ReaderBuilder::new()
-        .delimiter(b'\t')
-        .has_headers(false)
-        .comment(Some(b'#'))
-        .flexible(true)
-        .from_reader(fs::File::open(&out_vep)?);
-
+    let mut vep_rdr = std::io::BufReader::new(fs::File::open(&out_vep)?);
+    let mut vep_line = TsvLine::new();
     let mut lines: HashMap<u64, Vec<VepLine>> = HashMap::new();
-    for line in reader_vep.deserialize() {
-        let line: VepLine = line.context("Failed to deserialize VepLine")?;
+    while vep_line.read(&mut vep_rdr)? {
+        if vep_line.as_str().starts_with('#') || vep_line.as_str().is_empty() {
+            continue;
+        }
+        let line: VepLine = vep_line.as_str().parse()
+            .context("Failed to parse VepLine")?;
         let key = line
             .uploaded_variation
             .parse::<u64>()

+ 13 - 11
src/variant/variants_stats.rs

@@ -961,23 +961,25 @@ fn flatten_glm_rows(
 }
 
 pub fn write_glm_rows(all_rows: &[GlmRow], csv_path: &str) -> anyhow::Result<()> {
+    use std::{fs::File, io::{BufWriter, Write}};
+
     let (features, flat_rows) = flatten_glm_rows(all_rows);
-    let mut writer = csv::Writer::from_path(csv_path)?;
 
     let mut headers = vec![
-        "contig",
-        "start",
-        "end",
-        "length",
-        "log_length",
-        "mutation_count",
+        "contig", "start", "end", "length", "log_length", "mutation_count",
     ];
     headers.extend(features.iter().map(|s| s.as_str()));
-    writer.write_record(&headers)?;
 
-    for row in flat_rows {
-        let values: Vec<_> = headers.iter().map(|&h| row.get(h).unwrap()).collect();
-        writer.write_record(values)?;
+    let mut writer = BufWriter::new(
+        File::create(csv_path).with_context(|| format!("failed to create {csv_path}"))?,
+    );
+
+    writeln!(writer, "{}", headers.join(","))?;
+    for row in &flat_rows {
+        let values: Vec<&str> = headers.iter()
+            .map(|&h| row.get(h).map(|s| s.as_str()).unwrap_or(""))
+            .collect();
+        writeln!(writer, "{}", values.join(","))?;
     }
     writer.flush()?;
     Ok(())

+ 39 - 14
src/vcf_reader.rs

@@ -1,11 +1,16 @@
-use std::{fs::File, io::BufReader};
+use std::{
+    fs::File,
+    io::BufReader,
+    str::FromStr,
+};
 
-use csv::ReaderBuilder;
+use anyhow::Context;
 use pandora_lib_variants::variants::Variant;
 
-use crate::{callers::Caller, variant::vcf_variant::VariantType};
+use crate::{callers::Caller, io::tsv::TsvLine, variant::vcf_variant::VariantType};
 
-#[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
+/// A single row from a VCF file (tab-separated, no header row).
+#[derive(Debug, Eq, PartialEq, Clone)]
 pub struct VCFRow {
     pub chr: String,
     pub pos: u32,
@@ -19,24 +24,44 @@ pub struct VCFRow {
     pub value: String,
 }
 
+impl FromStr for VCFRow {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> anyhow::Result<Self> {
+        let f: Vec<&str> = s.split('\t').collect();
+        let get = |i: usize, name: &str| -> anyhow::Result<&str> {
+            f.get(i).copied().ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})"))
+        };
+        Ok(Self {
+            chr:       get(0, "chr")?.to_string(),
+            pos:       get(1, "pos")?.parse().context("bad pos")?,
+            id:        get(2, "id")?.to_string(),
+            reference: get(3, "reference")?.to_string(),
+            alt:       get(4, "alt")?.to_string(),
+            qual:      get(5, "qual")?.to_string(),
+            filter:    get(6, "filter")?.to_string(),
+            info:      get(7, "info")?.to_string(),
+            format:    get(8, "format")?.to_string(),
+            value:     get(9, "value")?.to_string(),
+        })
+    }
+}
+
 pub fn read_vcf(
     path: &str,
     caller: &Caller,
     variant_type: &VariantType,
 ) -> anyhow::Result<Vec<Variant>> {
-    let mut reader = ReaderBuilder::new()
-        .delimiter(b'\t')
-        .comment(Some(b'#'))
-        .has_headers(false)
-        .flexible(true)
-        .from_reader(get_reader(path)?);
-    let iter = reader.deserialize();
-
+    let mut reader = BufReader::new(get_reader(path)?);
+    let mut line = TsvLine::new();
     let mut all = Vec::new();
 
     // should be replaced with bcftools
-    for result in iter {
-        let record: VCFRow = result?;
+    while line.read(&mut reader)? {
+        if line.as_str().starts_with('#') || line.as_str().is_empty() {
+            continue;
+        }
+        let record: VCFRow = line.as_str().parse()?;
 
         // Normalize into multirows
         if record.alt.contains(",") {