瀏覽代碼

fix and audit fasta.rs, gff.rs, liftover.rs

liftover.rs — fix silent I/O error swallowing (REAL BUG):
  .map_while(Result::ok) dropped read errors and silently truncated the
  chain file; the liftover machine was then built from incomplete data
  with no indication to the caller. Replaced with .map(|r| r.context(...)?)
  + .collect::<anyhow::Result<_>>()?  so errors propagate correctly.

fasta.rs:
  - Rename sequence_range parameters: end -> end0_inclusive to make the
    0-based inclusive convention explicit (both callers pass inclusive end)
  - Remove dead current_name variable in split_fasta
  - Add //! module header and rustdoc on all public items

gff.rs:
  - Add //! module header documenting coordinate conversion
    (GFF3 1-based inclusive -> 0-based half-open via from_1_inclusive)
  - Rustdoc pass on features_ranges

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 月之前
父節點
當前提交
2d9b97dcea
共有 3 個文件被更改,包括 103 次插入64 次删除
  1. 64 42
      src/io/fasta.rs
  2. 23 11
      src/io/gff.rs
  3. 16 11
      src/io/liftover.rs

+ 64 - 42
src/io/fasta.rs

@@ -1,36 +1,45 @@
+//! Indexed FASTA reader and utilities for reference sequence access.
+//!
+//! All coordinate parameters follow the **0-based** convention used internally
+//! throughout the codebase. Conversion to the 1-based noodles API is done
+//! transparently inside each function.
+
 use std::io::{BufRead, Write};
 use std::{fs::File, io::BufReader, path::{Path, PathBuf}};
 
 use anyhow::Context;
 use noodles_fasta::io::{BufReader as NoodlesBufReader, IndexedReader};
 
-/// Open FASTA with its .fai index.
+/// Open an indexed FASTA file (requires a `.fai` index alongside the path).
 ///
-/// Note: noodles API differs slightly by version; adjust this function if needed.
-/// Commonly you either:
-/// - use a Builder that reads `{path}.fai`, or
-/// - create an IndexedReader from a reader + index.
+/// # Errors
 ///
-/// This version assumes there is a `indexed_reader::Builder`.
+/// Returns an error if the FASTA or its index cannot be opened.
 pub fn open_indexed_fasta(reference: &Path) -> anyhow::Result<IndexedReader<NoodlesBufReader<File>>> {
     noodles_fasta::io::indexed_reader::Builder::default()
         .build_from_path(reference)
         .map_err(|e| anyhow::anyhow!("Failed to open indexed FASTA {}: {e}", reference.display()))
 }
 
-// 0-based position in input
+/// Fetch a fixed-width window of reference sequence centred on `pos0`.
+///
+/// `pos0` is a **0-based** position. The window `[pos0 - len/2, pos0 - len/2 + len)`
+/// is fetched (clamped to the start of the contig). Returns the sequence in
+/// uppercase.
+///
+/// # Errors
+///
+/// Returns an error if the region is out of bounds or the sequence cannot be
+/// decoded as UTF-8.
 pub fn sequence_at(
     fasta_reader: &mut IndexedReader<noodles_fasta::io::BufReader<File>>,
     contig: &str,
     pos0: usize,
     len: usize,
 ) -> anyhow::Result<String> {
-    // convert to 1-based
-    let position = pos0 + 1;
-
+    let position = pos0 + 1; // 0-based → 1-based
     let start = position.saturating_sub(len / 2).max(1);
     let end = start + len - 1;
-    // debug!("Region {contig}:{start}-{end} (1-based inclusive)");
 
     let start = noodles_core::Position::try_from(start)?;
     let end = noodles_core::Position::try_from(end)?;
@@ -38,51 +47,67 @@ pub fn sequence_at(
 
     let r = noodles_core::region::Region::new(contig.to_string(), interval);
     let record = fasta_reader.query(&r)?;
-    let s = String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase();
-
-    Ok(s)
+    Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase())
 }
 
+/// Fetch reference sequence over `[start0, end0_inclusive]` (both **0-based inclusive**).
+///
+/// Both `start0` and `end0_inclusive` are **0-based inclusive** coordinates.
+/// For example, to fetch bases at 0-based positions 10, 11, 12:
+/// `sequence_range(reader, "chr1", 10, 12)`.
+///
+/// Returns the sequence in uppercase.
+///
+/// # Errors
+///
+/// Returns an error if the region is out of bounds or the sequence cannot be
+/// decoded as UTF-8.
 pub fn sequence_range(
     fasta_reader: &mut IndexedReader<noodles_fasta::io::BufReader<File>>,
     contig: &str,
-    start: usize,
-    end: usize,
+    start0: usize,          // 0-based inclusive
+    end0_inclusive: usize,  // 0-based inclusive
 ) -> anyhow::Result<String> {
-    let start = noodles_core::Position::try_from(start + 1)?;
-    let end = noodles_core::Position::try_from(end + 1)?;
+    let start = noodles_core::Position::try_from(start0 + 1)?;
+    let end   = noodles_core::Position::try_from(end0_inclusive + 1)?;
     let interval = noodles_core::region::interval::Interval::from(start..=end);
 
     let r = noodles_core::region::Region::new(contig.to_string(), interval);
     let record = fasta_reader.query(&r)?;
-    let s = String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase();
-
-    Ok(s)
+    Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase())
 }
 
+/// A single-contig FASTA file produced by [`split_fasta`].
 pub struct ContigFasta {
     pub name:       String,
     pub fasta_path: PathBuf,
 }
- 
+
 /// Split a multi-contig FASTA into individual single-contig files under `out_dir`.
-/// Returns one entry per contig in the order they appear in the input.
+///
+/// Returns one [`ContigFasta`] per contig in file order. Contig names are
+/// sanitised for use as filenames (only alphanumeric, `_`, `-` are kept;
+/// everything else becomes `_`).
+///
+/// # Errors
+///
+/// Returns an error if `out_dir` cannot be created, the input cannot be read,
+/// or any output file cannot be written.
 pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFasta>> {
     std::fs::create_dir_all(out_dir)
         .with_context(|| format!("Cannot create contig dir: {}", out_dir.display()))?;
- 
-    let f = std::fs::File::open(fasta)
-        .with_context(|| format!("Cannot open FASTA: {}", fasta.display()))?;
-    let reader = BufReader::new(f);
- 
-    let mut contigs  = Vec::new();
-    let mut current_name: Option<String> = None;
+
+    let reader = BufReader::new(
+        std::fs::File::open(fasta)
+            .with_context(|| format!("Cannot open FASTA: {}", fasta.display()))?,
+    );
+
+    let mut contigs: Vec<ContigFasta> = Vec::new();
     let mut current_writer: Option<std::fs::File> = None;
- 
+
     for line in reader.lines() {
         let line = line.context("Error reading FASTA")?;
         if let Some(name) = line.strip_prefix('>') {
-            // Sanitise contig name for use as filename
             let safe_name: String = name
                 .split_whitespace()
                 .next()
@@ -90,21 +115,18 @@ pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFas
                 .chars()
                 .map(|c| if c.is_alphanumeric() || c == '_' || c == '-' { c } else { '_' })
                 .collect();
- 
-            let path = out_dir.join(format!("{}.fasta", safe_name));
+
+            let path = out_dir.join(format!("{safe_name}.fasta"));
             let mut w = std::fs::File::create(&path)
                 .with_context(|| format!("Cannot create contig FASTA: {}", path.display()))?;
-            writeln!(w, ">{}", name)?;
- 
-            contigs.push(ContigFasta { name: safe_name.clone(), fasta_path: path });
-            current_name   = Some(safe_name);
+            writeln!(w, ">{name}")?;
+
+            contigs.push(ContigFasta { name: safe_name, fasta_path: path });
             current_writer = Some(w);
         } else if let Some(ref mut w) = current_writer {
-            writeln!(w, "{}", line)?;
+            writeln!(w, "{line}")?;
         }
     }
- 
-    let _ = current_name; // suppress unused warning
+
     Ok(contigs)
 }
-

+ 23 - 11
src/io/gff.rs

@@ -1,3 +1,10 @@
+//! GFF3 feature range extraction.
+//!
+//! Reads a GFF3 file (BGZF-compressed) and returns genomic ranges for a given
+//! feature type. GFF3 uses **1-based inclusive** coordinates; the ranges are
+//! converted to **0-based half-open** [`GenomeRange`] values via
+//! [`GenomeRange::from_1_inclusive`].
+
 use anyhow::Context;
 use noodles_gff as gff;
 
@@ -5,15 +12,23 @@ use crate::{config::Config, positions::GenomeRange};
 
 use super::readers::get_gz_reader;
 
+/// Return all genomic ranges matching `feature_type` in the configured GFF3 file.
+///
+/// Reads `config.refseq_gff` (BGZF-compressed GFF3). Coordinates are converted
+/// from GFF3 1-based inclusive to 0-based half-open [`GenomeRange`].
+///
 /// # Arguments
-/// * `feature_type` - GFF feature type to filter (e.g., "gene", "exon", "CDS", "transcript")
-/// * `config` - Application configuration containing refseq_gff path
-/// 
+///
+/// * `feature_type` - GFF3 feature type to filter (e.g. `"gene"`, `"exon"`, `"CDS"`)
+/// * `config` - Application configuration supplying the GFF3 path
+///
 /// # Returns
-/// Vec of genomic ranges matching specified feature type
-/// 
+///
+/// All ranges matching `feature_type`, in file order.
+///
 /// # Errors
-/// Returns anyhow::Error for I/O failures and parsing errors
+///
+/// Returns an error if the GFF3 file cannot be opened or a record fails to parse.
 pub fn features_ranges(feature_type: &str, config: &Config) -> anyhow::Result<Vec<GenomeRange>> {
     let path = &config.refseq_gff;
     let mut reader = get_gz_reader(path)
@@ -22,12 +37,9 @@ pub fn features_ranges(feature_type: &str, config: &Config) -> anyhow::Result<Ve
 
     let mut res = Vec::new();
     for result in reader.record_bufs() {
-        let record = result.with_context(|| {
-            format!("Error reading GFF file {path}")
-        })?;
+        let record = result.with_context(|| format!("Error reading GFF file {path}"))?;
 
-        let ty = record.ty();
-        if ty != feature_type {
+        if record.ty() != feature_type {
             continue;
         }
 

+ 16 - 11
src/io/liftover.rs

@@ -1,3 +1,5 @@
+//! UCSC chain file parser and coordinate liftover machine builder.
+
 use crate::io::readers::get_reader;
 use anyhow::Context;
 use chainfile::liftover::machine::Builder;
@@ -64,22 +66,25 @@ pub fn build_machine_from_chain<P: AsRef<Path>>(
     let path = chain_path.as_ref();
     let p = path.to_str().context("path to string")?;
 
-    let reader = get_reader(p)?;
-    let buf_reader = BufReader::new(reader);
+    // BufReader is needed to satisfy BufRead for .lines(); get_reader returns
+    // Box<dyn Read> not Box<dyn BufRead>.
+    let reader = BufReader::new(get_reader(p)?);
 
-    // Normalize space-delimited alignment blocks to tab-delimited format
-    let normalized: Vec<u8> = buf_reader
+    // Normalize space-delimited alignment blocks to tab-delimited format.
+    // Collect through Result so I/O errors are propagated rather than silently
+    // truncating the chain file (which would corrupt all downstream liftover results).
+    let normalized: Vec<u8> = reader
         .lines()
-        .map_while(Result::ok)
-        .map(|line| {
-            if line.starts_with("chain") || line.is_empty() {
-                line // Keep header/empty lines as-is
+        .map(|result| {
+            let line = result.context("I/O error reading chain file")?;
+            let out = if line.starts_with("chain") || line.is_empty() {
+                line
             } else {
-                // Convert alignment block spaces to tabs
                 line.split_whitespace().collect::<Vec<_>>().join("\t")
-            }
+            };
+            Ok(out)
         })
-        .collect::<Vec<_>>()
+        .collect::<anyhow::Result<Vec<_>>>()?
         .join("\n")
         .into_bytes();