Преглед на файлове

add //! module header to io/mod.rs; fix fastq.rs buffering

io/mod.rs: comprehensive //! header documenting all submodules, the
0-based half-open coordinate convention throughout, and the BGZF assumption.
Submodule declarations sorted alphabetically.

fastq.rs: wrap File::create in BufWriter — previously every writeln! and
write_all issued a separate syscall; now writes are batched. Add //! header
and proper rustdoc on write_fastq.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas преди 1 месец
родител
ревизия
e4ef7537d2
променени са 2 файла, в които са добавени 75 реда и са изтрити 24 реда
  1. 30 15
      src/io/fastq.rs
  2. 45 9
      src/io/mod.rs

+ 30 - 15
src/io/fastq.rs

@@ -1,35 +1,50 @@
-use std::{io::Write, path::Path};
+//! FASTQ writer from BAM/CRAM records.
+
+use std::{
+    io::{BufWriter, Write},
+    path::Path,
+};
 
 use anyhow::Context;
 use rust_htslib::bam;
 
-// ─── FASTQ writer ─────────────────────────────────────────────────────────────
- 
-/// Write htslib records to FASTQ.
+/// Write a slice of BAM records to a FASTQ file.
+///
+/// - Sequence is taken as stored in the BAM record (always forward-strand,
+///   regardless of alignment flags)
+/// - Quality scores are converted from phred+0 (HTSlib internal) to phred+33 ASCII
+/// - Records with empty sequences are silently skipped
+///
+/// # Arguments
+///
+/// * `records` - BAM records to write
+/// * `out` - Destination FASTQ file path (created or overwritten)
+///
+/// # Errors
 ///
-/// - htslib always stores the forward-strand sequence regardless of flag
-/// - Converts phred+0 qual to phred+33 ASCII
-/// - Skips records with empty sequence
+/// Returns an error if the file cannot be created or a read name is not valid UTF-8.
 pub fn write_fastq(records: &[bam::Record], out: &Path) -> anyhow::Result<()> {
-    let mut f = std::fs::File::create(out)
-        .with_context(|| format!("Cannot create FASTQ: {}", out.display()))?;
- 
+    let mut f = BufWriter::new(
+        std::fs::File::create(out)
+            .with_context(|| format!("Cannot create FASTQ: {}", out.display()))?,
+    );
+
     for rec in records {
         let seq = rec.seq().as_bytes();
         if seq.is_empty() {
             continue;
         }
- 
-        let name       = std::str::from_utf8(rec.qname()).context("Non-UTF8 read name")?;
+
+        let name = std::str::from_utf8(rec.qname()).context("Non-UTF8 read name")?;
         let qual_ascii: Vec<u8> = rec.qual().iter().map(|&q| q + 33).collect();
- 
-        writeln!(f, "@{}", name)?;
+
+        writeln!(f, "@{name}")?;
         f.write_all(&seq)?;
         writeln!(f)?;
         writeln!(f, "+")?;
         f.write_all(&qual_ascii)?;
         writeln!(f)?;
     }
- 
+
     Ok(())
 }

+ 45 - 9
src/io/mod.rs

@@ -1,15 +1,51 @@
-pub mod pod5_infos;
-pub mod readers;
-pub mod vcf;
+//! File I/O for all genomic formats used by Pandora.
+//!
+//! # Coordinate convention
+//!
+//! All types and functions in this module use **0-based, half-open `[start, end)`**
+//! coordinates unless explicitly documented otherwise. This matches the BED format,
+//! Rust's `Range<u32>`, and the internal [`GenomeRange`](crate::positions::GenomeRange)
+//! representation. Conversions to/from 1-based formats (GFF3, VCF POS, SAM POS,
+//! Tabix positions) are handled internally and noted in each function's documentation.
+//!
+//! # BGZF vs standard gzip
+//!
+//! All `.gz` files are treated as **BGZF** (block gzip), not standard gzip.
+//! BGZF is produced by `bgzip` and used by BAM, VCF.gz, BED.gz, etc.
+//! Plain `gzip` output will not decompress correctly. See [`readers`] for details.
+//!
+//! # Submodules
+//!
+//! | Module | Purpose |
+//! |--------|---------|
+//! | [`bam`] | BAM/CRAM reading, SA-tag parsing, fold-back inversion detection |
+//! | [`bed`] | BED file I/O, overlap queries, gene annotation, tabix compression |
+//! | [`vcf`] | VCF file I/O with BGZF + Tabix index |
+//! | [`fasta`] | Indexed FASTA access, contig splitting |
+//! | [`gff`] | GFF3 feature range extraction |
+//! | [`modkit`] | Modkit bedMethyl pileup parsing, epigenetic activity computation |
+//! | [`straglr`] | Straglr STR genotyper TSV parsing |
+//! | [`liftover`] | UCSC chain file parsing and coordinate liftover |
+//! | [`readers`] | Generic BGZF/plain readers, Tabix region fetch (`fetch_tabix_lines_with`) |
+//! | [`writers`] | BGZF writers, `BgzTabixWriter` for combined BGZF + Tabix output |
+//! | [`tsv`] | `TsvLine` — reusable delimiter-agnostic line buffer (replaces `csv::ByteRecord`) |
+//! | [`dict`] | Sequence dictionary (`.dict`) reader |
+//! | [`fastq`] | FASTQ writer from BAM records |
+//! | [`pod5_infos`] | POD5 run metadata extraction via Arrow IPC + flatbuffers |
+//! | [`pod5_footer_generated`] | Auto-generated flatbuffers types for the POD5 footer |
+
+pub mod bam;
 pub mod bed;
 pub mod dict;
 pub mod fasta;
-pub mod pod5_footer_generated;
+pub mod fastq;
 pub mod gff;
-pub mod bam;
-pub mod writers;
+pub mod liftover;
+pub mod modkit;
+pub mod pod5_footer_generated;
+pub mod pod5_infos;
+pub mod readers;
 pub mod straglr;
 pub mod tsv;
-pub mod modkit;
-pub mod liftover;
-pub mod fastq;
+pub mod vcf;
+pub mod writers;