//! Indexed FASTA reader and utilities for reference sequence access. //! //! All coordinate parameters follow the **0-based** convention used internally //! throughout the codebase. Conversion to the 1-based noodles API is done //! transparently inside each function. use std::io::{BufRead, Write}; use std::{ fs::File, io::BufReader, path::{Path, PathBuf}, }; use anyhow::Context; use noodles_fasta::io::{BufReader as NoodlesBufReader, IndexedReader}; /// Open an indexed FASTA file (requires a `.fai` index alongside the path). /// /// # Errors /// /// Returns an error if the FASTA or its index cannot be opened. pub fn open_indexed_fasta( reference: &Path, ) -> anyhow::Result>> { noodles_fasta::io::indexed_reader::Builder::default() .build_from_path(reference) .map_err(|e| anyhow::anyhow!("Failed to open indexed FASTA {}: {e}", reference.display())) } /// Fetch a fixed-width window of reference sequence centred on `pos0`. /// /// `pos0` is a **0-based** position. The window `[pos0 - len/2, pos0 - len/2 + len)` /// is fetched (clamped to the start of the contig). Returns the sequence in /// uppercase. /// /// # Errors /// /// Returns an error if the region is out of bounds or the sequence cannot be /// decoded as UTF-8. pub fn sequence_at( fasta_reader: &mut IndexedReader>, contig: &str, pos0: usize, len: usize, ) -> anyhow::Result { anyhow::ensure!(len > 0, "FASTA window length must be > 0"); let position = pos0 + 1; // 0-based → 1-based let start = position.saturating_sub(len / 2).max(1); let end = start .checked_add(len - 1) .context("FASTA window end coordinate overflow")?; let start = noodles_core::Position::try_from(start)?; let end = noodles_core::Position::try_from(end)?; let interval = noodles_core::region::interval::Interval::from(start..=end); let r = noodles_core::region::Region::new(contig.to_string(), interval); let record = fasta_reader.query(&r)?; Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase()) } /// Fetch reference sequence over `[start0, end0_inclusive]` (both **0-based inclusive**). /// /// Both `start0` and `end0_inclusive` are **0-based inclusive** coordinates. /// For example, to fetch bases at 0-based positions 10, 11, 12: /// `sequence_range(reader, "chr1", 10, 12)`. /// /// Returns the sequence in uppercase. /// /// # Errors /// /// Returns an error if the region is out of bounds or the sequence cannot be /// decoded as UTF-8. pub fn sequence_range( fasta_reader: &mut IndexedReader>, contig: &str, start0: usize, // 0-based inclusive end0_inclusive: usize, // 0-based inclusive ) -> anyhow::Result { anyhow::ensure!( start0 <= end0_inclusive, "FASTA range start ({start0}) is after end ({end0_inclusive})" ); let start = noodles_core::Position::try_from(start0 + 1)?; let end = noodles_core::Position::try_from(end0_inclusive + 1)?; let interval = noodles_core::region::interval::Interval::from(start..=end); let r = noodles_core::region::Region::new(contig.to_string(), interval); let record = fasta_reader.query(&r)?; Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase()) } /// A single-contig FASTA file produced by [`split_fasta`]. pub struct ContigFasta { pub name: String, pub fasta_path: PathBuf, } /// Split a multi-contig FASTA into individual single-contig files under `out_dir`. /// /// Returns one [`ContigFasta`] per contig in file order. Contig names are /// sanitised for use as filenames (only alphanumeric, `_`, `-` are kept; /// everything else becomes `_`). /// /// # Errors /// /// Returns an error if `out_dir` cannot be created, the input cannot be read, /// or any output file cannot be written. pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result> { std::fs::create_dir_all(out_dir) .with_context(|| format!("Cannot create contig dir: {}", out_dir.display()))?; let reader = BufReader::new( std::fs::File::open(fasta) .with_context(|| format!("Cannot open FASTA: {}", fasta.display()))?, ); let mut contigs: Vec = Vec::new(); let mut current_writer: Option = None; for line in reader.lines() { let line = line.context("Error reading FASTA")?; if let Some(name) = line.strip_prefix('>') { let safe_name: String = name .split_whitespace() .next() .unwrap_or("contig") .chars() .map(|c| { if c.is_alphanumeric() || c == '_' || c == '-' { c } else { '_' } }) .collect(); let path = out_dir.join(format!("{safe_name}.fasta")); let mut w = std::fs::File::create(&path) .with_context(|| format!("Cannot create contig FASTA: {}", path.display()))?; writeln!(w, ">{name}")?; contigs.push(ContigFasta { name: safe_name, fasta_path: path, }); current_writer = Some(w); } else if let Some(ref mut w) = current_writer { writeln!(w, "{line}")?; } } Ok(contigs) } /// Read a single-contig FASTA produced by split_fasta. /// Skips the header line, uppercases, replaces non-ACGT with N. /// Pure std — no noodles needed since we wrote the file ourselves. pub fn read_single_contig_fasta(path: &Path) -> anyhow::Result> { use std::io::BufRead; let f = std::fs::File::open(path)?; let reader = std::io::BufReader::with_capacity(8 * 1024 * 1024, f); let mut seq = Vec::new(); for line in reader.lines() { let line = line?; if line.starts_with('>') { continue; } let sanitized = line.bytes().map(|b| match b.to_ascii_uppercase() { b'A' | b'C' | b'G' | b'T' => b.to_ascii_uppercase(), _ => b'N', }); seq.extend(sanitized); } anyhow::ensure!(!seq.is_empty(), "empty contig in {}", path.display()); Ok(seq) } pub struct FaiEntry { pub name: String, pub len: usize, } pub fn read_fai(path: &Path) -> anyhow::Result> { use std::io::BufRead; let f = std::fs::File::open(path)?; std::io::BufReader::new(f) .lines() .map(|l| { let l = l?; let mut cols = l.split('\t'); let name = cols.next().context("fai: missing name")?.to_owned(); let len = cols .next() .context("fai: missing len")? .parse::() .context("fai: bad len")?; Ok(FaiEntry { name, len }) }) .collect() }