| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- //! Indexed FASTA reader and utilities for reference sequence access.
- //!
- //! All coordinate parameters follow the **0-based** convention used internally
- //! throughout the codebase. Conversion to the 1-based noodles API is done
- //! transparently inside each function.
- use std::io::{BufRead, Write};
- use std::{
- fs::File,
- io::BufReader,
- path::{Path, PathBuf},
- };
- use anyhow::Context;
- use noodles_fasta::io::{BufReader as NoodlesBufReader, IndexedReader};
- /// Open an indexed FASTA file (requires a `.fai` index alongside the path).
- ///
- /// # Errors
- ///
- /// Returns an error if the FASTA or its index cannot be opened.
- pub fn open_indexed_fasta(
- reference: &Path,
- ) -> anyhow::Result<IndexedReader<NoodlesBufReader<File>>> {
- noodles_fasta::io::indexed_reader::Builder::default()
- .build_from_path(reference)
- .map_err(|e| anyhow::anyhow!("Failed to open indexed FASTA {}: {e}", reference.display()))
- }
- /// Fetch a fixed-width window of reference sequence centred on `pos0`.
- ///
- /// `pos0` is a **0-based** position. The window `[pos0 - len/2, pos0 - len/2 + len)`
- /// is fetched (clamped to the start of the contig). Returns the sequence in
- /// uppercase.
- ///
- /// # Errors
- ///
- /// Returns an error if the region is out of bounds or the sequence cannot be
- /// decoded as UTF-8.
- pub fn sequence_at(
- fasta_reader: &mut IndexedReader<noodles_fasta::io::BufReader<File>>,
- contig: &str,
- pos0: usize,
- len: usize,
- ) -> anyhow::Result<String> {
- anyhow::ensure!(len > 0, "FASTA window length must be > 0");
- let position = pos0 + 1; // 0-based → 1-based
- let start = position.saturating_sub(len / 2).max(1);
- let end = start
- .checked_add(len - 1)
- .context("FASTA window end coordinate overflow")?;
- let start = noodles_core::Position::try_from(start)?;
- let end = noodles_core::Position::try_from(end)?;
- let interval = noodles_core::region::interval::Interval::from(start..=end);
- let r = noodles_core::region::Region::new(contig.to_string(), interval);
- let record = fasta_reader.query(&r)?;
- Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase())
- }
- /// Fetch reference sequence over `[start0, end0_inclusive]` (both **0-based inclusive**).
- ///
- /// Both `start0` and `end0_inclusive` are **0-based inclusive** coordinates.
- /// For example, to fetch bases at 0-based positions 10, 11, 12:
- /// `sequence_range(reader, "chr1", 10, 12)`.
- ///
- /// Returns the sequence in uppercase.
- ///
- /// # Errors
- ///
- /// Returns an error if the region is out of bounds or the sequence cannot be
- /// decoded as UTF-8.
- pub fn sequence_range(
- fasta_reader: &mut IndexedReader<noodles_fasta::io::BufReader<File>>,
- contig: &str,
- start0: usize, // 0-based inclusive
- end0_inclusive: usize, // 0-based inclusive
- ) -> anyhow::Result<String> {
- anyhow::ensure!(
- start0 <= end0_inclusive,
- "FASTA range start ({start0}) is after end ({end0_inclusive})"
- );
- let start = noodles_core::Position::try_from(start0 + 1)?;
- let end = noodles_core::Position::try_from(end0_inclusive + 1)?;
- let interval = noodles_core::region::interval::Interval::from(start..=end);
- let r = noodles_core::region::Region::new(contig.to_string(), interval);
- let record = fasta_reader.query(&r)?;
- Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase())
- }
- /// A single-contig FASTA file produced by [`split_fasta`].
- pub struct ContigFasta {
- pub name: String,
- pub fasta_path: PathBuf,
- }
- /// Split a multi-contig FASTA into individual single-contig files under `out_dir`.
- ///
- /// Returns one [`ContigFasta`] per contig in file order. Contig names are
- /// sanitised for use as filenames (only alphanumeric, `_`, `-` are kept;
- /// everything else becomes `_`).
- ///
- /// # Errors
- ///
- /// Returns an error if `out_dir` cannot be created, the input cannot be read,
- /// or any output file cannot be written.
- pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFasta>> {
- std::fs::create_dir_all(out_dir)
- .with_context(|| format!("Cannot create contig dir: {}", out_dir.display()))?;
- let reader = BufReader::new(
- std::fs::File::open(fasta)
- .with_context(|| format!("Cannot open FASTA: {}", fasta.display()))?,
- );
- let mut contigs: Vec<ContigFasta> = Vec::new();
- let mut current_writer: Option<std::fs::File> = None;
- for line in reader.lines() {
- let line = line.context("Error reading FASTA")?;
- if let Some(name) = line.strip_prefix('>') {
- let safe_name: String = name
- .split_whitespace()
- .next()
- .unwrap_or("contig")
- .chars()
- .map(|c| {
- if c.is_alphanumeric() || c == '_' || c == '-' {
- c
- } else {
- '_'
- }
- })
- .collect();
- let path = out_dir.join(format!("{safe_name}.fasta"));
- let mut w = std::fs::File::create(&path)
- .with_context(|| format!("Cannot create contig FASTA: {}", path.display()))?;
- writeln!(w, ">{name}")?;
- contigs.push(ContigFasta {
- name: safe_name,
- fasta_path: path,
- });
- current_writer = Some(w);
- } else if let Some(ref mut w) = current_writer {
- writeln!(w, "{line}")?;
- }
- }
- Ok(contigs)
- }
- /// Read a single-contig FASTA produced by split_fasta.
- /// Skips the header line, uppercases, replaces non-ACGT with N.
- /// Pure std — no noodles needed since we wrote the file ourselves.
- pub fn read_single_contig_fasta(path: &Path) -> anyhow::Result<Vec<u8>> {
- use std::io::BufRead;
- let f = std::fs::File::open(path)?;
- let reader = std::io::BufReader::with_capacity(8 * 1024 * 1024, f);
- let mut seq = Vec::new();
- for line in reader.lines() {
- let line = line?;
- if line.starts_with('>') {
- continue;
- }
- let sanitized = line.bytes().map(|b| match b.to_ascii_uppercase() {
- b'A' | b'C' | b'G' | b'T' => b.to_ascii_uppercase(),
- _ => b'N',
- });
- seq.extend(sanitized);
- }
- anyhow::ensure!(!seq.is_empty(), "empty contig in {}", path.display());
- Ok(seq)
- }
- pub struct FaiEntry {
- pub name: String,
- pub len: usize,
- }
- pub fn read_fai(path: &Path) -> anyhow::Result<Vec<FaiEntry>> {
- use std::io::BufRead;
- let f = std::fs::File::open(path)?;
- std::io::BufReader::new(f)
- .lines()
- .map(|l| {
- let l = l?;
- let mut cols = l.split('\t');
- let name = cols.next().context("fai: missing name")?.to_owned();
- let len = cols
- .next()
- .context("fai: missing len")?
- .parse::<usize>()
- .context("fai: bad len")?;
- Ok(FaiEntry { name, len })
- })
- .collect()
- }
|