|
|
@@ -1,36 +1,45 @@
|
|
|
+//! Indexed FASTA reader and utilities for reference sequence access.
|
|
|
+//!
|
|
|
+//! All coordinate parameters follow the **0-based** convention used internally
|
|
|
+//! throughout the codebase. Conversion to the 1-based noodles API is done
|
|
|
+//! transparently inside each function.
|
|
|
+
|
|
|
use std::io::{BufRead, Write};
|
|
|
use std::{fs::File, io::BufReader, path::{Path, PathBuf}};
|
|
|
|
|
|
use anyhow::Context;
|
|
|
use noodles_fasta::io::{BufReader as NoodlesBufReader, IndexedReader};
|
|
|
|
|
|
-/// Open FASTA with its .fai index.
|
|
|
+/// Open an indexed FASTA file (requires a `.fai` index alongside the path).
|
|
|
///
|
|
|
-/// Note: noodles API differs slightly by version; adjust this function if needed.
|
|
|
-/// Commonly you either:
|
|
|
-/// - use a Builder that reads `{path}.fai`, or
|
|
|
-/// - create an IndexedReader from a reader + index.
|
|
|
+/// # Errors
|
|
|
///
|
|
|
-/// This version assumes there is a `indexed_reader::Builder`.
|
|
|
+/// Returns an error if the FASTA or its index cannot be opened.
|
|
|
pub fn open_indexed_fasta(reference: &Path) -> anyhow::Result<IndexedReader<NoodlesBufReader<File>>> {
|
|
|
noodles_fasta::io::indexed_reader::Builder::default()
|
|
|
.build_from_path(reference)
|
|
|
.map_err(|e| anyhow::anyhow!("Failed to open indexed FASTA {}: {e}", reference.display()))
|
|
|
}
|
|
|
|
|
|
-// 0-based position in input
|
|
|
+/// Fetch a fixed-width window of reference sequence centred on `pos0`.
|
|
|
+///
|
|
|
+/// `pos0` is a **0-based** position. The window `[pos0 - len/2, pos0 - len/2 + len)`
|
|
|
+/// is fetched (clamped to the start of the contig). Returns the sequence in
|
|
|
+/// uppercase.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if the region is out of bounds or the sequence cannot be
|
|
|
+/// decoded as UTF-8.
|
|
|
pub fn sequence_at(
|
|
|
fasta_reader: &mut IndexedReader<noodles_fasta::io::BufReader<File>>,
|
|
|
contig: &str,
|
|
|
pos0: usize,
|
|
|
len: usize,
|
|
|
) -> anyhow::Result<String> {
|
|
|
- // convert to 1-based
|
|
|
- let position = pos0 + 1;
|
|
|
-
|
|
|
+ let position = pos0 + 1; // 0-based → 1-based
|
|
|
let start = position.saturating_sub(len / 2).max(1);
|
|
|
let end = start + len - 1;
|
|
|
- // debug!("Region {contig}:{start}-{end} (1-based inclusive)");
|
|
|
|
|
|
let start = noodles_core::Position::try_from(start)?;
|
|
|
let end = noodles_core::Position::try_from(end)?;
|
|
|
@@ -38,51 +47,67 @@ pub fn sequence_at(
|
|
|
|
|
|
let r = noodles_core::region::Region::new(contig.to_string(), interval);
|
|
|
let record = fasta_reader.query(&r)?;
|
|
|
- let s = String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase();
|
|
|
-
|
|
|
- Ok(s)
|
|
|
+ Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase())
|
|
|
}
|
|
|
|
|
|
+/// Fetch reference sequence over `[start0, end0_inclusive]` (both **0-based inclusive**).
|
|
|
+///
|
|
|
+/// Both `start0` and `end0_inclusive` are **0-based inclusive** coordinates.
|
|
|
+/// For example, to fetch bases at 0-based positions 10, 11, 12:
|
|
|
+/// `sequence_range(reader, "chr1", 10, 12)`.
|
|
|
+///
|
|
|
+/// Returns the sequence in uppercase.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if the region is out of bounds or the sequence cannot be
|
|
|
+/// decoded as UTF-8.
|
|
|
pub fn sequence_range(
|
|
|
fasta_reader: &mut IndexedReader<noodles_fasta::io::BufReader<File>>,
|
|
|
contig: &str,
|
|
|
- start: usize,
|
|
|
- end: usize,
|
|
|
+ start0: usize, // 0-based inclusive
|
|
|
+ end0_inclusive: usize, // 0-based inclusive
|
|
|
) -> anyhow::Result<String> {
|
|
|
- let start = noodles_core::Position::try_from(start + 1)?;
|
|
|
- let end = noodles_core::Position::try_from(end + 1)?;
|
|
|
+ let start = noodles_core::Position::try_from(start0 + 1)?;
|
|
|
+ let end = noodles_core::Position::try_from(end0_inclusive + 1)?;
|
|
|
let interval = noodles_core::region::interval::Interval::from(start..=end);
|
|
|
|
|
|
let r = noodles_core::region::Region::new(contig.to_string(), interval);
|
|
|
let record = fasta_reader.query(&r)?;
|
|
|
- let s = String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase();
|
|
|
-
|
|
|
- Ok(s)
|
|
|
+ Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase())
|
|
|
}
|
|
|
|
|
|
+/// A single-contig FASTA file produced by [`split_fasta`].
|
|
|
pub struct ContigFasta {
|
|
|
pub name: String,
|
|
|
pub fasta_path: PathBuf,
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/// Split a multi-contig FASTA into individual single-contig files under `out_dir`.
|
|
|
-/// Returns one entry per contig in the order they appear in the input.
|
|
|
+///
|
|
|
+/// Returns one [`ContigFasta`] per contig in file order. Contig names are
|
|
|
+/// sanitised for use as filenames (only alphanumeric, `_`, `-` are kept;
|
|
|
+/// everything else becomes `_`).
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if `out_dir` cannot be created, the input cannot be read,
|
|
|
+/// or any output file cannot be written.
|
|
|
pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFasta>> {
|
|
|
std::fs::create_dir_all(out_dir)
|
|
|
.with_context(|| format!("Cannot create contig dir: {}", out_dir.display()))?;
|
|
|
-
|
|
|
- let f = std::fs::File::open(fasta)
|
|
|
- .with_context(|| format!("Cannot open FASTA: {}", fasta.display()))?;
|
|
|
- let reader = BufReader::new(f);
|
|
|
-
|
|
|
- let mut contigs = Vec::new();
|
|
|
- let mut current_name: Option<String> = None;
|
|
|
+
|
|
|
+ let reader = BufReader::new(
|
|
|
+ std::fs::File::open(fasta)
|
|
|
+ .with_context(|| format!("Cannot open FASTA: {}", fasta.display()))?,
|
|
|
+ );
|
|
|
+
|
|
|
+ let mut contigs: Vec<ContigFasta> = Vec::new();
|
|
|
let mut current_writer: Option<std::fs::File> = None;
|
|
|
-
|
|
|
+
|
|
|
for line in reader.lines() {
|
|
|
let line = line.context("Error reading FASTA")?;
|
|
|
if let Some(name) = line.strip_prefix('>') {
|
|
|
- // Sanitise contig name for use as filename
|
|
|
let safe_name: String = name
|
|
|
.split_whitespace()
|
|
|
.next()
|
|
|
@@ -90,21 +115,18 @@ pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFas
|
|
|
.chars()
|
|
|
.map(|c| if c.is_alphanumeric() || c == '_' || c == '-' { c } else { '_' })
|
|
|
.collect();
|
|
|
-
|
|
|
- let path = out_dir.join(format!("{}.fasta", safe_name));
|
|
|
+
|
|
|
+ let path = out_dir.join(format!("{safe_name}.fasta"));
|
|
|
let mut w = std::fs::File::create(&path)
|
|
|
.with_context(|| format!("Cannot create contig FASTA: {}", path.display()))?;
|
|
|
- writeln!(w, ">{}", name)?;
|
|
|
-
|
|
|
- contigs.push(ContigFasta { name: safe_name.clone(), fasta_path: path });
|
|
|
- current_name = Some(safe_name);
|
|
|
+ writeln!(w, ">{name}")?;
|
|
|
+
|
|
|
+ contigs.push(ContigFasta { name: safe_name, fasta_path: path });
|
|
|
current_writer = Some(w);
|
|
|
} else if let Some(ref mut w) = current_writer {
|
|
|
- writeln!(w, "{}", line)?;
|
|
|
+ writeln!(w, "{line}")?;
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- let _ = current_name; // suppress unused warning
|
|
|
+
|
|
|
Ok(contigs)
|
|
|
}
|
|
|
-
|