fasta.rs 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. //! Indexed FASTA reader and utilities for reference sequence access.
  2. //!
  3. //! All coordinate parameters follow the **0-based** convention used internally
  4. //! throughout the codebase. Conversion to the 1-based noodles API is done
  5. //! transparently inside each function.
  6. use std::io::{BufRead, Write};
  7. use std::{
  8. fs::File,
  9. io::BufReader,
  10. path::{Path, PathBuf},
  11. };
  12. use anyhow::Context;
  13. use noodles_fasta::io::{BufReader as NoodlesBufReader, IndexedReader};
  14. /// Open an indexed FASTA file (requires a `.fai` index alongside the path).
  15. ///
  16. /// # Errors
  17. ///
  18. /// Returns an error if the FASTA or its index cannot be opened.
  19. pub fn open_indexed_fasta(
  20. reference: &Path,
  21. ) -> anyhow::Result<IndexedReader<NoodlesBufReader<File>>> {
  22. noodles_fasta::io::indexed_reader::Builder::default()
  23. .build_from_path(reference)
  24. .map_err(|e| anyhow::anyhow!("Failed to open indexed FASTA {}: {e}", reference.display()))
  25. }
  26. /// Fetch a fixed-width window of reference sequence centred on `pos0`.
  27. ///
  28. /// `pos0` is a **0-based** position. The window `[pos0 - len/2, pos0 - len/2 + len)`
  29. /// is fetched (clamped to the start of the contig). Returns the sequence in
  30. /// uppercase.
  31. ///
  32. /// # Errors
  33. ///
  34. /// Returns an error if the region is out of bounds or the sequence cannot be
  35. /// decoded as UTF-8.
  36. pub fn sequence_at(
  37. fasta_reader: &mut IndexedReader<noodles_fasta::io::BufReader<File>>,
  38. contig: &str,
  39. pos0: usize,
  40. len: usize,
  41. ) -> anyhow::Result<String> {
  42. anyhow::ensure!(len > 0, "FASTA window length must be > 0");
  43. let position = pos0 + 1; // 0-based → 1-based
  44. let start = position.saturating_sub(len / 2).max(1);
  45. let end = start
  46. .checked_add(len - 1)
  47. .context("FASTA window end coordinate overflow")?;
  48. let start = noodles_core::Position::try_from(start)?;
  49. let end = noodles_core::Position::try_from(end)?;
  50. let interval = noodles_core::region::interval::Interval::from(start..=end);
  51. let r = noodles_core::region::Region::new(contig.to_string(), interval);
  52. let record = fasta_reader.query(&r)?;
  53. Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase())
  54. }
  55. /// Fetch reference sequence over `[start0, end0_inclusive]` (both **0-based inclusive**).
  56. ///
  57. /// Both `start0` and `end0_inclusive` are **0-based inclusive** coordinates.
  58. /// For example, to fetch bases at 0-based positions 10, 11, 12:
  59. /// `sequence_range(reader, "chr1", 10, 12)`.
  60. ///
  61. /// Returns the sequence in uppercase.
  62. ///
  63. /// # Errors
  64. ///
  65. /// Returns an error if the region is out of bounds or the sequence cannot be
  66. /// decoded as UTF-8.
  67. pub fn sequence_range(
  68. fasta_reader: &mut IndexedReader<noodles_fasta::io::BufReader<File>>,
  69. contig: &str,
  70. start0: usize, // 0-based inclusive
  71. end0_inclusive: usize, // 0-based inclusive
  72. ) -> anyhow::Result<String> {
  73. anyhow::ensure!(
  74. start0 <= end0_inclusive,
  75. "FASTA range start ({start0}) is after end ({end0_inclusive})"
  76. );
  77. let start = noodles_core::Position::try_from(start0 + 1)?;
  78. let end = noodles_core::Position::try_from(end0_inclusive + 1)?;
  79. let interval = noodles_core::region::interval::Interval::from(start..=end);
  80. let r = noodles_core::region::Region::new(contig.to_string(), interval);
  81. let record = fasta_reader.query(&r)?;
  82. Ok(String::from_utf8(record.sequence().as_ref().to_vec())?.to_uppercase())
  83. }
  84. /// A single-contig FASTA file produced by [`split_fasta`].
  85. pub struct ContigFasta {
  86. pub name: String,
  87. pub fasta_path: PathBuf,
  88. }
  89. /// Split a multi-contig FASTA into individual single-contig files under `out_dir`.
  90. ///
  91. /// Returns one [`ContigFasta`] per contig in file order. Contig names are
  92. /// sanitised for use as filenames (only alphanumeric, `_`, `-` are kept;
  93. /// everything else becomes `_`).
  94. ///
  95. /// # Errors
  96. ///
  97. /// Returns an error if `out_dir` cannot be created, the input cannot be read,
  98. /// or any output file cannot be written.
  99. pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFasta>> {
  100. std::fs::create_dir_all(out_dir)
  101. .with_context(|| format!("Cannot create contig dir: {}", out_dir.display()))?;
  102. let reader = BufReader::new(
  103. std::fs::File::open(fasta)
  104. .with_context(|| format!("Cannot open FASTA: {}", fasta.display()))?,
  105. );
  106. let mut contigs: Vec<ContigFasta> = Vec::new();
  107. let mut current_writer: Option<std::fs::File> = None;
  108. for line in reader.lines() {
  109. let line = line.context("Error reading FASTA")?;
  110. if let Some(name) = line.strip_prefix('>') {
  111. let safe_name: String = name
  112. .split_whitespace()
  113. .next()
  114. .unwrap_or("contig")
  115. .chars()
  116. .map(|c| {
  117. if c.is_alphanumeric() || c == '_' || c == '-' {
  118. c
  119. } else {
  120. '_'
  121. }
  122. })
  123. .collect();
  124. let path = out_dir.join(format!("{safe_name}.fasta"));
  125. let mut w = std::fs::File::create(&path)
  126. .with_context(|| format!("Cannot create contig FASTA: {}", path.display()))?;
  127. writeln!(w, ">{name}")?;
  128. contigs.push(ContigFasta {
  129. name: safe_name,
  130. fasta_path: path,
  131. });
  132. current_writer = Some(w);
  133. } else if let Some(ref mut w) = current_writer {
  134. writeln!(w, "{line}")?;
  135. }
  136. }
  137. Ok(contigs)
  138. }
  139. /// Read a single-contig FASTA produced by split_fasta.
  140. /// Skips the header line, uppercases, replaces non-ACGT with N.
  141. /// Pure std — no noodles needed since we wrote the file ourselves.
  142. pub fn read_single_contig_fasta(path: &Path) -> anyhow::Result<Vec<u8>> {
  143. use std::io::BufRead;
  144. let f = std::fs::File::open(path)?;
  145. let reader = std::io::BufReader::with_capacity(8 * 1024 * 1024, f);
  146. let mut seq = Vec::new();
  147. for line in reader.lines() {
  148. let line = line?;
  149. if line.starts_with('>') {
  150. continue;
  151. }
  152. let sanitized = line.bytes().map(|b| match b.to_ascii_uppercase() {
  153. b'A' | b'C' | b'G' | b'T' => b.to_ascii_uppercase(),
  154. _ => b'N',
  155. });
  156. seq.extend(sanitized);
  157. }
  158. anyhow::ensure!(!seq.is_empty(), "empty contig in {}", path.display());
  159. Ok(seq)
  160. }
  161. pub struct FaiEntry {
  162. pub name: String,
  163. pub len: usize,
  164. }
  165. pub fn read_fai(path: &Path) -> anyhow::Result<Vec<FaiEntry>> {
  166. use std::io::BufRead;
  167. let f = std::fs::File::open(path)?;
  168. std::io::BufReader::new(f)
  169. .lines()
  170. .map(|l| {
  171. let l = l?;
  172. let mut cols = l.split('\t');
  173. let name = cols.next().context("fai: missing name")?.to_owned();
  174. let len = cols
  175. .next()
  176. .context("fai: missing len")?
  177. .parse::<usize>()
  178. .context("fai: bad len")?;
  179. Ok(FaiEntry { name, len })
  180. })
  181. .collect()
  182. }