vcf.rs 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. //! VCF file I/O utilities.
  2. //!
  3. //! Write operations produce BGZF-compressed output with an accompanying Tabix
  4. //! index (`.tbi`), enabling direct use with `bcftools`, `tabix`, and IGV without
  5. //! a separate indexing step. Read operations support both sequential loading and
  6. //! random-access region queries via the Tabix index.
  7. //!
  8. //! | Function | Purpose |
  9. //! |----------|---------|
  10. //! | [`read_vcf`] | Load a full VCF file into memory |
  11. //! | [`write_vcf`] | Write variants to `.vcf.gz` + `.tbi` atomically |
  12. //! | [`fetch_vcf`] | Random-access region fetch from a Tabix-indexed `.vcf.gz` |
  13. //! | [`vcf_header`] | Build a full VCF header with contig lines and INFO definitions |
  14. //!
  15. //! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is
  16. //! 1-based. The conversion `vcf_pos = position + 1` is applied internally.
  17. use std::io::{BufRead, BufReader};
  18. use anyhow::Context;
  19. use log::{info, warn};
  20. use noodles_core::Position;
  21. use crate::{
  22. io::writers::{BgzTabixWriter, IndexFormat},
  23. positions::GenomeRange,
  24. variant::vcf_variant::VcfVariant,
  25. };
  26. use super::{
  27. dict::read_dict,
  28. readers::{fetch_tabix_lines_with, get_reader},
  29. };
  30. /// Load a BGZF-compressed or plain VCF file into memory.
  31. ///
  32. /// Header lines (starting with `#`) and blank lines are skipped.
  33. /// I/O errors on individual lines are logged as warnings and skipped.
  34. ///
  35. /// # Arguments
  36. ///
  37. /// * `path` - Path to the VCF file (plain text or BGZF)
  38. ///
  39. /// # Returns
  40. ///
  41. /// A vector of parsed variants in file order.
  42. ///
  43. /// # Errors
  44. ///
  45. /// Returns an error if the file cannot be opened or a data line fails to parse.
  46. pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
  47. let reader = BufReader::new(get_reader(path)?);
  48. let mut res = Vec::new();
  49. for (i, line) in reader.lines().enumerate() {
  50. let line_no = i + 1;
  51. match line {
  52. Ok(line) => {
  53. if line.starts_with('#') || line.is_empty() {
  54. continue;
  55. }
  56. res.push(line.parse().with_context(|| {
  57. format!("failed parsing VCF record at {path}:{line_no}: {line}")
  58. })?);
  59. }
  60. Err(e) => warn!("Can't read {path}:{line_no}: {e}"),
  61. }
  62. }
  63. Ok(res)
  64. }
  65. /// Write variants to a BGZF-compressed VCF with a Tabix index.
  66. ///
  67. /// Produces `path` (`.vcf.gz`) and `path.tbi` in a single atomic pass using
  68. /// [`BgzTabixWriter`]. The output is immediately usable with `bcftools view`,
  69. /// `tabix`, and IGV without a separate indexing step.
  70. ///
  71. /// Tabix positions are derived from [`VcfVariant::position`]: the 0-based
  72. /// `GenomePosition.position` is converted to 1-based VCF POS internally.
  73. /// POS is used as both the tabix start and end (sufficient for all query types;
  74. /// a more precise end would require the REF length or INFO/END field).
  75. ///
  76. /// The output header is minimal (`##fileformat` + column line). Use
  77. /// [`vcf_header`] to build a full header with contig lines and INFO definitions.
  78. ///
  79. /// # Arguments
  80. ///
  81. /// * `variants` - Variants to write, in coordinate-sorted order
  82. /// * `path` - Destination path (must end in `.gz`)
  83. ///
  84. /// # Errors
  85. ///
  86. /// Returns an error if writing fails or if the variants are not coordinate-sorted
  87. /// (required by Tabix).
  88. pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
  89. info!("Writing: {path}");
  90. let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?;
  91. writer.write_header(b"##fileformat=VCFv4.2\n")?;
  92. writer.write_header(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
  93. for variant in variants {
  94. let row = format!("{}\n", variant.commun_deepvariant_clairs().into_vcf_row());
  95. let rname = variant.position.contig();
  96. // GenomePosition.position is 0-based; VCF POS is 1-based
  97. let vcf_pos = variant.position.position as usize + 1;
  98. let pos = Position::try_from(vcf_pos)
  99. .with_context(|| format!("invalid VCF position: {vcf_pos}"))?;
  100. writer.write_record(row.as_bytes(), &rname, pos, pos)?;
  101. }
  102. writer.finish()
  103. }
  104. /// Fetch variants whose POS falls inside `region` from a Tabix-indexed BGZF VCF file.
  105. ///
  106. /// Delegates BGZF seeking and line extraction to [`fetch_tabix_lines`], then
  107. /// parses each line as a [`VcfVariant`] and applies a secondary POS filter to
  108. /// discard any false positives from the Tabix bin query.
  109. ///
  110. /// TODO: this is POS-based, not true interval-overlap fetching. Deletions and
  111. /// symbolic SVs with `INFO/END` can overlap a region even when their POS is
  112. /// outside it. Supporting that correctly should be done alongside explicit VCF
  113. /// span parsing, not inside this wrapper.
  114. ///
  115. /// TODO: VCF allows `POS=0` as a telomere sentinel. Parsing preserves that as
  116. /// internal position `0`, but tabix region queries use 1-based coordinates and
  117. /// this wrapper does not currently perform a separate sentinel lookup.
  118. ///
  119. /// # Coordinate system
  120. ///
  121. /// `region` is **0-based half-open** `[start, end)`. The Tabix index was built
  122. /// with VCF POS (1-based), so [`fetch_tabix_lines`] converts internally:
  123. /// `tabix_start = region.start + 1`, `tabix_end = region.end`.
  124. ///
  125. /// # Arguments
  126. ///
  127. /// * `bgz_path` - Path to the `.vcf.gz` file; index expected at `<bgz_path>.tbi`
  128. /// * `region` - Genomic region to query (0-based half-open)
  129. ///
  130. /// # Returns
  131. ///
  132. /// Parsed [`VcfVariant`]s whose POS is inside `region`, in file order.
  133. ///
  134. /// # Errors
  135. ///
  136. /// Returns an error if the index or file cannot be read, or if a fetched line
  137. /// fails to parse as a VCF variant.
  138. pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<VcfVariant>> {
  139. let mut variants = Vec::new();
  140. fetch_tabix_lines_with(bgz_path, region, |line| {
  141. let v: VcfVariant = line
  142. .parse()
  143. .with_context(|| format!("failed to parse VCF line: {line}"))?;
  144. if v.position.contig == region.contig && region.range.contains(&v.position.position) {
  145. variants.push(v);
  146. }
  147. Ok(())
  148. })?;
  149. Ok(variants)
  150. }
  151. /// A flat VCF row for tab-separated line parsing.
  152. ///
  153. /// Use [`VcfVariant`] for richer parsed access. This struct exists for
  154. /// simple tabular ingestion where field-by-field access is not needed.
  155. ///
  156. /// `pos` is **1-based** as stored in the VCF file. Parse from a tab-separated
  157. /// line via `FromStr`; the `serde::Deserialize` derive is kept for callers
  158. /// that still use serde-based deserialization.
  159. #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
  160. pub struct VCFRow {
  161. pub chr: String,
  162. /// 1-based position (VCF convention)
  163. pub pos: u32,
  164. pub id: String,
  165. pub reference: String,
  166. pub alt: String,
  167. pub qual: String,
  168. pub filter: String,
  169. pub info: String,
  170. pub format: String,
  171. pub value: String,
  172. }
  173. impl std::str::FromStr for VCFRow {
  174. type Err = anyhow::Error;
  175. /// Parse a tab-separated VCF data line (no header, 10 positional columns).
  176. fn from_str(s: &str) -> anyhow::Result<Self> {
  177. let f: Vec<&str> = s.split('\t').collect();
  178. let get = |i: usize, name: &str| -> anyhow::Result<&str> {
  179. f.get(i)
  180. .copied()
  181. .ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})"))
  182. };
  183. Ok(Self {
  184. chr: get(0, "chr")?.to_string(),
  185. pos: get(1, "pos")?.parse().context("bad pos")?,
  186. id: get(2, "id")?.to_string(),
  187. reference: get(3, "reference")?.to_string(),
  188. alt: get(4, "alt")?.to_string(),
  189. qual: get(5, "qual")?.to_string(),
  190. filter: get(6, "filter")?.to_string(),
  191. info: get(7, "info")?.to_string(),
  192. format: get(8, "format")?.to_string(),
  193. value: get(9, "value")?.to_string(),
  194. })
  195. }
  196. }
  197. /// Build a full VCF header with INFO definitions, contig lines, and Pandora version.
  198. ///
  199. /// Includes SV-specific INFO fields (`SVTYPE`, `SVLEN`, `END`) and reads contig
  200. /// lengths from a `.dict` file. The last element is always the column header line
  201. /// (`#CHROM\tPOS\t...`).
  202. ///
  203. /// Note: [`write_vcf`] uses a minimal inline header and does not call this function.
  204. /// Use this when constructing a header separately before writing with a custom writer.
  205. ///
  206. /// # Arguments
  207. ///
  208. /// * `dict` - Path to a sequence dictionary (`.dict`) file
  209. ///
  210. /// # Returns
  211. ///
  212. /// Ordered header lines, each without a trailing newline.
  213. ///
  214. /// # Errors
  215. ///
  216. /// Returns an error if the dictionary file cannot be read.
  217. pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
  218. let mut header = Vec::new();
  219. header.push("##fileformat=VCFv4.2".to_string());
  220. header.push(
  221. "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"
  222. .to_string(),
  223. );
  224. header.push("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">".to_string());
  225. header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
  226. header.push(format!(
  227. "##Pandora_lib_version={}",
  228. env!("CARGO_PKG_VERSION")
  229. ));
  230. read_dict(dict)?
  231. .into_iter()
  232. .for_each(|(id, len)| header.push(format!("##contig=<ID={id},length={len}>")));
  233. header.push("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE".to_string());
  234. Ok(header)
  235. }
  236. #[cfg(test)]
  237. mod tests {
  238. use crate::config::Config;
  239. use super::*;
  240. #[test]
  241. fn vcf_dbsnp() {
  242. let config = Config::default();
  243. fetch_vcf(conf.db_snp, &GenomeRange::new("chr10", , end))
  244. }
  245. }