vcf.rs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. //! VCF file I/O utilities.
  2. //!
  3. //! Write operations produce BGZF-compressed output with an accompanying Tabix
  4. //! index (`.tbi`), enabling direct use with `bcftools`, `tabix`, and IGV without
  5. //! a separate indexing step. Read operations support both sequential loading and
  6. //! random-access region queries via the Tabix index.
  7. //!
  8. //! | Function | Purpose |
  9. //! |----------|---------|
  10. //! | [`read_vcf`] | Load a full VCF file into memory |
  11. //! | [`write_vcf`] | Write variants to `.vcf.gz` + `.tbi` atomically |
  12. //! | [`fetch_vcf`] | Random-access region fetch from a Tabix-indexed `.vcf.gz` |
  13. //! | [`vcf_header`] | Build a full VCF header with contig lines and INFO definitions |
  14. //!
  15. //! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is
  16. //! 1-based. The conversion `vcf_pos = position + 1` is applied internally.
  17. use std::{
  18. borrow::Borrow,
  19. io::{BufRead, BufReader},
  20. };
  21. use anyhow::Context;
  22. use log::{info, warn};
  23. use noodles_core::Position;
  24. use crate::{
  25. io::writers::{BgzTabixWriter, IndexFormat},
  26. positions::GenomeRange,
  27. variant::vcf_variant::{from_multiallelic, VcfVariant},
  28. };
  29. use super::{
  30. dict::read_dict,
  31. readers::{fetch_tabix_lines_with, get_reader, TabixReader},
  32. };
  33. /// Load a BGZF-compressed or plain VCF file into memory.
  34. ///
  35. /// Header lines (starting with `#`) and blank lines are skipped.
  36. /// I/O errors on individual lines are logged as warnings and skipped.
  37. ///
  38. /// # Arguments
  39. ///
  40. /// * `path` - Path to the VCF file (plain text or BGZF)
  41. ///
  42. /// # Returns
  43. ///
  44. /// A vector of parsed variants in file order.
  45. ///
  46. /// # Errors
  47. ///
  48. /// Returns an error if the file cannot be opened or a data line fails to parse.
  49. pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
  50. let reader = BufReader::new(get_reader(path)?);
  51. let mut res = Vec::new();
  52. for (i, line) in reader.lines().enumerate() {
  53. let line_no = i + 1;
  54. match line {
  55. Ok(line) => {
  56. if line.starts_with('#') || line.is_empty() {
  57. continue;
  58. }
  59. res.push(line.parse().with_context(|| {
  60. format!("failed parsing VCF record at {path}:{line_no}: {line}")
  61. })?);
  62. }
  63. Err(e) => warn!("Can't read {path}:{line_no}: {e}"),
  64. }
  65. }
  66. Ok(res)
  67. }
  68. /// Write variants to a BGZF-compressed VCF with a Tabix index.
  69. ///
  70. /// Produces `path` (`.vcf.gz`) and `path.tbi` in a single atomic pass using
  71. /// [`BgzTabixWriter`]. The output is immediately usable with `bcftools view`,
  72. /// `tabix`, and IGV without a separate indexing step.
  73. ///
  74. /// Tabix positions are derived from [`VcfVariant::position`]: the 0-based
  75. /// `GenomePosition.position` is converted to 1-based VCF POS internally.
  76. /// POS is used as both the tabix start and end (sufficient for all query types;
  77. /// a more precise end would require the REF length or INFO/END field).
  78. ///
  79. /// The output header is minimal (`##fileformat` + column line). Use
  80. /// [`vcf_header`] to build a full header with contig lines and INFO definitions.
  81. ///
  82. /// # Arguments
  83. ///
  84. /// * `variants` - Variants to write, in coordinate-sorted order
  85. /// * `path` - Destination path (must end in `.gz`)
  86. ///
  87. /// # Errors
  88. ///
  89. /// Returns an error if writing fails or if the variants are not coordinate-sorted
  90. /// (required by Tabix).
  91. pub fn write_vcf<B: Borrow<VcfVariant>>(variants: &[B], path: &str) -> anyhow::Result<()> {
  92. info!("Writing: {path}");
  93. let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?;
  94. writer.write_header(b"##fileformat=VCFv4.2\n")?;
  95. writer.write_header(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
  96. for variant in variants {
  97. let variant_ref = variant.borrow();
  98. let row = format!(
  99. "{}\n",
  100. variant_ref.commun_deepvariant_clairs().into_vcf_row()
  101. );
  102. let rname = variant_ref.position.contig();
  103. // GenomePosition.position is 0-based; VCF POS is 1-based
  104. let vcf_pos = variant_ref.position.position as usize + 1;
  105. let pos = Position::try_from(vcf_pos)
  106. .with_context(|| format!("invalid VCF position: {vcf_pos}"))?;
  107. writer.write_record(row.as_bytes(), &rname, pos, pos)?;
  108. }
  109. writer.finish()
  110. }
  111. /// Fetch variants whose POS falls inside `region` from a Tabix-indexed BGZF VCF file.
  112. ///
  113. /// Delegates BGZF seeking and line extraction to [`fetch_tabix_lines`], then
  114. /// parses each line as a [`VcfVariant`] and applies a secondary POS filter to
  115. /// discard any false positives from the Tabix bin query.
  116. ///
  117. /// TODO: this is POS-based, not true interval-overlap fetching. Deletions and
  118. /// symbolic SVs with `INFO/END` can overlap a region even when their POS is
  119. /// outside it. Supporting that correctly should be done alongside explicit VCF
  120. /// span parsing, not inside this wrapper.
  121. ///
  122. /// TODO: VCF allows `POS=0` as a telomere sentinel. Parsing preserves that as
  123. /// internal position `0`, but tabix region queries use 1-based coordinates and
  124. /// this wrapper does not currently perform a separate sentinel lookup.
  125. ///
  126. /// # Coordinate system
  127. ///
  128. /// `region` is **0-based half-open** `[start, end)`. The Tabix index was built
  129. /// with VCF POS (1-based), so [`fetch_tabix_lines`] converts internally:
  130. /// `tabix_start = region.start + 1`, `tabix_end = region.end`.
  131. ///
  132. /// # Arguments
  133. ///
  134. /// * `bgz_path` - Path to the `.vcf.gz` file; index expected at `<bgz_path>.tbi`
  135. /// * `region` - Genomic region to query (0-based half-open)
  136. ///
  137. /// # Returns
  138. ///
  139. /// Parsed [`VcfVariant`]s whose POS is inside `region`, in file order.
  140. ///
  141. /// # Errors
  142. ///
  143. /// Returns an error if the index or file cannot be read, or if a fetched line
  144. /// fails to parse as a VCF variant.
  145. pub fn fetch_vcf(bgz_path: &str, region: &GenomeRange) -> anyhow::Result<Vec<VcfVariant>> {
  146. let mut variants = Vec::new();
  147. fetch_tabix_lines_with(bgz_path, region, |line| {
  148. let v: VcfVariant = line
  149. .parse()
  150. .with_context(|| format!("failed to parse VCF line: {line}"))?;
  151. if v.position.contig == region.contig && region.range.contains(&v.position.position) {
  152. for split in from_multiallelic(v)? {
  153. variants.push(split);
  154. }
  155. }
  156. Ok(())
  157. })?;
  158. Ok(variants)
  159. }
  160. /// Like [`fetch_vcf`] but reuses a pre-opened [`TabixReader`] instead of
  161. /// reopening the index and BGZF file on every call.
  162. ///
  163. /// Use this inside `par_chunks` / `for_each_init` workers where one reader
  164. /// is initialised per thread and shared across all variants in that chunk.
  165. pub fn fetch_vcf_with_reader(
  166. reader: &mut TabixReader,
  167. region: &GenomeRange,
  168. ) -> anyhow::Result<Vec<VcfVariant>> {
  169. let mut variants = Vec::new();
  170. reader.fetch_with(region, |line| {
  171. let v: VcfVariant = line
  172. .parse()
  173. .with_context(|| format!("failed to parse VCF line: {line}"))?;
  174. if v.position.contig == region.contig && region.range.contains(&v.position.position) {
  175. for split in from_multiallelic(v)? {
  176. variants.push(split);
  177. }
  178. }
  179. Ok(())
  180. })?;
  181. Ok(variants)
  182. }
  183. /// A flat VCF row for tab-separated line parsing.
  184. ///
  185. /// Use [`VcfVariant`] for richer parsed access. This struct exists for
  186. /// simple tabular ingestion where field-by-field access is not needed.
  187. ///
  188. /// `pos` is **1-based** as stored in the VCF file. Parse from a tab-separated
  189. /// line via `FromStr`; the `serde::Deserialize` derive is kept for callers
  190. /// that still use serde-based deserialization.
  191. #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
  192. pub struct VCFRow {
  193. pub chr: String,
  194. /// 1-based position (VCF convention)
  195. pub pos: u32,
  196. pub id: String,
  197. pub reference: String,
  198. pub alt: String,
  199. pub qual: String,
  200. pub filter: String,
  201. pub info: String,
  202. pub format: String,
  203. pub value: String,
  204. }
  205. impl std::str::FromStr for VCFRow {
  206. type Err = anyhow::Error;
  207. /// Parse a tab-separated VCF data line (no header, 10 positional columns).
  208. fn from_str(s: &str) -> anyhow::Result<Self> {
  209. let f: Vec<&str> = s.split('\t').collect();
  210. let get = |i: usize, name: &str| -> anyhow::Result<&str> {
  211. f.get(i)
  212. .copied()
  213. .ok_or_else(|| anyhow::anyhow!("missing {name} (col {i})"))
  214. };
  215. Ok(Self {
  216. chr: get(0, "chr")?.to_string(),
  217. pos: get(1, "pos")?.parse().context("bad pos")?,
  218. id: get(2, "id")?.to_string(),
  219. reference: get(3, "reference")?.to_string(),
  220. alt: get(4, "alt")?.to_string(),
  221. qual: get(5, "qual")?.to_string(),
  222. filter: get(6, "filter")?.to_string(),
  223. info: get(7, "info")?.to_string(),
  224. format: get(8, "format")?.to_string(),
  225. value: get(9, "value")?.to_string(),
  226. })
  227. }
  228. }
  229. /// Build a full VCF header with INFO definitions, contig lines, and Pandora version.
  230. ///
  231. /// Includes SV-specific INFO fields (`SVTYPE`, `SVLEN`, `END`) and reads contig
  232. /// lengths from a `.dict` file. The last element is always the column header line
  233. /// (`#CHROM\tPOS\t...`).
  234. ///
  235. /// Note: [`write_vcf`] uses a minimal inline header and does not call this function.
  236. /// Use this when constructing a header separately before writing with a custom writer.
  237. ///
  238. /// # Arguments
  239. ///
  240. /// * `dict` - Path to a sequence dictionary (`.dict`) file
  241. ///
  242. /// # Returns
  243. ///
  244. /// Ordered header lines, each without a trailing newline.
  245. ///
  246. /// # Errors
  247. ///
  248. /// Returns an error if the dictionary file cannot be read.
  249. pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
  250. let mut header = Vec::new();
  251. header.push("##fileformat=VCFv4.2".to_string());
  252. header.push(
  253. "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"
  254. .to_string(),
  255. );
  256. header.push("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">".to_string());
  257. header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
  258. header.push(format!(
  259. "##Pandora_lib_version={}",
  260. env!("CARGO_PKG_VERSION")
  261. ));
  262. read_dict(dict)?
  263. .into_iter()
  264. .for_each(|(id, len)| header.push(format!("##contig=<ID={id},length={len}>")));
  265. header.push("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE".to_string());
  266. Ok(header)
  267. }
  268. #[cfg(test)]
  269. mod tests {
  270. use crate::config::Config;
  271. use super::*;
  272. #[test]
  273. fn vcf_dbsnp() -> anyhow::Result<()> {
  274. let config = Config::default();
  275. let v = fetch_vcf(&config.db_snp, &GenomeRange::new("chr1", 5656, 5660))?;
  276. println!("{v:#?}");
  277. Ok(())
  278. }
  279. }