|
@@ -1,15 +1,41 @@
|
|
|
-use std::{
|
|
|
|
|
- fs::File,
|
|
|
|
|
- io::{BufRead, BufReader, Write},
|
|
|
|
|
-};
|
|
|
|
|
|
|
+//! VCF file I/O utilities.
|
|
|
|
|
+//!
|
|
|
|
|
+//! All write operations produce BGZF-compressed output with an accompanying
|
|
|
|
|
+//! Tabix index (`.tbi`), enabling direct use with `bcftools`, `tabix`, and IGV
|
|
|
|
|
+//! without a separate indexing step.
|
|
|
|
|
+//!
|
|
|
|
|
+//! **Coordinate note:** [`GenomePosition::position`] is 0-based; VCF POS is
|
|
|
|
|
+//! 1-based. The conversion `vcf_pos = position + 1` is applied internally.
|
|
|
|
|
+
|
|
|
|
|
+use std::io::{BufRead, BufReader};
|
|
|
|
|
|
|
|
use anyhow::Context;
|
|
use anyhow::Context;
|
|
|
use log::{info, warn};
|
|
use log::{info, warn};
|
|
|
|
|
+use noodles_core::Position;
|
|
|
|
|
|
|
|
-use crate::{io::writers::{finalize_bgzf_file, get_gz_writer}, variant::vcf_variant::VcfVariant};
|
|
|
|
|
|
|
+use crate::{
|
|
|
|
|
+ io::writers::{BgzTabixWriter, IndexFormat},
|
|
|
|
|
+ variant::vcf_variant::VcfVariant,
|
|
|
|
|
+};
|
|
|
|
|
|
|
|
use super::{dict::read_dict, readers::get_reader};
|
|
use super::{dict::read_dict, readers::get_reader};
|
|
|
|
|
|
|
|
|
|
+/// Load a BGZF-compressed or plain VCF file into memory.
|
|
|
|
|
+///
|
|
|
|
|
+/// Header lines (starting with `#`) and blank lines are skipped.
|
|
|
|
|
+/// I/O errors on individual lines are logged as warnings and skipped.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `path` - Path to the VCF file (plain text or BGZF)
|
|
|
|
|
+///
|
|
|
|
|
+/// # Returns
|
|
|
|
|
+///
|
|
|
|
|
+/// A vector of parsed variants in file order.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the file cannot be opened or a data line fails to parse.
|
|
|
pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
|
|
pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
|
|
|
let reader = BufReader::new(get_reader(path)?);
|
|
let reader = BufReader::new(get_reader(path)?);
|
|
|
|
|
|
|
@@ -17,7 +43,7 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
|
|
|
for (i, line) in reader.lines().enumerate() {
|
|
for (i, line) in reader.lines().enumerate() {
|
|
|
match line {
|
|
match line {
|
|
|
Ok(line) => {
|
|
Ok(line) => {
|
|
|
- if line.starts_with("#") {
|
|
|
|
|
|
|
+ if line.starts_with('#') || line.is_empty() {
|
|
|
continue;
|
|
continue;
|
|
|
}
|
|
}
|
|
|
res.push(line.parse().context(format!("Can't parse {line}"))?);
|
|
res.push(line.parse().context(format!("Can't parse {line}"))?);
|
|
@@ -29,26 +55,59 @@ pub fn read_vcf(path: &str) -> anyhow::Result<Vec<VcfVariant>> {
|
|
|
Ok(res)
|
|
Ok(res)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// Write variants to a BGZF-compressed VCF with a Tabix index.
|
|
|
|
|
+///
|
|
|
|
|
+/// Produces `path` (`.vcf.gz`) and `path.tbi` in a single atomic pass using
|
|
|
|
|
+/// [`BgzTabixWriter`]. The output is immediately usable with `bcftools view`,
|
|
|
|
|
+/// `tabix`, and IGV without a separate indexing step.
|
|
|
|
|
+///
|
|
|
|
|
+/// Tabix positions are derived from [`VcfVariant::position`]: the 0-based
|
|
|
|
|
+/// `GenomePosition.position` is converted to 1-based VCF POS internally.
|
|
|
|
|
+/// POS is used as both the tabix start and end (sufficient for all query types;
|
|
|
|
|
+/// a more precise end would require the REF length or INFO/END field).
|
|
|
|
|
+///
|
|
|
|
|
+/// The output header is minimal (`##fileformat` + column line). Use
|
|
|
|
|
+/// [`vcf_header`] to build a full header with contig lines and INFO definitions.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `variants` - Variants to write, in coordinate-sorted order
|
|
|
|
|
+/// * `path` - Destination path (must end in `.gz`)
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if writing fails or if the variants are not coordinate-sorted
|
|
|
|
|
+/// (required by Tabix).
|
|
|
pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
|
|
pub fn write_vcf(variants: &[VcfVariant], path: &str) -> anyhow::Result<()> {
|
|
|
info!("Writing: {path}");
|
|
info!("Writing: {path}");
|
|
|
- let mut writer = get_gz_writer(path, true)?;
|
|
|
|
|
- // write!(writer, b"##fileformat=VCFv4.2\n")
|
|
|
|
|
- writer.write_all(b"##fileformat=VCFv4.2\n")?;
|
|
|
|
|
- writer.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
|
|
|
|
|
|
|
+
|
|
|
|
|
+ let mut writer = BgzTabixWriter::new(path, IndexFormat::Tbi, true)?;
|
|
|
|
|
+ writer.write_header(b"##fileformat=VCFv4.2\n")?;
|
|
|
|
|
+ writer.write_header(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")?;
|
|
|
|
|
|
|
|
for variant in variants {
|
|
for variant in variants {
|
|
|
- writer.write_fmt(format_args!(
|
|
|
|
|
- "{}\n",
|
|
|
|
|
- variant.commun_deepvariant_clairs().into_vcf_row()
|
|
|
|
|
- ))?;
|
|
|
|
|
|
|
+ let row = format!("{}\n", variant.commun_deepvariant_clairs().into_vcf_row());
|
|
|
|
|
+ let rname = variant.position.contig();
|
|
|
|
|
+ // GenomePosition.position is 0-based; VCF POS is 1-based
|
|
|
|
|
+ let vcf_pos = variant.position.position as usize + 1;
|
|
|
|
|
+ let pos = Position::try_from(vcf_pos)
|
|
|
|
|
+ .with_context(|| format!("invalid VCF position: {vcf_pos}"))?;
|
|
|
|
|
+ writer.write_record(row.as_bytes(), &rname, pos, pos)?;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- finalize_bgzf_file(writer, path)
|
|
|
|
|
|
|
+ writer.finish()
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// A flat VCF row for CSV/TSV deserialization (e.g. via `serde`).
|
|
|
|
|
+///
|
|
|
|
|
+/// Use [`VcfVariant`] for richer parsed access. This struct exists for
|
|
|
|
|
+/// simple tabular ingestion where field-by-field access is not needed.
|
|
|
|
|
+///
|
|
|
|
|
+/// `pos` is **1-based** as stored in the VCF file.
|
|
|
#[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
|
|
#[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
|
|
|
pub struct VCFRow {
|
|
pub struct VCFRow {
|
|
|
pub chr: String,
|
|
pub chr: String,
|
|
|
|
|
+ /// 1-based position (VCF convention)
|
|
|
pub pos: u32,
|
|
pub pos: u32,
|
|
|
pub id: String,
|
|
pub id: String,
|
|
|
pub reference: String,
|
|
pub reference: String,
|
|
@@ -60,24 +119,37 @@ pub struct VCFRow {
|
|
|
pub value: String,
|
|
pub value: String,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// Build a full VCF header with INFO definitions, contig lines, and Pandora version.
|
|
|
|
|
+///
|
|
|
|
|
+/// Includes SV-specific INFO fields (`SVTYPE`, `SVLEN`, `END`) and reads contig
|
|
|
|
|
+/// lengths from a `.dict` file. The last element is always the column header line
|
|
|
|
|
+/// (`#CHROM\tPOS\t...`).
|
|
|
|
|
+///
|
|
|
|
|
+/// Note: [`write_vcf`] uses a minimal inline header and does not call this function.
|
|
|
|
|
+/// Use this when constructing a header separately before writing with a custom writer.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `dict` - Path to a sequence dictionary (`.dict`) file
|
|
|
|
|
+///
|
|
|
|
|
+/// # Returns
|
|
|
|
|
+///
|
|
|
|
|
+/// Ordered header lines, each without a trailing newline.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if the dictionary file cannot be read.
|
|
|
pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
|
|
pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
|
|
|
let mut header = Vec::new();
|
|
let mut header = Vec::new();
|
|
|
- // file format
|
|
|
|
|
header.push("##fileformat=VCFv4.2".to_string());
|
|
header.push("##fileformat=VCFv4.2".to_string());
|
|
|
- // Format
|
|
|
|
|
- // Filter
|
|
|
|
|
- // Info
|
|
|
|
|
header.push(
|
|
header.push(
|
|
|
"##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"
|
|
"##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"
|
|
|
.to_string(),
|
|
.to_string(),
|
|
|
);
|
|
);
|
|
|
header.push("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">".to_string());
|
|
header.push("##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">".to_string());
|
|
|
header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
|
|
header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
|
|
|
-
|
|
|
|
|
- // version
|
|
|
|
|
header.push(format!("##Pandora_lib_version={}", env!("CARGO_PKG_VERSION")));
|
|
header.push(format!("##Pandora_lib_version={}", env!("CARGO_PKG_VERSION")));
|
|
|
|
|
|
|
|
- // contig
|
|
|
|
|
read_dict(dict)?
|
|
read_dict(dict)?
|
|
|
.into_iter()
|
|
.into_iter()
|
|
|
.for_each(|(id, len)| header.push(format!("##contig=<ID={id},length={len}>")));
|
|
.for_each(|(id, len)| header.push(format!("##contig=<ID={id},length={len}>")));
|