|
|
@@ -11,7 +11,13 @@ use bitcode::{Decode, Encode};
|
|
|
use log::{error, info};
|
|
|
use rayon::prelude::*;
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
-use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr};
|
|
|
+use std::{
|
|
|
+ cmp::Ordering,
|
|
|
+ collections::{BTreeSet, HashSet},
|
|
|
+ fmt,
|
|
|
+ hash::Hash,
|
|
|
+ str::FromStr,
|
|
|
+};
|
|
|
|
|
|
/// Represents a variant in the Variant Call Format (VCF).
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
|
|
|
@@ -372,7 +378,7 @@ impl VcfVariant {
|
|
|
self.deletion_len().map(|len| DeletionDesc {
|
|
|
contig: self.position.contig(),
|
|
|
start: self.position.position + 1,
|
|
|
- end: self.position.position.checked_add(len).unwrap_or(u32::MAX),
|
|
|
+ end: self.position.position.checked_add(len).unwrap_or(u32::MAX), // TODO
|
|
|
})
|
|
|
}
|
|
|
}
|
|
|
@@ -517,13 +523,25 @@ impl Ord for VcfVariant {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-/// Info
|
|
|
+/// A container for a list of VCF `INFO` fields.
|
|
|
+///
|
|
|
+/// Represents a parsed set of key-value annotations or flags found in the INFO column.
|
|
|
+///
|
|
|
+/// # Example
|
|
|
+/// ```
|
|
|
+/// use your_crate::Infos;
|
|
|
+/// use std::str::FromStr;
|
|
|
+///
|
|
|
+/// let infos = Infos::from_str("SVTYPE=DEL;END=12345;TUMOUR_AF=0.25,0.15").unwrap();
|
|
|
+/// println!("{}", infos); // Displays: SVTYPE=DEL;END=12345;TUMOUR_AF=0.25,0.15
|
|
|
+/// ```
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
|
|
|
pub struct Infos(pub Vec<Info>);
|
|
|
|
|
|
impl FromStr for Infos {
|
|
|
type Err = anyhow::Error;
|
|
|
|
|
|
+ /// Parses a semicolon-separated list of INFO fields from a VCF record.
|
|
|
fn from_str(s: &str) -> anyhow::Result<Self> {
|
|
|
Ok(Self(
|
|
|
s.split(";")
|
|
|
@@ -535,6 +553,7 @@ impl FromStr for Infos {
|
|
|
}
|
|
|
|
|
|
impl fmt::Display for Infos {
|
|
|
+ /// Formats the `Infos` as a semicolon-separated VCF-style INFO string.
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
write!(
|
|
|
f,
|
|
|
@@ -548,6 +567,12 @@ impl fmt::Display for Infos {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/// Enum representing a single INFO field in a VCF record.
|
|
|
+///
|
|
|
+/// Supports both standard fields and Severus-specific structural variant annotations.
|
|
|
+/// Handles string values, numeric values, vectors, and flags.
|
|
|
+///
|
|
|
+/// Variants with `Vec<_>` represent fields with multiple comma-separated values.
|
|
|
#[allow(non_camel_case_types)]
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
|
|
|
pub enum Info {
|
|
|
@@ -602,11 +627,28 @@ pub enum Info {
|
|
|
END(u32),
|
|
|
SVINSLEN(u32),
|
|
|
SVINSSEQ(String),
|
|
|
+ // Severus
|
|
|
+ PRECISE,
|
|
|
+ IMPRECISE,
|
|
|
+ STRANDS(String),
|
|
|
+ DETAILED_TYPE(String),
|
|
|
+ INSLEN(i32),
|
|
|
+ MAPQ(u32),
|
|
|
+ PHASESETID(String),
|
|
|
+ HP(u32),
|
|
|
+ CLUSTERID(String),
|
|
|
+ INSSEQ(String),
|
|
|
+ MATE_ID(String),
|
|
|
+ INSIDE_VNTR(String),
|
|
|
+ ALINGED_POS(String),
|
|
|
}
|
|
|
|
|
|
impl FromStr for Info {
|
|
|
type Err = anyhow::Error;
|
|
|
|
|
|
+ /// Parses a single `INFO` key or key=value string into a typed `Info` variant.
|
|
|
+ ///
|
|
|
+ /// Handles both presence/absence flags and key-value fields
|
|
|
fn from_str(s: &str) -> anyhow::Result<Self> {
|
|
|
if s.contains('=') {
|
|
|
let (key, value) = s
|
|
|
@@ -665,6 +707,21 @@ impl FromStr for Info {
|
|
|
"END_EVENT_SIZE_MEDIAN" => Info::END_EVENT_SIZE_MEDIAN(parse_value(value, key)?),
|
|
|
"END_EVENT_SIZE_MEAN" => Info::END_EVENT_SIZE_MEAN(parse_value(value, key)?),
|
|
|
"CLASS" => Info::CLASS(value.to_string()),
|
|
|
+
|
|
|
+ "PRECISE" => Info::PRECISE,
|
|
|
+ "IMPRECISE" => Info::IMPRECISE,
|
|
|
+ "STRANDS" => Info::STRANDS(value.to_string()),
|
|
|
+ "DETAILED_TYPE" => Info::DETAILED_TYPE(value.to_string()),
|
|
|
+ "INSLEN" => Info::INSLEN(parse_value(value, key)?),
|
|
|
+ "MAPQ" => Info::MAPQ(parse_value(value, key)?),
|
|
|
+ "PHASESETID" => Info::PHASESETID(value.to_string()),
|
|
|
+ "HP" => Info::HP(parse_value(value, key)?),
|
|
|
+ "CLUSTERID" => Info::CLUSTERID(value.to_string()),
|
|
|
+ "INSSEQ" => Info::INSSEQ(value.to_string()),
|
|
|
+ "MATE_ID" => Info::MATE_ID(value.to_string()),
|
|
|
+ "INSIDE_VNTR" => Info::INSIDE_VNTR(value.to_string()),
|
|
|
+ "ALINGED_POS" => Info::ALINGED_POS(value.to_string()),
|
|
|
+
|
|
|
_ => Info::Empty,
|
|
|
})
|
|
|
} else {
|
|
|
@@ -672,6 +729,9 @@ impl FromStr for Info {
|
|
|
"H" => Info::H,
|
|
|
"F" => Info::F,
|
|
|
"P" => Info::P,
|
|
|
+ "PRECISE" => Info::PRECISE,
|
|
|
+ "IMPRECISE" => Info::IMPRECISE,
|
|
|
+
|
|
|
_ => Info::Empty,
|
|
|
})
|
|
|
}
|
|
|
@@ -679,12 +739,15 @@ impl FromStr for Info {
|
|
|
}
|
|
|
|
|
|
impl fmt::Display for Info {
|
|
|
+ /// Converts the `Info` enum into a VCF-compliant string (key=value or flag).
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
match self {
|
|
|
Info::Empty => write!(f, "."),
|
|
|
Info::H => write!(f, "H"),
|
|
|
Info::F => write!(f, "F"),
|
|
|
Info::P => write!(f, "P"),
|
|
|
+
|
|
|
+ // ClairS
|
|
|
Info::FAU(v) => write!(f, "FAU={v}"),
|
|
|
Info::FCU(v) => write!(f, "FCU={v}"),
|
|
|
Info::FGU(v) => write!(f, "FGU={v}"),
|
|
|
@@ -693,12 +756,16 @@ impl fmt::Display for Info {
|
|
|
Info::RCU(v) => write!(f, "RCU={v}"),
|
|
|
Info::RGU(v) => write!(f, "RGU={v}"),
|
|
|
Info::RTU(v) => write!(f, "RTU={v}"),
|
|
|
+
|
|
|
+ // Nanomonsv
|
|
|
Info::SVTYPE(v) => write!(f, "SVTYPE={v}"),
|
|
|
Info::SVLEN(v) => write!(f, "SVLEN={v}"),
|
|
|
Info::END(v) => write!(f, "END={v}"),
|
|
|
Info::MATEID(v) => write!(f, "MATEID={v}"),
|
|
|
Info::SVINSLEN(v) => write!(f, "SVINSLEN={v}"),
|
|
|
Info::SVINSSEQ(v) => write!(f, "SVINSSEQ={v}"),
|
|
|
+
|
|
|
+ // SAVANA
|
|
|
Info::NORMAL_READ_SUPPORT(v) => write!(f, "NORMAL_READ_SUPPORT={v}"),
|
|
|
Info::TUMOUR_READ_SUPPORT(v) => write!(f, "TUMOUR_READ_SUPPORT={v}"),
|
|
|
Info::NORMAL_ALN_SUPPORT(v) => write!(f, "NORMAL_ALN_SUPPORT={v}"),
|
|
|
@@ -731,7 +798,23 @@ impl fmt::Display for Info {
|
|
|
Info::END_EVENT_SIZE_STD_DEV(v) => write!(f, "END_EVENT_SIZE_STD_DEV={v}"),
|
|
|
Info::END_EVENT_SIZE_MEDIAN(v) => write!(f, "END_EVENT_SIZE_MEDIAN={v}"),
|
|
|
Info::END_EVENT_SIZE_MEAN(v) => write!(f, "END_EVENT_SIZE_MEAN={v}"),
|
|
|
+
|
|
|
Info::CLASS(v) => write!(f, "CLASS={v}"),
|
|
|
+
|
|
|
+ // Severus
|
|
|
+ Info::PRECISE => write!(f, "PRECISE"),
|
|
|
+ Info::IMPRECISE => write!(f, "IMPRECISE"),
|
|
|
+ Info::STRANDS(v) => write!(f, "STRANDS={v}"),
|
|
|
+ Info::DETAILED_TYPE(v) => write!(f, "DETAILED_TYPE={v}"),
|
|
|
+ Info::INSLEN(v) => write!(f, "INSLEN={v}"),
|
|
|
+ Info::MAPQ(v) => write!(f, "MAPQ={v}"),
|
|
|
+ Info::PHASESETID(v) => write!(f, "PHASESETID={v}"),
|
|
|
+ Info::HP(v) => write!(f, "HP={v}"),
|
|
|
+ Info::CLUSTERID(v) => write!(f, "CLUSTERID={v}"),
|
|
|
+ Info::INSSEQ(v) => write!(f, "INSSEQ={v}"),
|
|
|
+ Info::MATE_ID(v) => write!(f, "MATE_ID={v}"),
|
|
|
+ Info::INSIDE_VNTR(v) => write!(f, "INSIDE_VNTR={v}"),
|
|
|
+ Info::ALINGED_POS(v) => write!(f, "ALINGED_POS={v}"),
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -743,58 +826,398 @@ pub fn concat_numbers<T: ToString>(v: &[T]) -> String {
|
|
|
.join(",")
|
|
|
}
|
|
|
|
|
|
+impl Info {
|
|
|
+ /// Returns the complete set of known VCF `INFO` header definitions used by `Info` variants.
|
|
|
+ ///
|
|
|
+ /// # Example
|
|
|
+ /// ```
|
|
|
+ /// let headers = Info::header_definitions();
|
|
|
+ /// for line in headers {
|
|
|
+ /// println!("{line}");
|
|
|
+ /// }
|
|
|
+ /// ```
|
|
|
+ pub fn header_definitions() -> BTreeSet<String> {
|
|
|
+ let mut set = BTreeSet::new();
|
|
|
+
|
|
|
+ macro_rules! push {
|
|
|
+ ($id:expr, $num:expr, $typ:expr, $desc:expr) => {
|
|
|
+ set.insert(format!(
|
|
|
+ r#"##INFO=<ID={},Number={},Type={},Description="{}">"#,
|
|
|
+ $id, $num, $typ, $desc
|
|
|
+ ));
|
|
|
+ };
|
|
|
+ }
|
|
|
+
|
|
|
+ // Flags
|
|
|
+ push!("H", 0, "Flag", "H flag");
|
|
|
+ push!("F", 0, "Flag", "F flag");
|
|
|
+ push!("P", 0, "Flag", "P flag");
|
|
|
+
|
|
|
+ // Allelic support
|
|
|
+ push!("FAU", 1, "Integer", "Forward A support in tumour");
|
|
|
+ push!("FCU", 1, "Integer", "Forward C support in tumour");
|
|
|
+ push!("FGU", 1, "Integer", "Forward G support in tumour");
|
|
|
+ push!("FTU", 1, "Integer", "Forward T support in tumour");
|
|
|
+ push!("RAU", 1, "Integer", "Reverse A support in tumour");
|
|
|
+ push!("RCU", 1, "Integer", "Reverse C support in tumour");
|
|
|
+ push!("RGU", 1, "Integer", "Reverse G support in tumour");
|
|
|
+ push!("RTU", 1, "Integer", "Reverse T support in tumour");
|
|
|
+
|
|
|
+ // Structural variant metadata
|
|
|
+ push!("SVTYPE", 1, "String", "Structural variant type");
|
|
|
+ push!("MATEID", 1, "String", "ID of the mate breakend");
|
|
|
+ push!("SVLEN", 1, "Integer", "Length of structural variant");
|
|
|
+ push!("SVINSLEN", 1, "Integer", "Length of inserted sequence");
|
|
|
+ push!("SVINSSEQ", 1, "String", "Inserted sequence");
|
|
|
+
|
|
|
+ // Positions and read support
|
|
|
+ push!("END", 1, "Integer", "End position of the variant");
|
|
|
+ push!(
|
|
|
+ "NORMAL_READ_SUPPORT",
|
|
|
+ 1,
|
|
|
+ "Integer",
|
|
|
+ "Supporting reads in normal sample"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "TUMOUR_READ_SUPPORT",
|
|
|
+ 1,
|
|
|
+ "Integer",
|
|
|
+ "Supporting reads in tumour sample"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "NORMAL_ALN_SUPPORT",
|
|
|
+ 1,
|
|
|
+ "Integer",
|
|
|
+ "Aligned reads in normal sample"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "TUMOUR_ALN_SUPPORT",
|
|
|
+ 1,
|
|
|
+ "Integer",
|
|
|
+ "Aligned reads in tumour sample"
|
|
|
+ );
|
|
|
+
|
|
|
+ // Depth profiles
|
|
|
+ push!(
|
|
|
+ "TUMOUR_DP_BEFORE",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Depth before breakpoint in tumour"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "TUMOUR_DP_AT",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Depth at breakpoint in tumour"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "TUMOUR_DP_AFTER",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Depth after breakpoint in tumour"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "NORMAL_DP_BEFORE",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Depth before breakpoint in normal"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "NORMAL_DP_AT",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Depth at breakpoint in normal"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "NORMAL_DP_AFTER",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Depth after breakpoint in normal"
|
|
|
+ );
|
|
|
+
|
|
|
+ // Allele frequencies
|
|
|
+ push!(
|
|
|
+ "TUMOUR_AF",
|
|
|
+ ".",
|
|
|
+ "Float",
|
|
|
+ "Variant allele frequencies in tumour"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "NORMAL_AF",
|
|
|
+ ".",
|
|
|
+ "Float",
|
|
|
+ "Variant allele frequencies in normal"
|
|
|
+ );
|
|
|
+
|
|
|
+ // Haplotype/phasing
|
|
|
+ push!(
|
|
|
+ "TUMOUR_ALT_HP",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Alternate haplotype support in tumour"
|
|
|
+ );
|
|
|
+ push!("TUMOUR_PS", ".", "String", "Phasing set in tumour");
|
|
|
+ push!(
|
|
|
+ "NORMAL_ALT_HP",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Alternate haplotype support in normal"
|
|
|
+ );
|
|
|
+ push!("NORMAL_PS", ".", "String", "Phasing set in normal");
|
|
|
+ push!(
|
|
|
+ "TUMOUR_TOTAL_HP_AT",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Total haplotype depth at breakpoint in tumour"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "NORMAL_TOTAL_HP_AT",
|
|
|
+ ".",
|
|
|
+ "Integer",
|
|
|
+ "Total haplotype depth at breakpoint in normal"
|
|
|
+ );
|
|
|
+
|
|
|
+ // Cluster analysis
|
|
|
+ push!(
|
|
|
+ "CLUSTERED_READS_TUMOUR",
|
|
|
+ 1,
|
|
|
+ "Integer",
|
|
|
+ "Clustered reads in tumour"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "CLUSTERED_READS_NORMAL",
|
|
|
+ 1,
|
|
|
+ "Integer",
|
|
|
+ "Clustered reads in normal"
|
|
|
+ );
|
|
|
+
|
|
|
+ // Origin and end-point statistics
|
|
|
+ push!(
|
|
|
+ "ORIGIN_STARTS_STD_DEV",
|
|
|
+ 1,
|
|
|
+ "Float",
|
|
|
+ "STDDEV of read starts at origin"
|
|
|
+ );
|
|
|
+ push!("ORIGIN_MAPQ_MEAN", 1, "Float", "Mean MAPQ at origin");
|
|
|
+ push!(
|
|
|
+ "ORIGIN_EVENT_SIZE_STD_DEV",
|
|
|
+ 1,
|
|
|
+ "Float",
|
|
|
+ "STDDEV of event size at origin"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "ORIGIN_EVENT_SIZE_MEDIAN",
|
|
|
+ 1,
|
|
|
+ "Float",
|
|
|
+ "Median event size at origin"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "ORIGIN_EVENT_SIZE_MEAN",
|
|
|
+ 1,
|
|
|
+ "Float",
|
|
|
+ "Mean event size at origin"
|
|
|
+ );
|
|
|
+
|
|
|
+ push!(
|
|
|
+ "END_STARTS_STD_DEV",
|
|
|
+ 1,
|
|
|
+ "Float",
|
|
|
+ "STDDEV of read starts at end"
|
|
|
+ );
|
|
|
+ push!("END_MAPQ_MEAN", 1, "Float", "Mean MAPQ at end");
|
|
|
+ push!(
|
|
|
+ "END_EVENT_SIZE_STD_DEV",
|
|
|
+ 1,
|
|
|
+ "Float",
|
|
|
+ "STDDEV of event size at end"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "END_EVENT_SIZE_MEDIAN",
|
|
|
+ 1,
|
|
|
+ "Float",
|
|
|
+ "Median event size at end"
|
|
|
+ );
|
|
|
+ push!("END_EVENT_SIZE_MEAN", 1, "Float", "Mean event size at end");
|
|
|
+
|
|
|
+ // Additional
|
|
|
+ push!("BP_NOTATION", 1, "String", "Breakpoint notation");
|
|
|
+ push!("SOURCE", 1, "String", "Caller source name");
|
|
|
+ push!("CLASS", 1, "String", "Variant classification");
|
|
|
+
|
|
|
+ // Severus
|
|
|
+ push!(
|
|
|
+ "PRECISE",
|
|
|
+ 0,
|
|
|
+ "Flag",
|
|
|
+ "SV with precise breakpoints coordinates and length"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "IMPRECISE",
|
|
|
+ 0,
|
|
|
+ "Flag",
|
|
|
+ "SV with imprecise breakpoints coordinates and length"
|
|
|
+ );
|
|
|
+ push!("STRANDS", 1, "String", "Breakpoint strandedness");
|
|
|
+ push!("DETAILED_TYPE", 1, "String", "Detailed type of the SV");
|
|
|
+ push!(
|
|
|
+ "INSLEN",
|
|
|
+ 1,
|
|
|
+ "Integer",
|
|
|
+ "Length of the unmapped sequence between breakpoint"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "MAPQ",
|
|
|
+ 1,
|
|
|
+ "Integer",
|
|
|
+ "Median mapping quality of supporting reads"
|
|
|
+ );
|
|
|
+ push!(
|
|
|
+ "PHASESETID",
|
|
|
+ 1,
|
|
|
+ "String",
|
|
|
+ "Matching phaseset ID for phased SVs"
|
|
|
+ );
|
|
|
+ push!("HP", 1, "Integer", "Matching haplotype ID for phased SVs");
|
|
|
+ push!("CLUSTERID", 1, "String", "Cluster ID in breakpoint_graph");
|
|
|
+ push!(
|
|
|
+ "INSSEQ",
|
|
|
+ 1,
|
|
|
+ "String",
|
|
|
+ "Insertion sequence between breakpoints"
|
|
|
+ );
|
|
|
+ push!("MATE_ID", 1, "String", "MATE ID for breakends");
|
|
|
+ push!(
|
|
|
+ "INSIDE_VNTR",
|
|
|
+ 1,
|
|
|
+ "String",
|
|
|
+ "True if an indel is inside a VNTR"
|
|
|
+ );
|
|
|
+ push!("ALINGED_POS", 1, "String", "Position in the reference");
|
|
|
+
|
|
|
+ set
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
/// Format
|
|
|
+/// Enum representing individual FORMAT fields from a VCF record.
|
|
|
+///
|
|
|
+/// This enum supports common fields used by DeepVariant, Clairs, and nanomonsv,
|
|
|
+/// as well as a generic fallback for other key-value pairs.
|
|
|
+///
|
|
|
+/// # Examples
|
|
|
+///
|
|
|
+/// ```
|
|
|
+/// use your_crate::Format;
|
|
|
+///
|
|
|
+/// let gt = Format::GT("0/1".to_string());
|
|
|
+/// let dp = Format::DP(30);
|
|
|
+/// let ad = Format::AD(vec![10, 20]);
|
|
|
+/// ```
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Encode, Decode)]
|
|
|
pub enum Format {
|
|
|
- // DeepVariant
|
|
|
+ // --- DeepVariant fields ---
|
|
|
+ /// Genotype string, e.g., "0/1", "1/1".
|
|
|
GT(String),
|
|
|
+
|
|
|
+ /// Genotype quality.
|
|
|
GQ(u32),
|
|
|
+
|
|
|
+ /// Read depth (total coverage at the variant position).
|
|
|
DP(u32),
|
|
|
+
|
|
|
+ /// Allelic depths for the ref and alt alleles (e.g., [ref, alt1, alt2...]).
|
|
|
AD(Vec<u32>),
|
|
|
+
|
|
|
+ /// Variant allele frequency (e.g., 0.25 for 25%).
|
|
|
VAF(f32),
|
|
|
+
|
|
|
+ /// Phred-scaled genotype likelihoods.
|
|
|
PL(Vec<u32>),
|
|
|
|
|
|
- // Clairs
|
|
|
- // when format begins with N: normal
|
|
|
- // AF(f32),
|
|
|
- // NAF(f32), // DP(u32),
|
|
|
+ // --- Clairs fields (prefixed with N: for normal sample, or tumor in case of paired) ---
|
|
|
+ /// Normal sample total depth.
|
|
|
NDP(u32),
|
|
|
+
|
|
|
+ /// Normal sample allelic depths (e.g., [ref, alt1, alt2...]).
|
|
|
NAD(Vec<u32>),
|
|
|
+
|
|
|
+ /// Allele-specific counts for A, C, G, T bases in tumor sample.
|
|
|
AU(u32),
|
|
|
CU(u32),
|
|
|
GU(u32),
|
|
|
TU(u32),
|
|
|
+
|
|
|
+ /// Allele-specific counts for A, C, G, T bases in normal sample.
|
|
|
NAU(u32),
|
|
|
NCU(u32),
|
|
|
NGU(u32),
|
|
|
NTU(u32),
|
|
|
|
|
|
- // nanomonsv
|
|
|
+ // --- nanomonsv fields ---
|
|
|
+ /// Total number of supporting reads in tumor.
|
|
|
TR(u32),
|
|
|
+
|
|
|
+ /// Variant-supporting reads in tumor.
|
|
|
VR(u32),
|
|
|
|
|
|
- Other((String, String)), // (key, value)
|
|
|
+ // --- Severus fields ---
|
|
|
+ DR(u32),
|
|
|
+ DV(u32),
|
|
|
+ HVAF(Vec<f32>),
|
|
|
+
|
|
|
+ /// Fallback for any other key-value pair not explicitly modeled.
|
|
|
+ /// Contains the raw key and value as strings.
|
|
|
+ Other((String, String)),
|
|
|
}
|
|
|
|
|
|
+/// Container for a list of `Format` items.
|
|
|
+/// Represents the full FORMAT field and sample value for one sample.
|
|
|
+///
|
|
|
+/// # Examples
|
|
|
+///
|
|
|
+/// ```
|
|
|
+/// use your_crate::{Formats, Format};
|
|
|
+///
|
|
|
+/// let formats = Formats(vec![
|
|
|
+/// Format::GT("0/1".to_string()),
|
|
|
+/// Format::DP(45),
|
|
|
+/// Format::AD(vec![15, 30]),
|
|
|
+/// ]);
|
|
|
+/// ```
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default, Encode, Decode)]
|
|
|
pub struct Formats(pub Vec<Format>);
|
|
|
|
|
|
impl Formats {
|
|
|
- /// Get the tumoral alternative read depth and total depth as an Option<(u32, u32)>.
|
|
|
+ /// Returns the tumor alternative read depth and total depth if both are available.
|
|
|
+ ///
|
|
|
+ /// This method looks for:
|
|
|
+ /// - `Format::AD`: to compute the sum of alternative allele depths (excluding reference)
|
|
|
+ /// - `Format::DP`: to get total read depth
|
|
|
+ ///
|
|
|
+ /// Returns `Some((alt_depth, total_depth))` if both are present, else `None`.
|
|
|
+ ///
|
|
|
+ /// # Example
|
|
|
+ /// ```
|
|
|
+ /// use your_crate::{Formats, Format};
|
|
|
+ ///
|
|
|
+ /// let f = Formats(vec![
|
|
|
+ /// Format::AD(vec![10, 20, 5]),
|
|
|
+ /// Format::DP(40),
|
|
|
+ /// ]);
|
|
|
+ ///
|
|
|
+ /// assert_eq!(f.n_alt_depth(), Some((25, 40)));
|
|
|
+ /// ```
|
|
|
pub fn n_alt_depth(&self) -> Option<(u32, u32)> {
|
|
|
let mut tumor_alt_depth: Option<u32> = None;
|
|
|
let mut tumor_total_depth: Option<u32> = None;
|
|
|
|
|
|
for format in &self.0 {
|
|
|
match format {
|
|
|
- // Tumor Allelic Depth (AD)
|
|
|
Format::AD(values) => {
|
|
|
if values.len() > 1 {
|
|
|
- // Sum all alternative allele depths (excluding reference allele)
|
|
|
tumor_alt_depth = Some(values[1..].iter().sum());
|
|
|
}
|
|
|
}
|
|
|
- // Tumor Total Depth (DP)
|
|
|
Format::DP(value) => {
|
|
|
tumor_total_depth = Some(*value);
|
|
|
}
|
|
|
@@ -802,7 +1225,6 @@ impl Formats {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- // Return a tuple (tumor_alt_depth, tumor_total_depth) if both are available
|
|
|
match (tumor_alt_depth, tumor_total_depth) {
|
|
|
(Some(alt), Some(total)) => Some((alt, total)),
|
|
|
_ => None,
|
|
|
@@ -813,6 +1235,22 @@ impl Formats {
|
|
|
impl TryFrom<(&str, &str)> for Formats {
|
|
|
type Error = anyhow::Error;
|
|
|
|
|
|
+ /// Attempts to construct a `Formats` from a pair of colon-separated FORMAT keys and values.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ /// * `k` - FORMAT field names (e.g., "GT:DP:AD")
|
|
|
+ /// * `v` - Corresponding values (e.g., "0/1:35:10,25")
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if the number of keys and values do not match or if parsing fails.
|
|
|
+ ///
|
|
|
+ /// # Example
|
|
|
+ /// ```
|
|
|
+ /// use your_crate::Formats;
|
|
|
+ /// use std::convert::TryFrom;
|
|
|
+ ///
|
|
|
+ /// let f = Formats::try_from(("GT:DP:AD", "0/1:40:12,28")).unwrap();
|
|
|
+ /// ```
|
|
|
fn try_from((k, v): (&str, &str)) -> anyhow::Result<Self> {
|
|
|
let keys: Vec<&str> = k.split(':').collect();
|
|
|
let values: Vec<&str> = v.split(':').collect();
|
|
|
@@ -832,6 +1270,23 @@ impl TryFrom<(&str, &str)> for Formats {
|
|
|
}
|
|
|
|
|
|
impl From<Formats> for (String, String) {
|
|
|
+ /// Converts `Formats` back into a `(keys, values)` tuple of colon-separated strings.
|
|
|
+ ///
|
|
|
+ /// This is the inverse of the `TryFrom<(&str, &str)>` implementation.
|
|
|
+ ///
|
|
|
+ /// # Example
|
|
|
+ /// ```
|
|
|
+ /// use your_crate::{Format, Formats};
|
|
|
+ ///
|
|
|
+ /// let formats = Formats(vec![
|
|
|
+ /// Format::GT("0/1".to_string()),
|
|
|
+ /// Format::DP(30),
|
|
|
+ /// ]);
|
|
|
+ ///
|
|
|
+ /// let (k, v): (String, String) = formats.into();
|
|
|
+ /// assert_eq!(k, "GT:DP");
|
|
|
+ /// assert_eq!(v, "0/1:30");
|
|
|
+ /// ```
|
|
|
fn from(formats: Formats) -> Self {
|
|
|
let mut keys = Vec::new();
|
|
|
let mut values = Vec::new();
|
|
|
@@ -849,15 +1304,34 @@ impl From<Formats> for (String, String) {
|
|
|
impl TryFrom<(&str, &str)> for Format {
|
|
|
type Error = anyhow::Error;
|
|
|
|
|
|
+ /// Tries to convert a `(key, value)` pair into a typed `Format` variant.
|
|
|
+ ///
|
|
|
+ /// This parser supports known FORMAT keys from DeepVariant, Clairs, and nanomonsv.
|
|
|
+ /// Unknown keys are stored as `Format::Other((key, value))`.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ /// * `key` - FORMAT field name
|
|
|
+ /// * `value` - raw string value associated with the key
|
|
|
+ ///
|
|
|
+ /// # Example
|
|
|
+ /// ```
|
|
|
+ /// use your_crate::Format;
|
|
|
+ /// use std::convert::TryFrom;
|
|
|
+ ///
|
|
|
+ /// let dp = Format::try_from(("DP", "42")).unwrap();
|
|
|
+ /// assert!(matches!(dp, Format::DP(42)));
|
|
|
+ /// ```
|
|
|
fn try_from((key, value): (&str, &str)) -> anyhow::Result<Self> {
|
|
|
let format = match key {
|
|
|
+ // DeepVariant
|
|
|
"GT" => Format::GT(value.to_string()),
|
|
|
"GQ" => Format::GQ(parse_value(value, key)?),
|
|
|
"DP" => Format::DP(parse_value(value, key)?),
|
|
|
"AD" => Format::AD(parse_vec_value(value, key)?),
|
|
|
"VAF" => Format::VAF(parse_value(value, key)?),
|
|
|
- // "AF" => Format::AF(parse_value(value, key)?),
|
|
|
- // "NAF" => Format::NAF(parse_value(value, key)?),
|
|
|
+ "PL" => Format::PL(parse_vec_value(value, key)?),
|
|
|
+
|
|
|
+ // Clairs
|
|
|
"NDP" => Format::NDP(parse_value(value, key)?),
|
|
|
"NAD" => Format::NAD(parse_vec_value(value, key)?),
|
|
|
"AU" => Format::AU(parse_value(value, key)?),
|
|
|
@@ -868,9 +1342,17 @@ impl TryFrom<(&str, &str)> for Format {
|
|
|
"NCU" => Format::NCU(parse_value(value, key)?),
|
|
|
"NGU" => Format::NGU(parse_value(value, key)?),
|
|
|
"NTU" => Format::NTU(parse_value(value, key)?),
|
|
|
- "PL" => Format::PL(parse_vec_value(value, key)?),
|
|
|
+
|
|
|
+ // nanomonsv
|
|
|
"TR" => Format::TR(parse_value(value, key)?),
|
|
|
"VR" => Format::VR(parse_value(value, key)?),
|
|
|
+
|
|
|
+ // Severus
|
|
|
+ "DR" => Format::DR(parse_value(value, key)?),
|
|
|
+ "DV" => Format::DV(parse_value(value, key)?),
|
|
|
+ "hVAF" => Format::HVAF(parse_vec_value(value, key)?),
|
|
|
+
|
|
|
+ // fallback
|
|
|
_ => Format::Other((key.to_string(), value.to_string())),
|
|
|
};
|
|
|
Ok(format)
|
|
|
@@ -888,7 +1370,7 @@ where
|
|
|
.context(format!("Can't parse {}: {}", key, value)) // Add context
|
|
|
}
|
|
|
|
|
|
-// Helper function to parse comma-separated values (DeepSeek)
|
|
|
+// Helper function to parse comma-separated values
|
|
|
fn parse_vec_value<T: std::str::FromStr>(value: &str, key: &str) -> anyhow::Result<Vec<T>>
|
|
|
where
|
|
|
T::Err: std::fmt::Debug,
|
|
|
@@ -904,27 +1386,47 @@ where
|
|
|
}
|
|
|
|
|
|
impl From<Format> for (String, String) {
|
|
|
+ /// Converts a `Format` enum into a `(key, value)` pair, as strings.
|
|
|
+ ///
|
|
|
+ /// This is used to serialize the FORMAT field back into VCF-compatible string values.
|
|
|
+ /// The key corresponds to the field ID (e.g., `"DP"`, `"GT"`), and the value is the encoded string representation.
|
|
|
+ ///
|
|
|
+ /// # Examples
|
|
|
+ /// ```
|
|
|
+ /// use your_crate::Format;
|
|
|
+ /// let f = Format::DP(42);
|
|
|
+ /// let (k, v): (String, String) = f.into();
|
|
|
+ /// assert_eq!(k, "DP");
|
|
|
+ /// assert_eq!(v, "42");
|
|
|
+ /// ```
|
|
|
fn from(format: Format) -> Self {
|
|
|
- let concat = |values: Vec<u32>| -> String {
|
|
|
+ let concat_u32 = |values: Vec<u32>| -> String {
|
|
|
values
|
|
|
.iter()
|
|
|
- .map(|v| v.to_string())
|
|
|
+ .map(u32::to_string)
|
|
|
+ .collect::<Vec<_>>()
|
|
|
+ .join(",")
|
|
|
+ };
|
|
|
+ let concat_f32 = |values: Vec<f32>| -> String {
|
|
|
+ values
|
|
|
+ .iter()
|
|
|
+ .map(|v| format!("{:.5}", v)) // consistent decimal format
|
|
|
.collect::<Vec<_>>()
|
|
|
.join(",")
|
|
|
};
|
|
|
|
|
|
match format {
|
|
|
+ // DeepVariant
|
|
|
Format::GT(value) => ("GT".to_string(), value),
|
|
|
Format::GQ(value) => ("GQ".to_string(), value.to_string()),
|
|
|
Format::DP(value) => ("DP".to_string(), value.to_string()),
|
|
|
- Format::AD(values) => ("AD".to_string(), concat(values)),
|
|
|
- Format::VAF(value) => ("VAF".to_string(), value.to_string()),
|
|
|
- Format::PL(values) => ("PL".to_string(), concat(values)),
|
|
|
- Format::Other((key, value)) => (key, value),
|
|
|
- // Format::AF(value) => ("AF".to_string(), value.to_string()),
|
|
|
- // Format::NAF(value) => ("NAF".to_string(), value.to_string()),
|
|
|
+ Format::AD(values) => ("AD".to_string(), concat_u32(values)),
|
|
|
+ Format::VAF(value) => ("VAF".to_string(), format!("{:.5}", value)),
|
|
|
+ Format::PL(values) => ("PL".to_string(), concat_u32(values)),
|
|
|
+
|
|
|
+ // Clairs
|
|
|
Format::NDP(value) => ("NDP".to_string(), value.to_string()),
|
|
|
- Format::NAD(values) => ("NAD".to_string(), concat(values)),
|
|
|
+ Format::NAD(values) => ("NAD".to_string(), concat_u32(values)),
|
|
|
Format::AU(value) => ("AU".to_string(), value.to_string()),
|
|
|
Format::CU(value) => ("CU".to_string(), value.to_string()),
|
|
|
Format::GU(value) => ("GU".to_string(), value.to_string()),
|
|
|
@@ -933,8 +1435,18 @@ impl From<Format> for (String, String) {
|
|
|
Format::NCU(value) => ("NCU".to_string(), value.to_string()),
|
|
|
Format::NGU(value) => ("NGU".to_string(), value.to_string()),
|
|
|
Format::NTU(value) => ("NTU".to_string(), value.to_string()),
|
|
|
+
|
|
|
+ // nanomonsv
|
|
|
Format::TR(value) => ("TR".to_string(), value.to_string()),
|
|
|
Format::VR(value) => ("VR".to_string(), value.to_string()),
|
|
|
+
|
|
|
+ // Severus
|
|
|
+ Format::DR(value) => ("DR".to_string(), value.to_string()),
|
|
|
+ Format::DV(value) => ("DV".to_string(), value.to_string()),
|
|
|
+ Format::HVAF(values) => ("hVAF".to_string(), concat_f32(values)),
|
|
|
+
|
|
|
+ // fallback
|
|
|
+ Format::Other((key, value)) => (key, value),
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -963,6 +1475,99 @@ impl Formats {
|
|
|
|
|
|
Formats(filtered_vec)
|
|
|
}
|
|
|
+
|
|
|
+ /// Returns a sorted set of VCF header definitions for all possible `Format` fields.
|
|
|
+ ///
|
|
|
+ /// # Example
|
|
|
+ /// ```
|
|
|
+ /// let headers = Formats::format_headers();
|
|
|
+ /// for h in headers {
|
|
|
+ /// println!("{}", h);
|
|
|
+ /// }
|
|
|
+ /// ```
|
|
|
+ pub fn format_headers() -> BTreeSet<String> {
|
|
|
+ let mut headers = BTreeSet::new();
|
|
|
+
|
|
|
+ headers
|
|
|
+ .insert(r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#.to_string());
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">"#.to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">"#.to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(r#"##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles">"#.to_string());
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Frequency">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(r#"##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods">"#.to_string());
|
|
|
+
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=NDP,Number=1,Type=Integer,Description="Normal sample read depth">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=NAD,Number=R,Type=Integer,Description="Normal sample allelic depths">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=AU,Number=1,Type=Integer,Description="Tumor A allele count">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=CU,Number=1,Type=Integer,Description="Tumor C allele count">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=GU,Number=1,Type=Integer,Description="Tumor G allele count">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=TU,Number=1,Type=Integer,Description="Tumor T allele count">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=NAU,Number=1,Type=Integer,Description="Normal A allele count">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=NCU,Number=1,Type=Integer,Description="Normal C allele count">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=NGU,Number=1,Type=Integer,Description="Normal G allele count">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=NTU,Number=1,Type=Integer,Description="Normal T allele count">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+
|
|
|
+ headers.insert(r#"##FORMAT=<ID=TR,Number=1,Type=Integer,Description="Total supporting reads (tumor)">"#.to_string());
|
|
|
+ headers.insert(r#"##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Variant-supporting reads (tumor)">"#.to_string());
|
|
|
+
|
|
|
+ // Severus
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=DR,Number=1,Type=Integer,Description="Number of reference reads">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+
|
|
|
+ headers.insert(
|
|
|
+ r#"##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of variant reads">"#
|
|
|
+ .to_string(),
|
|
|
+ );
|
|
|
+ headers.insert(r#"##FORMAT=<ID=hVAF,Number=3,Type=Float,Description="Haplotype specific variant Allele frequency (H0,H1,H2)">"#.to_string());
|
|
|
+
|
|
|
+ // headers.insert(
|
|
|
+ // r#"##FORMAT=<ID=Other,Number=.,Type=String,Description="Unspecified FORMAT field">"#
|
|
|
+ // .to_string(),
|
|
|
+ // );
|
|
|
+
|
|
|
+ headers
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/// Filter
|