|
|
@@ -10,20 +10,34 @@ use rayon::prelude::*;
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
use std::{cmp::Ordering, collections::HashSet, fmt, hash::Hash, str::FromStr};
|
|
|
|
|
|
+/// Represents a variant in the Variant Call Format (VCF).
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
pub struct VcfVariant {
|
|
|
+ /// A 128-bit hash of the variant's key properties for efficient comparison and storage.
|
|
|
pub hash: Hash128,
|
|
|
+ /// The genomic position of the variant.
|
|
|
pub position: GenomePosition,
|
|
|
+ /// The identifier of the variant.
|
|
|
pub id: String,
|
|
|
+ /// The reference allele.
|
|
|
pub reference: ReferenceAlternative,
|
|
|
+ /// The alternative allele.
|
|
|
pub alternative: ReferenceAlternative,
|
|
|
+ /// The quality score of the variant call, if available.
|
|
|
pub quality: Option<f32>,
|
|
|
+ /// The filter status of the variant.
|
|
|
pub filter: Filter,
|
|
|
+ /// Additional information about the variant.
|
|
|
pub infos: Infos,
|
|
|
+ /// Genotype information and other sample-specific data.
|
|
|
pub formats: Formats,
|
|
|
}
|
|
|
|
|
|
impl PartialEq for VcfVariant {
|
|
|
+ /// Compares two VcfVariants for equality.
|
|
|
+ ///
|
|
|
+ /// Note: This comparison only considers position, reference, and alternative.
|
|
|
+ /// It intentionally ignores id, filter, info, format, and quality.
|
|
|
fn eq(&self, other: &Self) -> bool {
|
|
|
// Nota bene: id, filter, info, format and quality is intentionally not compared
|
|
|
self.position == other.position
|
|
|
@@ -37,6 +51,13 @@ impl Eq for VcfVariant {}
|
|
|
impl FromStr for VcfVariant {
|
|
|
type Err = anyhow::Error;
|
|
|
|
|
|
+ /// Parses a VcfVariant from a string representation.
|
|
|
+ ///
|
|
|
+ /// The input string is expected to be a tab-separated VCF line.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if parsing fails for any field.
|
|
|
fn from_str(s: &str) -> anyhow::Result<Self> {
|
|
|
let v: Vec<&str> = s.split('\t').collect();
|
|
|
let vcf_position: VcfPosition = (
|
|
|
@@ -108,6 +129,10 @@ impl FromStr for VcfVariant {
|
|
|
|
|
|
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ADJAGBA_diag
|
|
|
impl VcfVariant {
|
|
|
+ /// Converts the VcfVariant into a VCF-formatted row string.
|
|
|
+ ///
|
|
|
+ /// This method creates a tab-separated string representation of the variant,
|
|
|
+ /// suitable for writing to a VCF file.
|
|
|
pub fn into_vcf_row(&self) -> String {
|
|
|
let vcf_position: VcfPosition = self.position.clone().into();
|
|
|
let (contig, position) = vcf_position.into();
|
|
|
@@ -134,10 +159,15 @@ impl VcfVariant {
|
|
|
columns.join("\t")
|
|
|
}
|
|
|
|
|
|
+ /// Returns the hash of the variant.
|
|
|
pub fn hash(&self) -> Hash128 {
|
|
|
self.hash
|
|
|
}
|
|
|
|
|
|
+ /// Creates a new VcfVariant with common attributes from DeepVariant and CLAIRS.
|
|
|
+ ///
|
|
|
+ /// This method generates a new variant with shared properties, resetting some fields
|
|
|
+ /// to default or empty values.
|
|
|
pub fn commun_deepvariant_clairs(&self) -> VcfVariant {
|
|
|
VcfVariant {
|
|
|
hash: self.hash,
|
|
|
@@ -152,10 +182,16 @@ impl VcfVariant {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ /// Checks if the variant has an SVTYPE info field.
|
|
|
+ ///
|
|
|
+ /// Returns true if the variant contains structural variation type information.
|
|
|
pub fn has_svtype(&self) -> bool {
|
|
|
self.infos.0.iter().any(|i| matches!(i, Info::SVTYPE(_)))
|
|
|
}
|
|
|
|
|
|
+ /// Retrieves the structural variation type of the variant, if present.
|
|
|
+ ///
|
|
|
+ /// Returns Some(SVType) if the variant has an SVTYPE info field,
|
|
|
pub fn svtype(&self) -> Option<SVType> {
|
|
|
self.infos.0.iter().find_map(|e| {
|
|
|
if let Info::SVTYPE(sv_type) = e {
|
|
|
@@ -166,6 +202,25 @@ impl VcfVariant {
|
|
|
})
|
|
|
}
|
|
|
|
|
|
+ /// Determines the alteration category of the variant.
|
|
|
+ ///
|
|
|
+ /// This method analyzes the reference and alternative alleles to classify
|
|
|
+ /// the variant into one of several alteration categories:
|
|
|
+ /// - SNV (Single Nucleotide Variant)
|
|
|
+ /// - INS (Insertion)
|
|
|
+ /// - DEL (Deletion)
|
|
|
+ /// - Other (including structural variants and complex alterations)
|
|
|
+ ///
|
|
|
+ /// The classification is based on the following rules:
|
|
|
+ /// 1. If both reference and alternative are single nucleotides, it's an SNV.
|
|
|
+ /// 2. If reference is a single nucleotide and alternative is multiple nucleotides, it's an insertion.
|
|
|
+ /// 3. If reference is multiple nucleotides and alternative is a single nucleotide, it's a deletion.
|
|
|
+ /// 4. For cases where both are multiple nucleotides, the longer one determines if it's an insertion or deletion.
|
|
|
+ /// 5. If none of the above apply, it checks for structural variant types.
|
|
|
+ /// 6. If no structural variant type is found, it's classified as "Other".
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// An `AlterationCategory` enum representing the type of alteration.
|
|
|
pub fn alteration_category(&self) -> AlterationCategory {
|
|
|
match (&self.reference, &self.alternative) {
|
|
|
(ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
|
|
|
@@ -384,7 +439,7 @@ impl Ord for VcfVariant {
|
|
|
|
|
|
/// Info
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
|
|
|
-pub struct Infos(Vec<Info>);
|
|
|
+pub struct Infos(pub Vec<Info>);
|
|
|
|
|
|
impl FromStr for Infos {
|
|
|
type Err = anyhow::Error;
|
|
|
@@ -642,7 +697,7 @@ pub enum Format {
|
|
|
}
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
|
|
|
-pub struct Formats(Vec<Format>);
|
|
|
+pub struct Formats(pub Vec<Format>);
|
|
|
|
|
|
impl TryFrom<(&str, &str)> for Formats {
|
|
|
type Error = anyhow::Error;
|