| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743 |
- use anyhow::anyhow;
- use hashbrown::HashMap;
- use itertools::Itertools;
- use log::{debug, warn};
- use serde::{Deserialize, Serialize};
- use std::{
- cmp::{Ordering, Reverse},
- io::{BufRead, BufReader},
- process::{Command, Stdio},
- str::FromStr,
- };
- use super::ncbi::NCBIAcc;
- /// Represents a single line of output from the Variant Effect Predictor (VEP).
- ///
- /// This struct encapsulates all fields typically found in a VEP output line,
- /// providing a structured way to work with VEP results.
- #[derive(Debug, PartialEq, Serialize, Deserialize)]
- pub struct VepLine {
- /// The identifier of the input variant
- pub uploaded_variation: String,
- /// The genomic location of the variant
- pub location: String,
- /// The variant allele
- pub allele: String,
- /// The gene symbol or identifier
- pub gene: String,
- /// The affected feature (e.g., transcript ID)
- pub feature: String,
- /// The type of feature affected (e.g., Transcript, RegulatoryFeature)
- pub feature_type: String,
- /// The consequence of the variant
- pub consequence: String,
- /// The position of the variant in the cDNA
- pub cdna_position: String,
- /// The position of the variant in the CDS
- pub cds_position: String,
- /// The position of the variant in the protein
- pub protein_position: String,
- /// The amino acid change caused by the variant
- pub amino_acids: String,
- /// The codon change caused by the variant
- pub codons: String,
- /// Known identifiers for this variant
- pub existing_variation: String,
- /// Additional information in key=value format
- pub extra: String,
- }
- impl FromStr for VepLine {
- type Err = anyhow::Error;
- /// Parses a VEP output line into a VepLine struct.
- ///
- /// This method expects the input string to be a tab-separated line
- /// containing exactly 14 fields, as per standard VEP output format.
- ///
- /// # Arguments
- /// * `s` - A string slice representing a single line of VEP output
- ///
- /// # Returns
- /// * `Ok(VepLine)` if parsing is successful
- /// * `Err(anyhow::Error)` if the input doesn't contain the expected number of fields
- ///
- /// # Example
- /// ```
- /// let vep_line = VepLine::from_str("").unwrap();
- /// assert_eq!(vep_line.gene, "ENSG00000135744");
- /// ```
- fn from_str(s: &str) -> Result<Self, Self::Err> {
- let parts: Vec<&str> = s.split('\t').collect();
- if parts.len() != 14 {
- return Err(anyhow!("Invalid number of fields in VEP line"));
- }
- Ok(VepLine {
- uploaded_variation: parts[0].to_string(),
- location: parts[1].to_string(),
- allele: parts[2].to_string(),
- gene: parts[3].to_string(),
- feature: parts[4].to_string(),
- feature_type: parts[5].to_string(),
- consequence: parts[6].to_string(),
- cdna_position: parts[7].to_string(),
- cds_position: parts[8].to_string(),
- protein_position: parts[9].to_string(),
- amino_acids: parts[10].to_string(),
- codons: parts[11].to_string(),
- existing_variation: parts[12].to_string(),
- extra: parts[13].to_string(),
- })
- }
- }
- /// Represents a Variant Effect Predictor (VEP) annotation for a genetic variant.
- ///
- /// This struct encapsulates various fields that describe the predicted effects
- /// of a variant on genes, transcripts, and proteins, as determined by VEP.
- #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
- pub struct VEP {
- /// The gene affected by the variant
- pub gene: Option<String>,
- /// The specific feature (e.g., transcript) affected by the variant
- pub feature: Option<String>,
- /// The type of feature (e.g., Transcript, RegulatoryFeature)
- pub feature_type: Option<String>,
- /// The predicted consequence(s) of the variant
- pub consequence: Option<Vec<VepConsequence>>,
- /// The position of the variant in the cDNA sequence
- pub cdna_position: Option<String>,
- /// The position of the variant in the coding sequence
- pub cds_position: Option<String>,
- /// The position of the variant in the protein sequence
- pub protein_position: Option<String>,
- /// The amino acid change caused by the variant
- pub amino_acids: Option<String>,
- /// The codon change caused by the variant
- pub codons: Option<String>,
- /// Known identifiers for this variant
- pub existing_variation: Option<String>,
- /// Additional VEP information
- pub extra: VEPExtra,
- }
- /// Represents the predicted consequences of a genetic variant as determined by
- /// the Ensembl Variant Effect Predictor (VEP).
- ///
- /// This enum aligns with the Sequence Ontology (SO) terms used by VEP to describe
- /// the effects of variants on transcripts, proteins, and regulatory regions.
- ///
- /// The variants are generally ordered from most to least severe in terms of their
- /// potential impact on gene function.
- ///
- /// For more information, see:
- /// <https://ensembl.org/info/genome/variation/prediction/predicted_data.html>
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
- pub enum VepConsequence {
- /// Complete destruction of a transcript
- TranscriptAblation,
- /// Variant at the splice acceptor site
- SpliceAcceptorVariant,
- /// Variant at the splice donor site
- SpliceDonorVariant,
- /// Variant causing a premature stop codon
- StopGained,
- /// Insertion or deletion causing a frameshift
- FrameshiftVariant,
- /// Variant causing loss of stop codon
- StopLost,
- /// Variant causing loss of start codon
- StartLost,
- /// Amplification of a transcript
- TranscriptAmplification,
- /// Insertion not causing a frameshift
- InframeInsertion,
- /// Deletion not causing a frameshift
- InframeDeletion,
- /// Variant causing an amino acid change
- MissenseVariant,
- /// Variant that changes protein structure
- ProteinAlteringVariant,
- /// Variant at the 5th base of a splice donor site
- SpliceDonor5thBaseVariant,
- /// Variant in the splice region
- SpliceRegionVariant,
- /// Variant in the splice donor region
- SpliceDonorRegionVariant,
- /// Variant in the polypyrimidine tract near splice sites
- SplicePolyrimidineTractVariant,
- /// Variant in the final, incomplete codon of a transcript
- IncompleteTerminalCodonVariant,
- /// Variant in start codon resulting in no change
- StartRetainedVariant,
- /// Variant in stop codon resulting in no change
- StopRetainedVariant,
- /// Variant not changing the encoded amino acid
- SynonymousVariant,
- /// Variant in coding sequence with indeterminate effect
- CodingSequenceVariant,
- /// Variant in mature miRNA
- MatureMiRnaVariant,
- /// Variant in 5' UTR
- FivePrimeUtrVariant,
- /// Variant in 3' UTR
- ThreePrimeUtrVariant,
- /// Variant in non-coding exon
- NonCodingTranscriptExonVariant,
- /// Variant in intron
- IntronVariant,
- /// Variant in transcript subject to nonsense-mediated decay
- NmdTranscriptVariant,
- /// Variant in non-coding transcript
- NonCodingTranscriptVariant,
- /// Variant upstream of a gene
- UpstreamGeneVariant,
- /// Variant downstream of a gene
- DownstreamGeneVariant,
- /// Deletion of a transcription factor binding site
- TfbsAblation,
- /// Amplification of a transcription factor binding site
- TfbsAmplification,
- /// Variant in a transcription factor binding site
- TfBindingSiteVariant,
- /// Deletion of a regulatory region
- RegulatoryRegionAblation,
- /// Amplification of a regulatory region
- RegulatoryRegionAmplification,
- /// Variant causing a feature to be extended
- FeatureElongation,
- /// Variant in a regulatory region
- RegulatoryRegionVariant,
- /// Variant causing a feature to be shortened
- FeatureTruncation,
- /// Variant in intergenic region
- IntergenicVariant,
- /// General sequence variant
- SequenceVariant,
- }
- /// Represents the severity of a variant's impact as predicted by the
- /// Variant Effect Predictor (VEP).
- ///
- /// The impact categories are ordered from most severe (HIGH) to least severe (MODIFIER).
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
- pub enum VepImpact {
- /// High impact variants are expected to have high (disruptive) impact in the protein,
- /// probably causing protein truncation, loss of function or triggering nonsense mediated decay.
- HIGH,
- /// Moderate impact variants are non-disruptive variants that might change protein effectiveness.
- MODERATE,
- /// Low impact variants are mostly harmless or unlikely to change protein behavior.
- LOW,
- /// Modifier variants are usually non-coding variants or variants affecting non-coding genes,
- /// where predictions are difficult or there is no evidence of impact.
- MODIFIER,
- }
- impl From<VepImpact> for String {
- /// Converts a VepImpact enum variant to its string representation.
- fn from(value: VepImpact) -> Self {
- match value {
- VepImpact::HIGH => "High",
- VepImpact::MODERATE => "Moderate",
- VepImpact::LOW => "Low",
- VepImpact::MODIFIER => "Modifier",
- }
- .to_string()
- }
- }
- impl PartialOrd for VepImpact {
- /// Implements partial ordering for VepImpact.
- fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
- Some(self.cmp(other))
- }
- }
- impl Ord for VepImpact {
- /// Implements total ordering for VepImpact.
- ///
- /// The ordering is inverse to severity: HIGH < MODERATE < LOW < MODIFIER
- /// This allows for easy sorting where the most severe impacts come first.
- fn cmp(&self, other: &Self) -> Ordering {
- match (self, other) {
- (VepImpact::HIGH, VepImpact::HIGH) => Ordering::Equal,
- (VepImpact::HIGH, _) => Ordering::Less,
- (VepImpact::MODERATE, VepImpact::HIGH) => Ordering::Greater,
- (VepImpact::MODERATE, VepImpact::MODERATE) => Ordering::Equal,
- (VepImpact::MODERATE, _) => Ordering::Less,
- (VepImpact::LOW, VepImpact::HIGH | VepImpact::MODERATE) => Ordering::Greater,
- (VepImpact::LOW, VepImpact::LOW) => Ordering::Equal,
- (VepImpact::LOW, VepImpact::MODIFIER) => Ordering::Less,
- (VepImpact::MODIFIER, VepImpact::MODIFIER) => Ordering::Equal,
- (VepImpact::MODIFIER, _) => Ordering::Greater,
- }
- }
- }
- impl FromStr for VepImpact {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- match s {
- "LOW" => Ok(VepImpact::LOW),
- "MODERATE" => Ok(VepImpact::MODERATE),
- "HIGH" => Ok(VepImpact::HIGH),
- "MODIFIER" => Ok(VepImpact::MODIFIER),
- _ => Err(anyhow!("Unexpected VEP Impact value")),
- }
- }
- }
- impl From<&VepConsequence> for VepImpact {
- fn from(consequence: &VepConsequence) -> Self {
- match consequence {
- VepConsequence::TranscriptAblation
- | VepConsequence::SpliceAcceptorVariant
- | VepConsequence::SpliceDonorVariant
- | VepConsequence::StopGained
- | VepConsequence::FrameshiftVariant
- | VepConsequence::StopLost
- | VepConsequence::StartLost
- | VepConsequence::TranscriptAmplification
- | VepConsequence::FeatureElongation
- | VepConsequence::FeatureTruncation => VepImpact::HIGH,
- VepConsequence::InframeInsertion
- | VepConsequence::InframeDeletion
- | VepConsequence::MissenseVariant
- | VepConsequence::ProteinAlteringVariant => VepImpact::MODERATE,
- VepConsequence::SpliceDonor5thBaseVariant
- | VepConsequence::SpliceRegionVariant
- | VepConsequence::SpliceDonorRegionVariant
- | VepConsequence::SplicePolyrimidineTractVariant
- | VepConsequence::IncompleteTerminalCodonVariant
- | VepConsequence::StartRetainedVariant
- | VepConsequence::StopRetainedVariant
- | VepConsequence::SynonymousVariant => VepImpact::LOW,
- VepConsequence::CodingSequenceVariant
- | VepConsequence::MatureMiRnaVariant
- | VepConsequence::FivePrimeUtrVariant
- | VepConsequence::ThreePrimeUtrVariant
- | VepConsequence::NonCodingTranscriptExonVariant
- | VepConsequence::IntronVariant
- | VepConsequence::NmdTranscriptVariant
- | VepConsequence::NonCodingTranscriptVariant
- | VepConsequence::UpstreamGeneVariant
- | VepConsequence::DownstreamGeneVariant
- | VepConsequence::TfbsAblation
- | VepConsequence::TfbsAmplification
- | VepConsequence::TfBindingSiteVariant
- | VepConsequence::RegulatoryRegionAblation
- | VepConsequence::RegulatoryRegionAmplification
- | VepConsequence::RegulatoryRegionVariant
- | VepConsequence::SequenceVariant
- | VepConsequence::IntergenicVariant => VepImpact::MODIFIER,
- }
- }
- }
- impl From<VepConsequence> for String {
- fn from(consequence: VepConsequence) -> Self {
- match consequence {
- VepConsequence::TranscriptAblation => "transcript_ablation".to_string(),
- VepConsequence::SpliceAcceptorVariant => "splice_acceptor_variant".to_string(),
- VepConsequence::SpliceDonorVariant => "splice_donor_variant".to_string(),
- VepConsequence::StopGained => "stop_gained".to_string(),
- VepConsequence::FrameshiftVariant => "frameshift_variant".to_string(),
- VepConsequence::StopLost => "stop_lost".to_string(),
- VepConsequence::StartLost => "start_lost".to_string(),
- VepConsequence::TranscriptAmplification => "transcript_amplification".to_string(),
- VepConsequence::InframeInsertion => "inframe_insertion".to_string(),
- VepConsequence::InframeDeletion => "inframe_deletion".to_string(),
- VepConsequence::MissenseVariant => "missense_variant".to_string(),
- VepConsequence::ProteinAlteringVariant => "protein_altering_variant".to_string(),
- VepConsequence::SpliceRegionVariant => "splice_region_variant".to_string(),
- VepConsequence::IncompleteTerminalCodonVariant => {
- "incomplete_terminal_codon_variant".to_string()
- }
- VepConsequence::StartRetainedVariant => "start_retained_variant".to_string(),
- VepConsequence::StopRetainedVariant => "stop_retained_variant".to_string(),
- VepConsequence::SynonymousVariant => "synonymous_variant".to_string(),
- VepConsequence::CodingSequenceVariant => "coding_sequence_variant".to_string(),
- VepConsequence::MatureMiRnaVariant => "mature_miRNA_variant".to_string(),
- VepConsequence::FivePrimeUtrVariant => "5_prime_UTR_variant".to_string(),
- VepConsequence::ThreePrimeUtrVariant => "3_prime_UTR_variant".to_string(),
- VepConsequence::NonCodingTranscriptExonVariant => {
- "non_coding_transcript_exon_variant".to_string()
- }
- VepConsequence::IntronVariant => "intron_variant".to_string(),
- VepConsequence::NmdTranscriptVariant => "NMD_transcript_variant".to_string(),
- VepConsequence::NonCodingTranscriptVariant => {
- "non_coding_transcript_variant".to_string()
- }
- VepConsequence::UpstreamGeneVariant => "upstream_gene_variant".to_string(),
- VepConsequence::DownstreamGeneVariant => "downstream_gene_variant".to_string(),
- VepConsequence::TfbsAblation => "TFBS_ablation".to_string(),
- VepConsequence::TfbsAmplification => "TFBS_amplification".to_string(),
- VepConsequence::TfBindingSiteVariant => "TF_binding_site_variant".to_string(),
- VepConsequence::RegulatoryRegionAblation => "regulatory_region_ablation".to_string(),
- VepConsequence::RegulatoryRegionAmplification => {
- "regulatory_region_amplification".to_string()
- }
- VepConsequence::FeatureElongation => "feature_elongation".to_string(),
- VepConsequence::RegulatoryRegionVariant => "regulatory_region_variant".to_string(),
- VepConsequence::FeatureTruncation => "feature_truncation".to_string(),
- VepConsequence::SpliceDonor5thBaseVariant => {
- "splice_donor_5th_base_variant".to_string()
- }
- VepConsequence::SpliceDonorRegionVariant => "splice_donor_region_variant".to_string(),
- VepConsequence::SplicePolyrimidineTractVariant => {
- "splice_polyrimidine_tract_variant".to_string()
- }
- VepConsequence::SequenceVariant => "sequence_variant".to_string(),
- VepConsequence::IntergenicVariant => "intergenic_variant".to_string(),
- }
- }
- }
- impl FromStr for VepConsequence {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- match s {
- "transcript_ablation" => Ok(VepConsequence::TranscriptAblation),
- "splice_acceptor_variant" => Ok(VepConsequence::SpliceAcceptorVariant),
- "splice_donor_variant" => Ok(VepConsequence::SpliceDonorVariant),
- "stop_gained" => Ok(VepConsequence::StopGained),
- "frameshift_variant" => Ok(VepConsequence::FrameshiftVariant),
- "stop_lost" => Ok(VepConsequence::StopLost),
- "start_lost" => Ok(VepConsequence::StartLost),
- "transcript_amplification" => Ok(VepConsequence::TranscriptAmplification),
- "feature_elongation" => Ok(VepConsequence::FeatureElongation),
- "feature_truncation" => Ok(VepConsequence::FeatureTruncation),
- "inframe_insertion" => Ok(VepConsequence::InframeInsertion),
- "inframe_deletion" => Ok(VepConsequence::InframeDeletion),
- "missense_variant" => Ok(VepConsequence::MissenseVariant),
- "protein_altering_variant" => Ok(VepConsequence::ProteinAlteringVariant),
- "splice_donor_5th_base_variant" => Ok(VepConsequence::SpliceDonor5thBaseVariant),
- "splice_region_variant" => Ok(VepConsequence::SpliceRegionVariant),
- "splice_donor_region_variant" => Ok(VepConsequence::SpliceDonorRegionVariant),
- "splice_polypyrimidine_tract_variant" => {
- Ok(VepConsequence::SplicePolyrimidineTractVariant)
- }
- "incomplete_terminal_codon_variant" => {
- Ok(VepConsequence::IncompleteTerminalCodonVariant)
- }
- "start_retained_variant" => Ok(VepConsequence::StartRetainedVariant),
- "stop_retained_variant" => Ok(VepConsequence::StopRetainedVariant),
- "synonymous_variant" => Ok(VepConsequence::SynonymousVariant),
- "coding_sequence_variant" => Ok(VepConsequence::CodingSequenceVariant),
- "mature_miRNA_variant" => Ok(VepConsequence::MatureMiRnaVariant),
- "5_prime_UTR_variant" => Ok(VepConsequence::FivePrimeUtrVariant),
- "3_prime_UTR_variant" => Ok(VepConsequence::ThreePrimeUtrVariant),
- "non_coding_transcript_exon_variant" => {
- Ok(VepConsequence::NonCodingTranscriptExonVariant)
- }
- "intron_variant" => Ok(VepConsequence::IntronVariant),
- "NMD_transcript_variant" => Ok(VepConsequence::NmdTranscriptVariant),
- "non_coding_transcript_variant" => Ok(VepConsequence::NonCodingTranscriptVariant),
- "upstream_gene_variant" => Ok(VepConsequence::UpstreamGeneVariant),
- "downstream_gene_variant" => Ok(VepConsequence::DownstreamGeneVariant),
- "TFBS_ablation" => Ok(VepConsequence::TfbsAblation),
- "TFBS_amplification" => Ok(VepConsequence::TfbsAmplification),
- "TF_binding_site_variant" => Ok(VepConsequence::TfBindingSiteVariant),
- "regulatory_region_ablation" => Ok(VepConsequence::RegulatoryRegionAblation),
- "regulatory_region_amplification" => Ok(VepConsequence::RegulatoryRegionAmplification),
- "regulatory_region_variant" => Ok(VepConsequence::RegulatoryRegionVariant),
- "intergenic_variant" => Ok(VepConsequence::IntergenicVariant),
- "sequence_variant" => Ok(VepConsequence::SequenceVariant),
- _ => Err(anyhow!("Unknown VepConsequence: {s}")),
- }
- }
- }
- impl TryFrom<&VepLine> for VEP {
- type Error = anyhow::Error;
- /// Attempts to convert a VepLine reference into a VEP struct.
- ///
- /// This implementation provides a way to transform the raw VEP output line
- /// (represented by VepLine) into a more structured and typed VEP representation.
- ///
- /// # Arguments
- /// * `d` - A reference to a VepLine struct
- ///
- /// # Returns
- /// * `Ok(VEP)` if the conversion is successful
- /// * `Err(anyhow::Error)` if any parsing errors occur
- ///
- /// # Behavior
- /// - Converts "-" strings to None for optional fields
- /// - Parses the consequence field into a `Vec<VepConsequence>`
- /// - Parses the extra field into a VEPExtra struct
- ///
- /// # Example
- /// ```
- /// let vep_line = VepLine { /* ... */ };
- /// match VEP::try_from(&vep_line) {
- /// Ok(vep) => println!("Converted VEP: {:?}", vep),
- /// Err(e) => eprintln!("Conversion failed: {}", e),
- /// }
- /// ```
- fn try_from(d: &VepLine) -> anyhow::Result<Self> {
- let or_opt = |s: &str| match s {
- "-" => None,
- _ => Some(s.to_string()),
- };
- let consequence = or_opt(&d.consequence).map(|c| {
- c.split(',')
- .filter_map(|e| e.parse::<VepConsequence>().ok())
- .collect::<Vec<VepConsequence>>()
- });
- Ok(VEP {
- gene: or_opt(&d.gene),
- feature: or_opt(&d.feature),
- feature_type: or_opt(&d.feature_type),
- consequence,
- cdna_position: or_opt(&d.feature_type),
- cds_position: or_opt(&d.cds_position),
- protein_position: or_opt(&d.protein_position),
- amino_acids: or_opt(&d.amino_acids),
- codons: or_opt(&d.codons),
- existing_variation: or_opt(&d.existing_variation),
- extra: d.extra.parse()?,
- })
- }
- }
- /// Represents additional information from Variant Effect Predictor (VEP) annotations.
- ///
- /// This struct encapsulates various optional fields that provide extra context
- /// about a variant's predicted effect.
- #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
- pub struct VEPExtra {
- /// The impact severity of the variant (e.g., HIGH, MODERATE, LOW, MODIFIER)
- pub impact: Option<VepImpact>,
- /// The gene symbol associated with the variant
- pub symbol: Option<String>,
- /// The distance to the feature (e.g., for upstream/downstream variants)
- pub distance: Option<u32>,
- /// HGVS notation describing the variant at the coding sequence level
- pub hgvs_c: Option<String>,
- /// HGVS notation describing the variant at the protein level
- pub hgvs_p: Option<String>,
- }
- impl FromStr for VEPExtra {
- type Err = anyhow::Error;
- /// Parses a VEPExtra instance from a string representation.
- ///
- /// This method interprets a semicolon-separated key-value string typically
- /// found in VEP output and constructs a VEPExtra struct from it.
- ///
- /// # Arguments
- /// * `s` - A string slice containing the VEP extra information
- ///
- /// # Returns
- /// * `Ok(VEPExtra)` if parsing is successful
- /// * `Err(anyhow::Error)` if parsing fails
- ///
- /// # Example
- /// ```
- /// let vep_extra = VEPExtra::from_str("IMPACT=MODERATE;SYMBOL=BRCA1;DISTANCE=500;HGVSc=c.1A>T;HGVSp=p.Met1?");
- /// assert!(vep_extra.is_ok());
- /// ```
- ///
- /// # Note
- /// This method is flexible and will not fail if some fields are missing.
- /// It will simply set those fields to None in the resulting struct.
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let kv: HashMap<_, _> = s.split(';').filter_map(|e| e.split_once('=')).collect();
- let impact = kv.get("IMPACT").map(|&v| v.parse()).transpose()?;
- let symbol = kv.get("SYMBOL").map(ToString::to_string);
- let distance = kv.get("DISTANCE").map(|&v| v.parse()).transpose()?;
- let hgvs_c = kv.get("HGVSc").map(ToString::to_string);
- let hgvs_p = kv.get("HGVSp").map(ToString::to_string);
- Ok(VEPExtra {
- impact,
- symbol,
- distance,
- hgvs_c,
- hgvs_p,
- })
- }
- }
- /// Runs the Variant Effect Predictor (VEP) on a given input file and outputs the results.
- ///
- /// This function executes the VEP tool with specific parameters to annotate genetic variants.
- /// It uses a predefined set of reference data and plugins to enhance the annotation process.
- ///
- /// # Arguments
- /// * `in_path` - A string slice that holds the path to the input file
- /// * `out_path` - A string slice that holds the path where the output should be written
- ///
- /// # Returns
- /// * `Ok(())` if VEP runs successfully
- /// * `Err(anyhow::Error)` if there's an error during execution
- ///
- /// # Configuration
- /// The function uses hardcoded paths for:
- /// - VEP binary directory
- /// - VEP cache directory
- /// - Reference FASTA file
- /// - GFF annotation file
- ///
- /// # VEP Parameters
- /// - Uses offline cache
- /// - Includes symbol information
- /// - Applies SpliceRegion and Downstream plugins
- /// - Generates HGVS notations
- ///
- /// # Error Handling
- /// - Logs any lines containing "error" from VEP's stderr output as warnings
- /// - Returns an error if the VEP process fails to complete
- ///
- /// # Example
- /// ```
- /// match run_vep("input.vcf", "output.vcf") {
- /// Ok(()) => println!("VEP annotation completed successfully"),
- /// Err(e) => eprintln!("VEP annotation failed: {}", e),
- /// }
- /// ```
- ///
- /// # Note
- /// Ensure that the VEP tool and all necessary reference data are correctly installed
- /// and accessible at the specified paths before running this function.
- pub fn run_vep(in_path: &str, out_path: &str) -> anyhow::Result<()> {
- // VEP need plugin Downstream and SpliceRegion /home/prom/.vep/Plugins
- debug!("Run VEP for {in_path} and ouput {out_path}");
- let bin_dir = "/data/tools/ensembl-vep";
- let dir_cache = "/data/ref/hs1/vepcache/";
- let fasta = "/data/ref/hs1/chm13v2.0.fa";
- let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz";
- let mut cmd = Command::new(format!("{}/vep", bin_dir))
- .arg("--dir_cache")
- .arg(dir_cache)
- .arg("--cache")
- .arg("--offline")
- .arg("--fasta")
- .arg(fasta)
- .arg("--gff")
- .arg(gff)
- .arg("--symbol")
- .arg("--plugin")
- .arg("SpliceRegion")
- .arg("--plugin")
- .arg("Downstream")
- .arg("--hgvs")
- .arg("-i")
- .arg(in_path)
- .arg("-o")
- .arg(out_path)
- .stderr(Stdio::piped())
- .spawn()
- .expect("VEP failed to start");
- let stderr = cmd.stderr.take().unwrap();
- let reader = BufReader::new(stderr);
- reader
- .lines()
- .map_while(Result::ok)
- // .inspect(|y| println!("{y}"))
- .filter(|line| line.contains("error"))
- .for_each(|line| warn!("{}", line));
- cmd.wait()?;
- Ok(())
- }
- /// Selects the best Variant Effect Predictor (VEP) annotation from a slice of VEP annotations.
- ///
- /// This function implements a prioritization algorithm to choose the most relevant VEP annotation
- /// based on impact severity and transcript preference.
- ///
- /// # Arguments
- /// * `d` - A slice of VEP annotations
- ///
- /// # Returns
- /// * `Ok(VEP)` containing the best VEP annotation if successful
- /// * `Err(anyhow::Error)` if no valid VEP annotation can be selected
- ///
- /// # Algorithm
- /// 1. Groups VEPs by their impact (HIGH > MODERATE > LOW > MODIFIER)
- /// 2. Selects the group with the highest impact
- /// 3. If there's only one VEP in this group, it's returned
- /// 4. Otherwise, further filtering is done based on transcript identifiers:
- /// - Parses NCBI accessions from the feature field
- /// - Prioritizes 'NM' prefixed accessions over others
- /// - Sorts by accession number (higher numbers are preferred)
- ///
- /// # Errors
- /// This function will return an error if:
- /// * The input slice is empty
- /// * No valid NCBI accession can be parsed from the selected VEPs
- ///
- /// # Example
- /// ```
- /// let veps = vec![VEP { /* ... */ }, VEP { /* ... */ }];
- /// match get_best_vep(&veps) {
- /// Ok(best_vep) => println!("Best VEP: {:?}", best_vep),
- /// Err(e) => eprintln!("Failed to get best VEP: {}", e),
- /// }
- /// ```
- ///
- /// # Note
- /// This function assumes that the VEP annotations are well-formed and contain
- /// the necessary fields for comparison (consequence, feature). It will log warnings
- /// for any parsing errors encountered during the selection process.
- pub fn get_best_vep(d: &[VEP]) -> anyhow::Result<VEP> {
- let best_impact_veps = d
- .iter()
- .chunk_by(|vep| {
- vep.consequence.as_ref().map_or(VepImpact::MODIFIER, |c| {
- c.iter()
- .map(VepImpact::from)
- .min()
- .unwrap_or(VepImpact::MODIFIER)
- })
- })
- .into_iter()
- .min_by_key(|(impact, _)| impact.clone())
- .map(|(_, group)| group.cloned().collect::<Vec<_>>())
- .ok_or_else(|| anyhow!("No element in VEP vector"))?;
- if best_impact_veps.len() == 1 {
- return Ok(best_impact_veps[0].clone());
- }
- let parsed_veps = best_impact_veps
- .iter()
- .enumerate()
- .filter_map(|(i, vep)| {
- vep.feature.as_ref().and_then(|feat| {
- feat.parse::<NCBIAcc>()
- .map(|acc| (i, acc))
- .map_err(|e| warn!("Can't parse {}: {}", feat, e))
- .ok()
- })
- })
- .sorted_by_key(|(_, acc)| (Reverse(acc.prefix == "NM"), acc.number))
- .collect::<Vec<_>>();
- parsed_veps
- .first()
- .map(|(k, _)| best_impact_veps[*k].clone())
- .ok_or_else(|| anyhow!("No valid NCBI accession found"))
- }
|