use anyhow::anyhow; use bitcode::{Decode, Encode}; use hashbrown::HashMap; use itertools::Itertools; use log::warn; use serde::{Deserialize, Serialize}; use std::{ cmp::{Ordering, Reverse}, fmt::Display, path::{Path, PathBuf}, str::FromStr, }; use crate::{ commands::{Command as JobCommand, LocalBatchRunner, LocalRunner, SbatchRunner}, config::Config, helpers::singularity_bind_flags, }; use super::ncbi::NCBIAcc; /// Represents a single line of output from the Variant Effect Predictor (VEP). /// /// This struct encapsulates all fields typically found in a VEP output line, /// providing a structured way to work with VEP results. #[derive(Debug, PartialEq, Serialize, Deserialize)] pub struct VepLine { /// The identifier of the input variant pub uploaded_variation: String, /// The genomic location of the variant pub location: String, /// The variant allele pub allele: String, /// The gene symbol or identifier pub gene: String, /// The affected feature (e.g., transcript ID) pub feature: String, /// The type of feature affected (e.g., Transcript, RegulatoryFeature) pub feature_type: String, /// The consequence of the variant pub consequence: String, /// The position of the variant in the cDNA pub cdna_position: String, /// The position of the variant in the CDS pub cds_position: String, /// The position of the variant in the protein pub protein_position: String, /// The amino acid change caused by the variant pub amino_acids: String, /// The codon change caused by the variant pub codons: String, /// Known identifiers for this variant pub existing_variation: String, /// Additional information in key=value format pub extra: String, } impl FromStr for VepLine { type Err = anyhow::Error; /// Parses a VEP output line into a VepLine struct. /// /// This method expects the input string to be a tab-separated line /// containing exactly 14 fields, as per standard VEP output format. /// /// # Arguments /// * `s` - A string slice representing a single line of VEP output /// /// # Returns /// * `Ok(VepLine)` if parsing is successful /// * `Err(anyhow::Error)` if the input doesn't contain the expected number of fields /// /// # Example /// ``` /// let vep_line = VepLine::from_str("").unwrap(); /// assert_eq!(vep_line.gene, "ENSG00000135744"); /// ``` fn from_str(s: &str) -> Result { let parts: Vec<&str> = s.split('\t').collect(); if parts.len() != 14 { return Err(anyhow!("Invalid number of fields in VEP line")); } Ok(VepLine { uploaded_variation: parts[0].to_string(), location: parts[1].to_string(), allele: parts[2].to_string(), gene: parts[3].to_string(), feature: parts[4].to_string(), feature_type: parts[5].to_string(), consequence: parts[6].to_string(), cdna_position: parts[7].to_string(), cds_position: parts[8].to_string(), protein_position: parts[9].to_string(), amino_acids: parts[10].to_string(), codons: parts[11].to_string(), existing_variation: parts[12].to_string(), extra: parts[13].to_string(), }) } } /// Represents a Variant Effect Predictor (VEP) annotation for a genetic variant. /// /// This struct encapsulates various fields that describe the predicted effects /// of a variant on genes, transcripts, and proteins, as determined by VEP. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)] pub struct VEP { /// The gene affected by the variant pub gene: Option, /// The specific feature (e.g., transcript) affected by the variant pub feature: Option, /// The type of feature (e.g., Transcript, RegulatoryFeature) pub feature_type: Option, /// The predicted consequence(s) of the variant pub consequence: Option>, /// The position of the variant in the cDNA sequence pub cdna_position: Option, /// The position of the variant in the coding sequence pub cds_position: Option, /// The position of the variant in the protein sequence pub protein_position: Option, /// The amino acid change caused by the variant pub amino_acids: Option, /// The codon change caused by the variant pub codons: Option, /// Known identifiers for this variant pub existing_variation: Option, /// Additional VEP information pub extra: VEPExtra, } impl VEP { pub fn impact(&self) -> Option { self.extra.impact.clone() } pub fn gene(&self) -> Option { self.extra.symbol.clone() } pub fn gene_distance(&self) -> Option { self.extra.distance } pub fn feature(&self) -> Option { self.feature.clone() } pub fn consequences_str(&self) -> Vec { match &self.consequence { Some(csq) => csq.iter().map(|s| s.to_string()).collect(), None => Vec::new(), } } pub fn hgvs(&self) -> Option { if self.extra.hgvs_p.is_some() { return self.extra.hgvs_p.clone(); } if self.extra.hgvs_c.is_some() { return self.extra.hgvs_c.clone(); } None } } /// Represents the predicted consequences of a genetic variant as determined by /// the Ensembl Variant Effect Predictor (VEP). /// /// This enum aligns with the Sequence Ontology (SO) terms used by VEP to describe /// the effects of variants on transcripts, proteins, and regulatory regions. /// /// The variants are generally ordered from most to least severe in terms of their /// potential impact on gene function. /// /// For more information, see: /// #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)] pub enum VepConsequence { /// Complete destruction of a transcript TranscriptAblation, /// Variant at the splice acceptor site SpliceAcceptorVariant, /// Variant at the splice donor site SpliceDonorVariant, /// Variant causing a premature stop codon StopGained, /// Insertion or deletion causing a frameshift FrameshiftVariant, /// Variant causing loss of stop codon StopLost, /// Variant causing loss of start codon StartLost, /// Amplification of a transcript TranscriptAmplification, /// Insertion not causing a frameshift InframeInsertion, /// Deletion not causing a frameshift InframeDeletion, /// Variant causing an amino acid change MissenseVariant, /// Variant that changes protein structure ProteinAlteringVariant, /// Variant at the 5th base of a splice donor site SpliceDonor5thBaseVariant, /// Variant in the splice region SpliceRegionVariant, /// Variant in the splice donor region SpliceDonorRegionVariant, /// Variant in the polypyrimidine tract near splice sites SplicePolyrimidineTractVariant, /// Variant in the final, incomplete codon of a transcript IncompleteTerminalCodonVariant, /// Variant in start codon resulting in no change StartRetainedVariant, /// Variant in stop codon resulting in no change StopRetainedVariant, /// Variant not changing the encoded amino acid SynonymousVariant, /// Variant in coding sequence with indeterminate effect CodingSequenceVariant, /// Variant in mature miRNA MatureMiRnaVariant, /// Variant in 5' UTR FivePrimeUtrVariant, /// Variant in 3' UTR ThreePrimeUtrVariant, /// Variant in non-coding exon NonCodingTranscriptExonVariant, /// Variant in intron IntronVariant, /// Variant in transcript subject to nonsense-mediated decay NmdTranscriptVariant, /// Variant in non-coding transcript NonCodingTranscriptVariant, /// Variant upstream of a gene UpstreamGeneVariant, /// Variant downstream of a gene DownstreamGeneVariant, /// Deletion of a transcription factor binding site TfbsAblation, /// Amplification of a transcription factor binding site TfbsAmplification, /// Variant in a transcription factor binding site TfBindingSiteVariant, /// Deletion of a regulatory region RegulatoryRegionAblation, /// Amplification of a regulatory region RegulatoryRegionAmplification, /// Variant causing a feature to be extended FeatureElongation, /// Variant in a regulatory region RegulatoryRegionVariant, /// Variant causing a feature to be shortened FeatureTruncation, /// Variant in intergenic region IntergenicVariant, /// General sequence variant SequenceVariant, } /// Represents the severity of a variant's impact as predicted by the /// Variant Effect Predictor (VEP). /// /// The impact categories are ordered from most severe (HIGH) to least severe (MODIFIER). #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)] pub enum VepImpact { /// High impact variants are expected to have high (disruptive) impact in the protein, /// probably causing protein truncation, loss of function or triggering nonsense mediated decay. HIGH, /// Moderate impact variants are non-disruptive variants that might change protein effectiveness. MODERATE, /// Low impact variants are mostly harmless or unlikely to change protein behavior. LOW, /// Modifier variants are usually non-coding variants or variants affecting non-coding genes, /// where predictions are difficult or there is no evidence of impact. MODIFIER, } impl Display for VepImpact { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{}", match self { VepImpact::HIGH => "HIGH", VepImpact::MODERATE => "MODERATE", VepImpact::LOW => "LOW", VepImpact::MODIFIER => "MODIFIER", } ) } } impl From for String { /// Converts a VepImpact enum variant to its string representation. fn from(value: VepImpact) -> Self { match value { VepImpact::HIGH => "High", VepImpact::MODERATE => "Moderate", VepImpact::LOW => "Low", VepImpact::MODIFIER => "Modifier", } .to_string() } } impl PartialOrd for VepImpact { /// Implements partial ordering for VepImpact. fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for VepImpact { /// Implements total ordering for VepImpact. /// /// The ordering is inverse to severity: HIGH < MODERATE < LOW < MODIFIER /// This allows for easy sorting where the most severe impacts come first. fn cmp(&self, other: &Self) -> Ordering { match (self, other) { (VepImpact::HIGH, VepImpact::HIGH) => Ordering::Equal, (VepImpact::HIGH, _) => Ordering::Less, (VepImpact::MODERATE, VepImpact::HIGH) => Ordering::Greater, (VepImpact::MODERATE, VepImpact::MODERATE) => Ordering::Equal, (VepImpact::MODERATE, _) => Ordering::Less, (VepImpact::LOW, VepImpact::HIGH | VepImpact::MODERATE) => Ordering::Greater, (VepImpact::LOW, VepImpact::LOW) => Ordering::Equal, (VepImpact::LOW, VepImpact::MODIFIER) => Ordering::Less, (VepImpact::MODIFIER, VepImpact::MODIFIER) => Ordering::Equal, (VepImpact::MODIFIER, _) => Ordering::Greater, } } } impl FromStr for VepImpact { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "LOW" => Ok(VepImpact::LOW), "MODERATE" => Ok(VepImpact::MODERATE), "HIGH" => Ok(VepImpact::HIGH), "MODIFIER" => Ok(VepImpact::MODIFIER), _ => Err(anyhow!("Unexpected VEP Impact value")), } } } impl From<&VepConsequence> for VepImpact { fn from(consequence: &VepConsequence) -> Self { match consequence { VepConsequence::TranscriptAblation | VepConsequence::SpliceAcceptorVariant | VepConsequence::SpliceDonorVariant | VepConsequence::StopGained | VepConsequence::FrameshiftVariant | VepConsequence::StopLost | VepConsequence::StartLost | VepConsequence::TranscriptAmplification | VepConsequence::FeatureElongation | VepConsequence::FeatureTruncation => VepImpact::HIGH, VepConsequence::InframeInsertion | VepConsequence::InframeDeletion | VepConsequence::MissenseVariant | VepConsequence::ProteinAlteringVariant => VepImpact::MODERATE, VepConsequence::SpliceDonor5thBaseVariant | VepConsequence::SpliceRegionVariant | VepConsequence::SpliceDonorRegionVariant | VepConsequence::SplicePolyrimidineTractVariant | VepConsequence::IncompleteTerminalCodonVariant | VepConsequence::StartRetainedVariant | VepConsequence::StopRetainedVariant | VepConsequence::SynonymousVariant => VepImpact::LOW, VepConsequence::CodingSequenceVariant | VepConsequence::MatureMiRnaVariant | VepConsequence::FivePrimeUtrVariant | VepConsequence::ThreePrimeUtrVariant | VepConsequence::NonCodingTranscriptExonVariant | VepConsequence::IntronVariant | VepConsequence::NmdTranscriptVariant | VepConsequence::NonCodingTranscriptVariant | VepConsequence::UpstreamGeneVariant | VepConsequence::DownstreamGeneVariant | VepConsequence::TfbsAblation | VepConsequence::TfbsAmplification | VepConsequence::TfBindingSiteVariant | VepConsequence::RegulatoryRegionAblation | VepConsequence::RegulatoryRegionAmplification | VepConsequence::RegulatoryRegionVariant | VepConsequence::SequenceVariant | VepConsequence::IntergenicVariant => VepImpact::MODIFIER, } } } impl From for String { fn from(consequence: VepConsequence) -> Self { match consequence { VepConsequence::TranscriptAblation => "transcript_ablation".to_string(), VepConsequence::SpliceAcceptorVariant => "splice_acceptor_variant".to_string(), VepConsequence::SpliceDonorVariant => "splice_donor_variant".to_string(), VepConsequence::StopGained => "stop_gained".to_string(), VepConsequence::FrameshiftVariant => "frameshift_variant".to_string(), VepConsequence::StopLost => "stop_lost".to_string(), VepConsequence::StartLost => "start_lost".to_string(), VepConsequence::TranscriptAmplification => "transcript_amplification".to_string(), VepConsequence::InframeInsertion => "inframe_insertion".to_string(), VepConsequence::InframeDeletion => "inframe_deletion".to_string(), VepConsequence::MissenseVariant => "missense_variant".to_string(), VepConsequence::ProteinAlteringVariant => "protein_altering_variant".to_string(), VepConsequence::SpliceRegionVariant => "splice_region_variant".to_string(), VepConsequence::IncompleteTerminalCodonVariant => { "incomplete_terminal_codon_variant".to_string() } VepConsequence::StartRetainedVariant => "start_retained_variant".to_string(), VepConsequence::StopRetainedVariant => "stop_retained_variant".to_string(), VepConsequence::SynonymousVariant => "synonymous_variant".to_string(), VepConsequence::CodingSequenceVariant => "coding_sequence_variant".to_string(), VepConsequence::MatureMiRnaVariant => "mature_miRNA_variant".to_string(), VepConsequence::FivePrimeUtrVariant => "5_prime_UTR_variant".to_string(), VepConsequence::ThreePrimeUtrVariant => "3_prime_UTR_variant".to_string(), VepConsequence::NonCodingTranscriptExonVariant => { "non_coding_transcript_exon_variant".to_string() } VepConsequence::IntronVariant => "intron_variant".to_string(), VepConsequence::NmdTranscriptVariant => "NMD_transcript_variant".to_string(), VepConsequence::NonCodingTranscriptVariant => { "non_coding_transcript_variant".to_string() } VepConsequence::UpstreamGeneVariant => "upstream_gene_variant".to_string(), VepConsequence::DownstreamGeneVariant => "downstream_gene_variant".to_string(), VepConsequence::TfbsAblation => "TFBS_ablation".to_string(), VepConsequence::TfbsAmplification => "TFBS_amplification".to_string(), VepConsequence::TfBindingSiteVariant => "TF_binding_site_variant".to_string(), VepConsequence::RegulatoryRegionAblation => "regulatory_region_ablation".to_string(), VepConsequence::RegulatoryRegionAmplification => { "regulatory_region_amplification".to_string() } VepConsequence::FeatureElongation => "feature_elongation".to_string(), VepConsequence::RegulatoryRegionVariant => "regulatory_region_variant".to_string(), VepConsequence::FeatureTruncation => "feature_truncation".to_string(), VepConsequence::SpliceDonor5thBaseVariant => { "splice_donor_5th_base_variant".to_string() } VepConsequence::SpliceDonorRegionVariant => "splice_donor_region_variant".to_string(), VepConsequence::SplicePolyrimidineTractVariant => { "splice_polyrimidine_tract_variant".to_string() } VepConsequence::SequenceVariant => "sequence_variant".to_string(), VepConsequence::IntergenicVariant => "intergenic_variant".to_string(), } } } impl Display for VepConsequence { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", String::from(self.clone())) } } impl FromStr for VepConsequence { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "transcript_ablation" => Ok(VepConsequence::TranscriptAblation), "splice_acceptor_variant" => Ok(VepConsequence::SpliceAcceptorVariant), "splice_donor_variant" => Ok(VepConsequence::SpliceDonorVariant), "stop_gained" => Ok(VepConsequence::StopGained), "frameshift_variant" => Ok(VepConsequence::FrameshiftVariant), "stop_lost" => Ok(VepConsequence::StopLost), "start_lost" => Ok(VepConsequence::StartLost), "transcript_amplification" => Ok(VepConsequence::TranscriptAmplification), "feature_elongation" => Ok(VepConsequence::FeatureElongation), "feature_truncation" => Ok(VepConsequence::FeatureTruncation), "inframe_insertion" => Ok(VepConsequence::InframeInsertion), "inframe_deletion" => Ok(VepConsequence::InframeDeletion), "missense_variant" => Ok(VepConsequence::MissenseVariant), "protein_altering_variant" => Ok(VepConsequence::ProteinAlteringVariant), "splice_donor_5th_base_variant" => Ok(VepConsequence::SpliceDonor5thBaseVariant), "splice_region_variant" => Ok(VepConsequence::SpliceRegionVariant), "splice_donor_region_variant" => Ok(VepConsequence::SpliceDonorRegionVariant), "splice_polypyrimidine_tract_variant" => { Ok(VepConsequence::SplicePolyrimidineTractVariant) } "incomplete_terminal_codon_variant" => { Ok(VepConsequence::IncompleteTerminalCodonVariant) } "start_retained_variant" => Ok(VepConsequence::StartRetainedVariant), "stop_retained_variant" => Ok(VepConsequence::StopRetainedVariant), "synonymous_variant" => Ok(VepConsequence::SynonymousVariant), "coding_sequence_variant" => Ok(VepConsequence::CodingSequenceVariant), "mature_miRNA_variant" => Ok(VepConsequence::MatureMiRnaVariant), "5_prime_UTR_variant" => Ok(VepConsequence::FivePrimeUtrVariant), "3_prime_UTR_variant" => Ok(VepConsequence::ThreePrimeUtrVariant), "non_coding_transcript_exon_variant" => { Ok(VepConsequence::NonCodingTranscriptExonVariant) } "intron_variant" => Ok(VepConsequence::IntronVariant), "NMD_transcript_variant" => Ok(VepConsequence::NmdTranscriptVariant), "non_coding_transcript_variant" => Ok(VepConsequence::NonCodingTranscriptVariant), "upstream_gene_variant" => Ok(VepConsequence::UpstreamGeneVariant), "downstream_gene_variant" => Ok(VepConsequence::DownstreamGeneVariant), "TFBS_ablation" => Ok(VepConsequence::TfbsAblation), "TFBS_amplification" => Ok(VepConsequence::TfbsAmplification), "TF_binding_site_variant" => Ok(VepConsequence::TfBindingSiteVariant), "regulatory_region_ablation" => Ok(VepConsequence::RegulatoryRegionAblation), "regulatory_region_amplification" => Ok(VepConsequence::RegulatoryRegionAmplification), "regulatory_region_variant" => Ok(VepConsequence::RegulatoryRegionVariant), "intergenic_variant" => Ok(VepConsequence::IntergenicVariant), "sequence_variant" => Ok(VepConsequence::SequenceVariant), _ => Err(anyhow!("Unknown VepConsequence: {s}")), } } } impl TryFrom<&VepLine> for VEP { type Error = anyhow::Error; /// Attempts to convert a VepLine reference into a VEP struct. /// /// This implementation provides a way to transform the raw VEP output line /// (represented by VepLine) into a more structured and typed VEP representation. /// /// # Arguments /// * `d` - A reference to a VepLine struct /// /// # Returns /// * `Ok(VEP)` if the conversion is successful /// * `Err(anyhow::Error)` if any parsing errors occur /// /// # Behavior /// - Converts "-" strings to None for optional fields /// - Parses the consequence field into a `Vec` /// - Parses the extra field into a VEPExtra struct /// /// # Example /// ``` /// let vep_line = VepLine { /* ... */ }; /// match VEP::try_from(&vep_line) { /// Ok(vep) => println!("Converted VEP: {:?}", vep), /// Err(e) => eprintln!("Conversion failed: {}", e), /// } /// ``` fn try_from(d: &VepLine) -> anyhow::Result { let or_opt = |s: &str| match s { "-" => None, _ => Some(s.to_string()), }; let consequence = or_opt(&d.consequence).map(|c| { c.split(',') .filter_map(|e| e.parse::().ok()) .collect::>() }); Ok(VEP { gene: or_opt(&d.gene), feature: or_opt(&d.feature), feature_type: or_opt(&d.feature_type), consequence, cdna_position: or_opt(&d.feature_type), cds_position: or_opt(&d.cds_position), protein_position: or_opt(&d.protein_position), amino_acids: or_opt(&d.amino_acids), codons: or_opt(&d.codons), existing_variation: or_opt(&d.existing_variation), extra: d.extra.parse()?, }) } } /// Represents additional information from Variant Effect Predictor (VEP) annotations. /// /// This struct encapsulates various optional fields that provide extra context /// about a variant's predicted effect. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)] pub struct VEPExtra { /// The impact severity of the variant (e.g., HIGH, MODERATE, LOW, MODIFIER) pub impact: Option, /// The gene symbol associated with the variant pub symbol: Option, /// The distance to the feature (e.g., for upstream/downstream variants) pub distance: Option, /// HGVS notation describing the variant at the coding sequence level pub hgvs_c: Option, /// HGVS notation describing the variant at the protein level pub hgvs_p: Option, } impl FromStr for VEPExtra { type Err = anyhow::Error; /// Parses a VEPExtra instance from a string representation. /// /// This method interprets a semicolon-separated key-value string typically /// found in VEP output and constructs a VEPExtra struct from it. /// /// # Arguments /// * `s` - A string slice containing the VEP extra information /// /// # Returns /// * `Ok(VEPExtra)` if parsing is successful /// * `Err(anyhow::Error)` if parsing fails /// /// # Example /// ``` /// let vep_extra = VEPExtra::from_str("IMPACT=MODERATE;SYMBOL=BRCA1;DISTANCE=500;HGVSc=c.1A>T;HGVSp=p.Met1?"); /// assert!(vep_extra.is_ok()); /// ``` /// /// # Note /// This method is flexible and will not fail if some fields are missing. /// It will simply set those fields to None in the resulting struct. fn from_str(s: &str) -> anyhow::Result { let kv: HashMap<_, _> = s.split(';').filter_map(|e| e.split_once('=')).collect(); let impact = kv.get("IMPACT").map(|&v| v.parse()).transpose()?; let symbol = kv.get("SYMBOL").map(ToString::to_string); let distance = kv.get("DISTANCE").map(|&v| v.parse()).transpose()?; let hgvs_c = kv.get("HGVSc").map(ToString::to_string); let hgvs_p = kv.get("HGVSp").map(ToString::to_string); Ok(VEPExtra { impact, symbol, distance, hgvs_c, hgvs_p, }) } } #[derive(Debug)] pub struct VepJob { in_vcf: PathBuf, out_vcf: PathBuf, config: Config, } impl VepJob { pub fn new(in_path: impl AsRef, out_path: impl AsRef, config: &Config) -> Self { VepJob { in_vcf: in_path.as_ref().to_path_buf(), out_vcf: out_path.as_ref().to_path_buf(), config: config.clone(), } } } impl JobCommand for VepJob { fn cmd(&self) -> String { let bind_flags = singularity_bind_flags([ &self.in_vcf.to_string_lossy().to_string(), &self.out_vcf.to_string_lossy().to_string(), &self.config.vep_gff, &self.config.vep_cache_dir, &self.config.reference, ]); let plugins = vec!["SpliceRegion", "Downstream"] .into_iter() .map(|e| format!("--plugin {e}")) .collect::>() .join(" "); format!( "{singularity_bin} exec \ {binds} \ {image} vep \ --dir_cache {dir_cache} \ --cache --offline \ --fasta {fasta} \ --gff {gff} \ --symbol \ {plugins} \ --hgvs \ -i {input} \ -o {output}", singularity_bin = self.config.singularity_bin, binds = bind_flags, image = self.config.vep_image, dir_cache = self.config.vep_cache_dir, fasta = self.config.reference, gff = self.config.vep_gff, plugins = plugins, input = self.in_vcf.to_string_lossy(), output = self.out_vcf.to_string_lossy(), ) } } impl LocalRunner for VepJob {} impl LocalBatchRunner for VepJob {} impl SbatchRunner for VepJob { fn slurm_params(&self) -> crate::commands::SlurmParams { crate::commands::SlurmParams { job_name: Some("VEP".into()), partition: Some("shortq".into()), cpus_per_task: Some(10), mem: Some("10G".into()), gres: None, } } } // impl Run for VepJob { // fn run(&mut self) -> anyhow::Result<()> { // let report = run!(&self.config, self).with_context(|| format!("Failed to filter PASS for {}", self.id))?; // report // .save_to_file(format!("{}/bcftools_pass_", self.config.log_dir)) // .context("Can't save bcftools PASS logs")?;; // Ok(()) // } // // add code here // } /// Selects the "best" VEP annotation from a list, based on biological impact and transcript priority. /// /// # Selection Strategy /// 1. **Group by minimal VEP impact** (e.g., `HIGH`, `MODERATE`, etc.). /// The lowest-impact group is selected (e.g., `HIGH` preferred over `MODIFIER`). /// /// 2. **If only one annotation in the best-impact group**, return it immediately. /// /// 3. **If multiple remain**, rank by: /// - Transcript type: prefer `"NM"` (manually curated) over others /// - Accession number (lower is better) /// /// # Arguments /// * `d` - A slice of `VEP` annotations to choose from. /// /// # Returns /// * `Ok(VEP)` - The best annotation according to the rules above /// * `Err(anyhow::Error)` - If the input is empty or contains no valid features /// /// # Errors /// * Returns an error if: /// - `d` is empty /// - None of the best-impact VEPs have a parsable `feature` field pub fn get_best_vep(d: &[VEP]) -> anyhow::Result { // Step 1: Group by minimal VEP impact let best_impact_veps = d .iter() .chunk_by(|vep| { vep.consequence.as_ref().map_or(VepImpact::MODIFIER, |c| { c.iter() .map(VepImpact::from) .min() .unwrap_or(VepImpact::MODIFIER) }) }) .into_iter() .min_by_key(|(impact, _)| impact.clone()) .map(|(_, group)| group.cloned().collect::>()) .ok_or_else(|| anyhow::anyhow!("No element in VEP vector"))?; // Step 2: If only one, return it if best_impact_veps.len() == 1 { return Ok(best_impact_veps[0].clone()); } // Step 3: Rank by NCBI accession priority let parsed_veps = best_impact_veps .iter() .enumerate() .filter_map(|(i, vep)| { vep.feature.as_ref().and_then(|feat| { feat.parse::() .map(|acc| (i, acc)) .map_err(|e| warn!("Can't parse NCBI accession '{}': {}", feat, e)) .ok() }) }) .sorted_by_key(|(_, acc)| (Reverse(acc.prefix == "NM"), acc.number)) .collect::>(); // Pick the top-ranked transcript parsed_veps .first() .map(|(k, _)| best_impact_veps[*k].clone()) .ok_or_else(|| anyhow::anyhow!("No valid NCBI accession found")) } #[cfg(test)] mod tests { use super::*; use crate::{helpers::test_init, run}; #[test] fn vep_run() -> anyhow::Result<()> { test_init(); let config = Config::default(); let mut vep_job = VepJob { in_vcf: "/mnt/beegfs02/scratch/t_steimle/data/wgs/DUMCO/diag/ClairS/DUMCO_diag_clairs_PASSED.vcf.gz".into(), out_vcf: "/mnt/beegfs02/scratch/t_steimle/data/wgs/DUMCO/diag/TEST_clairs_PASSED_VEP.vcf".into(), config: config.clone() }; let r = run!(config, &mut vep_job)?; println!("{r:#?}"); Ok(()) } }