use anyhow::anyhow; use hashbrown::HashMap; use itertools::Itertools; use log::{debug, warn}; use serde::{Deserialize, Serialize}; use std::{ cmp::{Ordering, Reverse}, io::{BufRead, BufReader}, process::{Command, Stdio}, str::FromStr, }; use super::ncbi::NCBIAcc; /// Represents a single line of output from the Variant Effect Predictor (VEP). /// /// This struct encapsulates all fields typically found in a VEP output line, /// providing a structured way to work with VEP results. #[derive(Debug, PartialEq, Serialize, Deserialize)] pub struct VepLine { /// The identifier of the input variant pub uploaded_variation: String, /// The genomic location of the variant pub location: String, /// The variant allele pub allele: String, /// The gene symbol or identifier pub gene: String, /// The affected feature (e.g., transcript ID) pub feature: String, /// The type of feature affected (e.g., Transcript, RegulatoryFeature) pub feature_type: String, /// The consequence of the variant pub consequence: String, /// The position of the variant in the cDNA pub cdna_position: String, /// The position of the variant in the CDS pub cds_position: String, /// The position of the variant in the protein pub protein_position: String, /// The amino acid change caused by the variant pub amino_acids: String, /// The codon change caused by the variant pub codons: String, /// Known identifiers for this variant pub existing_variation: String, /// Additional information in key=value format pub extra: String, } impl FromStr for VepLine { type Err = anyhow::Error; /// Parses a VEP output line into a VepLine struct. /// /// This method expects the input string to be a tab-separated line /// containing exactly 14 fields, as per standard VEP output format. /// /// # Arguments /// * `s` - A string slice representing a single line of VEP output /// /// # Returns /// * `Ok(VepLine)` if parsing is successful /// * `Err(anyhow::Error)` if the input doesn't contain the expected number of fields /// /// # Example /// ``` /// let vep_line = VepLine::from_str("").unwrap(); /// assert_eq!(vep_line.gene, "ENSG00000135744"); /// ``` fn from_str(s: &str) -> Result { let parts: Vec<&str> = s.split('\t').collect(); if parts.len() != 14 { return Err(anyhow!("Invalid number of fields in VEP line")); } Ok(VepLine { uploaded_variation: parts[0].to_string(), location: parts[1].to_string(), allele: parts[2].to_string(), gene: parts[3].to_string(), feature: parts[4].to_string(), feature_type: parts[5].to_string(), consequence: parts[6].to_string(), cdna_position: parts[7].to_string(), cds_position: parts[8].to_string(), protein_position: parts[9].to_string(), amino_acids: parts[10].to_string(), codons: parts[11].to_string(), existing_variation: parts[12].to_string(), extra: parts[13].to_string(), }) } } /// Represents a Variant Effect Predictor (VEP) annotation for a genetic variant. /// /// This struct encapsulates various fields that describe the predicted effects /// of a variant on genes, transcripts, and proteins, as determined by VEP. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct VEP { /// The gene affected by the variant pub gene: Option, /// The specific feature (e.g., transcript) affected by the variant pub feature: Option, /// The type of feature (e.g., Transcript, RegulatoryFeature) pub feature_type: Option, /// The predicted consequence(s) of the variant pub consequence: Option>, /// The position of the variant in the cDNA sequence pub cdna_position: Option, /// The position of the variant in the coding sequence pub cds_position: Option, /// The position of the variant in the protein sequence pub protein_position: Option, /// The amino acid change caused by the variant pub amino_acids: Option, /// The codon change caused by the variant pub codons: Option, /// Known identifiers for this variant pub existing_variation: Option, /// Additional VEP information pub extra: VEPExtra, } /// Represents the predicted consequences of a genetic variant as determined by /// the Ensembl Variant Effect Predictor (VEP). /// /// This enum aligns with the Sequence Ontology (SO) terms used by VEP to describe /// the effects of variants on transcripts, proteins, and regulatory regions. /// /// The variants are generally ordered from most to least severe in terms of their /// potential impact on gene function. /// /// For more information, see: /// #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum VepConsequence { /// Complete destruction of a transcript TranscriptAblation, /// Variant at the splice acceptor site SpliceAcceptorVariant, /// Variant at the splice donor site SpliceDonorVariant, /// Variant causing a premature stop codon StopGained, /// Insertion or deletion causing a frameshift FrameshiftVariant, /// Variant causing loss of stop codon StopLost, /// Variant causing loss of start codon StartLost, /// Amplification of a transcript TranscriptAmplification, /// Insertion not causing a frameshift InframeInsertion, /// Deletion not causing a frameshift InframeDeletion, /// Variant causing an amino acid change MissenseVariant, /// Variant that changes protein structure ProteinAlteringVariant, /// Variant at the 5th base of a splice donor site SpliceDonor5thBaseVariant, /// Variant in the splice region SpliceRegionVariant, /// Variant in the splice donor region SpliceDonorRegionVariant, /// Variant in the polypyrimidine tract near splice sites SplicePolyrimidineTractVariant, /// Variant in the final, incomplete codon of a transcript IncompleteTerminalCodonVariant, /// Variant in start codon resulting in no change StartRetainedVariant, /// Variant in stop codon resulting in no change StopRetainedVariant, /// Variant not changing the encoded amino acid SynonymousVariant, /// Variant in coding sequence with indeterminate effect CodingSequenceVariant, /// Variant in mature miRNA MatureMiRnaVariant, /// Variant in 5' UTR FivePrimeUtrVariant, /// Variant in 3' UTR ThreePrimeUtrVariant, /// Variant in non-coding exon NonCodingTranscriptExonVariant, /// Variant in intron IntronVariant, /// Variant in transcript subject to nonsense-mediated decay NmdTranscriptVariant, /// Variant in non-coding transcript NonCodingTranscriptVariant, /// Variant upstream of a gene UpstreamGeneVariant, /// Variant downstream of a gene DownstreamGeneVariant, /// Deletion of a transcription factor binding site TfbsAblation, /// Amplification of a transcription factor binding site TfbsAmplification, /// Variant in a transcription factor binding site TfBindingSiteVariant, /// Deletion of a regulatory region RegulatoryRegionAblation, /// Amplification of a regulatory region RegulatoryRegionAmplification, /// Variant causing a feature to be extended FeatureElongation, /// Variant in a regulatory region RegulatoryRegionVariant, /// Variant causing a feature to be shortened FeatureTruncation, /// Variant in intergenic region IntergenicVariant, /// General sequence variant SequenceVariant, } /// Represents the severity of a variant's impact as predicted by the /// Variant Effect Predictor (VEP). /// /// The impact categories are ordered from most severe (HIGH) to least severe (MODIFIER). #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum VepImpact { /// High impact variants are expected to have high (disruptive) impact in the protein, /// probably causing protein truncation, loss of function or triggering nonsense mediated decay. HIGH, /// Moderate impact variants are non-disruptive variants that might change protein effectiveness. MODERATE, /// Low impact variants are mostly harmless or unlikely to change protein behavior. LOW, /// Modifier variants are usually non-coding variants or variants affecting non-coding genes, /// where predictions are difficult or there is no evidence of impact. MODIFIER, } impl From for String { /// Converts a VepImpact enum variant to its string representation. fn from(value: VepImpact) -> Self { match value { VepImpact::HIGH => "High", VepImpact::MODERATE => "Moderate", VepImpact::LOW => "Low", VepImpact::MODIFIER => "Modifier", } .to_string() } } impl PartialOrd for VepImpact { /// Implements partial ordering for VepImpact. fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for VepImpact { /// Implements total ordering for VepImpact. /// /// The ordering is inverse to severity: HIGH < MODERATE < LOW < MODIFIER /// This allows for easy sorting where the most severe impacts come first. fn cmp(&self, other: &Self) -> Ordering { match (self, other) { (VepImpact::HIGH, VepImpact::HIGH) => Ordering::Equal, (VepImpact::HIGH, _) => Ordering::Less, (VepImpact::MODERATE, VepImpact::HIGH) => Ordering::Greater, (VepImpact::MODERATE, VepImpact::MODERATE) => Ordering::Equal, (VepImpact::MODERATE, _) => Ordering::Less, (VepImpact::LOW, VepImpact::HIGH | VepImpact::MODERATE) => Ordering::Greater, (VepImpact::LOW, VepImpact::LOW) => Ordering::Equal, (VepImpact::LOW, VepImpact::MODIFIER) => Ordering::Less, (VepImpact::MODIFIER, VepImpact::MODIFIER) => Ordering::Equal, (VepImpact::MODIFIER, _) => Ordering::Greater, } } } impl FromStr for VepImpact { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "LOW" => Ok(VepImpact::LOW), "MODERATE" => Ok(VepImpact::MODERATE), "HIGH" => Ok(VepImpact::HIGH), "MODIFIER" => Ok(VepImpact::MODIFIER), _ => Err(anyhow!("Unexpected VEP Impact value")), } } } impl From<&VepConsequence> for VepImpact { fn from(consequence: &VepConsequence) -> Self { match consequence { VepConsequence::TranscriptAblation | VepConsequence::SpliceAcceptorVariant | VepConsequence::SpliceDonorVariant | VepConsequence::StopGained | VepConsequence::FrameshiftVariant | VepConsequence::StopLost | VepConsequence::StartLost | VepConsequence::TranscriptAmplification | VepConsequence::FeatureElongation | VepConsequence::FeatureTruncation => VepImpact::HIGH, VepConsequence::InframeInsertion | VepConsequence::InframeDeletion | VepConsequence::MissenseVariant | VepConsequence::ProteinAlteringVariant => VepImpact::MODERATE, VepConsequence::SpliceDonor5thBaseVariant | VepConsequence::SpliceRegionVariant | VepConsequence::SpliceDonorRegionVariant | VepConsequence::SplicePolyrimidineTractVariant | VepConsequence::IncompleteTerminalCodonVariant | VepConsequence::StartRetainedVariant | VepConsequence::StopRetainedVariant | VepConsequence::SynonymousVariant => VepImpact::LOW, VepConsequence::CodingSequenceVariant | VepConsequence::MatureMiRnaVariant | VepConsequence::FivePrimeUtrVariant | VepConsequence::ThreePrimeUtrVariant | VepConsequence::NonCodingTranscriptExonVariant | VepConsequence::IntronVariant | VepConsequence::NmdTranscriptVariant | VepConsequence::NonCodingTranscriptVariant | VepConsequence::UpstreamGeneVariant | VepConsequence::DownstreamGeneVariant | VepConsequence::TfbsAblation | VepConsequence::TfbsAmplification | VepConsequence::TfBindingSiteVariant | VepConsequence::RegulatoryRegionAblation | VepConsequence::RegulatoryRegionAmplification | VepConsequence::RegulatoryRegionVariant | VepConsequence::SequenceVariant | VepConsequence::IntergenicVariant => VepImpact::MODIFIER, } } } impl From for String { fn from(consequence: VepConsequence) -> Self { match consequence { VepConsequence::TranscriptAblation => "transcript_ablation".to_string(), VepConsequence::SpliceAcceptorVariant => "splice_acceptor_variant".to_string(), VepConsequence::SpliceDonorVariant => "splice_donor_variant".to_string(), VepConsequence::StopGained => "stop_gained".to_string(), VepConsequence::FrameshiftVariant => "frameshift_variant".to_string(), VepConsequence::StopLost => "stop_lost".to_string(), VepConsequence::StartLost => "start_lost".to_string(), VepConsequence::TranscriptAmplification => "transcript_amplification".to_string(), VepConsequence::InframeInsertion => "inframe_insertion".to_string(), VepConsequence::InframeDeletion => "inframe_deletion".to_string(), VepConsequence::MissenseVariant => "missense_variant".to_string(), VepConsequence::ProteinAlteringVariant => "protein_altering_variant".to_string(), VepConsequence::SpliceRegionVariant => "splice_region_variant".to_string(), VepConsequence::IncompleteTerminalCodonVariant => { "incomplete_terminal_codon_variant".to_string() } VepConsequence::StartRetainedVariant => "start_retained_variant".to_string(), VepConsequence::StopRetainedVariant => "stop_retained_variant".to_string(), VepConsequence::SynonymousVariant => "synonymous_variant".to_string(), VepConsequence::CodingSequenceVariant => "coding_sequence_variant".to_string(), VepConsequence::MatureMiRnaVariant => "mature_miRNA_variant".to_string(), VepConsequence::FivePrimeUtrVariant => "5_prime_UTR_variant".to_string(), VepConsequence::ThreePrimeUtrVariant => "3_prime_UTR_variant".to_string(), VepConsequence::NonCodingTranscriptExonVariant => { "non_coding_transcript_exon_variant".to_string() } VepConsequence::IntronVariant => "intron_variant".to_string(), VepConsequence::NmdTranscriptVariant => "NMD_transcript_variant".to_string(), VepConsequence::NonCodingTranscriptVariant => { "non_coding_transcript_variant".to_string() } VepConsequence::UpstreamGeneVariant => "upstream_gene_variant".to_string(), VepConsequence::DownstreamGeneVariant => "downstream_gene_variant".to_string(), VepConsequence::TfbsAblation => "TFBS_ablation".to_string(), VepConsequence::TfbsAmplification => "TFBS_amplification".to_string(), VepConsequence::TfBindingSiteVariant => "TF_binding_site_variant".to_string(), VepConsequence::RegulatoryRegionAblation => "regulatory_region_ablation".to_string(), VepConsequence::RegulatoryRegionAmplification => { "regulatory_region_amplification".to_string() } VepConsequence::FeatureElongation => "feature_elongation".to_string(), VepConsequence::RegulatoryRegionVariant => "regulatory_region_variant".to_string(), VepConsequence::FeatureTruncation => "feature_truncation".to_string(), VepConsequence::SpliceDonor5thBaseVariant => { "splice_donor_5th_base_variant".to_string() } VepConsequence::SpliceDonorRegionVariant => "splice_donor_region_variant".to_string(), VepConsequence::SplicePolyrimidineTractVariant => { "splice_polyrimidine_tract_variant".to_string() } VepConsequence::SequenceVariant => "sequence_variant".to_string(), VepConsequence::IntergenicVariant => "intergenic_variant".to_string(), } } } impl FromStr for VepConsequence { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { match s { "transcript_ablation" => Ok(VepConsequence::TranscriptAblation), "splice_acceptor_variant" => Ok(VepConsequence::SpliceAcceptorVariant), "splice_donor_variant" => Ok(VepConsequence::SpliceDonorVariant), "stop_gained" => Ok(VepConsequence::StopGained), "frameshift_variant" => Ok(VepConsequence::FrameshiftVariant), "stop_lost" => Ok(VepConsequence::StopLost), "start_lost" => Ok(VepConsequence::StartLost), "transcript_amplification" => Ok(VepConsequence::TranscriptAmplification), "feature_elongation" => Ok(VepConsequence::FeatureElongation), "feature_truncation" => Ok(VepConsequence::FeatureTruncation), "inframe_insertion" => Ok(VepConsequence::InframeInsertion), "inframe_deletion" => Ok(VepConsequence::InframeDeletion), "missense_variant" => Ok(VepConsequence::MissenseVariant), "protein_altering_variant" => Ok(VepConsequence::ProteinAlteringVariant), "splice_donor_5th_base_variant" => Ok(VepConsequence::SpliceDonor5thBaseVariant), "splice_region_variant" => Ok(VepConsequence::SpliceRegionVariant), "splice_donor_region_variant" => Ok(VepConsequence::SpliceDonorRegionVariant), "splice_polypyrimidine_tract_variant" => { Ok(VepConsequence::SplicePolyrimidineTractVariant) } "incomplete_terminal_codon_variant" => { Ok(VepConsequence::IncompleteTerminalCodonVariant) } "start_retained_variant" => Ok(VepConsequence::StartRetainedVariant), "stop_retained_variant" => Ok(VepConsequence::StopRetainedVariant), "synonymous_variant" => Ok(VepConsequence::SynonymousVariant), "coding_sequence_variant" => Ok(VepConsequence::CodingSequenceVariant), "mature_miRNA_variant" => Ok(VepConsequence::MatureMiRnaVariant), "5_prime_UTR_variant" => Ok(VepConsequence::FivePrimeUtrVariant), "3_prime_UTR_variant" => Ok(VepConsequence::ThreePrimeUtrVariant), "non_coding_transcript_exon_variant" => { Ok(VepConsequence::NonCodingTranscriptExonVariant) } "intron_variant" => Ok(VepConsequence::IntronVariant), "NMD_transcript_variant" => Ok(VepConsequence::NmdTranscriptVariant), "non_coding_transcript_variant" => Ok(VepConsequence::NonCodingTranscriptVariant), "upstream_gene_variant" => Ok(VepConsequence::UpstreamGeneVariant), "downstream_gene_variant" => Ok(VepConsequence::DownstreamGeneVariant), "TFBS_ablation" => Ok(VepConsequence::TfbsAblation), "TFBS_amplification" => Ok(VepConsequence::TfbsAmplification), "TF_binding_site_variant" => Ok(VepConsequence::TfBindingSiteVariant), "regulatory_region_ablation" => Ok(VepConsequence::RegulatoryRegionAblation), "regulatory_region_amplification" => Ok(VepConsequence::RegulatoryRegionAmplification), "regulatory_region_variant" => Ok(VepConsequence::RegulatoryRegionVariant), "intergenic_variant" => Ok(VepConsequence::IntergenicVariant), "sequence_variant" => Ok(VepConsequence::SequenceVariant), _ => Err(anyhow!("Unknown VepConsequence: {s}")), } } } impl TryFrom<&VepLine> for VEP { type Error = anyhow::Error; /// Attempts to convert a VepLine reference into a VEP struct. /// /// This implementation provides a way to transform the raw VEP output line /// (represented by VepLine) into a more structured and typed VEP representation. /// /// # Arguments /// * `d` - A reference to a VepLine struct /// /// # Returns /// * `Ok(VEP)` if the conversion is successful /// * `Err(anyhow::Error)` if any parsing errors occur /// /// # Behavior /// - Converts "-" strings to None for optional fields /// - Parses the consequence field into a `Vec` /// - Parses the extra field into a VEPExtra struct /// /// # Example /// ``` /// let vep_line = VepLine { /* ... */ }; /// match VEP::try_from(&vep_line) { /// Ok(vep) => println!("Converted VEP: {:?}", vep), /// Err(e) => eprintln!("Conversion failed: {}", e), /// } /// ``` fn try_from(d: &VepLine) -> anyhow::Result { let or_opt = |s: &str| match s { "-" => None, _ => Some(s.to_string()), }; let consequence = or_opt(&d.consequence).map(|c| { c.split(',') .filter_map(|e| e.parse::().ok()) .collect::>() }); Ok(VEP { gene: or_opt(&d.gene), feature: or_opt(&d.feature), feature_type: or_opt(&d.feature_type), consequence, cdna_position: or_opt(&d.feature_type), cds_position: or_opt(&d.cds_position), protein_position: or_opt(&d.protein_position), amino_acids: or_opt(&d.amino_acids), codons: or_opt(&d.codons), existing_variation: or_opt(&d.existing_variation), extra: d.extra.parse()?, }) } } /// Represents additional information from Variant Effect Predictor (VEP) annotations. /// /// This struct encapsulates various optional fields that provide extra context /// about a variant's predicted effect. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct VEPExtra { /// The impact severity of the variant (e.g., HIGH, MODERATE, LOW, MODIFIER) pub impact: Option, /// The gene symbol associated with the variant pub symbol: Option, /// The distance to the feature (e.g., for upstream/downstream variants) pub distance: Option, /// HGVS notation describing the variant at the coding sequence level pub hgvs_c: Option, /// HGVS notation describing the variant at the protein level pub hgvs_p: Option, } impl FromStr for VEPExtra { type Err = anyhow::Error; /// Parses a VEPExtra instance from a string representation. /// /// This method interprets a semicolon-separated key-value string typically /// found in VEP output and constructs a VEPExtra struct from it. /// /// # Arguments /// * `s` - A string slice containing the VEP extra information /// /// # Returns /// * `Ok(VEPExtra)` if parsing is successful /// * `Err(anyhow::Error)` if parsing fails /// /// # Example /// ``` /// let vep_extra = VEPExtra::from_str("IMPACT=MODERATE;SYMBOL=BRCA1;DISTANCE=500;HGVSc=c.1A>T;HGVSp=p.Met1?"); /// assert!(vep_extra.is_ok()); /// ``` /// /// # Note /// This method is flexible and will not fail if some fields are missing. /// It will simply set those fields to None in the resulting struct. fn from_str(s: &str) -> anyhow::Result { let kv: HashMap<_, _> = s.split(';').filter_map(|e| e.split_once('=')).collect(); let impact = kv.get("IMPACT").map(|&v| v.parse()).transpose()?; let symbol = kv.get("SYMBOL").map(ToString::to_string); let distance = kv.get("DISTANCE").map(|&v| v.parse()).transpose()?; let hgvs_c = kv.get("HGVSc").map(ToString::to_string); let hgvs_p = kv.get("HGVSp").map(ToString::to_string); Ok(VEPExtra { impact, symbol, distance, hgvs_c, hgvs_p, }) } } /// Runs the Variant Effect Predictor (VEP) on a given input file and outputs the results. /// /// This function executes the VEP tool with specific parameters to annotate genetic variants. /// It uses a predefined set of reference data and plugins to enhance the annotation process. /// /// # Arguments /// * `in_path` - A string slice that holds the path to the input file /// * `out_path` - A string slice that holds the path where the output should be written /// /// # Returns /// * `Ok(())` if VEP runs successfully /// * `Err(anyhow::Error)` if there's an error during execution /// /// # Configuration /// The function uses hardcoded paths for: /// - VEP binary directory /// - VEP cache directory /// - Reference FASTA file /// - GFF annotation file /// /// # VEP Parameters /// - Uses offline cache /// - Includes symbol information /// - Applies SpliceRegion and Downstream plugins /// - Generates HGVS notations /// /// # Error Handling /// - Logs any lines containing "error" from VEP's stderr output as warnings /// - Returns an error if the VEP process fails to complete /// /// # Example /// ``` /// match run_vep("input.vcf", "output.vcf") { /// Ok(()) => println!("VEP annotation completed successfully"), /// Err(e) => eprintln!("VEP annotation failed: {}", e), /// } /// ``` /// /// # Note /// Ensure that the VEP tool and all necessary reference data are correctly installed /// and accessible at the specified paths before running this function. pub fn run_vep(in_path: &str, out_path: &str) -> anyhow::Result<()> { // VEP need plugin Downstream and SpliceRegion /home/prom/.vep/Plugins debug!("Run VEP for {in_path} and ouput {out_path}"); let bin_dir = "/data/tools/ensembl-vep"; let dir_cache = "/data/ref/hs1/vepcache/"; let fasta = "/data/ref/hs1/chm13v2.0.fa"; let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz"; let mut cmd = Command::new(format!("{}/vep", bin_dir)) .arg("--dir_cache") .arg(dir_cache) .arg("--cache") .arg("--offline") .arg("--fasta") .arg(fasta) .arg("--gff") .arg(gff) .arg("--symbol") .arg("--plugin") .arg("SpliceRegion") .arg("--plugin") .arg("Downstream") .arg("--hgvs") .arg("-i") .arg(in_path) .arg("-o") .arg(out_path) .stderr(Stdio::piped()) .spawn() .expect("VEP failed to start"); let stderr = cmd.stderr.take().unwrap(); let reader = BufReader::new(stderr); reader .lines() .map_while(Result::ok) // .inspect(|y| println!("{y}")) .filter(|line| line.contains("error")) .for_each(|line| warn!("{}", line)); cmd.wait()?; Ok(()) } /// Selects the best Variant Effect Predictor (VEP) annotation from a slice of VEP annotations. /// /// This function implements a prioritization algorithm to choose the most relevant VEP annotation /// based on impact severity and transcript preference. /// /// # Arguments /// * `d` - A slice of VEP annotations /// /// # Returns /// * `Ok(VEP)` containing the best VEP annotation if successful /// * `Err(anyhow::Error)` if no valid VEP annotation can be selected /// /// # Algorithm /// 1. Groups VEPs by their impact (HIGH > MODERATE > LOW > MODIFIER) /// 2. Selects the group with the highest impact /// 3. If there's only one VEP in this group, it's returned /// 4. Otherwise, further filtering is done based on transcript identifiers: /// - Parses NCBI accessions from the feature field /// - Prioritizes 'NM' prefixed accessions over others /// - Sorts by accession number (higher numbers are preferred) /// /// # Errors /// This function will return an error if: /// * The input slice is empty /// * No valid NCBI accession can be parsed from the selected VEPs /// /// # Example /// ``` /// let veps = vec![VEP { /* ... */ }, VEP { /* ... */ }]; /// match get_best_vep(&veps) { /// Ok(best_vep) => println!("Best VEP: {:?}", best_vep), /// Err(e) => eprintln!("Failed to get best VEP: {}", e), /// } /// ``` /// /// # Note /// This function assumes that the VEP annotations are well-formed and contain /// the necessary fields for comparison (consequence, feature). It will log warnings /// for any parsing errors encountered during the selection process. pub fn get_best_vep(d: &[VEP]) -> anyhow::Result { let best_impact_veps = d .iter() .chunk_by(|vep| { vep.consequence.as_ref().map_or(VepImpact::MODIFIER, |c| { c.iter() .map(VepImpact::from) .min() .unwrap_or(VepImpact::MODIFIER) }) }) .into_iter() .min_by_key(|(impact, _)| impact.clone()) .map(|(_, group)| group.cloned().collect::>()) .ok_or_else(|| anyhow!("No element in VEP vector"))?; if best_impact_veps.len() == 1 { return Ok(best_impact_veps[0].clone()); } let parsed_veps = best_impact_veps .iter() .enumerate() .filter_map(|(i, vep)| { vep.feature.as_ref().and_then(|feat| { feat.parse::() .map(|acc| (i, acc)) .map_err(|e| warn!("Can't parse {}: {}", feat, e)) .ok() }) }) .sorted_by_key(|(_, acc)| (Reverse(acc.prefix == "NM"), acc.number)) .collect::>(); parsed_veps .first() .map(|(k, _)| best_impact_veps[*k].clone()) .ok_or_else(|| anyhow!("No valid NCBI accession found")) }