Thomas 1 ano atrás
pai
commit
9008256d21
1 arquivos alterados com 228 adições e 64 exclusões
  1. 228 64
      src/annotation/vep.rs

+ 228 - 64
src/annotation/vep.rs

@@ -1,4 +1,4 @@
-use anyhow::{anyhow, Context, Ok, Result};
+use anyhow::{anyhow, Context};
 use csv::ReaderBuilder;
 use hashbrown::HashMap;
 use log::warn;
@@ -39,7 +39,7 @@ pub struct VEP {
     pub gene: Option<String>,
     pub feature: Option<String>,
     pub feature_type: Option<String>,
-    pub consequence: Option<Vec<String>>,
+    pub consequence: Option<Vec<VepConsequence>>,
     pub cdna_position: Option<String>,
     pub cds_position: Option<String>,
     pub protein_position: Option<String>,
@@ -50,55 +50,219 @@ pub struct VEP {
 }
 
 // ensembl.org/info/genome/variation/prediction/predicted_data.html
-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub enum VepConsequence {
-    Transcript_ablation,
-    Splice_acceptor_variant,
-    Splice_donor_variant,
-    Stop_gained,
-    Frameshift_variant,
-    Stop_lost,
-    Start_lost,
-    Transcript_amplification,
-    Inframe_insertion,
-    Inframe_deletion,
-    Missense_variant,
-    Protein_altering_variant,
-    Splice_region_variant,
-    Incomplete_terminal_codon_variant,
-    Start_retained_variant,
-    Stop_retained_variant,
-    Synonymous_variant,
-    Coding_sequence_variant,
-    Mature_miRNA_variant,
-    Five_prime_UTR_variant,
-    Three_prime_UTR_variant,
-    Non_coding_transcript_exon_variant,
-    Intron_variant,
-    NMD_transcript_variant,
-    Non_coding_transcript_variant,
-    Upstream_gene_variant,
-    Downstream_gene_variant,
-    TFBS_ablation,
-    TFBS_amplification,
-    TF_binding_site_variant,
-    Regulatory_region_ablation,
-    Regulatory_region_amplification,
-    Feature_elongation,
-    Regulatory_region_variant,
-    Feature_truncation,
-    Intergenic_variant,
+    TranscriptAblation,
+    SpliceAcceptorVariant,
+    SpliceDonorVariant,
+    StopGained,
+    FrameshiftVariant,
+    StopLost,
+    StartLost,
+    TranscriptAmplification,
+    InframeInsertion,
+    InframeDeletion,
+    MissenseVariant,
+    ProteinAlteringVariant,
+    SpliceDonor5thBaseVariant,
+    SpliceRegionVariant,
+    SpliceDonorRegionVariant,
+    SplicePolyrimidineTractVariant,
+    IncompleteTerminalCodonVariant,
+    StartRetainedVariant,
+    StopRetainedVariant,
+    SynonymousVariant,
+    CodingSequenceVariant,
+    MatureMiRnaVariant,
+    FivePrimeUtrVariant,
+    ThreePrimeUtrVariant,
+    NonCodingTranscriptExonVariant,
+    IntronVariant,
+    NmdTranscriptVariant,
+    NonCodingTranscriptVariant,
+    UpstreamGeneVariant,
+    DownstreamGeneVariant,
+    TfbsAblation,
+    TfbsAmplification,
+    TfBindingSiteVariant,
+    RegulatoryRegionAblation,
+    RegulatoryRegionAmplification,
+    FeatureElongation,
+    RegulatoryRegionVariant,
+    FeatureTruncation,
+    IntergenicVariant,
+    SequenceVariant,
+}
+
+
+#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum VepImpact {
+    HIGH,
+    MODERATE,
+    LOW,
+    MODIFIER,
+}
+
+impl VepImpact {
+    pub fn from_conseque(consequence: &VepConsequence) -> VepImpact {
+        match consequence {
+            VepConsequence::TranscriptAblation |
+            VepConsequence::SpliceAcceptorVariant |
+            VepConsequence::SpliceDonorVariant |
+            VepConsequence::StopGained |
+            VepConsequence::FrameshiftVariant |
+            VepConsequence::StopLost |
+            VepConsequence::StartLost |
+            VepConsequence::TranscriptAmplification |
+            VepConsequence::FeatureElongation |
+            VepConsequence::FeatureTruncation => VepImpact::HIGH,
+            
+            VepConsequence::InframeInsertion |
+            VepConsequence::InframeDeletion |
+            VepConsequence::MissenseVariant |
+            VepConsequence::ProteinAlteringVariant => VepImpact::MODERATE,
+
+            VepConsequence::SpliceDonor5thBaseVariant |
+            VepConsequence::SpliceRegionVariant |
+            VepConsequence::SpliceDonorRegionVariant |
+            VepConsequence::SplicePolyrimidineTractVariant |
+            VepConsequence::IncompleteTerminalCodonVariant |
+            VepConsequence::StartRetainedVariant |
+            VepConsequence::StopRetainedVariant |
+            VepConsequence::SynonymousVariant => VepImpact::LOW,
+
+            VepConsequence::CodingSequenceVariant |
+            VepConsequence::MatureMiRnaVariant |
+            VepConsequence::FivePrimeUtrVariant |
+            VepConsequence::ThreePrimeUtrVariant |
+            VepConsequence::NonCodingTranscriptExonVariant |
+            VepConsequence::IntronVariant |
+            VepConsequence::NmdTranscriptVariant |
+            VepConsequence::NonCodingTranscriptVariant |
+            VepConsequence::UpstreamGeneVariant |
+            VepConsequence::DownstreamGeneVariant |
+            VepConsequence::TfbsAblation |
+            VepConsequence::TfbsAmplification |
+            VepConsequence::TfBindingSiteVariant |
+            VepConsequence::RegulatoryRegionAblation |
+            VepConsequence::RegulatoryRegionAmplification |
+            VepConsequence::RegulatoryRegionVariant |
+            VepConsequence::SequenceVariant |
+            VepConsequence::IntergenicVariant => VepImpact::MODIFIER,
+        }
+    }
+}
+
+impl From<VepConsequence> for String {
+    fn from(consequence: VepConsequence) -> Self {
+        match consequence {
+            VepConsequence::TranscriptAblation => "transcript_ablation".to_string(),
+            VepConsequence::SpliceAcceptorVariant => "splice_acceptor_variant".to_string(),
+            VepConsequence::SpliceDonorVariant => "splice_donor_variant".to_string(),
+            VepConsequence::StopGained => "stop_gained".to_string(),
+            VepConsequence::FrameshiftVariant => "frameshift_variant".to_string(),
+            VepConsequence::StopLost => "stop_lost".to_string(),
+            VepConsequence::StartLost => "start_lost".to_string(),
+            VepConsequence::TranscriptAmplification => "transcript_amplification".to_string(),
+            VepConsequence::InframeInsertion => "inframe_insertion".to_string(),
+            VepConsequence::InframeDeletion => "inframe_deletion".to_string(),
+            VepConsequence::MissenseVariant => "missense_variant".to_string(),
+            VepConsequence::ProteinAlteringVariant => "protein_altering_variant".to_string(),
+            VepConsequence::SpliceRegionVariant => "splice_region_variant".to_string(),
+            VepConsequence::IncompleteTerminalCodonVariant => "incomplete_terminal_codon_variant".to_string(),
+            VepConsequence::StartRetainedVariant => "start_retained_variant".to_string(),
+            VepConsequence::StopRetainedVariant => "stop_retained_variant".to_string(),
+            VepConsequence::SynonymousVariant => "synonymous_variant".to_string(),
+            VepConsequence::CodingSequenceVariant => "coding_sequence_variant".to_string(),
+            VepConsequence::MatureMiRnaVariant => "mature_miRNA_variant".to_string(),
+            VepConsequence::FivePrimeUtrVariant => "5_prime_UTR_variant".to_string(),
+            VepConsequence::ThreePrimeUtrVariant => "3_prime_UTR_variant".to_string(),
+            VepConsequence::NonCodingTranscriptExonVariant => "non_coding_transcript_exon_variant".to_string(),
+            VepConsequence::IntronVariant => "intron_variant".to_string(),
+            VepConsequence::NmdTranscriptVariant => "NMD_transcript_variant".to_string(),
+            VepConsequence::NonCodingTranscriptVariant => "non_coding_transcript_variant".to_string(),
+            VepConsequence::UpstreamGeneVariant => "upstream_gene_variant".to_string(),
+            VepConsequence::DownstreamGeneVariant => "downstream_gene_variant".to_string(),
+            VepConsequence::TfbsAblation => "TFBS_ablation".to_string(),
+            VepConsequence::TfbsAmplification => "TFBS_amplification".to_string(),
+            VepConsequence::TfBindingSiteVariant => "TF_binding_site_variant".to_string(),
+            VepConsequence::RegulatoryRegionAblation => "regulatory_region_ablation".to_string(),
+            VepConsequence::RegulatoryRegionAmplification => "regulatory_region_amplification".to_string(),
+            VepConsequence::FeatureElongation => "feature_elongation".to_string(),
+            VepConsequence::RegulatoryRegionVariant => "regulatory_region_variant".to_string(),
+            VepConsequence::FeatureTruncation => "feature_truncation".to_string(),
+            VepConsequence::SpliceDonor5thBaseVariant => "splice_donor_5th_base_variant".to_string(),
+            VepConsequence::SpliceDonorRegionVariant => "splice_donor_region_variant".to_string(),
+            VepConsequence::SplicePolyrimidineTractVariant => "splice_polyrimidine_tract_variant".to_string(),
+            VepConsequence::SequenceVariant => "sequence_variant".to_string(),
+            VepConsequence::IntergenicVariant => "intergenic_variant".to_string(),
+        }
+    }
+}
+
+impl FromStr for VepConsequence {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> anyhow::Result<Self> {
+        match s {
+            "transcript_ablation" => Ok(VepConsequence::TranscriptAblation),
+            "splice_acceptor_variant" => Ok(VepConsequence::SpliceAcceptorVariant),
+            "splice_donor_variant" => Ok(VepConsequence::SpliceDonorVariant),
+            "stop_gained" => Ok(VepConsequence::StopGained),
+            "frameshift_variant" => Ok(VepConsequence::FrameshiftVariant),
+            "stop_lost" => Ok(VepConsequence::StopLost),
+            "start_lost" => Ok(VepConsequence::StartLost),
+            "transcript_amplification" => Ok(VepConsequence::TranscriptAmplification),
+            "feature_elongation" => Ok(VepConsequence::FeatureElongation),
+            "feature_truncation" => Ok(VepConsequence::FeatureTruncation),
+
+            "inframe_insertion" => Ok(VepConsequence::InframeInsertion),
+            "inframe_deletion" => Ok(VepConsequence::InframeDeletion),
+            "missense_variant" => Ok(VepConsequence::MissenseVariant),
+            "protein_altering_variant" => Ok(VepConsequence::ProteinAlteringVariant),
+            "splice_donor_5th_base_variant" => Ok(VepConsequence::SpliceDonor5thBaseVariant),
+            "splice_region_variant" => Ok(VepConsequence::SpliceRegionVariant),
+            "splice_donor_region_variant" => Ok(VepConsequence::SpliceDonorRegionVariant),
+            "splice_polypyrimidine_tract_variant" => Ok(VepConsequence::SplicePolyrimidineTractVariant),
+
+            "incomplete_terminal_codon_variant" => Ok(VepConsequence::IncompleteTerminalCodonVariant),
+            "start_retained_variant" => Ok(VepConsequence::StartRetainedVariant),
+            "stop_retained_variant" => Ok(VepConsequence::StopRetainedVariant),
+            "synonymous_variant" => Ok(VepConsequence::SynonymousVariant),
+            "coding_sequence_variant" => Ok(VepConsequence::CodingSequenceVariant),
+            "mature_miRNA_variant" => Ok(VepConsequence::MatureMiRnaVariant),
+            "5_prime_UTR_variant" => Ok(VepConsequence::FivePrimeUtrVariant),
+            "3_prime_UTR_variant" => Ok(VepConsequence::ThreePrimeUtrVariant),
+            "non_coding_transcript_exon_variant" => Ok(VepConsequence::NonCodingTranscriptExonVariant),
+            "intron_variant" => Ok(VepConsequence::IntronVariant),
+
+            "NMD_transcript_variant" => Ok(VepConsequence::NmdTranscriptVariant),
+            "non_coding_transcript_variant" => Ok(VepConsequence::NonCodingTranscriptVariant),
+            "upstream_gene_variant" => Ok(VepConsequence::UpstreamGeneVariant),
+            "downstream_gene_variant" => Ok(VepConsequence::DownstreamGeneVariant),
+            "TFBS_ablation" => Ok(VepConsequence::TfbsAblation),
+            "TFBS_amplification" => Ok(VepConsequence::TfbsAmplification),
+            "TF_binding_site_variant" => Ok(VepConsequence::TfBindingSiteVariant),
+            "regulatory_region_ablation" => Ok(VepConsequence::RegulatoryRegionAblation),
+            "regulatory_region_amplification" => Ok(VepConsequence::RegulatoryRegionAmplification),
+            "regulatory_region_variant" => Ok(VepConsequence::RegulatoryRegionVariant),
+
+            "intergenic_variant" => Ok(VepConsequence::IntergenicVariant),
+            "sequence_variant" => Ok(VepConsequence::SequenceVariant),
+            _ => Err(anyhow!("Unknown VepConsequence: {s}")),
+        }
+    }
 }
 
 impl VEP {
-    fn from_vep_line(d: &VEPLine) -> Result<VEP> {
+    fn from_vep_line(d: &VEPLine) -> anyhow::Result<VEP> {
         let or_opt = |s: &str| match s {
             "-" => None,
             _ => Some(s.to_string()),
         };
 
         let consequence = or_opt(&d.consequence)
-            .map(|c| c.split(",").map(|e| e.to_string()).collect::<Vec<String>>());
+            .map(|c| c.split(",").map(|e| e.parse()).collect::<Vec<VepConsequence>>());
 
         Ok(VEP {
             gene: or_opt(&d.gene),
@@ -127,7 +291,7 @@ pub struct VEPExtra {
 impl FromStr for VEPExtra {
     type Err = anyhow::Error;
 
-    fn from_str(s: &str) -> Result<Self> {
+    fn from_str(s: &str) -> anyhow::Result<Self> {
         let err = |c| anyhow!("Error {} parsing VEP Extra field {}", c, s);
 
         let elements = s.split(";").collect::<Vec<&str>>();
@@ -165,27 +329,27 @@ impl FromStr for VEPExtra {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
-pub enum VEPImpact {
-    Low,
-    Moderate,
-    High,
-    Modifier,
-}
-
-impl FromStr for VEPImpact {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self> {
-        match s {
-            "LOW" => Ok(VEPImpact::Low),
-            "MODERATE" => Ok(VEPImpact::Moderate),
-            "HIGH" => Ok(VEPImpact::High),
-            "MODIFIER" => Ok(VEPImpact::Modifier),
-            _ => Err(anyhow!("Unexpected VEP Impact value")),
-        }
-    }
-}
+// #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+// pub enum VEPImpact {
+//     Low,
+//     Moderate,
+//     High,
+//     Modifier,
+// }
+//
+// impl FromStr for VEPImpact {
+//     type Err = anyhow::Error;
+//
+//     fn from_str(s: &str) -> Result<Self> {
+//         match s {
+//             "LOW" => Ok(VEPImpact::Low),
+//             "MODERATE" => Ok(VEPImpact::Moderate),
+//             "HIGH" => Ok(VEPImpact::High),
+//             "MODIFIER" => Ok(VEPImpact::Modifier),
+//             _ => Err(anyhow!("Unexpected VEP Impact value")),
+//         }
+//     }
+// }
 // pub fn vep_chunk(data: &mut [Variant]) -> Result<()> {
 //     let in_vcf = format!(
 //         "{}/vep_{}.vcf",
@@ -325,7 +489,7 @@ fn run_vep(in_path: &str, out_path: &str) -> Result<()> {
     Ok(())
 }
 
-pub fn get_best_vep(d: &[VEP]) -> Result<VEP> {
+pub fn get_best_vep(d: &[VEP]) -> anyhow::Result<VEP> {
     d.into_iter().filter(|v| v.)
 
     if d.is_empty() {