vep.rs 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743
  1. use anyhow::anyhow;
  2. use hashbrown::HashMap;
  3. use itertools::Itertools;
  4. use log::{debug, warn};
  5. use serde::{Deserialize, Serialize};
  6. use std::{
  7. cmp::{Ordering, Reverse},
  8. io::{BufRead, BufReader},
  9. process::{Command, Stdio},
  10. str::FromStr,
  11. };
  12. use super::ncbi::NCBIAcc;
  13. /// Represents a single line of output from the Variant Effect Predictor (VEP).
  14. ///
  15. /// This struct encapsulates all fields typically found in a VEP output line,
  16. /// providing a structured way to work with VEP results.
  17. #[derive(Debug, PartialEq, Serialize, Deserialize)]
  18. pub struct VepLine {
  19. /// The identifier of the input variant
  20. pub uploaded_variation: String,
  21. /// The genomic location of the variant
  22. pub location: String,
  23. /// The variant allele
  24. pub allele: String,
  25. /// The gene symbol or identifier
  26. pub gene: String,
  27. /// The affected feature (e.g., transcript ID)
  28. pub feature: String,
  29. /// The type of feature affected (e.g., Transcript, RegulatoryFeature)
  30. pub feature_type: String,
  31. /// The consequence of the variant
  32. pub consequence: String,
  33. /// The position of the variant in the cDNA
  34. pub cdna_position: String,
  35. /// The position of the variant in the CDS
  36. pub cds_position: String,
  37. /// The position of the variant in the protein
  38. pub protein_position: String,
  39. /// The amino acid change caused by the variant
  40. pub amino_acids: String,
  41. /// The codon change caused by the variant
  42. pub codons: String,
  43. /// Known identifiers for this variant
  44. pub existing_variation: String,
  45. /// Additional information in key=value format
  46. pub extra: String,
  47. }
  48. impl FromStr for VepLine {
  49. type Err = anyhow::Error;
  50. /// Parses a VEP output line into a VepLine struct.
  51. ///
  52. /// This method expects the input string to be a tab-separated line
  53. /// containing exactly 14 fields, as per standard VEP output format.
  54. ///
  55. /// # Arguments
  56. /// * `s` - A string slice representing a single line of VEP output
  57. ///
  58. /// # Returns
  59. /// * `Ok(VepLine)` if parsing is successful
  60. /// * `Err(anyhow::Error)` if the input doesn't contain the expected number of fields
  61. ///
  62. /// # Example
  63. /// ```
  64. /// let vep_line = VepLine::from_str("").unwrap();
  65. /// assert_eq!(vep_line.gene, "ENSG00000135744");
  66. /// ```
  67. fn from_str(s: &str) -> Result<Self, Self::Err> {
  68. let parts: Vec<&str> = s.split('\t').collect();
  69. if parts.len() != 14 {
  70. return Err(anyhow!("Invalid number of fields in VEP line"));
  71. }
  72. Ok(VepLine {
  73. uploaded_variation: parts[0].to_string(),
  74. location: parts[1].to_string(),
  75. allele: parts[2].to_string(),
  76. gene: parts[3].to_string(),
  77. feature: parts[4].to_string(),
  78. feature_type: parts[5].to_string(),
  79. consequence: parts[6].to_string(),
  80. cdna_position: parts[7].to_string(),
  81. cds_position: parts[8].to_string(),
  82. protein_position: parts[9].to_string(),
  83. amino_acids: parts[10].to_string(),
  84. codons: parts[11].to_string(),
  85. existing_variation: parts[12].to_string(),
  86. extra: parts[13].to_string(),
  87. })
  88. }
  89. }
  90. /// Represents a Variant Effect Predictor (VEP) annotation for a genetic variant.
  91. ///
  92. /// This struct encapsulates various fields that describe the predicted effects
  93. /// of a variant on genes, transcripts, and proteins, as determined by VEP.
  94. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
  95. pub struct VEP {
  96. /// The gene affected by the variant
  97. pub gene: Option<String>,
  98. /// The specific feature (e.g., transcript) affected by the variant
  99. pub feature: Option<String>,
  100. /// The type of feature (e.g., Transcript, RegulatoryFeature)
  101. pub feature_type: Option<String>,
  102. /// The predicted consequence(s) of the variant
  103. pub consequence: Option<Vec<VepConsequence>>,
  104. /// The position of the variant in the cDNA sequence
  105. pub cdna_position: Option<String>,
  106. /// The position of the variant in the coding sequence
  107. pub cds_position: Option<String>,
  108. /// The position of the variant in the protein sequence
  109. pub protein_position: Option<String>,
  110. /// The amino acid change caused by the variant
  111. pub amino_acids: Option<String>,
  112. /// The codon change caused by the variant
  113. pub codons: Option<String>,
  114. /// Known identifiers for this variant
  115. pub existing_variation: Option<String>,
  116. /// Additional VEP information
  117. pub extra: VEPExtra,
  118. }
  119. /// Represents the predicted consequences of a genetic variant as determined by
  120. /// the Ensembl Variant Effect Predictor (VEP).
  121. ///
  122. /// This enum aligns with the Sequence Ontology (SO) terms used by VEP to describe
  123. /// the effects of variants on transcripts, proteins, and regulatory regions.
  124. ///
  125. /// The variants are generally ordered from most to least severe in terms of their
  126. /// potential impact on gene function.
  127. ///
  128. /// For more information, see:
  129. /// <https://ensembl.org/info/genome/variation/prediction/predicted_data.html>
  130. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  131. pub enum VepConsequence {
  132. /// Complete destruction of a transcript
  133. TranscriptAblation,
  134. /// Variant at the splice acceptor site
  135. SpliceAcceptorVariant,
  136. /// Variant at the splice donor site
  137. SpliceDonorVariant,
  138. /// Variant causing a premature stop codon
  139. StopGained,
  140. /// Insertion or deletion causing a frameshift
  141. FrameshiftVariant,
  142. /// Variant causing loss of stop codon
  143. StopLost,
  144. /// Variant causing loss of start codon
  145. StartLost,
  146. /// Amplification of a transcript
  147. TranscriptAmplification,
  148. /// Insertion not causing a frameshift
  149. InframeInsertion,
  150. /// Deletion not causing a frameshift
  151. InframeDeletion,
  152. /// Variant causing an amino acid change
  153. MissenseVariant,
  154. /// Variant that changes protein structure
  155. ProteinAlteringVariant,
  156. /// Variant at the 5th base of a splice donor site
  157. SpliceDonor5thBaseVariant,
  158. /// Variant in the splice region
  159. SpliceRegionVariant,
  160. /// Variant in the splice donor region
  161. SpliceDonorRegionVariant,
  162. /// Variant in the polypyrimidine tract near splice sites
  163. SplicePolyrimidineTractVariant,
  164. /// Variant in the final, incomplete codon of a transcript
  165. IncompleteTerminalCodonVariant,
  166. /// Variant in start codon resulting in no change
  167. StartRetainedVariant,
  168. /// Variant in stop codon resulting in no change
  169. StopRetainedVariant,
  170. /// Variant not changing the encoded amino acid
  171. SynonymousVariant,
  172. /// Variant in coding sequence with indeterminate effect
  173. CodingSequenceVariant,
  174. /// Variant in mature miRNA
  175. MatureMiRnaVariant,
  176. /// Variant in 5' UTR
  177. FivePrimeUtrVariant,
  178. /// Variant in 3' UTR
  179. ThreePrimeUtrVariant,
  180. /// Variant in non-coding exon
  181. NonCodingTranscriptExonVariant,
  182. /// Variant in intron
  183. IntronVariant,
  184. /// Variant in transcript subject to nonsense-mediated decay
  185. NmdTranscriptVariant,
  186. /// Variant in non-coding transcript
  187. NonCodingTranscriptVariant,
  188. /// Variant upstream of a gene
  189. UpstreamGeneVariant,
  190. /// Variant downstream of a gene
  191. DownstreamGeneVariant,
  192. /// Deletion of a transcription factor binding site
  193. TfbsAblation,
  194. /// Amplification of a transcription factor binding site
  195. TfbsAmplification,
  196. /// Variant in a transcription factor binding site
  197. TfBindingSiteVariant,
  198. /// Deletion of a regulatory region
  199. RegulatoryRegionAblation,
  200. /// Amplification of a regulatory region
  201. RegulatoryRegionAmplification,
  202. /// Variant causing a feature to be extended
  203. FeatureElongation,
  204. /// Variant in a regulatory region
  205. RegulatoryRegionVariant,
  206. /// Variant causing a feature to be shortened
  207. FeatureTruncation,
  208. /// Variant in intergenic region
  209. IntergenicVariant,
  210. /// General sequence variant
  211. SequenceVariant,
  212. }
  213. /// Represents the severity of a variant's impact as predicted by the
  214. /// Variant Effect Predictor (VEP).
  215. ///
  216. /// The impact categories are ordered from most severe (HIGH) to least severe (MODIFIER).
  217. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
  218. pub enum VepImpact {
  219. /// High impact variants are expected to have high (disruptive) impact in the protein,
  220. /// probably causing protein truncation, loss of function or triggering nonsense mediated decay.
  221. HIGH,
  222. /// Moderate impact variants are non-disruptive variants that might change protein effectiveness.
  223. MODERATE,
  224. /// Low impact variants are mostly harmless or unlikely to change protein behavior.
  225. LOW,
  226. /// Modifier variants are usually non-coding variants or variants affecting non-coding genes,
  227. /// where predictions are difficult or there is no evidence of impact.
  228. MODIFIER,
  229. }
  230. impl From<VepImpact> for String {
  231. /// Converts a VepImpact enum variant to its string representation.
  232. fn from(value: VepImpact) -> Self {
  233. match value {
  234. VepImpact::HIGH => "High",
  235. VepImpact::MODERATE => "Moderate",
  236. VepImpact::LOW => "Low",
  237. VepImpact::MODIFIER => "Modifier",
  238. }
  239. .to_string()
  240. }
  241. }
  242. impl PartialOrd for VepImpact {
  243. /// Implements partial ordering for VepImpact.
  244. fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
  245. Some(self.cmp(other))
  246. }
  247. }
  248. impl Ord for VepImpact {
  249. /// Implements total ordering for VepImpact.
  250. ///
  251. /// The ordering is inverse to severity: HIGH < MODERATE < LOW < MODIFIER
  252. /// This allows for easy sorting where the most severe impacts come first.
  253. fn cmp(&self, other: &Self) -> Ordering {
  254. match (self, other) {
  255. (VepImpact::HIGH, VepImpact::HIGH) => Ordering::Equal,
  256. (VepImpact::HIGH, _) => Ordering::Less,
  257. (VepImpact::MODERATE, VepImpact::HIGH) => Ordering::Greater,
  258. (VepImpact::MODERATE, VepImpact::MODERATE) => Ordering::Equal,
  259. (VepImpact::MODERATE, _) => Ordering::Less,
  260. (VepImpact::LOW, VepImpact::HIGH | VepImpact::MODERATE) => Ordering::Greater,
  261. (VepImpact::LOW, VepImpact::LOW) => Ordering::Equal,
  262. (VepImpact::LOW, VepImpact::MODIFIER) => Ordering::Less,
  263. (VepImpact::MODIFIER, VepImpact::MODIFIER) => Ordering::Equal,
  264. (VepImpact::MODIFIER, _) => Ordering::Greater,
  265. }
  266. }
  267. }
  268. impl FromStr for VepImpact {
  269. type Err = anyhow::Error;
  270. fn from_str(s: &str) -> anyhow::Result<Self> {
  271. match s {
  272. "LOW" => Ok(VepImpact::LOW),
  273. "MODERATE" => Ok(VepImpact::MODERATE),
  274. "HIGH" => Ok(VepImpact::HIGH),
  275. "MODIFIER" => Ok(VepImpact::MODIFIER),
  276. _ => Err(anyhow!("Unexpected VEP Impact value")),
  277. }
  278. }
  279. }
  280. impl From<&VepConsequence> for VepImpact {
  281. fn from(consequence: &VepConsequence) -> Self {
  282. match consequence {
  283. VepConsequence::TranscriptAblation
  284. | VepConsequence::SpliceAcceptorVariant
  285. | VepConsequence::SpliceDonorVariant
  286. | VepConsequence::StopGained
  287. | VepConsequence::FrameshiftVariant
  288. | VepConsequence::StopLost
  289. | VepConsequence::StartLost
  290. | VepConsequence::TranscriptAmplification
  291. | VepConsequence::FeatureElongation
  292. | VepConsequence::FeatureTruncation => VepImpact::HIGH,
  293. VepConsequence::InframeInsertion
  294. | VepConsequence::InframeDeletion
  295. | VepConsequence::MissenseVariant
  296. | VepConsequence::ProteinAlteringVariant => VepImpact::MODERATE,
  297. VepConsequence::SpliceDonor5thBaseVariant
  298. | VepConsequence::SpliceRegionVariant
  299. | VepConsequence::SpliceDonorRegionVariant
  300. | VepConsequence::SplicePolyrimidineTractVariant
  301. | VepConsequence::IncompleteTerminalCodonVariant
  302. | VepConsequence::StartRetainedVariant
  303. | VepConsequence::StopRetainedVariant
  304. | VepConsequence::SynonymousVariant => VepImpact::LOW,
  305. VepConsequence::CodingSequenceVariant
  306. | VepConsequence::MatureMiRnaVariant
  307. | VepConsequence::FivePrimeUtrVariant
  308. | VepConsequence::ThreePrimeUtrVariant
  309. | VepConsequence::NonCodingTranscriptExonVariant
  310. | VepConsequence::IntronVariant
  311. | VepConsequence::NmdTranscriptVariant
  312. | VepConsequence::NonCodingTranscriptVariant
  313. | VepConsequence::UpstreamGeneVariant
  314. | VepConsequence::DownstreamGeneVariant
  315. | VepConsequence::TfbsAblation
  316. | VepConsequence::TfbsAmplification
  317. | VepConsequence::TfBindingSiteVariant
  318. | VepConsequence::RegulatoryRegionAblation
  319. | VepConsequence::RegulatoryRegionAmplification
  320. | VepConsequence::RegulatoryRegionVariant
  321. | VepConsequence::SequenceVariant
  322. | VepConsequence::IntergenicVariant => VepImpact::MODIFIER,
  323. }
  324. }
  325. }
  326. impl From<VepConsequence> for String {
  327. fn from(consequence: VepConsequence) -> Self {
  328. match consequence {
  329. VepConsequence::TranscriptAblation => "transcript_ablation".to_string(),
  330. VepConsequence::SpliceAcceptorVariant => "splice_acceptor_variant".to_string(),
  331. VepConsequence::SpliceDonorVariant => "splice_donor_variant".to_string(),
  332. VepConsequence::StopGained => "stop_gained".to_string(),
  333. VepConsequence::FrameshiftVariant => "frameshift_variant".to_string(),
  334. VepConsequence::StopLost => "stop_lost".to_string(),
  335. VepConsequence::StartLost => "start_lost".to_string(),
  336. VepConsequence::TranscriptAmplification => "transcript_amplification".to_string(),
  337. VepConsequence::InframeInsertion => "inframe_insertion".to_string(),
  338. VepConsequence::InframeDeletion => "inframe_deletion".to_string(),
  339. VepConsequence::MissenseVariant => "missense_variant".to_string(),
  340. VepConsequence::ProteinAlteringVariant => "protein_altering_variant".to_string(),
  341. VepConsequence::SpliceRegionVariant => "splice_region_variant".to_string(),
  342. VepConsequence::IncompleteTerminalCodonVariant => {
  343. "incomplete_terminal_codon_variant".to_string()
  344. }
  345. VepConsequence::StartRetainedVariant => "start_retained_variant".to_string(),
  346. VepConsequence::StopRetainedVariant => "stop_retained_variant".to_string(),
  347. VepConsequence::SynonymousVariant => "synonymous_variant".to_string(),
  348. VepConsequence::CodingSequenceVariant => "coding_sequence_variant".to_string(),
  349. VepConsequence::MatureMiRnaVariant => "mature_miRNA_variant".to_string(),
  350. VepConsequence::FivePrimeUtrVariant => "5_prime_UTR_variant".to_string(),
  351. VepConsequence::ThreePrimeUtrVariant => "3_prime_UTR_variant".to_string(),
  352. VepConsequence::NonCodingTranscriptExonVariant => {
  353. "non_coding_transcript_exon_variant".to_string()
  354. }
  355. VepConsequence::IntronVariant => "intron_variant".to_string(),
  356. VepConsequence::NmdTranscriptVariant => "NMD_transcript_variant".to_string(),
  357. VepConsequence::NonCodingTranscriptVariant => {
  358. "non_coding_transcript_variant".to_string()
  359. }
  360. VepConsequence::UpstreamGeneVariant => "upstream_gene_variant".to_string(),
  361. VepConsequence::DownstreamGeneVariant => "downstream_gene_variant".to_string(),
  362. VepConsequence::TfbsAblation => "TFBS_ablation".to_string(),
  363. VepConsequence::TfbsAmplification => "TFBS_amplification".to_string(),
  364. VepConsequence::TfBindingSiteVariant => "TF_binding_site_variant".to_string(),
  365. VepConsequence::RegulatoryRegionAblation => "regulatory_region_ablation".to_string(),
  366. VepConsequence::RegulatoryRegionAmplification => {
  367. "regulatory_region_amplification".to_string()
  368. }
  369. VepConsequence::FeatureElongation => "feature_elongation".to_string(),
  370. VepConsequence::RegulatoryRegionVariant => "regulatory_region_variant".to_string(),
  371. VepConsequence::FeatureTruncation => "feature_truncation".to_string(),
  372. VepConsequence::SpliceDonor5thBaseVariant => {
  373. "splice_donor_5th_base_variant".to_string()
  374. }
  375. VepConsequence::SpliceDonorRegionVariant => "splice_donor_region_variant".to_string(),
  376. VepConsequence::SplicePolyrimidineTractVariant => {
  377. "splice_polyrimidine_tract_variant".to_string()
  378. }
  379. VepConsequence::SequenceVariant => "sequence_variant".to_string(),
  380. VepConsequence::IntergenicVariant => "intergenic_variant".to_string(),
  381. }
  382. }
  383. }
  384. impl FromStr for VepConsequence {
  385. type Err = anyhow::Error;
  386. fn from_str(s: &str) -> anyhow::Result<Self> {
  387. match s {
  388. "transcript_ablation" => Ok(VepConsequence::TranscriptAblation),
  389. "splice_acceptor_variant" => Ok(VepConsequence::SpliceAcceptorVariant),
  390. "splice_donor_variant" => Ok(VepConsequence::SpliceDonorVariant),
  391. "stop_gained" => Ok(VepConsequence::StopGained),
  392. "frameshift_variant" => Ok(VepConsequence::FrameshiftVariant),
  393. "stop_lost" => Ok(VepConsequence::StopLost),
  394. "start_lost" => Ok(VepConsequence::StartLost),
  395. "transcript_amplification" => Ok(VepConsequence::TranscriptAmplification),
  396. "feature_elongation" => Ok(VepConsequence::FeatureElongation),
  397. "feature_truncation" => Ok(VepConsequence::FeatureTruncation),
  398. "inframe_insertion" => Ok(VepConsequence::InframeInsertion),
  399. "inframe_deletion" => Ok(VepConsequence::InframeDeletion),
  400. "missense_variant" => Ok(VepConsequence::MissenseVariant),
  401. "protein_altering_variant" => Ok(VepConsequence::ProteinAlteringVariant),
  402. "splice_donor_5th_base_variant" => Ok(VepConsequence::SpliceDonor5thBaseVariant),
  403. "splice_region_variant" => Ok(VepConsequence::SpliceRegionVariant),
  404. "splice_donor_region_variant" => Ok(VepConsequence::SpliceDonorRegionVariant),
  405. "splice_polypyrimidine_tract_variant" => {
  406. Ok(VepConsequence::SplicePolyrimidineTractVariant)
  407. }
  408. "incomplete_terminal_codon_variant" => {
  409. Ok(VepConsequence::IncompleteTerminalCodonVariant)
  410. }
  411. "start_retained_variant" => Ok(VepConsequence::StartRetainedVariant),
  412. "stop_retained_variant" => Ok(VepConsequence::StopRetainedVariant),
  413. "synonymous_variant" => Ok(VepConsequence::SynonymousVariant),
  414. "coding_sequence_variant" => Ok(VepConsequence::CodingSequenceVariant),
  415. "mature_miRNA_variant" => Ok(VepConsequence::MatureMiRnaVariant),
  416. "5_prime_UTR_variant" => Ok(VepConsequence::FivePrimeUtrVariant),
  417. "3_prime_UTR_variant" => Ok(VepConsequence::ThreePrimeUtrVariant),
  418. "non_coding_transcript_exon_variant" => {
  419. Ok(VepConsequence::NonCodingTranscriptExonVariant)
  420. }
  421. "intron_variant" => Ok(VepConsequence::IntronVariant),
  422. "NMD_transcript_variant" => Ok(VepConsequence::NmdTranscriptVariant),
  423. "non_coding_transcript_variant" => Ok(VepConsequence::NonCodingTranscriptVariant),
  424. "upstream_gene_variant" => Ok(VepConsequence::UpstreamGeneVariant),
  425. "downstream_gene_variant" => Ok(VepConsequence::DownstreamGeneVariant),
  426. "TFBS_ablation" => Ok(VepConsequence::TfbsAblation),
  427. "TFBS_amplification" => Ok(VepConsequence::TfbsAmplification),
  428. "TF_binding_site_variant" => Ok(VepConsequence::TfBindingSiteVariant),
  429. "regulatory_region_ablation" => Ok(VepConsequence::RegulatoryRegionAblation),
  430. "regulatory_region_amplification" => Ok(VepConsequence::RegulatoryRegionAmplification),
  431. "regulatory_region_variant" => Ok(VepConsequence::RegulatoryRegionVariant),
  432. "intergenic_variant" => Ok(VepConsequence::IntergenicVariant),
  433. "sequence_variant" => Ok(VepConsequence::SequenceVariant),
  434. _ => Err(anyhow!("Unknown VepConsequence: {s}")),
  435. }
  436. }
  437. }
  438. impl TryFrom<&VepLine> for VEP {
  439. type Error = anyhow::Error;
  440. /// Attempts to convert a VepLine reference into a VEP struct.
  441. ///
  442. /// This implementation provides a way to transform the raw VEP output line
  443. /// (represented by VepLine) into a more structured and typed VEP representation.
  444. ///
  445. /// # Arguments
  446. /// * `d` - A reference to a VepLine struct
  447. ///
  448. /// # Returns
  449. /// * `Ok(VEP)` if the conversion is successful
  450. /// * `Err(anyhow::Error)` if any parsing errors occur
  451. ///
  452. /// # Behavior
  453. /// - Converts "-" strings to None for optional fields
  454. /// - Parses the consequence field into a `Vec<VepConsequence>`
  455. /// - Parses the extra field into a VEPExtra struct
  456. ///
  457. /// # Example
  458. /// ```
  459. /// let vep_line = VepLine { /* ... */ };
  460. /// match VEP::try_from(&vep_line) {
  461. /// Ok(vep) => println!("Converted VEP: {:?}", vep),
  462. /// Err(e) => eprintln!("Conversion failed: {}", e),
  463. /// }
  464. /// ```
  465. fn try_from(d: &VepLine) -> anyhow::Result<Self> {
  466. let or_opt = |s: &str| match s {
  467. "-" => None,
  468. _ => Some(s.to_string()),
  469. };
  470. let consequence = or_opt(&d.consequence).map(|c| {
  471. c.split(',')
  472. .filter_map(|e| e.parse::<VepConsequence>().ok())
  473. .collect::<Vec<VepConsequence>>()
  474. });
  475. Ok(VEP {
  476. gene: or_opt(&d.gene),
  477. feature: or_opt(&d.feature),
  478. feature_type: or_opt(&d.feature_type),
  479. consequence,
  480. cdna_position: or_opt(&d.feature_type),
  481. cds_position: or_opt(&d.cds_position),
  482. protein_position: or_opt(&d.protein_position),
  483. amino_acids: or_opt(&d.amino_acids),
  484. codons: or_opt(&d.codons),
  485. existing_variation: or_opt(&d.existing_variation),
  486. extra: d.extra.parse()?,
  487. })
  488. }
  489. }
  490. /// Represents additional information from Variant Effect Predictor (VEP) annotations.
  491. ///
  492. /// This struct encapsulates various optional fields that provide extra context
  493. /// about a variant's predicted effect.
  494. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
  495. pub struct VEPExtra {
  496. /// The impact severity of the variant (e.g., HIGH, MODERATE, LOW, MODIFIER)
  497. pub impact: Option<VepImpact>,
  498. /// The gene symbol associated with the variant
  499. pub symbol: Option<String>,
  500. /// The distance to the feature (e.g., for upstream/downstream variants)
  501. pub distance: Option<u32>,
  502. /// HGVS notation describing the variant at the coding sequence level
  503. pub hgvs_c: Option<String>,
  504. /// HGVS notation describing the variant at the protein level
  505. pub hgvs_p: Option<String>,
  506. }
  507. impl FromStr for VEPExtra {
  508. type Err = anyhow::Error;
  509. /// Parses a VEPExtra instance from a string representation.
  510. ///
  511. /// This method interprets a semicolon-separated key-value string typically
  512. /// found in VEP output and constructs a VEPExtra struct from it.
  513. ///
  514. /// # Arguments
  515. /// * `s` - A string slice containing the VEP extra information
  516. ///
  517. /// # Returns
  518. /// * `Ok(VEPExtra)` if parsing is successful
  519. /// * `Err(anyhow::Error)` if parsing fails
  520. ///
  521. /// # Example
  522. /// ```
  523. /// let vep_extra = VEPExtra::from_str("IMPACT=MODERATE;SYMBOL=BRCA1;DISTANCE=500;HGVSc=c.1A>T;HGVSp=p.Met1?");
  524. /// assert!(vep_extra.is_ok());
  525. /// ```
  526. ///
  527. /// # Note
  528. /// This method is flexible and will not fail if some fields are missing.
  529. /// It will simply set those fields to None in the resulting struct.
  530. fn from_str(s: &str) -> anyhow::Result<Self> {
  531. let kv: HashMap<_, _> = s.split(';').filter_map(|e| e.split_once('=')).collect();
  532. let impact = kv.get("IMPACT").map(|&v| v.parse()).transpose()?;
  533. let symbol = kv.get("SYMBOL").map(ToString::to_string);
  534. let distance = kv.get("DISTANCE").map(|&v| v.parse()).transpose()?;
  535. let hgvs_c = kv.get("HGVSc").map(ToString::to_string);
  536. let hgvs_p = kv.get("HGVSp").map(ToString::to_string);
  537. Ok(VEPExtra {
  538. impact,
  539. symbol,
  540. distance,
  541. hgvs_c,
  542. hgvs_p,
  543. })
  544. }
  545. }
  546. /// Runs the Variant Effect Predictor (VEP) on a given input file and outputs the results.
  547. ///
  548. /// This function executes the VEP tool with specific parameters to annotate genetic variants.
  549. /// It uses a predefined set of reference data and plugins to enhance the annotation process.
  550. ///
  551. /// # Arguments
  552. /// * `in_path` - A string slice that holds the path to the input file
  553. /// * `out_path` - A string slice that holds the path where the output should be written
  554. ///
  555. /// # Returns
  556. /// * `Ok(())` if VEP runs successfully
  557. /// * `Err(anyhow::Error)` if there's an error during execution
  558. ///
  559. /// # Configuration
  560. /// The function uses hardcoded paths for:
  561. /// - VEP binary directory
  562. /// - VEP cache directory
  563. /// - Reference FASTA file
  564. /// - GFF annotation file
  565. ///
  566. /// # VEP Parameters
  567. /// - Uses offline cache
  568. /// - Includes symbol information
  569. /// - Applies SpliceRegion and Downstream plugins
  570. /// - Generates HGVS notations
  571. ///
  572. /// # Error Handling
  573. /// - Logs any lines containing "error" from VEP's stderr output as warnings
  574. /// - Returns an error if the VEP process fails to complete
  575. ///
  576. /// # Example
  577. /// ```
  578. /// match run_vep("input.vcf", "output.vcf") {
  579. /// Ok(()) => println!("VEP annotation completed successfully"),
  580. /// Err(e) => eprintln!("VEP annotation failed: {}", e),
  581. /// }
  582. /// ```
  583. ///
  584. /// # Note
  585. /// Ensure that the VEP tool and all necessary reference data are correctly installed
  586. /// and accessible at the specified paths before running this function.
  587. pub fn run_vep(in_path: &str, out_path: &str) -> anyhow::Result<()> {
  588. // VEP need plugin Downstream and SpliceRegion /home/prom/.vep/Plugins
  589. debug!("Run VEP for {in_path} and ouput {out_path}");
  590. let bin_dir = "/data/tools/ensembl-vep";
  591. let dir_cache = "/data/ref/hs1/vepcache/";
  592. let fasta = "/data/ref/hs1/chm13v2.0.fa";
  593. let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz";
  594. let mut cmd = Command::new(format!("{}/vep", bin_dir))
  595. .arg("--dir_cache")
  596. .arg(dir_cache)
  597. .arg("--cache")
  598. .arg("--offline")
  599. .arg("--fasta")
  600. .arg(fasta)
  601. .arg("--gff")
  602. .arg(gff)
  603. .arg("--symbol")
  604. .arg("--plugin")
  605. .arg("SpliceRegion")
  606. .arg("--plugin")
  607. .arg("Downstream")
  608. .arg("--hgvs")
  609. .arg("-i")
  610. .arg(in_path)
  611. .arg("-o")
  612. .arg(out_path)
  613. .stderr(Stdio::piped())
  614. .spawn()
  615. .expect("VEP failed to start");
  616. let stderr = cmd.stderr.take().unwrap();
  617. let reader = BufReader::new(stderr);
  618. reader
  619. .lines()
  620. .map_while(Result::ok)
  621. // .inspect(|y| println!("{y}"))
  622. .filter(|line| line.contains("error"))
  623. .for_each(|line| warn!("{}", line));
  624. cmd.wait()?;
  625. Ok(())
  626. }
  627. /// Selects the best Variant Effect Predictor (VEP) annotation from a slice of VEP annotations.
  628. ///
  629. /// This function implements a prioritization algorithm to choose the most relevant VEP annotation
  630. /// based on impact severity and transcript preference.
  631. ///
  632. /// # Arguments
  633. /// * `d` - A slice of VEP annotations
  634. ///
  635. /// # Returns
  636. /// * `Ok(VEP)` containing the best VEP annotation if successful
  637. /// * `Err(anyhow::Error)` if no valid VEP annotation can be selected
  638. ///
  639. /// # Algorithm
  640. /// 1. Groups VEPs by their impact (HIGH > MODERATE > LOW > MODIFIER)
  641. /// 2. Selects the group with the highest impact
  642. /// 3. If there's only one VEP in this group, it's returned
  643. /// 4. Otherwise, further filtering is done based on transcript identifiers:
  644. /// - Parses NCBI accessions from the feature field
  645. /// - Prioritizes 'NM' prefixed accessions over others
  646. /// - Sorts by accession number (higher numbers are preferred)
  647. ///
  648. /// # Errors
  649. /// This function will return an error if:
  650. /// * The input slice is empty
  651. /// * No valid NCBI accession can be parsed from the selected VEPs
  652. ///
  653. /// # Example
  654. /// ```
  655. /// let veps = vec![VEP { /* ... */ }, VEP { /* ... */ }];
  656. /// match get_best_vep(&veps) {
  657. /// Ok(best_vep) => println!("Best VEP: {:?}", best_vep),
  658. /// Err(e) => eprintln!("Failed to get best VEP: {}", e),
  659. /// }
  660. /// ```
  661. ///
  662. /// # Note
  663. /// This function assumes that the VEP annotations are well-formed and contain
  664. /// the necessary fields for comparison (consequence, feature). It will log warnings
  665. /// for any parsing errors encountered during the selection process.
  666. pub fn get_best_vep(d: &[VEP]) -> anyhow::Result<VEP> {
  667. let best_impact_veps = d
  668. .iter()
  669. .chunk_by(|vep| {
  670. vep.consequence.as_ref().map_or(VepImpact::MODIFIER, |c| {
  671. c.iter()
  672. .map(VepImpact::from)
  673. .min()
  674. .unwrap_or(VepImpact::MODIFIER)
  675. })
  676. })
  677. .into_iter()
  678. .min_by_key(|(impact, _)| impact.clone())
  679. .map(|(_, group)| group.cloned().collect::<Vec<_>>())
  680. .ok_or_else(|| anyhow!("No element in VEP vector"))?;
  681. if best_impact_veps.len() == 1 {
  682. return Ok(best_impact_veps[0].clone());
  683. }
  684. let parsed_veps = best_impact_veps
  685. .iter()
  686. .enumerate()
  687. .filter_map(|(i, vep)| {
  688. vep.feature.as_ref().and_then(|feat| {
  689. feat.parse::<NCBIAcc>()
  690. .map(|acc| (i, acc))
  691. .map_err(|e| warn!("Can't parse {}: {}", feat, e))
  692. .ok()
  693. })
  694. })
  695. .sorted_by_key(|(_, acc)| (Reverse(acc.prefix == "NM"), acc.number))
  696. .collect::<Vec<_>>();
  697. parsed_veps
  698. .first()
  699. .map(|(k, _)| best_impact_veps[*k].clone())
  700. .ok_or_else(|| anyhow!("No valid NCBI accession found"))
  701. }