vep.rs 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805
  1. use anyhow::anyhow;
  2. use bitcode::{Decode, Encode};
  3. use hashbrown::HashMap;
  4. use itertools::Itertools;
  5. use log::warn;
  6. use serde::{Deserialize, Serialize};
  7. use std::{
  8. cmp::{Ordering, Reverse},
  9. fmt::Display,
  10. path::{Path, PathBuf},
  11. str::FromStr,
  12. };
  13. use crate::{
  14. commands::{Command as JobCommand, LocalBatchRunner, LocalRunner, SbatchRunner},
  15. config::Config,
  16. helpers::singularity_bind_flags,
  17. };
  18. use super::ncbi::NCBIAcc;
  19. /// Represents a single line of output from the Variant Effect Predictor (VEP).
  20. ///
  21. /// This struct encapsulates all fields typically found in a VEP output line,
  22. /// providing a structured way to work with VEP results.
  23. #[derive(Debug, PartialEq, Serialize, Deserialize)]
  24. pub struct VepLine {
  25. /// The identifier of the input variant
  26. pub uploaded_variation: String,
  27. /// The genomic location of the variant
  28. pub location: String,
  29. /// The variant allele
  30. pub allele: String,
  31. /// The gene symbol or identifier
  32. pub gene: String,
  33. /// The affected feature (e.g., transcript ID)
  34. pub feature: String,
  35. /// The type of feature affected (e.g., Transcript, RegulatoryFeature)
  36. pub feature_type: String,
  37. /// The consequence of the variant
  38. pub consequence: String,
  39. /// The position of the variant in the cDNA
  40. pub cdna_position: String,
  41. /// The position of the variant in the CDS
  42. pub cds_position: String,
  43. /// The position of the variant in the protein
  44. pub protein_position: String,
  45. /// The amino acid change caused by the variant
  46. pub amino_acids: String,
  47. /// The codon change caused by the variant
  48. pub codons: String,
  49. /// Known identifiers for this variant
  50. pub existing_variation: String,
  51. /// Additional information in key=value format
  52. pub extra: String,
  53. }
  54. impl FromStr for VepLine {
  55. type Err = anyhow::Error;
  56. /// Parses a VEP output line into a VepLine struct.
  57. ///
  58. /// This method expects the input string to be a tab-separated line
  59. /// containing exactly 14 fields, as per standard VEP output format.
  60. ///
  61. /// # Arguments
  62. /// * `s` - A string slice representing a single line of VEP output
  63. ///
  64. /// # Returns
  65. /// * `Ok(VepLine)` if parsing is successful
  66. /// * `Err(anyhow::Error)` if the input doesn't contain the expected number of fields
  67. ///
  68. /// # Example
  69. /// ```
  70. /// let vep_line = VepLine::from_str("").unwrap();
  71. /// assert_eq!(vep_line.gene, "ENSG00000135744");
  72. /// ```
  73. fn from_str(s: &str) -> Result<Self, Self::Err> {
  74. let parts: Vec<&str> = s.split('\t').collect();
  75. if parts.len() != 14 {
  76. return Err(anyhow!("Invalid number of fields in VEP line"));
  77. }
  78. Ok(VepLine {
  79. uploaded_variation: parts[0].to_string(),
  80. location: parts[1].to_string(),
  81. allele: parts[2].to_string(),
  82. gene: parts[3].to_string(),
  83. feature: parts[4].to_string(),
  84. feature_type: parts[5].to_string(),
  85. consequence: parts[6].to_string(),
  86. cdna_position: parts[7].to_string(),
  87. cds_position: parts[8].to_string(),
  88. protein_position: parts[9].to_string(),
  89. amino_acids: parts[10].to_string(),
  90. codons: parts[11].to_string(),
  91. existing_variation: parts[12].to_string(),
  92. extra: parts[13].to_string(),
  93. })
  94. }
  95. }
  96. /// Represents a Variant Effect Predictor (VEP) annotation for a genetic variant.
  97. ///
  98. /// This struct encapsulates various fields that describe the predicted effects
  99. /// of a variant on genes, transcripts, and proteins, as determined by VEP.
  100. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
  101. pub struct VEP {
  102. /// The gene affected by the variant
  103. pub gene: Option<String>,
  104. /// The specific feature (e.g., transcript) affected by the variant
  105. pub feature: Option<String>,
  106. /// The type of feature (e.g., Transcript, RegulatoryFeature)
  107. pub feature_type: Option<String>,
  108. /// The predicted consequence(s) of the variant
  109. pub consequence: Option<Vec<VepConsequence>>,
  110. /// The position of the variant in the cDNA sequence
  111. pub cdna_position: Option<String>,
  112. /// The position of the variant in the coding sequence
  113. pub cds_position: Option<String>,
  114. /// The position of the variant in the protein sequence
  115. pub protein_position: Option<String>,
  116. /// The amino acid change caused by the variant
  117. pub amino_acids: Option<String>,
  118. /// The codon change caused by the variant
  119. pub codons: Option<String>,
  120. /// Known identifiers for this variant
  121. pub existing_variation: Option<String>,
  122. /// Additional VEP information
  123. pub extra: VEPExtra,
  124. }
  125. impl VEP {
  126. pub fn impact(&self) -> Option<VepImpact> {
  127. self.extra.impact.clone()
  128. }
  129. pub fn gene(&self) -> Option<String> {
  130. self.extra.symbol.clone()
  131. }
  132. pub fn gene_distance(&self) -> Option<u32> {
  133. self.extra.distance
  134. }
  135. pub fn feature(&self) -> Option<String> {
  136. self.feature.clone()
  137. }
  138. pub fn consequences_str(&self) -> Vec<String> {
  139. match &self.consequence {
  140. Some(csq) => csq.iter().map(|s| s.to_string()).collect(),
  141. None => Vec::new(),
  142. }
  143. }
  144. pub fn hgvs(&self) -> Option<String> {
  145. if self.extra.hgvs_p.is_some() {
  146. return self.extra.hgvs_p.clone();
  147. }
  148. if self.extra.hgvs_c.is_some() {
  149. return self.extra.hgvs_c.clone();
  150. }
  151. None
  152. }
  153. }
  154. /// Represents the predicted consequences of a genetic variant as determined by
  155. /// the Ensembl Variant Effect Predictor (VEP).
  156. ///
  157. /// This enum aligns with the Sequence Ontology (SO) terms used by VEP to describe
  158. /// the effects of variants on transcripts, proteins, and regulatory regions.
  159. ///
  160. /// The variants are generally ordered from most to least severe in terms of their
  161. /// potential impact on gene function.
  162. ///
  163. /// For more information, see:
  164. /// <https://ensembl.org/info/genome/variation/prediction/predicted_data.html>
  165. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
  166. pub enum VepConsequence {
  167. /// Complete destruction of a transcript
  168. TranscriptAblation,
  169. /// Variant at the splice acceptor site
  170. SpliceAcceptorVariant,
  171. /// Variant at the splice donor site
  172. SpliceDonorVariant,
  173. /// Variant causing a premature stop codon
  174. StopGained,
  175. /// Insertion or deletion causing a frameshift
  176. FrameshiftVariant,
  177. /// Variant causing loss of stop codon
  178. StopLost,
  179. /// Variant causing loss of start codon
  180. StartLost,
  181. /// Amplification of a transcript
  182. TranscriptAmplification,
  183. /// Insertion not causing a frameshift
  184. InframeInsertion,
  185. /// Deletion not causing a frameshift
  186. InframeDeletion,
  187. /// Variant causing an amino acid change
  188. MissenseVariant,
  189. /// Variant that changes protein structure
  190. ProteinAlteringVariant,
  191. /// Variant at the 5th base of a splice donor site
  192. SpliceDonor5thBaseVariant,
  193. /// Variant in the splice region
  194. SpliceRegionVariant,
  195. /// Variant in the splice donor region
  196. SpliceDonorRegionVariant,
  197. /// Variant in the polypyrimidine tract near splice sites
  198. SplicePolyrimidineTractVariant,
  199. /// Variant in the final, incomplete codon of a transcript
  200. IncompleteTerminalCodonVariant,
  201. /// Variant in start codon resulting in no change
  202. StartRetainedVariant,
  203. /// Variant in stop codon resulting in no change
  204. StopRetainedVariant,
  205. /// Variant not changing the encoded amino acid
  206. SynonymousVariant,
  207. /// Variant in coding sequence with indeterminate effect
  208. CodingSequenceVariant,
  209. /// Variant in mature miRNA
  210. MatureMiRnaVariant,
  211. /// Variant in 5' UTR
  212. FivePrimeUtrVariant,
  213. /// Variant in 3' UTR
  214. ThreePrimeUtrVariant,
  215. /// Variant in non-coding exon
  216. NonCodingTranscriptExonVariant,
  217. /// Variant in intron
  218. IntronVariant,
  219. /// Variant in transcript subject to nonsense-mediated decay
  220. NmdTranscriptVariant,
  221. /// Variant in non-coding transcript
  222. NonCodingTranscriptVariant,
  223. /// Variant upstream of a gene
  224. UpstreamGeneVariant,
  225. /// Variant downstream of a gene
  226. DownstreamGeneVariant,
  227. /// Deletion of a transcription factor binding site
  228. TfbsAblation,
  229. /// Amplification of a transcription factor binding site
  230. TfbsAmplification,
  231. /// Variant in a transcription factor binding site
  232. TfBindingSiteVariant,
  233. /// Deletion of a regulatory region
  234. RegulatoryRegionAblation,
  235. /// Amplification of a regulatory region
  236. RegulatoryRegionAmplification,
  237. /// Variant causing a feature to be extended
  238. FeatureElongation,
  239. /// Variant in a regulatory region
  240. RegulatoryRegionVariant,
  241. /// Variant causing a feature to be shortened
  242. FeatureTruncation,
  243. /// Variant in intergenic region
  244. IntergenicVariant,
  245. /// General sequence variant
  246. SequenceVariant,
  247. }
  248. /// Represents the severity of a variant's impact as predicted by the
  249. /// Variant Effect Predictor (VEP).
  250. ///
  251. /// The impact categories are ordered from most severe (HIGH) to least severe (MODIFIER).
  252. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
  253. pub enum VepImpact {
  254. /// High impact variants are expected to have high (disruptive) impact in the protein,
  255. /// probably causing protein truncation, loss of function or triggering nonsense mediated decay.
  256. HIGH,
  257. /// Moderate impact variants are non-disruptive variants that might change protein effectiveness.
  258. MODERATE,
  259. /// Low impact variants are mostly harmless or unlikely to change protein behavior.
  260. LOW,
  261. /// Modifier variants are usually non-coding variants or variants affecting non-coding genes,
  262. /// where predictions are difficult or there is no evidence of impact.
  263. MODIFIER,
  264. }
  265. impl Display for VepImpact {
  266. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  267. write!(
  268. f,
  269. "{}",
  270. match self {
  271. VepImpact::HIGH => "HIGH",
  272. VepImpact::MODERATE => "MODERATE",
  273. VepImpact::LOW => "LOW",
  274. VepImpact::MODIFIER => "MODIFIER",
  275. }
  276. )
  277. }
  278. }
  279. impl From<VepImpact> for String {
  280. /// Converts a VepImpact enum variant to its string representation.
  281. fn from(value: VepImpact) -> Self {
  282. match value {
  283. VepImpact::HIGH => "High",
  284. VepImpact::MODERATE => "Moderate",
  285. VepImpact::LOW => "Low",
  286. VepImpact::MODIFIER => "Modifier",
  287. }
  288. .to_string()
  289. }
  290. }
  291. impl PartialOrd for VepImpact {
  292. /// Implements partial ordering for VepImpact.
  293. fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
  294. Some(self.cmp(other))
  295. }
  296. }
  297. impl Ord for VepImpact {
  298. /// Implements total ordering for VepImpact.
  299. ///
  300. /// The ordering is inverse to severity: HIGH < MODERATE < LOW < MODIFIER
  301. /// This allows for easy sorting where the most severe impacts come first.
  302. fn cmp(&self, other: &Self) -> Ordering {
  303. match (self, other) {
  304. (VepImpact::HIGH, VepImpact::HIGH) => Ordering::Equal,
  305. (VepImpact::HIGH, _) => Ordering::Less,
  306. (VepImpact::MODERATE, VepImpact::HIGH) => Ordering::Greater,
  307. (VepImpact::MODERATE, VepImpact::MODERATE) => Ordering::Equal,
  308. (VepImpact::MODERATE, _) => Ordering::Less,
  309. (VepImpact::LOW, VepImpact::HIGH | VepImpact::MODERATE) => Ordering::Greater,
  310. (VepImpact::LOW, VepImpact::LOW) => Ordering::Equal,
  311. (VepImpact::LOW, VepImpact::MODIFIER) => Ordering::Less,
  312. (VepImpact::MODIFIER, VepImpact::MODIFIER) => Ordering::Equal,
  313. (VepImpact::MODIFIER, _) => Ordering::Greater,
  314. }
  315. }
  316. }
  317. impl FromStr for VepImpact {
  318. type Err = anyhow::Error;
  319. fn from_str(s: &str) -> anyhow::Result<Self> {
  320. match s {
  321. "LOW" => Ok(VepImpact::LOW),
  322. "MODERATE" => Ok(VepImpact::MODERATE),
  323. "HIGH" => Ok(VepImpact::HIGH),
  324. "MODIFIER" => Ok(VepImpact::MODIFIER),
  325. _ => Err(anyhow!("Unexpected VEP Impact value")),
  326. }
  327. }
  328. }
  329. impl From<&VepConsequence> for VepImpact {
  330. fn from(consequence: &VepConsequence) -> Self {
  331. match consequence {
  332. VepConsequence::TranscriptAblation
  333. | VepConsequence::SpliceAcceptorVariant
  334. | VepConsequence::SpliceDonorVariant
  335. | VepConsequence::StopGained
  336. | VepConsequence::FrameshiftVariant
  337. | VepConsequence::StopLost
  338. | VepConsequence::StartLost
  339. | VepConsequence::TranscriptAmplification
  340. | VepConsequence::FeatureElongation
  341. | VepConsequence::FeatureTruncation => VepImpact::HIGH,
  342. VepConsequence::InframeInsertion
  343. | VepConsequence::InframeDeletion
  344. | VepConsequence::MissenseVariant
  345. | VepConsequence::ProteinAlteringVariant => VepImpact::MODERATE,
  346. VepConsequence::SpliceDonor5thBaseVariant
  347. | VepConsequence::SpliceRegionVariant
  348. | VepConsequence::SpliceDonorRegionVariant
  349. | VepConsequence::SplicePolyrimidineTractVariant
  350. | VepConsequence::IncompleteTerminalCodonVariant
  351. | VepConsequence::StartRetainedVariant
  352. | VepConsequence::StopRetainedVariant
  353. | VepConsequence::SynonymousVariant => VepImpact::LOW,
  354. VepConsequence::CodingSequenceVariant
  355. | VepConsequence::MatureMiRnaVariant
  356. | VepConsequence::FivePrimeUtrVariant
  357. | VepConsequence::ThreePrimeUtrVariant
  358. | VepConsequence::NonCodingTranscriptExonVariant
  359. | VepConsequence::IntronVariant
  360. | VepConsequence::NmdTranscriptVariant
  361. | VepConsequence::NonCodingTranscriptVariant
  362. | VepConsequence::UpstreamGeneVariant
  363. | VepConsequence::DownstreamGeneVariant
  364. | VepConsequence::TfbsAblation
  365. | VepConsequence::TfbsAmplification
  366. | VepConsequence::TfBindingSiteVariant
  367. | VepConsequence::RegulatoryRegionAblation
  368. | VepConsequence::RegulatoryRegionAmplification
  369. | VepConsequence::RegulatoryRegionVariant
  370. | VepConsequence::SequenceVariant
  371. | VepConsequence::IntergenicVariant => VepImpact::MODIFIER,
  372. }
  373. }
  374. }
  375. impl From<VepConsequence> for String {
  376. fn from(consequence: VepConsequence) -> Self {
  377. match consequence {
  378. VepConsequence::TranscriptAblation => "transcript_ablation".to_string(),
  379. VepConsequence::SpliceAcceptorVariant => "splice_acceptor_variant".to_string(),
  380. VepConsequence::SpliceDonorVariant => "splice_donor_variant".to_string(),
  381. VepConsequence::StopGained => "stop_gained".to_string(),
  382. VepConsequence::FrameshiftVariant => "frameshift_variant".to_string(),
  383. VepConsequence::StopLost => "stop_lost".to_string(),
  384. VepConsequence::StartLost => "start_lost".to_string(),
  385. VepConsequence::TranscriptAmplification => "transcript_amplification".to_string(),
  386. VepConsequence::InframeInsertion => "inframe_insertion".to_string(),
  387. VepConsequence::InframeDeletion => "inframe_deletion".to_string(),
  388. VepConsequence::MissenseVariant => "missense_variant".to_string(),
  389. VepConsequence::ProteinAlteringVariant => "protein_altering_variant".to_string(),
  390. VepConsequence::SpliceRegionVariant => "splice_region_variant".to_string(),
  391. VepConsequence::IncompleteTerminalCodonVariant => {
  392. "incomplete_terminal_codon_variant".to_string()
  393. }
  394. VepConsequence::StartRetainedVariant => "start_retained_variant".to_string(),
  395. VepConsequence::StopRetainedVariant => "stop_retained_variant".to_string(),
  396. VepConsequence::SynonymousVariant => "synonymous_variant".to_string(),
  397. VepConsequence::CodingSequenceVariant => "coding_sequence_variant".to_string(),
  398. VepConsequence::MatureMiRnaVariant => "mature_miRNA_variant".to_string(),
  399. VepConsequence::FivePrimeUtrVariant => "5_prime_UTR_variant".to_string(),
  400. VepConsequence::ThreePrimeUtrVariant => "3_prime_UTR_variant".to_string(),
  401. VepConsequence::NonCodingTranscriptExonVariant => {
  402. "non_coding_transcript_exon_variant".to_string()
  403. }
  404. VepConsequence::IntronVariant => "intron_variant".to_string(),
  405. VepConsequence::NmdTranscriptVariant => "NMD_transcript_variant".to_string(),
  406. VepConsequence::NonCodingTranscriptVariant => {
  407. "non_coding_transcript_variant".to_string()
  408. }
  409. VepConsequence::UpstreamGeneVariant => "upstream_gene_variant".to_string(),
  410. VepConsequence::DownstreamGeneVariant => "downstream_gene_variant".to_string(),
  411. VepConsequence::TfbsAblation => "TFBS_ablation".to_string(),
  412. VepConsequence::TfbsAmplification => "TFBS_amplification".to_string(),
  413. VepConsequence::TfBindingSiteVariant => "TF_binding_site_variant".to_string(),
  414. VepConsequence::RegulatoryRegionAblation => "regulatory_region_ablation".to_string(),
  415. VepConsequence::RegulatoryRegionAmplification => {
  416. "regulatory_region_amplification".to_string()
  417. }
  418. VepConsequence::FeatureElongation => "feature_elongation".to_string(),
  419. VepConsequence::RegulatoryRegionVariant => "regulatory_region_variant".to_string(),
  420. VepConsequence::FeatureTruncation => "feature_truncation".to_string(),
  421. VepConsequence::SpliceDonor5thBaseVariant => {
  422. "splice_donor_5th_base_variant".to_string()
  423. }
  424. VepConsequence::SpliceDonorRegionVariant => "splice_donor_region_variant".to_string(),
  425. VepConsequence::SplicePolyrimidineTractVariant => {
  426. "splice_polyrimidine_tract_variant".to_string()
  427. }
  428. VepConsequence::SequenceVariant => "sequence_variant".to_string(),
  429. VepConsequence::IntergenicVariant => "intergenic_variant".to_string(),
  430. }
  431. }
  432. }
  433. impl Display for VepConsequence {
  434. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  435. write!(f, "{}", String::from(self.clone()))
  436. }
  437. }
  438. impl FromStr for VepConsequence {
  439. type Err = anyhow::Error;
  440. fn from_str(s: &str) -> anyhow::Result<Self> {
  441. match s {
  442. "transcript_ablation" => Ok(VepConsequence::TranscriptAblation),
  443. "splice_acceptor_variant" => Ok(VepConsequence::SpliceAcceptorVariant),
  444. "splice_donor_variant" => Ok(VepConsequence::SpliceDonorVariant),
  445. "stop_gained" => Ok(VepConsequence::StopGained),
  446. "frameshift_variant" => Ok(VepConsequence::FrameshiftVariant),
  447. "stop_lost" => Ok(VepConsequence::StopLost),
  448. "start_lost" => Ok(VepConsequence::StartLost),
  449. "transcript_amplification" => Ok(VepConsequence::TranscriptAmplification),
  450. "feature_elongation" => Ok(VepConsequence::FeatureElongation),
  451. "feature_truncation" => Ok(VepConsequence::FeatureTruncation),
  452. "inframe_insertion" => Ok(VepConsequence::InframeInsertion),
  453. "inframe_deletion" => Ok(VepConsequence::InframeDeletion),
  454. "missense_variant" => Ok(VepConsequence::MissenseVariant),
  455. "protein_altering_variant" => Ok(VepConsequence::ProteinAlteringVariant),
  456. "splice_donor_5th_base_variant" => Ok(VepConsequence::SpliceDonor5thBaseVariant),
  457. "splice_region_variant" => Ok(VepConsequence::SpliceRegionVariant),
  458. "splice_donor_region_variant" => Ok(VepConsequence::SpliceDonorRegionVariant),
  459. "splice_polypyrimidine_tract_variant" => {
  460. Ok(VepConsequence::SplicePolyrimidineTractVariant)
  461. }
  462. "incomplete_terminal_codon_variant" => {
  463. Ok(VepConsequence::IncompleteTerminalCodonVariant)
  464. }
  465. "start_retained_variant" => Ok(VepConsequence::StartRetainedVariant),
  466. "stop_retained_variant" => Ok(VepConsequence::StopRetainedVariant),
  467. "synonymous_variant" => Ok(VepConsequence::SynonymousVariant),
  468. "coding_sequence_variant" => Ok(VepConsequence::CodingSequenceVariant),
  469. "mature_miRNA_variant" => Ok(VepConsequence::MatureMiRnaVariant),
  470. "5_prime_UTR_variant" => Ok(VepConsequence::FivePrimeUtrVariant),
  471. "3_prime_UTR_variant" => Ok(VepConsequence::ThreePrimeUtrVariant),
  472. "non_coding_transcript_exon_variant" => {
  473. Ok(VepConsequence::NonCodingTranscriptExonVariant)
  474. }
  475. "intron_variant" => Ok(VepConsequence::IntronVariant),
  476. "NMD_transcript_variant" => Ok(VepConsequence::NmdTranscriptVariant),
  477. "non_coding_transcript_variant" => Ok(VepConsequence::NonCodingTranscriptVariant),
  478. "upstream_gene_variant" => Ok(VepConsequence::UpstreamGeneVariant),
  479. "downstream_gene_variant" => Ok(VepConsequence::DownstreamGeneVariant),
  480. "TFBS_ablation" => Ok(VepConsequence::TfbsAblation),
  481. "TFBS_amplification" => Ok(VepConsequence::TfbsAmplification),
  482. "TF_binding_site_variant" => Ok(VepConsequence::TfBindingSiteVariant),
  483. "regulatory_region_ablation" => Ok(VepConsequence::RegulatoryRegionAblation),
  484. "regulatory_region_amplification" => Ok(VepConsequence::RegulatoryRegionAmplification),
  485. "regulatory_region_variant" => Ok(VepConsequence::RegulatoryRegionVariant),
  486. "intergenic_variant" => Ok(VepConsequence::IntergenicVariant),
  487. "sequence_variant" => Ok(VepConsequence::SequenceVariant),
  488. _ => Err(anyhow!("Unknown VepConsequence: {s}")),
  489. }
  490. }
  491. }
  492. impl TryFrom<&VepLine> for VEP {
  493. type Error = anyhow::Error;
  494. /// Attempts to convert a VepLine reference into a VEP struct.
  495. ///
  496. /// This implementation provides a way to transform the raw VEP output line
  497. /// (represented by VepLine) into a more structured and typed VEP representation.
  498. ///
  499. /// # Arguments
  500. /// * `d` - A reference to a VepLine struct
  501. ///
  502. /// # Returns
  503. /// * `Ok(VEP)` if the conversion is successful
  504. /// * `Err(anyhow::Error)` if any parsing errors occur
  505. ///
  506. /// # Behavior
  507. /// - Converts "-" strings to None for optional fields
  508. /// - Parses the consequence field into a `Vec<VepConsequence>`
  509. /// - Parses the extra field into a VEPExtra struct
  510. ///
  511. /// # Example
  512. /// ```
  513. /// let vep_line = VepLine { /* ... */ };
  514. /// match VEP::try_from(&vep_line) {
  515. /// Ok(vep) => println!("Converted VEP: {:?}", vep),
  516. /// Err(e) => eprintln!("Conversion failed: {}", e),
  517. /// }
  518. /// ```
  519. fn try_from(d: &VepLine) -> anyhow::Result<Self> {
  520. let or_opt = |s: &str| match s {
  521. "-" => None,
  522. _ => Some(s.to_string()),
  523. };
  524. let consequence = or_opt(&d.consequence).map(|c| {
  525. c.split(',')
  526. .filter_map(|e| e.parse::<VepConsequence>().ok())
  527. .collect::<Vec<VepConsequence>>()
  528. });
  529. Ok(VEP {
  530. gene: or_opt(&d.gene),
  531. feature: or_opt(&d.feature),
  532. feature_type: or_opt(&d.feature_type),
  533. consequence,
  534. cdna_position: or_opt(&d.feature_type),
  535. cds_position: or_opt(&d.cds_position),
  536. protein_position: or_opt(&d.protein_position),
  537. amino_acids: or_opt(&d.amino_acids),
  538. codons: or_opt(&d.codons),
  539. existing_variation: or_opt(&d.existing_variation),
  540. extra: d.extra.parse()?,
  541. })
  542. }
  543. }
  544. /// Represents additional information from Variant Effect Predictor (VEP) annotations.
  545. ///
  546. /// This struct encapsulates various optional fields that provide extra context
  547. /// about a variant's predicted effect.
  548. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
  549. pub struct VEPExtra {
  550. /// The impact severity of the variant (e.g., HIGH, MODERATE, LOW, MODIFIER)
  551. pub impact: Option<VepImpact>,
  552. /// The gene symbol associated with the variant
  553. pub symbol: Option<String>,
  554. /// The distance to the feature (e.g., for upstream/downstream variants)
  555. pub distance: Option<u32>,
  556. /// HGVS notation describing the variant at the coding sequence level
  557. pub hgvs_c: Option<String>,
  558. /// HGVS notation describing the variant at the protein level
  559. pub hgvs_p: Option<String>,
  560. }
  561. impl FromStr for VEPExtra {
  562. type Err = anyhow::Error;
  563. /// Parses a VEPExtra instance from a string representation.
  564. ///
  565. /// This method interprets a semicolon-separated key-value string typically
  566. /// found in VEP output and constructs a VEPExtra struct from it.
  567. ///
  568. /// # Arguments
  569. /// * `s` - A string slice containing the VEP extra information
  570. ///
  571. /// # Returns
  572. /// * `Ok(VEPExtra)` if parsing is successful
  573. /// * `Err(anyhow::Error)` if parsing fails
  574. ///
  575. /// # Example
  576. /// ```
  577. /// let vep_extra = VEPExtra::from_str("IMPACT=MODERATE;SYMBOL=BRCA1;DISTANCE=500;HGVSc=c.1A>T;HGVSp=p.Met1?");
  578. /// assert!(vep_extra.is_ok());
  579. /// ```
  580. ///
  581. /// # Note
  582. /// This method is flexible and will not fail if some fields are missing.
  583. /// It will simply set those fields to None in the resulting struct.
  584. fn from_str(s: &str) -> anyhow::Result<Self> {
  585. let kv: HashMap<_, _> = s.split(';').filter_map(|e| e.split_once('=')).collect();
  586. let impact = kv.get("IMPACT").map(|&v| v.parse()).transpose()?;
  587. let symbol = kv.get("SYMBOL").map(ToString::to_string);
  588. let distance = kv.get("DISTANCE").map(|&v| v.parse()).transpose()?;
  589. let hgvs_c = kv.get("HGVSc").map(ToString::to_string);
  590. let hgvs_p = kv.get("HGVSp").map(ToString::to_string);
  591. Ok(VEPExtra {
  592. impact,
  593. symbol,
  594. distance,
  595. hgvs_c,
  596. hgvs_p,
  597. })
  598. }
  599. }
  600. #[derive(Debug)]
  601. pub struct VepJob {
  602. in_vcf: PathBuf,
  603. out_vcf: PathBuf,
  604. config: Config,
  605. }
  606. impl VepJob {
  607. pub fn new(in_path: impl AsRef<Path>, out_path: impl AsRef<Path>, config: &Config) -> Self {
  608. VepJob {
  609. in_vcf: in_path.as_ref().to_path_buf(),
  610. out_vcf: out_path.as_ref().to_path_buf(),
  611. config: config.clone(),
  612. }
  613. }
  614. }
  615. impl JobCommand for VepJob {
  616. fn cmd(&self) -> String {
  617. let bind_flags = singularity_bind_flags([
  618. &self.in_vcf.to_string_lossy().to_string(),
  619. &self.out_vcf.to_string_lossy().to_string(),
  620. &self.config.vep_gff,
  621. &self.config.vep_cache_dir,
  622. &self.config.reference,
  623. ]);
  624. let plugins = vec!["SpliceRegion", "Downstream"]
  625. .into_iter()
  626. .map(|e| format!("--plugin {e}"))
  627. .collect::<Vec<String>>()
  628. .join(" ");
  629. format!(
  630. "{singularity_bin} exec \
  631. {binds} \
  632. {image} vep \
  633. --dir_cache {dir_cache} \
  634. --cache --offline \
  635. --fasta {fasta} \
  636. --gff {gff} \
  637. --symbol \
  638. {plugins} \
  639. --hgvs \
  640. -i {input} \
  641. -o {output}",
  642. singularity_bin = self.config.singularity_bin,
  643. binds = bind_flags,
  644. image = self.config.vep_image,
  645. dir_cache = self.config.vep_cache_dir,
  646. fasta = self.config.reference,
  647. gff = self.config.vep_gff,
  648. plugins = plugins,
  649. input = self.in_vcf.to_string_lossy(),
  650. output = self.out_vcf.to_string_lossy(),
  651. )
  652. }
  653. }
  654. impl LocalRunner for VepJob {}
  655. impl LocalBatchRunner for VepJob {}
  656. impl SbatchRunner for VepJob {
  657. fn slurm_params(&self) -> crate::commands::SlurmParams {
  658. crate::commands::SlurmParams {
  659. job_name: Some("VEP".into()),
  660. partition: Some("shortq".into()),
  661. cpus_per_task: Some(10),
  662. mem: Some("10G".into()),
  663. gres: None,
  664. }
  665. }
  666. }
  667. // impl Run for VepJob {
  668. // fn run(&mut self) -> anyhow::Result<()> {
  669. // let report = run!(&self.config, self).with_context(|| format!("Failed to filter PASS for {}", self.id))?;
  670. // report
  671. // .save_to_file(format!("{}/bcftools_pass_", self.config.log_dir))
  672. // .context("Can't save bcftools PASS logs")?;;
  673. // Ok(())
  674. // }
  675. // // add code here
  676. // }
  677. /// Selects the "best" VEP annotation from a list, based on biological impact and transcript priority.
  678. ///
  679. /// # Selection Strategy
  680. /// 1. **Group by minimal VEP impact** (e.g., `HIGH`, `MODERATE`, etc.).
  681. /// The lowest-impact group is selected (e.g., `HIGH` preferred over `MODIFIER`).
  682. ///
  683. /// 2. **If only one annotation in the best-impact group**, return it immediately.
  684. ///
  685. /// 3. **If multiple remain**, rank by:
  686. /// - Transcript type: prefer `"NM"` (manually curated) over others
  687. /// - Accession number (lower is better)
  688. ///
  689. /// # Arguments
  690. /// * `d` - A slice of `VEP` annotations to choose from.
  691. ///
  692. /// # Returns
  693. /// * `Ok(VEP)` - The best annotation according to the rules above
  694. /// * `Err(anyhow::Error)` - If the input is empty or contains no valid features
  695. ///
  696. /// # Errors
  697. /// * Returns an error if:
  698. /// - `d` is empty
  699. /// - None of the best-impact VEPs have a parsable `feature` field
  700. pub fn get_best_vep(d: &[VEP]) -> anyhow::Result<VEP> {
  701. // Step 1: Group by minimal VEP impact
  702. let best_impact_veps = d
  703. .iter()
  704. .chunk_by(|vep| {
  705. vep.consequence.as_ref().map_or(VepImpact::MODIFIER, |c| {
  706. c.iter()
  707. .map(VepImpact::from)
  708. .min()
  709. .unwrap_or(VepImpact::MODIFIER)
  710. })
  711. })
  712. .into_iter()
  713. .min_by_key(|(impact, _)| impact.clone())
  714. .map(|(_, group)| group.cloned().collect::<Vec<_>>())
  715. .ok_or_else(|| anyhow::anyhow!("No element in VEP vector"))?;
  716. // Step 2: If only one, return it
  717. if best_impact_veps.len() == 1 {
  718. return Ok(best_impact_veps[0].clone());
  719. }
  720. // Step 3: Rank by NCBI accession priority
  721. let parsed_veps = best_impact_veps
  722. .iter()
  723. .enumerate()
  724. .filter_map(|(i, vep)| {
  725. vep.feature.as_ref().and_then(|feat| {
  726. feat.parse::<NCBIAcc>()
  727. .map(|acc| (i, acc))
  728. .map_err(|e| warn!("Can't parse NCBI accession '{}': {}", feat, e))
  729. .ok()
  730. })
  731. })
  732. .sorted_by_key(|(_, acc)| (Reverse(acc.prefix == "NM"), acc.number))
  733. .collect::<Vec<_>>();
  734. // Pick the top-ranked transcript
  735. parsed_veps
  736. .first()
  737. .map(|(k, _)| best_impact_veps[*k].clone())
  738. .ok_or_else(|| anyhow::anyhow!("No valid NCBI accession found"))
  739. }
  740. #[cfg(test)]
  741. mod tests {
  742. use super::*;
  743. use crate::{helpers::test_init, run};
  744. #[test]
  745. fn vep_run() -> anyhow::Result<()> {
  746. test_init();
  747. let config = Config::default();
  748. let mut vep_job = VepJob { in_vcf: "/mnt/beegfs02/scratch/t_steimle/data/wgs/DUMCO/diag/ClairS/DUMCO_diag_clairs_PASSED.vcf.gz".into(), out_vcf: "/mnt/beegfs02/scratch/t_steimle/data/wgs/DUMCO/diag/TEST_clairs_PASSED_VEP.vcf".into(), config: config.clone() };
  749. let r = run!(config, &mut vep_job)?;
  750. println!("{r:#?}");
  751. Ok(())
  752. }
  753. }