|
|
@@ -0,0 +1,367 @@
|
|
|
+use anyhow::{anyhow, Context, Ok, Result};
|
|
|
+use csv::ReaderBuilder;
|
|
|
+use hashbrown::HashMap;
|
|
|
+use log::warn;
|
|
|
+use serde::{Deserialize, Serialize};
|
|
|
+use std::io::Write;
|
|
|
+use std::{
|
|
|
+ env::temp_dir,
|
|
|
+ fs::{self, File},
|
|
|
+ io::{BufRead, BufReader},
|
|
|
+ process::{Command, Stdio},
|
|
|
+ str::FromStr,
|
|
|
+};
|
|
|
+
|
|
|
+use crate::io::vcf::vcf_header;
|
|
|
+
|
|
|
+use super::ncbi::NCBIAcc;
|
|
|
+
|
|
|
+#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
|
|
+pub struct VEPLine {
|
|
|
+ pub uploaded_variation: String,
|
|
|
+ pub location: String,
|
|
|
+ pub allele: String,
|
|
|
+ pub gene: String,
|
|
|
+ pub feature: String,
|
|
|
+ pub feature_type: String,
|
|
|
+ pub consequence: String,
|
|
|
+ pub cdna_position: String,
|
|
|
+ pub cds_position: String,
|
|
|
+ pub protein_position: String,
|
|
|
+ pub amino_acids: String,
|
|
|
+ pub codons: String,
|
|
|
+ pub existing_variation: String,
|
|
|
+ pub extra: String,
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
|
|
+pub struct VEP {
|
|
|
+ pub gene: Option<String>,
|
|
|
+ pub feature: Option<String>,
|
|
|
+ pub feature_type: Option<String>,
|
|
|
+ pub consequence: Option<Vec<String>>,
|
|
|
+ pub cdna_position: Option<String>,
|
|
|
+ pub cds_position: Option<String>,
|
|
|
+ pub protein_position: Option<String>,
|
|
|
+ pub amino_acids: Option<String>,
|
|
|
+ pub codons: Option<String>,
|
|
|
+ pub existing_variation: Option<String>,
|
|
|
+ pub extra: VEPExtra,
|
|
|
+}
|
|
|
+
|
|
|
+// ensembl.org/info/genome/variation/prediction/predicted_data.html
|
|
|
+#[derive(Debug, PartialEq, Eq)]
|
|
|
+pub enum VepConsequence {
|
|
|
+ Transcript_ablation,
|
|
|
+ Splice_acceptor_variant,
|
|
|
+ Splice_donor_variant,
|
|
|
+ Stop_gained,
|
|
|
+ Frameshift_variant,
|
|
|
+ Stop_lost,
|
|
|
+ Start_lost,
|
|
|
+ Transcript_amplification,
|
|
|
+ Inframe_insertion,
|
|
|
+ Inframe_deletion,
|
|
|
+ Missense_variant,
|
|
|
+ Protein_altering_variant,
|
|
|
+ Splice_region_variant,
|
|
|
+ Incomplete_terminal_codon_variant,
|
|
|
+ Start_retained_variant,
|
|
|
+ Stop_retained_variant,
|
|
|
+ Synonymous_variant,
|
|
|
+ Coding_sequence_variant,
|
|
|
+ Mature_miRNA_variant,
|
|
|
+ Five_prime_UTR_variant,
|
|
|
+ Three_prime_UTR_variant,
|
|
|
+ Non_coding_transcript_exon_variant,
|
|
|
+ Intron_variant,
|
|
|
+ NMD_transcript_variant,
|
|
|
+ Non_coding_transcript_variant,
|
|
|
+ Upstream_gene_variant,
|
|
|
+ Downstream_gene_variant,
|
|
|
+ TFBS_ablation,
|
|
|
+ TFBS_amplification,
|
|
|
+ TF_binding_site_variant,
|
|
|
+ Regulatory_region_ablation,
|
|
|
+ Regulatory_region_amplification,
|
|
|
+ Feature_elongation,
|
|
|
+ Regulatory_region_variant,
|
|
|
+ Feature_truncation,
|
|
|
+ Intergenic_variant,
|
|
|
+}
|
|
|
+
|
|
|
+impl VEP {
|
|
|
+ fn from_vep_line(d: &VEPLine) -> Result<VEP> {
|
|
|
+ let or_opt = |s: &str| match s {
|
|
|
+ "-" => None,
|
|
|
+ _ => Some(s.to_string()),
|
|
|
+ };
|
|
|
+
|
|
|
+ let consequence = or_opt(&d.consequence)
|
|
|
+ .map(|c| c.split(",").map(|e| e.to_string()).collect::<Vec<String>>());
|
|
|
+
|
|
|
+ Ok(VEP {
|
|
|
+ gene: or_opt(&d.gene),
|
|
|
+ feature: or_opt(&d.feature),
|
|
|
+ feature_type: or_opt(&d.feature_type),
|
|
|
+ consequence,
|
|
|
+ cdna_position: or_opt(&d.feature_type),
|
|
|
+ cds_position: or_opt(&d.cds_position),
|
|
|
+ protein_position: or_opt(&d.protein_position),
|
|
|
+ amino_acids: or_opt(&d.amino_acids),
|
|
|
+ codons: or_opt(&d.codons),
|
|
|
+ existing_variation: or_opt(&d.existing_variation),
|
|
|
+ extra: d.extra.parse()?,
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
|
|
+pub struct VEPExtra {
|
|
|
+ pub impact: Option<VEPImpact>,
|
|
|
+ pub symbol: Option<String>,
|
|
|
+ pub distance: Option<u32>,
|
|
|
+ pub hgvs_c: Option<String>,
|
|
|
+ pub hgvs_p: Option<String>,
|
|
|
+}
|
|
|
+impl FromStr for VEPExtra {
|
|
|
+ type Err = anyhow::Error;
|
|
|
+
|
|
|
+ fn from_str(s: &str) -> Result<Self> {
|
|
|
+ let err = |c| anyhow!("Error {} parsing VEP Extra field {}", c, s);
|
|
|
+
|
|
|
+ let elements = s.split(";").collect::<Vec<&str>>();
|
|
|
+
|
|
|
+ let mut kv = HashMap::new();
|
|
|
+
|
|
|
+ for e in elements.iter() {
|
|
|
+ let (k, v) = e.split_once("=").ok_or(err("in split '='"))?;
|
|
|
+ if kv.insert(k, v).is_some() {
|
|
|
+ return Err(err("kv insert"));
|
|
|
+ };
|
|
|
+ }
|
|
|
+
|
|
|
+ let impact: Option<VEPImpact> = if let Some(v) = kv.get("IMPACT") {
|
|
|
+ Some(v.parse()?)
|
|
|
+ } else {
|
|
|
+ None
|
|
|
+ };
|
|
|
+ let symbol: Option<String> = kv.get("SYMBOL").map(|v| v.to_string());
|
|
|
+ let distance: Option<u32> = if let Some(v) = kv.get("DISTANCE") {
|
|
|
+ Some(v.parse()?)
|
|
|
+ } else {
|
|
|
+ None
|
|
|
+ };
|
|
|
+ let hgvs_c: Option<String> = kv.get("HGVSc").map(|v| v.to_string());
|
|
|
+ let hgvs_p: Option<String> = kv.get("HGVSp").map(|v| v.to_string());
|
|
|
+
|
|
|
+ Ok(VEPExtra {
|
|
|
+ impact,
|
|
|
+ symbol,
|
|
|
+ distance,
|
|
|
+ hgvs_c,
|
|
|
+ hgvs_p,
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
|
|
+pub enum VEPImpact {
|
|
|
+ Low,
|
|
|
+ Moderate,
|
|
|
+ High,
|
|
|
+ Modifier,
|
|
|
+}
|
|
|
+
|
|
|
+impl FromStr for VEPImpact {
|
|
|
+ type Err = anyhow::Error;
|
|
|
+
|
|
|
+ fn from_str(s: &str) -> Result<Self> {
|
|
|
+ match s {
|
|
|
+ "LOW" => Ok(VEPImpact::Low),
|
|
|
+ "MODERATE" => Ok(VEPImpact::Moderate),
|
|
|
+ "HIGH" => Ok(VEPImpact::High),
|
|
|
+ "MODIFIER" => Ok(VEPImpact::Modifier),
|
|
|
+ _ => Err(anyhow!("Unexpected VEP Impact value")),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+// pub fn vep_chunk(data: &mut [Variant]) -> Result<()> {
|
|
|
+// let in_vcf = format!(
|
|
|
+// "{}/vep_{}.vcf",
|
|
|
+// temp_dir().to_str().unwrap(),
|
|
|
+// uuid::Uuid::new_v4()
|
|
|
+// );
|
|
|
+// let out_vep = format!(
|
|
|
+// "{}/vep_{}.txt",
|
|
|
+// temp_dir().to_str().unwrap(),
|
|
|
+// uuid::Uuid::new_v4()
|
|
|
+// );
|
|
|
+//
|
|
|
+// let mut vcf = File::create(&in_vcf).unwrap();
|
|
|
+// let vcf_header = vcf_header("/data/ref/hs1/chm13v2.0.dict")?;
|
|
|
+//
|
|
|
+// writeln!(vcf, "{}", vcf_header.join("\n")).unwrap();
|
|
|
+//
|
|
|
+// for (i, row) in data.iter().enumerate() {
|
|
|
+// writeln!(
|
|
|
+// vcf,
|
|
|
+// "{}\t{}\t{}\t{}\t{}\t.\tPASS\t.\t.\t.",
|
|
|
+// row.contig,
|
|
|
+// row.position,
|
|
|
+// i + 1,
|
|
|
+// row.reference,
|
|
|
+// row.alternative
|
|
|
+// )?;
|
|
|
+// }
|
|
|
+//
|
|
|
+// if let Err(err) = run_vep(&in_vcf, &out_vep) {
|
|
|
+// panic!("{err}");
|
|
|
+// };
|
|
|
+//
|
|
|
+// // read the results in txt file, parse and add to HashMap
|
|
|
+// let mut reader_vep = ReaderBuilder::new()
|
|
|
+// .delimiter(b'\t')
|
|
|
+// .has_headers(false)
|
|
|
+// .comment(Some(b'#'))
|
|
|
+// .flexible(true)
|
|
|
+// .from_reader(fs::File::open(out_vep.clone())?);
|
|
|
+//
|
|
|
+// let mut lines: HashMap<u64, Vec<VEPLine>> = HashMap::new();
|
|
|
+// for line in reader_vep.deserialize::<VEPLine>() {
|
|
|
+// if let std::result::Result::Ok(line) = line {
|
|
|
+// if let std::result::Result::Ok(k) = line.uploaded_variation.parse::<u64>() {
|
|
|
+// lines
|
|
|
+// .raw_entry_mut()
|
|
|
+// .from_key(&k)
|
|
|
+// .or_insert_with(|| (k, vec![]))
|
|
|
+// .1
|
|
|
+// .push(line);
|
|
|
+// } else {
|
|
|
+// return Err(anyhow!("Error while parsing: {:?}", line));
|
|
|
+// }
|
|
|
+// } else {
|
|
|
+// return Err(anyhow!("Error while parsing: {:?}", line));
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// // remove input and result file
|
|
|
+// fs::remove_file(in_vcf)?;
|
|
|
+// fs::remove_file(out_vep)?;
|
|
|
+//
|
|
|
+// let mut n_not_vep = 0;
|
|
|
+// data.iter_mut().enumerate().for_each(|(i, entry)| {
|
|
|
+// let k = (i + 1) as u64;
|
|
|
+//
|
|
|
+// match lines.get(&k) {
|
|
|
+// Some(vep_lines) => {
|
|
|
+// let vep: Vec<VEP> = vep_lines
|
|
|
+// .iter()
|
|
|
+// .map(|e| match VEP::from_vep_line(e) {
|
|
|
+// std::result::Result::Ok(r) => r,
|
|
|
+// Err(err) => panic!("Error while parsing: {} line: {:?}", err, e),
|
|
|
+// })
|
|
|
+// .collect();
|
|
|
+// entry.annotations.push(AnnotationType::VEP(vep.to_vec()));
|
|
|
+// }
|
|
|
+// None => {
|
|
|
+// n_not_vep += 1;
|
|
|
+// }
|
|
|
+// };
|
|
|
+// });
|
|
|
+//
|
|
|
+// if n_not_vep > 0 {
|
|
|
+// warn!("{} variants not annotated by VEP", n_not_vep);
|
|
|
+// }
|
|
|
+//
|
|
|
+// Ok(())
|
|
|
+// }
|
|
|
+//
|
|
|
+// VEP need plugin Downstream and SpliceRegion /home/prom/.vep/Plugins
|
|
|
+fn run_vep(in_path: &str, out_path: &str) -> Result<()> {
|
|
|
+ let bin_dir = "/data/tools/ensembl-vep";
|
|
|
+ let dir_cache = "/data/ref/hs1/vepcache/";
|
|
|
+ let fasta = "/data/ref/hs1/chm13v2.0.fa";
|
|
|
+ let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz";
|
|
|
+ // let gff = "/data/ref/hs1/ncbi_dataset/data/GCF_009914755.1/genomic_chr_sorted.gff.gz";
|
|
|
+
|
|
|
+ // info!("Running VEP for {}", in_path);
|
|
|
+ let mut cmd = Command::new(format!("{}/vep", bin_dir))
|
|
|
+ .arg("--dir_cache")
|
|
|
+ .arg(dir_cache)
|
|
|
+ .arg("--cache")
|
|
|
+ .arg("--offline")
|
|
|
+ .arg("--fasta")
|
|
|
+ .arg(fasta)
|
|
|
+ .arg("--gff")
|
|
|
+ .arg(gff)
|
|
|
+ .arg("--symbol")
|
|
|
+ .arg("--plugin")
|
|
|
+ .arg("SpliceRegion")
|
|
|
+ .arg("--plugin")
|
|
|
+ .arg("Downstream")
|
|
|
+ .arg("--hgvs")
|
|
|
+ .arg("-i")
|
|
|
+ .arg(in_path)
|
|
|
+ .arg("-o")
|
|
|
+ .arg(out_path)
|
|
|
+ .stderr(Stdio::piped())
|
|
|
+ // .stderr(Stdio::null())
|
|
|
+ .spawn()
|
|
|
+ .expect("VEP failed to start");
|
|
|
+ // .stderr
|
|
|
+ // .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "Could not capture standard output.")).unwrap();
|
|
|
+
|
|
|
+ let stderr = cmd.stderr.take().unwrap();
|
|
|
+ let reader = BufReader::new(stderr);
|
|
|
+ reader
|
|
|
+ .lines()
|
|
|
+ .map_while(Result::ok)
|
|
|
+ // .inspect(|y| println!("{y}"))
|
|
|
+ .filter(|line| line.contains("error"))
|
|
|
+ .for_each(|line| warn!("{}", line));
|
|
|
+
|
|
|
+ cmd.wait()?;
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
+
|
|
|
+pub fn get_best_vep(d: &[VEP]) -> Result<VEP> {
|
|
|
+ d.into_iter().filter(|v| v.)
|
|
|
+
|
|
|
+ if d.is_empty() {
|
|
|
+ return Err(anyhow!("No element in VEP vector"));
|
|
|
+ }
|
|
|
+ if d.len() == 1 {
|
|
|
+ return Ok(d.first().unwrap().clone());
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut parsed: Vec<(usize, NCBIAcc)> = Vec::new();
|
|
|
+ for (i, vep) in d.iter().enumerate() {
|
|
|
+ if let Some(feat) = &vep.feature {
|
|
|
+ if let std::result::Result::Ok(f) = feat
|
|
|
+ .parse::<NCBIAcc>()
|
|
|
+ .context("Error parsing NCBI accession")
|
|
|
+ {
|
|
|
+ parsed.push((i, f));
|
|
|
+ } else {
|
|
|
+ warn!("Can't parse {}", feat);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ parsed.sort_by(|(_, a), (_, b)| a.number.cmp(&b.number));
|
|
|
+
|
|
|
+ let nm: Vec<(usize, NCBIAcc)> = parsed
|
|
|
+ .clone()
|
|
|
+ .into_iter()
|
|
|
+ .filter(|(_, e)| e.prefix == *"NM")
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ if !nm.is_empty() {
|
|
|
+ let (k, _) = nm.first().unwrap();
|
|
|
+ return Ok(d.get(*k).unwrap().clone());
|
|
|
+ } else {
|
|
|
+ let (k, _) = parsed.first().unwrap();
|
|
|
+ return Ok(d.get(*k).unwrap().clone());
|
|
|
+ }
|
|
|
+}
|