|
|
@@ -6,7 +6,7 @@ use std::{
|
|
|
|
|
|
use anyhow::Context;
|
|
|
use csv::ReaderBuilder;
|
|
|
-use log::{info, warn};
|
|
|
+use log::{debug, info, warn};
|
|
|
use rayon::prelude::*;
|
|
|
use uuid::Uuid;
|
|
|
|
|
|
@@ -42,8 +42,7 @@ impl VariantCollection {
|
|
|
}
|
|
|
|
|
|
pub fn retain_keys(&mut self, keys_to_keep: &HashSet<Hash128>) {
|
|
|
- self.variants
|
|
|
- .retain(|v| keys_to_keep.contains(&v.hash()));
|
|
|
+ self.variants.retain(|v| keys_to_keep.contains(&v.hash()));
|
|
|
}
|
|
|
|
|
|
pub fn remove_keys(&mut self, keys_to_remove: &HashSet<Hash128>) {
|
|
|
@@ -233,7 +232,7 @@ impl ExternalAnnotation {
|
|
|
let mut unfound = Vec::new();
|
|
|
|
|
|
for variant in variants {
|
|
|
- let hash = variant.hash();
|
|
|
+ let hash = variant.hash();
|
|
|
let mut has_pushed = false;
|
|
|
|
|
|
// Check COSMIC
|
|
|
@@ -305,8 +304,14 @@ impl ExternalAnnotation {
|
|
|
|
|
|
let header = vcf_header("/data/ref/hs1/chm13v2.0.dict")?.join("\n");
|
|
|
|
|
|
+ let min_chunk_size = 1000;
|
|
|
+ let max_chunks = 150;
|
|
|
+
|
|
|
+ let optimal_chunk_size = unfound.len().div_ceil(max_chunks as usize);
|
|
|
+ let optimal_chunk_size = optimal_chunk_size.max(min_chunk_size);
|
|
|
+
|
|
|
let results = unfound
|
|
|
- .par_chunks(unfound.len() / 33)
|
|
|
+ .par_chunks(optimal_chunk_size)
|
|
|
.flat_map(|chunk| -> anyhow::Result<Vec<_>> {
|
|
|
let in_tmp = temp_dir.join(format!("echtvar_in_{}.vcf", Uuid::new_v4()));
|
|
|
let out_tmp = temp_dir.join(format!("echtvar_out_{}.vcf.gz", Uuid::new_v4()));
|
|
|
@@ -450,72 +455,181 @@ impl ExternalAnnotation {
|
|
|
|
|
|
let header = vcf_header("/data/ref/hs1/chm13v2.0.dict")?.join("\n");
|
|
|
|
|
|
- let results = unfound
|
|
|
- .par_chunks(unfound.len() / 33)
|
|
|
- .flat_map(|chunk| -> anyhow::Result<Vec<_>> {
|
|
|
- let in_tmp = temp_file_path("vcf")?.to_str().unwrap().to_string();
|
|
|
- let out_vep = temp_file_path("_vep.txt")?.to_str().unwrap().to_string();
|
|
|
+ let (sv, unfound): (Vec<VcfVariant>, Vec<VcfVariant>) =
|
|
|
+ unfound.into_iter().partition(|v| v.has_svtype());
|
|
|
|
|
|
- // Write input VCF
|
|
|
- let mut vcf = File::create(&in_tmp)?;
|
|
|
- writeln!(vcf, "{}", header)?;
|
|
|
- for (i, row) in chunk.iter().enumerate() {
|
|
|
- writeln!(
|
|
|
- vcf,
|
|
|
- "{}\t{}\t{}\t{}\t{}\t.\tPASS\t.\t.\t.",
|
|
|
- row.position.contig(),
|
|
|
- row.position.position + 1, // vcf
|
|
|
- i + 1,
|
|
|
- row.reference,
|
|
|
- row.alternative
|
|
|
- )?;
|
|
|
- }
|
|
|
+ warn!("SV {}", sv.len());
|
|
|
|
|
|
- run_vep(&in_tmp, &out_vep)?;
|
|
|
+ let min_chunk_size = 1000;
|
|
|
+ let max_chunks = 150;
|
|
|
+
|
|
|
+ let mut results = if !unfound.is_empty() {
|
|
|
+ let optimal_chunk_size = unfound.len().div_ceil(max_chunks as usize);
|
|
|
+ let optimal_chunk_size = optimal_chunk_size.max(min_chunk_size);
|
|
|
+
|
|
|
+ unfound
|
|
|
+ .par_chunks(optimal_chunk_size)
|
|
|
+ .flat_map(|chunk| -> anyhow::Result<Vec<_>> {
|
|
|
+ let in_tmp = temp_file_path("vcf")?.to_str().unwrap().to_string();
|
|
|
+ let out_vep = temp_file_path("_vep.txt")?.to_str().unwrap().to_string();
|
|
|
+ let out_summary = format!("{out_vep}_summary.html");
|
|
|
+ let out_warnings = format!("{out_vep}_warnings.txt");
|
|
|
+
|
|
|
+ // Write input VCF
|
|
|
+ let mut vcf = File::create(&in_tmp)?;
|
|
|
+ writeln!(vcf, "{}", header)?;
|
|
|
+ for (i, row) in chunk.iter().enumerate() {
|
|
|
+ writeln!(
|
|
|
+ vcf,
|
|
|
+ "{}\t{}\t{}\t{}\t{}\t.\tPASS\t.\t.\t.",
|
|
|
+ row.position.contig(),
|
|
|
+ row.position.position + 1, // vcf
|
|
|
+ i + 1,
|
|
|
+ row.reference,
|
|
|
+ row.alternative
|
|
|
+ )?;
|
|
|
+ }
|
|
|
|
|
|
- let mut reader_vep = ReaderBuilder::new()
|
|
|
- .delimiter(b'\t')
|
|
|
- .has_headers(false)
|
|
|
- .comment(Some(b'#'))
|
|
|
- .flexible(true)
|
|
|
- .from_reader(fs::File::open(out_vep.clone())?);
|
|
|
+ run_vep(&in_tmp, &out_vep)?;
|
|
|
|
|
|
- let mut lines: HashMap<u64, Vec<VepLine>> = HashMap::new();
|
|
|
- for line in reader_vep.deserialize() {
|
|
|
- let line: VepLine = line.context("Failed to deserialize VepLine")?;
|
|
|
- let key = line
|
|
|
- .uploaded_variation
|
|
|
- .parse::<u64>()
|
|
|
- .context("Failed to parse uploaded_variation as u64")?;
|
|
|
+ let mut reader_vep = ReaderBuilder::new()
|
|
|
+ .delimiter(b'\t')
|
|
|
+ .has_headers(false)
|
|
|
+ .comment(Some(b'#'))
|
|
|
+ .flexible(true)
|
|
|
+ .from_reader(fs::File::open(out_vep.clone())?);
|
|
|
|
|
|
- lines.entry(key).or_default().push(line);
|
|
|
- }
|
|
|
+ let mut lines: HashMap<u64, Vec<VepLine>> = HashMap::new();
|
|
|
+ for line in reader_vep.deserialize() {
|
|
|
+ let line: VepLine = line.context("Failed to deserialize VepLine")?;
|
|
|
+ let key = line
|
|
|
+ .uploaded_variation
|
|
|
+ .parse::<u64>()
|
|
|
+ .context("Failed to parse uploaded_variation as u64")?;
|
|
|
|
|
|
- fs::remove_file(in_tmp)?;
|
|
|
- fs::remove_file(out_vep)?;
|
|
|
+ lines.entry(key).or_default().push(line);
|
|
|
+ }
|
|
|
+
|
|
|
+ fs::remove_file(in_tmp)?;
|
|
|
+ fs::remove_file(out_vep)?;
|
|
|
|
|
|
- let mut n_not_vep = 0;
|
|
|
- let mut chunk_results: Vec<(Hash128, Vec<VEP>)> = Vec::new();
|
|
|
+ let mut n_not_vep = 0;
|
|
|
+ let mut chunk_results: Vec<(Hash128, Vec<VEP>)> = Vec::new();
|
|
|
|
|
|
- chunk.iter().enumerate().for_each(|(i, entry)| {
|
|
|
- let k = (i + 1) as u64;
|
|
|
+ chunk.iter().enumerate().for_each(|(i, entry)| {
|
|
|
+ let k = (i + 1) as u64;
|
|
|
|
|
|
- if let Some(vep_lines) = lines.get(&k) {
|
|
|
- if let Ok(veps) = vep_lines.iter().map(VEP::try_from).collect() {
|
|
|
- chunk_results.push((entry.hash(), veps));
|
|
|
+ if let Some(vep_lines) = lines.get(&k) {
|
|
|
+ if let Ok(veps) = vep_lines.iter().map(VEP::try_from).collect() {
|
|
|
+ chunk_results.push((entry.hash(), veps));
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ warn!(
|
|
|
+ "No VEP entry for {}:{}>{}",
|
|
|
+ entry.position.to_string(),
|
|
|
+ entry.reference.to_string(),
|
|
|
+ entry.alternative.to_string()
|
|
|
+ );
|
|
|
+ n_not_vep += 1;
|
|
|
}
|
|
|
- } else {
|
|
|
- n_not_vep += 1;
|
|
|
+ });
|
|
|
+
|
|
|
+ if n_not_vep > 0 {
|
|
|
+ debug!("{n_not_vep} variants not annotated by VEP.");
|
|
|
+ let warnings = fs::read_to_string(&out_warnings)
|
|
|
+ .context(format!("Can't read VEP warnings: {out_warnings}"))?;
|
|
|
+ warn!("VEP warnings:\n{warnings}");
|
|
|
+ }
|
|
|
+ fs::remove_file(out_warnings)?;
|
|
|
+ fs::remove_file(out_summary)?;
|
|
|
+ Ok(chunk_results)
|
|
|
+ })
|
|
|
+ .flatten()
|
|
|
+ .collect::<Vec<_>>()
|
|
|
+ } else {
|
|
|
+ Vec::new()
|
|
|
+ };
|
|
|
+
|
|
|
+ if !sv.is_empty() {
|
|
|
+ let optimal_chunk_size = sv.len().div_ceil(max_chunks as usize);
|
|
|
+ let optimal_chunk_size = optimal_chunk_size.max(min_chunk_size);
|
|
|
+
|
|
|
+ let results_sv = sv
|
|
|
+ .par_chunks(optimal_chunk_size)
|
|
|
+ .flat_map(|chunk| -> anyhow::Result<Vec<_>> {
|
|
|
+ let in_tmp = temp_file_path(".vcf")?.to_str().unwrap().to_string();
|
|
|
+ let out_vep = temp_file_path("_vep.txt")?.to_str().unwrap().to_string();
|
|
|
+ let out_summary = format!("{out_vep}_summary.html");
|
|
|
+ let out_warnings = format!("{out_vep}_warnings.txt");
|
|
|
+
|
|
|
+ // Write input VCF
|
|
|
+ let mut vcf = File::create(&in_tmp)?;
|
|
|
+ writeln!(vcf, "{}", header)?;
|
|
|
+ for (i, mut row) in chunk.iter().cloned().enumerate() {
|
|
|
+ row.id = (i + 1).to_string();
|
|
|
+ let s = row.into_vcf_row();
|
|
|
+ writeln!(vcf, "{s}",)?;
|
|
|
}
|
|
|
- });
|
|
|
|
|
|
- if n_not_vep > 0 {
|
|
|
- warn!("{n_not_vep} variants not annotated by VEP");
|
|
|
- }
|
|
|
- Ok(chunk_results)
|
|
|
- })
|
|
|
- .flatten()
|
|
|
- .collect::<Vec<_>>();
|
|
|
+ run_vep(&in_tmp, &out_vep)?;
|
|
|
+
|
|
|
+ let mut reader_vep = ReaderBuilder::new()
|
|
|
+ .delimiter(b'\t')
|
|
|
+ .has_headers(false)
|
|
|
+ .comment(Some(b'#'))
|
|
|
+ .flexible(true)
|
|
|
+ .from_reader(fs::File::open(out_vep.clone())?);
|
|
|
+
|
|
|
+ let mut lines: HashMap<u64, Vec<VepLine>> = HashMap::new();
|
|
|
+ for line in reader_vep.deserialize() {
|
|
|
+ let line: VepLine = line.context("Failed to deserialize VepLine")?;
|
|
|
+ let key = line
|
|
|
+ .uploaded_variation
|
|
|
+ .parse::<u64>()
|
|
|
+ .context("Failed to parse uploaded_variation as u64")?;
|
|
|
+
|
|
|
+ lines.entry(key).or_default().push(line);
|
|
|
+ }
|
|
|
+
|
|
|
+ fs::remove_file(in_tmp)?;
|
|
|
+ fs::remove_file(out_vep)?;
|
|
|
+
|
|
|
+ let mut n_not_vep = 0;
|
|
|
+ let mut chunk_results: Vec<(Hash128, Vec<VEP>)> = Vec::new();
|
|
|
+
|
|
|
+ chunk.iter().enumerate().for_each(|(i, entry)| {
|
|
|
+ let k = (i + 1) as u64;
|
|
|
+
|
|
|
+ if let Some(vep_lines) = lines.get(&k) {
|
|
|
+ if let Ok(veps) = vep_lines.iter().map(VEP::try_from).collect() {
|
|
|
+ chunk_results.push((entry.hash(), veps));
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ warn!(
|
|
|
+ "No VEP entry for {}\t{}\t{}",
|
|
|
+ entry.position.to_string(),
|
|
|
+ entry.reference.to_string(),
|
|
|
+ entry.alternative.to_string()
|
|
|
+ );
|
|
|
+ n_not_vep += 1;
|
|
|
+ }
|
|
|
+ });
|
|
|
+
|
|
|
+ if n_not_vep > 0 {
|
|
|
+ debug!("{n_not_vep} variants not annotated by VEP.");
|
|
|
+ let warnings = fs::read_to_string(&out_warnings)
|
|
|
+ .context(format!("Can't read VEP warnings: {out_warnings}"))?;
|
|
|
+ warn!("VEP warnings:\n{warnings}");
|
|
|
+ }
|
|
|
+ fs::remove_file(out_warnings)?;
|
|
|
+ fs::remove_file(out_summary)?;
|
|
|
+ Ok(chunk_results)
|
|
|
+ })
|
|
|
+ .flatten()
|
|
|
+ .collect::<Vec<_>>();
|
|
|
+
|
|
|
+ results.extend(results_sv);
|
|
|
+ }
|
|
|
|
|
|
for (hash, veps) in results {
|
|
|
// self.update_database(hash, "vep", &serde_json::to_vec(&veps)?)?;
|