|
|
@@ -1,16 +1,18 @@
|
|
|
use std::{
|
|
|
collections::{HashMap, HashSet},
|
|
|
fs::{self, File},
|
|
|
- io::Write,
|
|
|
+ io::{Read, Write},
|
|
|
};
|
|
|
|
|
|
use anyhow::Context;
|
|
|
+use bgzip::{BGZFReader, BGZFWriter};
|
|
|
use csv::ReaderBuilder;
|
|
|
use log::{debug, info, warn};
|
|
|
use rayon::prelude::*;
|
|
|
+use serde::{Deserialize, Serialize};
|
|
|
use uuid::Uuid;
|
|
|
|
|
|
-use super::variant::{AlterationCategory, VcfVariant};
|
|
|
+use super::variant::{AlterationCategory, ReferenceAlternative, VcfVariant};
|
|
|
use crate::{
|
|
|
annotation::{
|
|
|
cosmic::Cosmic,
|
|
|
@@ -26,6 +28,7 @@ use crate::{
|
|
|
helpers::{app_storage_dir, estimate_shannon_entropy, temp_file_path, Hash128},
|
|
|
io::{readers::get_reader, vcf::vcf_header},
|
|
|
pipes::somatic::sequence_at,
|
|
|
+ positions::GenomePosition,
|
|
|
};
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
|
|
@@ -184,8 +187,126 @@ impl VariantCollection {
|
|
|
self.variants.len()
|
|
|
);
|
|
|
}
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Serialize, Deserialize)]
|
|
|
+pub struct Variant {
|
|
|
+ pub hash: Hash128,
|
|
|
+ pub position: GenomePosition,
|
|
|
+ pub reference: ReferenceAlternative,
|
|
|
+ pub alternative: ReferenceAlternative,
|
|
|
+ pub vcf_variants: Vec<VcfVariant>,
|
|
|
+ pub annotations: Vec<Annotation>,
|
|
|
+}
|
|
|
+
|
|
|
+impl PartialEq for Variant {
|
|
|
+ fn eq(&self, other: &Self) -> bool {
|
|
|
+ self.position == other.position
|
|
|
+ && self.reference == other.reference
|
|
|
+ && self.alternative == other.alternative
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Default, Serialize, Deserialize)]
|
|
|
+pub struct Variants {
|
|
|
+ pub data: Vec<Variant>,
|
|
|
+}
|
|
|
+
|
|
|
+impl Variants {
|
|
|
+ pub fn sort(&mut self) {
|
|
|
+ self.data
|
|
|
+ .sort_unstable_by(|a, b| a.position.cmp(&b.position));
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn merge(&mut self, others: VariantCollection, annotations: &Annotations) {
|
|
|
+ let mut result = Vec::new();
|
|
|
+ let mut n_merged = 0;
|
|
|
+
|
|
|
+ let mut self_iter = self.data.drain(..).peekable(); // Iterator for self.data
|
|
|
+ let mut others_iter = others.variants.into_iter().peekable(); // Iterator for others.variants
|
|
|
+
|
|
|
+ // Merge using two-pointer technique
|
|
|
+ while let (Some(self_variant), Some(other_variant)) = (self_iter.peek(), others_iter.peek())
|
|
|
+ {
|
|
|
+ match self_variant.position.cmp(&other_variant.position) {
|
|
|
+ std::cmp::Ordering::Less => {
|
|
|
+ result.push(self_iter.next().unwrap());
|
|
|
+ }
|
|
|
+ std::cmp::Ordering::Greater => {
|
|
|
+ result.push(create_variant(
|
|
|
+ vec![others_iter.next().unwrap()],
|
|
|
+ annotations,
|
|
|
+ ));
|
|
|
+ }
|
|
|
+ std::cmp::Ordering::Equal => {
|
|
|
+ match (
|
|
|
+ self_variant.reference == other_variant.reference,
|
|
|
+ self_variant.alternative == other_variant.alternative,
|
|
|
+ ) {
|
|
|
+ (true, true) => {
|
|
|
+ let mut merged_variant = self_iter.next().unwrap();
|
|
|
+
|
|
|
+ merged_variant
|
|
|
+ .vcf_variants
|
|
|
+ .push(others_iter.next().unwrap());
|
|
|
+ n_merged += 1;
|
|
|
+ result.push(merged_variant);
|
|
|
+ }
|
|
|
+ _ => {
|
|
|
+ result.push(self_iter.next().unwrap());
|
|
|
+ result.push(create_variant(
|
|
|
+ vec![others_iter.next().unwrap()],
|
|
|
+ annotations,
|
|
|
+ ));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Drain remaining elements from iterators
|
|
|
+ result.extend(self_iter);
|
|
|
+ result.extend(others_iter.map(|v| create_variant(vec![v], annotations)));
|
|
|
+
|
|
|
+ info!("n merged: {}", n_merged);
|
|
|
+ self.data = result;
|
|
|
+ }
|
|
|
|
|
|
- pub fn external_annotation() {}
|
|
|
+ pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
|
|
|
+ let json = serde_json::to_string(self)?;
|
|
|
+ let file = File::create(filename)?;
|
|
|
+ let mut writer = BGZFWriter::new(file, bgzip::Compression::default());
|
|
|
+ writer.write_all(json.as_bytes())?;
|
|
|
+ writer.close()?;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
|
|
|
+ let file = File::open(filename)?;
|
|
|
+ let mut reader = BGZFReader::new(file)?;
|
|
|
+ let mut json = String::new();
|
|
|
+ reader.read_to_string(&mut json)?;
|
|
|
+ let variants: Variants = serde_json::from_str(&json)?;
|
|
|
+ Ok(variants)
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+fn create_variant(vcf_variants: Vec<VcfVariant>, annotations: &Annotations) -> Variant {
|
|
|
+ let first = &vcf_variants[0];
|
|
|
+ let annotations = annotations
|
|
|
+ .store
|
|
|
+ .get(&first.hash)
|
|
|
+ .map(|v| v.value().to_vec())
|
|
|
+ .unwrap_or_default();
|
|
|
+ Variant {
|
|
|
+ hash: first.hash,
|
|
|
+ position: first.position.clone(),
|
|
|
+ reference: first.reference.clone(),
|
|
|
+ alternative: first.alternative.clone(),
|
|
|
+ vcf_variants,
|
|
|
+ annotations,
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
pub enum ExtAnnotationSource {
|