Thomas 1 rok temu
rodzic
commit
ba6a45e9ce
4 zmienionych plików z 1954 dodań i 212 usunięć
  1. 1764 0
      :
  2. 1 0
      src/annotations/mod.rs
  3. 9 0
      src/annotations/phase.rs
  4. 180 212
      src/variants.rs

+ 1764 - 0
:

@@ -0,0 +1,1764 @@
+use crate::{
+    annotations::{
+        cosmic::Cosmic, echtvar::{parse_echtvar_val, run_echtvar}, gnomad::GnomAD, ncbi_gff::NCBIGFF, pangolin::{pangolin_parse_results, pangolin_save_variants, run_pangolin, Pangolin}, vep::{get_best_vep, vep_chunk, VEP}
+    },
+    callers::{
+        clairs::{ClairSFormat, ClairSInfo},
+        deepvariant::{DeepVariantFormat, DeepVariantInfo},
+        nanomonsv::{NanomonsvFormat, NanomonsvInfo},
+        sniffles::{SnifflesFormat, SnifflesInfo},
+    },
+    config::{self, Config},
+    in_out::{
+        self,
+        dict_reader::read_dict,
+        get_reader,
+        vcf_reader::{read_vcf, VCFRow},
+        vcf_writer::{vcf_header_from, VariantWritter},
+    },
+    sql::{stats_sql::insert_stats, variants_sql::insert_variants},
+    utils::{
+        chi_square_test_for_proportions, count_repetitions, estimate_shannon_entropy,
+        get_hts_nt_pileup, new_pg, new_pg_speed, print_stat_cat,
+    },
+};
+use anyhow::{anyhow, Context, Ok, Result};
+use csv::ReaderBuilder;
+use dashmap::DashMap;
+use hashbrown::HashMap;
+use indicatif::{MultiProgress, ParallelProgressIterator};
+use log::{info, warn};
+use noodles_core::{region::Region, Position};
+use noodles_fasta::indexed_reader::Builder as FastaBuilder;
+use noodles_gff as gff;
+
+use rayon::prelude::*;
+use serde::{ser::SerializeStruct, Deserialize, Serialize, Serializer};
+use std::{fs, io::Write};
+use std::{
+    env::temp_dir,
+    fmt,
+    fs::File,
+    str::FromStr,
+    sync::{
+        atomic::{AtomicI32, Ordering},
+        Arc,
+    },
+};
+use utoipa::{openapi::schema, ToSchema};
+
+// chr12:25116542|G>T KRAS
+#[derive(Debug, Clone)]
+pub struct Variants {
+    pub name: String,
+    pub data: Vec<Variant>,
+    pub constit: DashMap<String, Variant>,
+    pub stats_vcf: StatsVCF,
+    pub stats_bam: StatsBAM,
+    pub cfg: Config,
+    pub mp: MultiProgress,
+}
+
+impl Serialize for Variants {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        // 3 is the number of fields in the struct.
+        let mut state = serializer.serialize_struct("Variants", 5)?;
+        state.serialize_field("name", &self.name)?;
+        state.serialize_field("data", &self.data)?;
+        state.serialize_field("constit", &self.constit)?;
+        state.serialize_field("stats_vcf", &self.stats_vcf)?;
+        state.serialize_field("stats_bam", &self.stats_bam)?;
+        state.end()
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct StatsVCF {
+    n_tumoral_init: usize,
+    n_constit_init: usize,
+    n_constit: i32,
+    n_loh: i32,
+    n_low_mrd_depth: i32,
+}
+
+impl fmt::Display for StatsVCF {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let k = 100.0 / self.n_tumoral_init as f64;
+        let string = format!(
+            "VCF filters found {} ({:.1}%) constit, {} ({:.1}%) LOH, {} ({:.1}%) Low depth for constit variants",
+            self.n_constit, self.n_constit as f64 * k,
+            self.n_loh, self.n_loh as f64 * k,
+            self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k
+        );
+        write!(f, "{}", string)
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct StatsBAM {
+    n_lasting: i32,
+    n_constit: i32,
+    n_low_mrd_depth: i32,
+    n_low_diversity: i32,
+    n_somatic: i32,
+}
+
+impl fmt::Display for StatsBAM {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let k = 100.0 / self.n_lasting as f64;
+        let string = format!(
+            "BAM filters found {} ({:.1}%) constit, {} ({:.1}%) low depth for constit variants, {} ({:.1}%) low diversity of sequence at the variant position, {} ({:.1}%) somatic variants",
+            self.n_constit, self.n_constit as f64 * k,
+            self.n_low_mrd_depth, self.n_low_mrd_depth as f64 * k,
+            self.n_low_diversity, self.n_low_diversity as f64 * k,
+            self.n_somatic, self.n_somatic as f64 * k
+        );
+        write!(f, "{}", string)
+    }
+}
+
+impl Variants {
+    pub fn from_vec(name: String, mp: &MultiProgress, data: Vec<Variant>) -> Self {
+        Self {
+            name,
+            data,
+            constit: DashMap::new(),
+            stats_vcf: StatsVCF::default(),
+            stats_bam: StatsBAM::default(),
+            cfg: Config::get().unwrap(),
+            mp: mp.clone(),
+        }
+    }
+
+    pub fn from_vcfs(
+        name: String,
+        v: Vec<(&str, &VCFSource, &VariantType)>,
+        cfg: &Config,
+        mp: MultiProgress,
+    ) -> Result<Self> {
+        let pg = mp.add(new_pg(v.len() as u64));
+        pg.set_message("Reading VCF");
+
+        let constit: Arc<DashMap<String, Variant>> = Arc::new(DashMap::new());
+        let n_constit = AtomicI32::new(0);
+        let data: Vec<Variant> = v
+            .par_iter()
+            // .progress_count(v.len() as u64)
+            .flat_map(|(path, source, variant_type)| {
+                let r = match variant_type {
+                    VariantType::Somatic => read_vcf(path, source, variant_type).unwrap(),
+                    VariantType::Constitutionnal => {
+                        read_vcf(path, source, variant_type)
+                            .unwrap()
+                            .par_iter()
+                            .for_each(|e| {
+                                n_constit.fetch_add(1, Ordering::SeqCst);
+                                constit.insert(
+                                    format!(
+                                        "{}:{}|{}>{}",
+                                        e.contig, e.position, e.reference, e.alternative
+                                    ),
+                                    e.clone(),
+                                );
+                            });
+                        vec![]
+                    }
+                };
+                pg.inc(1);
+                r
+            })
+            .collect();
+
+        let stats_vcf = StatsVCF::default();
+        let stats_bam = StatsBAM::default();
+
+        let constit = Arc::try_unwrap(constit).unwrap();
+        let elapsed = pg.elapsed();
+        pg.finish();
+        info!("{} variants parsed from somatic VCFs and {} variants positions parsed from constitutionnal VCFs. Executed in {}s", data.len(), constit.len(), elapsed.as_secs());
+        let cfg = cfg.clone();
+
+        return Ok(Self {
+            name,
+            data,
+            constit,
+            stats_vcf,
+            stats_bam,
+            cfg,
+            mp: mp.clone(),
+        });
+    }
+
+    pub fn vcf_filters(&mut self) {
+        let cfg = &self.cfg;
+        let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
+        pg.set_message("VCF filtering");
+
+        let n_tumoral_init = self.len();
+        let n_constit_init = self.constit_len();
+        let min_loh_diff = cfg.deepvariant_loh_pval as f64;
+        let min_mrd_depth = cfg.min_mrd_depth;
+
+        info!("Filtering Constitutionnal (reported variant in constit), LOH (VAF proportion test < {}), LowMRDDepth (< {} in constit) variants by VCF annotations of {} likely somatic variants", min_loh_diff, min_mrd_depth, n_tumoral_init);
+        let n_constit = AtomicI32::new(0);
+        let n_loh = AtomicI32::new(0);
+        let n_low_mrd_depth = AtomicI32::new(0);
+        self.data = self
+            .data
+            .par_iter()
+            .map(|e| {
+                let mut tumoral = e.clone();
+                let k = format!(
+                    "{}:{}|{}>{}",
+                    tumoral.contig, tumoral.position, tumoral.reference, tumoral.alternative
+                );
+
+                if let Some(mut constit) = self.constit.get_mut(&k) {
+                    if constit.get_depth() < min_mrd_depth {
+                        n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
+                        tumoral.annotations.push(AnnotationType::VariantCategory(
+                            VariantCategory::LowMRDDepth,
+                        ));
+                    } else if constit.get_n_alt() == constit.get_depth()
+                        && tumoral.get_n_alt() == tumoral.get_depth()
+                    {
+                        n_constit.fetch_add(1, Ordering::SeqCst);
+                        tumoral
+                            .annotations
+                            .push(AnnotationType::VariantCategory(VariantCategory::Constit));
+                    } else {
+                        let pval = chi_square_test_for_proportions(
+                            tumoral.get_n_alt() as f64,
+                            tumoral.get_depth() as f64,
+                            constit.get_n_alt() as f64,
+                            constit.get_depth() as f64,
+                        )
+                        .unwrap();
+                        if pval != 0.0 && pval <= min_loh_diff {
+                            n_loh.fetch_add(1, Ordering::SeqCst);
+                            tumoral
+                                .annotations
+                                .push(AnnotationType::VariantCategory(VariantCategory::LOH));
+                        } else {
+                            n_constit.fetch_add(1, Ordering::SeqCst);
+                            tumoral
+                                .annotations
+                                .push(AnnotationType::VariantCategory(VariantCategory::Constit));
+                        }
+                    }
+                // If not un Constit registry, ClairS look for VCF constit depth and n_alt
+                } else if let Format::ClairS(format) = &tumoral.callers_data.get(0).unwrap().format
+                {
+                    if format.ndp < min_mrd_depth {
+                        n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
+                        tumoral.annotations.push(AnnotationType::VariantCategory(
+                            VariantCategory::LowMRDDepth,
+                        ));
+                    } else if let ReferenceAlternative::Nucleotide(alt_base) = &tumoral.alternative
+                    {
+                        let mrd_n_alt = match alt_base {
+                            Base::A => format.nau,
+                            Base::T => format.ntu,
+                            Base::C => format.ncu,
+                            Base::G => format.ngu,
+                            _ => 0,
+                        };
+                        if mrd_n_alt != 0 {
+                            n_constit.fetch_add(1, Ordering::SeqCst);
+                            tumoral
+                                .annotations
+                                .push(AnnotationType::VariantCategory(VariantCategory::Constit));
+                        }
+                    }
+                }
+                pg.inc(1);
+                tumoral
+            })
+            .collect();
+
+        let n_constit = n_constit.load(Ordering::SeqCst);
+        let n_loh = n_loh.load(Ordering::SeqCst);
+        let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
+
+        self.stats_vcf = StatsVCF {
+            n_tumoral_init,
+            n_constit_init,
+            n_constit,
+            n_loh,
+            n_low_mrd_depth,
+        };
+        // let elapsed = start.elapsed();
+        let elapsed = pg.elapsed();
+        pg.finish();
+        info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
+    }
+
+    /// Filter variants by reading informations from constist BAM.
+    pub fn bam_filters(&mut self, mrd_bam: &str) {
+        let cfg = &self.cfg;
+        // let start = Instant::now();
+        let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
+        pg.set_message("BAM filtering");
+
+        let min_mrd_depth = cfg.min_mrd_depth;
+        info!("Filtering Constitutionnal (Alt base found in BAM pileup), LowDiversity (sequence +/- 20nt around variant with entropy < {}), LowMRDDepth (BAM pileup depth < {}) variants by BAM pileup fetching of {} likely somatic variants", cfg.min_diversity, min_mrd_depth, self.stats_vcf.n_tumoral_init - (self.stats_vcf.n_constit + self.stats_vcf.n_loh + self.stats_vcf.n_low_mrd_depth) as usize);
+
+        let n_already = AtomicI32::new(0);
+        let n_constit = AtomicI32::new(0);
+        let n_low_mrd_depth = AtomicI32::new(0);
+        let n_low_diversity = AtomicI32::new(0);
+        let n_somatic = AtomicI32::new(0);
+        self.data.par_chunks_mut(10_000).for_each(|chunk| {
+            let mut bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam)
+                .context(anyhow!("Reading {}", mrd_bam))
+                .unwrap();
+            let mut genome_reader = FastaBuilder::default()
+                .build_from_path(&cfg.reference_fa)
+                .unwrap();
+
+            for tumoral in chunk.iter_mut() {
+                pg.inc(1);
+
+                if tumoral.annotations.len() > 0 {
+                    n_already.fetch_add(1, Ordering::SeqCst);
+                    continue;
+                }
+                let (pos, is_ins) = match tumoral.alt_cat() {
+                    AlterationCategory::INS => (tumoral.position, true),
+                    AlterationCategory::DEL => (tumoral.position, false),
+                    _ => (tumoral.position, false),
+                };
+                match get_hts_nt_pileup(
+                    &mut bam,
+                    &tumoral.contig,
+                    pos as i32,
+                    is_ins, // tumoral.position as i32,
+                ) {
+                    std::result::Result::Ok(bases) => {
+                        let depth = bases.len() as u32;
+
+                        if depth < min_mrd_depth {
+                            n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
+                            tumoral.annotations.push(AnnotationType::VariantCategory(
+                                VariantCategory::LowMRDDepth,
+                            ));
+                        } else {
+                            // Check local diversity
+                            let start =
+                                Position::try_from((tumoral.position - 20) as usize).unwrap();
+                            let end = Position::try_from((tumoral.position + 19) as usize).unwrap();
+                            let r = Region::new(tumoral.contig.to_string(), start..=end);
+                            if let std::result::Result::Ok(reg) = genome_reader.query(&r) {
+                                let s = reg.sequence();
+                                let u = s.as_ref();
+                                let s = String::from_utf8(u.to_vec()).unwrap();
+                                let ent = estimate_shannon_entropy(&s.to_lowercase());
+
+                                if ent < cfg.min_diversity {
+                                    n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                    tumoral.annotations.push(AnnotationType::VariantCategory(
+                                        VariantCategory::LowDiversity,
+                                    ));
+                                    continue;
+                                }
+
+                                // Check triplets or doublets if DeepVariant
+                                let callers = tumoral.callers();
+
+                                if callers.len() == 1 {
+                                    if callers[0] == "DeepVariant".to_string() {
+                                        let seq_left = &s[0..20];
+                                        let seq_right = &s[21..s.len() - 1];
+
+                                        // Triplet right
+                                        if count_repetitions(seq_right, 3) >= 3 {
+                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                            tumoral.annotations.push(
+                                                AnnotationType::VariantCategory(
+                                                    VariantCategory::LowDiversity,
+                                                ),
+                                            );
+                                            continue;
+                                        }
+
+                                        // Doublet right
+                                        if count_repetitions(seq_right, 2) >= 4 {
+                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                            tumoral.annotations.push(
+                                                AnnotationType::VariantCategory(
+                                                    VariantCategory::LowDiversity,
+                                                ),
+                                            );
+                                            continue;
+                                        }
+
+                                        // Triplet left
+                                        if count_repetitions(seq_left, 3) >= 3 {
+                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                            tumoral.annotations.push(
+                                                AnnotationType::VariantCategory(
+                                                    VariantCategory::LowDiversity,
+                                                ),
+                                            );
+                                            continue;
+                                        }
+
+                                        // Doublet left
+                                        if count_repetitions(seq_left, 2) >= 4 {
+                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                            tumoral.annotations.push(
+                                                AnnotationType::VariantCategory(
+                                                    VariantCategory::LowDiversity,
+                                                ),
+                                            );
+                                            continue;
+                                        }
+                                    }
+                                }
+                            }
+
+                            // Check if the base is in constitutionnal pileup
+                            if let ReferenceAlternative::Nucleotide(alt_b) = &tumoral.alternative {
+                                let alt_b = alt_b.clone().into_u8();
+                                let n_alt_mrd = bases
+                                    .clone()
+                                    .into_iter()
+                                    .filter(|e| *e == alt_b)
+                                    .collect::<Vec<_>>()
+                                    .len();
+                                if n_alt_mrd > 0 {
+                                    n_constit.fetch_add(1, Ordering::SeqCst);
+                                    tumoral.annotations.push(AnnotationType::VariantCategory(
+                                        VariantCategory::Constit,
+                                    ));
+                                } else {
+                                    n_somatic.fetch_add(1, Ordering::SeqCst);
+                                    tumoral.annotations.push(AnnotationType::VariantCategory(
+                                        VariantCategory::Somatic,
+                                    ));
+                                }
+                            } else if tumoral.is_ins() {
+                                let n_alt_mrd =
+                                    bases.clone().into_iter().filter(|e| *e == b'I').count();
+                                if n_alt_mrd > 0 {
+                                    n_constit.fetch_add(1, Ordering::SeqCst);
+                                    tumoral.annotations.push(AnnotationType::VariantCategory(
+                                        VariantCategory::Constit,
+                                    ));
+                                } else {
+                                    n_somatic.fetch_add(1, Ordering::SeqCst);
+                                    tumoral.annotations.push(AnnotationType::VariantCategory(
+                                        VariantCategory::Somatic,
+                                    ));
+                                }
+                            } else if tumoral.alt_cat() == AlterationCategory::DEL {
+                                let n_alt_mrd =
+                                    bases.clone().into_iter().filter(|e| *e == b'D').count();
+                                if n_alt_mrd > 0 {
+                                    n_constit.fetch_add(1, Ordering::SeqCst);
+                                    tumoral.annotations.push(AnnotationType::VariantCategory(
+                                        VariantCategory::Constit,
+                                    ));
+                                } else {
+                                    n_somatic.fetch_add(1, Ordering::SeqCst);
+                                    tumoral.annotations.push(AnnotationType::VariantCategory(
+                                        VariantCategory::Somatic,
+                                    ));
+                                }
+                            }
+                        }
+                    }
+                    Err(r) => panic!("{}", r),
+                }
+            }
+        });
+        let n_constit = n_constit.load(Ordering::SeqCst);
+        let n_low_mrd_depth = n_low_mrd_depth.load(Ordering::SeqCst);
+        let n_low_diversity = n_low_diversity.load(Ordering::SeqCst);
+        let n_somatic = n_somatic.load(Ordering::SeqCst);
+        let n_lasting = self.data.len() as i32 - n_already.load(Ordering::SeqCst);
+        self.stats_bam = StatsBAM {
+            n_lasting,
+            n_constit,
+            n_low_mrd_depth,
+            n_low_diversity,
+            n_somatic,
+        };
+        let elapsed = pg.elapsed();
+        pg.finish();
+        info!("{}. Executed in {}s", self.stats_vcf, elapsed.as_secs());
+    }
+
+    pub fn get_cat(&mut self, cat: &VariantCategory) -> Vec<Variant> {
+        let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
+        pg.set_message(format!("Get cat {:?}", cat));
+        self.data
+            .par_iter()
+            .progress_with(pg)
+            .flat_map(|e| {
+                if e.annotations
+                    .iter()
+                    .filter(|e| match e {
+                        AnnotationType::VariantCategory(vc) => vc == cat,
+                        _ => false,
+                    })
+                    .count()
+                    > 0
+                {
+                    vec![e.clone()]
+                } else {
+                    vec![]
+                }
+            })
+            .collect::<Vec<Variant>>()
+    }
+
+    pub fn write_vcf_cat(&mut self, path: &str, cat: &VariantCategory) -> Result<()> {
+        info!("Writing VCF {}", path);
+
+        let mut to_write = sort_variants(self.get_cat(cat), &self.cfg.dict_file)?;
+        let pg = self.mp.add(new_pg_speed(to_write.len() as u64));
+        pg.set_message("Writing VCF");
+
+        let mut w = VariantWritter::new(path, &self.cfg.dict_file)?;
+        for row in to_write.iter_mut() {
+            w.write_variant(row)?;
+            pg.inc(1);
+        }
+        w.write_index_finish()?;
+        Ok(())
+    }
+
+    /// Keep variants annotated Somatic
+    pub fn keep_somatics_un(&mut self) {
+        let pg = self.mp.add(new_pg_speed(self.data.len() as u64));
+        pg.set_message("Filtering Variants");
+
+        self.data = self
+            .data
+            .par_iter_mut()
+            .progress_with(pg)
+            .flat_map(|e| {
+                // keep unannotated and somatic
+                if e.annotations
+                    .iter()
+                    .filter(|a| match a {
+                        AnnotationType::VariantCategory(vc) => match vc {
+                            VariantCategory::Somatic => false,
+                            _ => true,
+                        },
+                        _ => false,
+                    })
+                    .count()
+                    == 0
+                {
+                    vec![e]
+                } else {
+                    vec![]
+                }
+            })
+            .map(|e| e.clone())
+            .collect();
+    }
+
+    /// Annotate with VEP
+    pub fn vep(&mut self) {
+        let pg = self.mp.add(new_pg_speed(self.len() as u64));
+        pg.set_message("VEP");
+        self.data
+            .par_chunks_mut(self.cfg.vep_chunk_size)
+            .progress_with(pg)
+            .for_each(|chunks| vep_chunk(chunks).unwrap());
+    }
+
+    /// sort_variants TODO
+    pub fn sort(&mut self) -> Result<()> {
+        let cfg = &self.cfg;
+        self.data = sort_variants(self.data.clone(), &cfg.dict_file)?;
+        Ok(())
+    }
+
+    ///
+    pub fn merge(&mut self) {
+        let pg = self.mp.add(new_pg_speed(self.len() as u64));
+        pg.set_message("Merging Variants by contig, positions, ref, alt");
+        let hm: DashMap<String, Variant> = DashMap::new();
+        self.data.par_iter().progress_with(pg).for_each(|e| {
+            let k = format!(
+                "{}:{}|{}>{}",
+                e.contig, e.position, e.reference, e.alternative
+            );
+
+            if let Some(mut v) = hm.get_mut(&k) {
+                let v = v.value_mut();
+                e.callers_data.iter().for_each(|cd| {
+                    v.callers_data.push(cd.clone());
+                    v.callers_data.dedup();
+                });
+                v.source.extend(e.source.clone());
+                v.source.dedup();
+            } else {
+                hm.insert(k, e.clone());
+            }
+        });
+        self.data = hm.iter().map(|e| e.value().clone()).collect();
+    }
+
+    pub fn annotate_gff_feature(&mut self, gff_path: &str) -> Result<()> {
+        let gff_path = gff_path.to_string();
+        let len = self.data.len();
+        let pg = self.mp.add(new_pg_speed(self.len() as u64));
+        pg.set_message("GFF Annotate");
+
+        self.data
+            .par_chunks_mut(len / 33)
+            .progress_with(pg)
+            .for_each(|chunk| {
+                let mut reader = File::open(gff_path.to_string())
+                    .map(noodles_bgzf::Reader::new)
+                    .map(gff::Reader::new)
+                    .unwrap();
+
+                let index = noodles_csi::read(format!("{}.csi", gff_path)).unwrap();
+
+                for v in chunk.iter_mut() {
+                    let start = Position::try_from(v.position as usize).unwrap();
+                    let r = Region::new(v.contig.to_string(), start..=start);
+                    if let std::result::Result::Ok(rows) = reader.query(&index, &r.clone()) {
+                        for row in rows {
+                            let ncbi = NCBIGFF::try_from(row.unwrap()).unwrap();
+                            v.annotations.push(AnnotationType::NCBIGFF(ncbi));
+                        }
+                    }
+                }
+            });
+        Ok(())
+    }
+
+    pub fn echtvar_annotate(&mut self, header_path: &str) -> Result<()> {
+        let len = self.len();
+        let header = vcf_header_from(header_path)?;
+        let pg = self.mp.add(new_pg_speed(len as u64));
+        pg.set_message("Echtvar Annotate");
+
+        self.data
+            .par_chunks_mut(len / 33)
+            .progress_with(pg)
+            .for_each(|chunk| {
+                let in_tmp = format!(
+                    "{}/echtvar_in_{}.vcf",
+                    temp_dir().to_str().unwrap(),
+                    uuid::Uuid::new_v4()
+                );
+
+                let out_tmp = format!(
+                    "{}/echtvar_in_{}.vcf.gz",
+                    temp_dir().to_str().unwrap(),
+                    uuid::Uuid::new_v4()
+                );
+                let mut vcf = File::create(&in_tmp).unwrap();
+
+                let _ = writeln!(vcf, "{}", header);
+
+                for (i, row) in chunk.iter().enumerate() {
+                    let _ = writeln!(
+                        vcf,
+                        "{}\t{}\t{}\t{}\t{}\t{}\tPASS\t.\t{}\t{}",
+                        row.contig,
+                        row.position,
+                        i + 1,
+                        row.reference,
+                        row.alternative,
+                        ".",
+                        ".",
+                        "."
+                    );
+                }
+
+                run_echtvar(&in_tmp, &out_tmp).unwrap();
+
+                let mut reader = ReaderBuilder::new()
+                    .delimiter(b'\t')
+                    .has_headers(false)
+                    .comment(Some(b'#'))
+                    .flexible(true)
+                    .from_reader(get_reader(&out_tmp).unwrap());
+
+                // let mut lines: HashMap<u64, Vec<VEPLine>> = HashMap::new();
+                let mut last: usize = 1;
+                for line in reader.deserialize::<VCFRow>() {
+                    if let std::result::Result::Ok(row) = line {
+                        let (cosmic, gnomad) = parse_echtvar_val(&row.info).unwrap();
+                        let id: usize = row.id.parse().unwrap();
+                        if id != last {
+                            panic!("Echtvar output not in input order!");
+                        }
+                        if let Some(c) = cosmic {
+                            chunk[id - 1].annotations.push(AnnotationType::Cosmic(c));
+                        }
+                        if let Some(g) = gnomad {
+                            chunk[id - 1].annotations.push(AnnotationType::GnomAD(g));
+                        }
+                        last += 1;
+                    }
+                }
+            });
+        Ok(())
+    }
+
+    pub fn category_iter(&self, category: &VariantCategory) -> Vec<&Variant> {
+        self.data
+            .par_iter()
+            .filter(|v| {
+                for annotation in v.annotations.iter() {
+                    match annotation {
+                        AnnotationType::VariantCategory(cat) => {
+                            if cat == category {
+                                return true;
+                            }
+                        }
+                        _ => (),
+                    }
+                }
+                return false;
+            })
+            .collect::<Vec<&Variant>>()
+    }
+
+    /// Filter based on GnomAD if gnomad_af < max_gnomad_af
+    pub fn filter_snp(&mut self) -> Result<i32> {
+        let n_snp = AtomicI32::new(0);
+        self.data = self
+            .data
+            .clone()
+            .into_par_iter()
+            .filter(|e| {
+                let mut res = true;
+                e.annotations.iter().for_each(|a| {
+                    match a {
+                        AnnotationType::GnomAD(g) => {
+                            res = g.gnomad_af < self.cfg.max_gnomad_af;
+                        }
+                        _ => (),
+                    };
+                });
+                if !res {
+                    n_snp.fetch_add(1, Ordering::SeqCst);
+                }
+                res
+            })
+            .collect();
+        let n = n_snp.load(Ordering::SeqCst);
+        Ok(n)
+    }
+
+    pub fn pangolin(&mut self) -> Result<()> {
+        let tmp_file = pangolin_save_variants(&self)?;
+        let res_file = run_pangolin(&tmp_file)?;
+        
+        fs::remove_file(tmp_file)?;
+        let res = pangolin_parse_results(&res_file)?;
+        let mut res = res.iter();
+        fs::remove_file(res_file)?;
+        info!("Adding pangolin results for {} variants.", res.len());
+
+        let mut n_added = 0;
+        if let Some(r) = res.next() {
+            let mut curr = r.clone();
+            for variant in self.data.iter_mut() {
+                if variant.contig == curr.0 && variant.position == curr.1 {
+                    if variant.reference == curr.2 {
+                        if variant.alternative == curr.3  {
+                            variant.annotations.push(AnnotationType::Pangolin(curr.4));
+                            n_added += 1;
+                            if let Some(r) = res.next() {
+                                curr = r.clone();
+                            } else {
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        assert_eq!(res.len(), 0);
+
+        Ok(())
+    }
+
+    pub fn len(&self) -> usize {
+        self.data.len()
+    }
+
+    pub fn constit_len(&self) -> usize {
+        self.constit.len()
+    }
+
+    pub fn get_variant(&self, contig: &str, pos: u32) -> Vec<Variant> {
+        self.data
+            .par_iter()
+            .filter(|v| v.contig == contig && v.position == pos)
+            .map(|v| v.clone())
+            .collect()
+    }
+
+    pub fn stats(&self) -> Result<Vec<Stat>> {
+        let mut callers_cat = HashMap::new();
+        let mut n_caller_data = 0;
+
+        let mut variants_cat = HashMap::new();
+        let mut n_variants_wcat = 0;
+
+        let mut ncbi_feature = HashMap::new();
+        let mut n_ncbi_feature = 0;
+
+        let mut cosmic_sup_1 = HashMap::new();
+        let mut n_cosmic_sup_1 = 0;
+
+        let mut cons_cat = HashMap::new();
+        let mut n_csq = 0;
+
+        let add_hm = |hm: &mut HashMap<String, u32>, k: &str| {
+            let (_, v) = hm.raw_entry_mut().from_key(k).or_insert(k.to_string(), 1);
+            *v += 1;
+        };
+
+        for ele in self.data.iter() {
+            // Callers
+            let mut callers = Vec::new();
+            for cd in &ele.callers_data {
+                callers.push(
+                    match cd.format {
+                        Format::DeepVariant(_) => "DeepVariant",
+                        Format::ClairS(_) => "ClairS",
+                        Format::Sniffles(_) => "Sniffles",
+                        Format::Nanomonsv(_) => "Nanomonsv",
+                    }
+                    .to_string(),
+                );
+            }
+
+            if !callers.is_empty() {
+                n_caller_data += 1;
+                callers.sort();
+                let k = callers.join(",");
+
+                let (_, v) = callers_cat
+                    .raw_entry_mut()
+                    .from_key(&k)
+                    .or_insert(k.clone(), 1);
+                *v += 1;
+            }
+
+            // Var cat
+
+            // Annotations
+            for annot in ele.annotations.iter() {
+                let mut features = Vec::new();
+                let mut variant_cat = Vec::new();
+                let mut cosmic_m1 = false;
+
+                match annot {
+                    AnnotationType::NCBIGFF(ncbi) => {
+                        features.push(ncbi.feature.to_string());
+                    }
+                    AnnotationType::Cosmic(c) => {
+                        if c.cosmic_cnt > 1 {
+                            cosmic_m1 = true;
+                        }
+                    }
+                    AnnotationType::VariantCategory(vc) => {
+                        let s = serde_json::to_string(vc)?;
+                        variant_cat.push(s);
+                    }
+                    _ => (),
+                };
+
+                if !features.is_empty() {
+                    features.sort();
+                    add_hm(&mut ncbi_feature, &features.join(","));
+                    n_ncbi_feature += 1;
+                }
+
+                if !variant_cat.is_empty() {
+                    add_hm(&mut variants_cat, &variant_cat.join(","));
+                    n_variants_wcat += 1;
+                }
+
+                if cosmic_m1 {
+                    add_hm(&mut cosmic_sup_1, "Cosmic > 1");
+                    n_cosmic_sup_1 += 1;
+                }
+            }
+
+            // VEP
+            let d: Vec<VEP> = ele
+                .annotations
+                .iter()
+                .flat_map(|e| {
+                    if let AnnotationType::VEP(e) = e {
+                        e.clone()
+                    } else {
+                        vec![]
+                    }
+                })
+                .collect();
+            if let std::result::Result::Ok(vep) = get_best_vep(&d) {
+                if let Some(csq) = vep.consequence {
+                    n_csq += 1;
+                    let csq = csq.join(",");
+                    let (_, v) = cons_cat
+                        .raw_entry_mut()
+                        .from_key(&csq)
+                        .or_insert(csq.clone(), 1);
+                    *v += 1;
+                }
+            }
+        }
+
+        print_stat_cat(&cons_cat, n_csq as u32);
+        print_stat_cat(&ncbi_feature, n_ncbi_feature as u32);
+        print_stat_cat(&cosmic_sup_1, n_cosmic_sup_1 as u32);
+        print_stat_cat(&callers_cat, n_caller_data as u32);
+
+        // let file = File::create(path)?;
+        // let mut writer = BufWriter::new(file);
+        let mut results = Vec::new();
+        results.push(Stat::new(
+            "consequences".to_string(),
+            cons_cat,
+            n_csq as u32,
+        ));
+        results.push(Stat::new(
+            "variants_cat".to_string(),
+            variants_cat,
+            n_variants_wcat as u32,
+        ));
+        results.push(Stat::new(
+            "ncbi_feature".to_string(),
+            ncbi_feature,
+            n_ncbi_feature as u32,
+        ));
+        results.push(Stat::new(
+            "callers_cat".to_string(),
+            callers_cat,
+            n_caller_data as u32,
+        ));
+
+        // let res = serde_json::to_string(&results)?;
+
+        Ok(results)
+    }
+
+    pub fn save_sql(&self, path: &str) -> Result<()> {
+        insert_variants(self, path)
+    }
+
+    pub fn stats_sql(&self, path: &str) -> Result<()> {
+        insert_stats(
+            "VCF".to_string(),
+            serde_json::to_string(&self.stats_vcf)?,
+            path,
+        )?;
+        insert_stats(
+            "BAM".to_string(),
+            serde_json::to_string(&self.stats_bam)?,
+            path,
+        )?;
+        Ok(())
+    }
+
+    pub fn save_bytes(&self, path: &str) -> Result<()> {
+        let serialized = pot::to_vec(&self.data)?;
+        let mut w = noodles_bgzf::writer::Builder::default().build_with_writer(File::create(path)?);
+        w.write_all(&serialized)?;
+        Ok(())
+    }
+
+    pub fn new_from_bytes(name: &str, path: &str, mp: MultiProgress) -> Result<Self> {
+        info!("Loading variants from: {path}");
+        let r = in_out::get_reader_progress(path, &mp)?;
+
+        let data: Vec<Variant> = pot::from_reader(r)?;
+        Ok(Self {
+            name: name.to_string(),
+            data,
+            constit: DashMap::new(),
+            stats_vcf: StatsVCF::default(),
+            stats_bam: StatsBAM::default(),
+            cfg: Config::get()?,
+            mp,
+        })
+    }
+
+    pub fn filter_category(&self, and_categories: &Vec<Category>) -> Vec<&Variant> {
+        self.data
+            .par_iter()
+            .flat_map(|v| {
+                if v.is_from_category(and_categories) {
+                    vec![v]
+                } else {
+                    vec![]
+                }
+            })
+            .collect()
+    }
+}
+
+#[derive(Debug, Clone, Serialize, ToSchema)]
+pub struct Stat {
+    name: String,
+    counts: HashMap<String, u32>,
+    n_with_annotation: u32,
+}
+
+impl Stat {
+    pub fn new(name: String, counts: HashMap<String, u32>, n_with_annotation: u32) -> Self {
+        Stat {
+            counts,
+            n_with_annotation,
+            name,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
+pub struct Variant {
+    pub contig: String,
+    pub position: u32,
+    pub reference: ReferenceAlternative,
+    pub alternative: ReferenceAlternative,
+    pub callers_data: Vec<CallerData>,
+    pub n_alt: Option<u32>,
+    pub n_ref: Option<u32>,
+    pub vaf: Option<f32>,
+    pub depth: Option<u32>,
+    pub variant_type: VariantType,
+    pub source: Vec<VCFSource>,
+    pub annotations: Vec<AnnotationType>,
+}
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone, ToSchema)]
+pub struct CallerData {
+    pub qual: Option<f32>,
+    pub format: Format,
+    pub info: Info,
+}
+
+impl CallerData {
+    pub fn get_vaf(&self) -> f64 {
+        match &self.format {
+            Format::DeepVariant(v) => v.vaf as f64,
+            Format::ClairS(v) => v.af,
+            Format::Sniffles(v) => v.dv as f64 / (v.dv as f64 + v.dr as f64),
+            Format::Nanomonsv(v) => v.vr as f64 / v.tr as f64,
+        }
+    }
+    pub fn get_depth(&mut self) -> u32 {
+        match &self.format {
+            Format::DeepVariant(v) => v.dp,
+            Format::ClairS(v) => v.dp,
+            Format::Sniffles(v) => v.dv + v.dr,
+            Format::Nanomonsv(v) => v.tr,
+        }
+    }
+    pub fn get_n_alt(&mut self) -> u32 {
+        match &self.format {
+            Format::DeepVariant(v) => v.ad.get(1).unwrap().to_owned(),
+            Format::ClairS(v) => v.ad.get(1).unwrap().to_owned(),
+            Format::Sniffles(v) => v.dv,
+            Format::Nanomonsv(v) => v.tr - v.vr,
+        }
+    }
+
+    /// Variants filter rules
+    pub fn should_filter(&self) -> bool {
+        if let Info::Sniffles(info) = &self.info {
+            let imprecise = info
+                .tags
+                .iter()
+                .filter(|s| s.to_string() == "IMPRECISE".to_string())
+                .count();
+            let mut n_alt = 0;
+            if let Format::Sniffles(f) = &self.format {
+                n_alt = f.dv;
+            }
+            if imprecise == 0 && n_alt >= 3 {
+                return false;
+            } else {
+                return true;
+            }
+        } else {
+            return false;
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Eq, PartialEq, Deserialize, ToSchema)]
+pub enum VariantType {
+    Somatic,
+    Constitutionnal,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
+pub enum VCFSource {
+    DeepVariant,
+    ClairS,
+    Sniffles,
+    Nanomonsv,
+}
+
+impl FromStr for VCFSource {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self> {
+        match s {
+            "DeepVariant" => Ok(VCFSource::DeepVariant),
+            "ClairS" => Ok(VCFSource::ClairS),
+            "Sniffles" => Ok(VCFSource::Sniffles),
+            "Nanomonsv" => Ok(VCFSource::Nanomonsv),
+            _ => Err(anyhow!("Error parsing VCFSource")),
+        }
+    }
+}
+
+impl ToString for VCFSource {
+    fn to_string(&self) -> String {
+        let s = match self {
+            VCFSource::DeepVariant => "DeepVariant",
+            VCFSource::ClairS => "ClairS",
+            VCFSource::Sniffles => "Sniffles",
+            VCFSource::Nanomonsv => "Nanomonsv",
+        };
+        s.to_string()
+    }
+}
+
+impl Variant {
+    pub fn from_vcfrow(row: &VCFRow, source: VCFSource, variant_type: VariantType) -> Result<Self> {
+        let callers_data = vec![CallerData {
+            qual: row.qual.parse::<f32>().ok(),
+            info: parse_info(&row.info, &source).context(anyhow!(
+                "Can't parse {:?} info for {}",
+                source,
+                row.info
+            ))?,
+            format: parse_format(&source, &row.value).context(anyhow!(
+                "Can't parse {:?} format for {}",
+                source,
+                row.value
+            ))?,
+        }];
+
+        Ok(Variant {
+            contig: row.chr.to_string(),
+            position: row.pos,
+            reference: row
+                .reference
+                .parse()
+                .context(anyhow!("Error while parsing {}", row.reference))?,
+            alternative: row
+                .alt
+                .parse()
+                .context(anyhow!("Error while parsing {}", row.alt))?,
+            n_ref: None,
+            n_alt: None,
+            vaf: None,
+            depth: None,
+            callers_data,
+            source: vec![source],
+            variant_type,
+            annotations: Vec::new(),
+        })
+    }
+
+    pub fn get_depth(&mut self) -> u32 {
+        if let Some(depth) = self.depth {
+            return depth;
+        } else {
+            let depth = self
+                .callers_data
+                .iter_mut()
+                .map(|v| v.get_depth())
+                .max()
+                .unwrap();
+            self.depth = Some(depth);
+            return depth;
+        }
+    }
+
+    pub fn get_n_alt(&mut self) -> u32 {
+        if let Some(n_alt) = self.n_alt {
+            return n_alt;
+        } else {
+            let n_alt = self
+                .callers_data
+                .iter_mut()
+                .map(|v| v.get_n_alt())
+                .max()
+                .unwrap();
+            self.n_alt = Some(n_alt);
+            return n_alt;
+        }
+    }
+
+    pub fn vaf(&mut self) -> f32 {
+        let n_alt = self.get_n_alt() as f32;
+        let depth = self.get_depth() as f32;
+        self.vaf = Some(n_alt / depth);
+        self.vaf.unwrap()
+    }
+
+    fn is_ins(&self) -> bool {
+        match (&self.reference, &self.alternative) {
+            (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => true,
+            _ => false,
+        }
+    }
+
+    fn alt_cat(&self) -> AlterationCategory {
+        match (&self.reference, &self.alternative) {
+            (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
+                AlterationCategory::SNV
+            }
+            (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
+                AlterationCategory::INS
+            }
+            (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
+                AlterationCategory::Other
+            }
+            (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
+                AlterationCategory::DEL
+            }
+            (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) => {
+                let a = a.len();
+                let b = b.len();
+                if a < b {
+                    AlterationCategory::INS
+                } else if a > b {
+                    AlterationCategory::DEL
+                } else {
+                    AlterationCategory::REP
+                }
+            }
+            (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Unstructured(_)) => {
+                AlterationCategory::Other
+            }
+            (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotide(_)) => {
+                AlterationCategory::Other
+            }
+            (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Nucleotides(_)) => {
+                AlterationCategory::Other
+            }
+            (ReferenceAlternative::Unstructured(_), ReferenceAlternative::Unstructured(_)) => {
+                AlterationCategory::Other
+            }
+        }
+    }
+
+    pub fn to_min_string(&mut self) -> String {
+        let depth = self.get_depth();
+        let n_alt = self.get_n_alt();
+
+        format!(
+            "DP:AD\t{}:{}",
+            depth,
+            vec![(depth - n_alt).to_string(), n_alt.to_string()].join(",")
+        )
+    }
+
+    pub fn get_veps(&self) -> Vec<VEP> {
+        self.annotations
+            .iter()
+            .flat_map(|e| {
+                if let AnnotationType::VEP(e) = e {
+                    e.clone()
+                } else {
+                    vec![]
+                }
+            })
+            .collect()
+    }
+    pub fn get_best_vep(&self) -> Result<VEP> {
+        get_best_vep(&self.get_veps())
+    }
+
+    pub fn is_from_category(&self, and_categories: &Vec<Category>) -> bool {
+        let mut vec_bools = Vec::new();
+        for category in and_categories.iter() {
+            match category {
+                Category::VariantCategory(vc) => {
+                    for annotations in self.annotations.iter() {
+                        match annotations {
+                            AnnotationType::VariantCategory(vvc) => {
+                                if vc == vvc {
+                                    vec_bools.push(true);
+                                    break;
+                                }
+                            }
+                            _ => (),
+                        }
+                    }
+                }
+                Category::PositionRange { contig, from, to } => {
+                    if self.contig == *contig {
+                        match (from, to) {
+                            (None, None) => vec_bools.push(true),
+                            (None, Some(to)) => vec_bools.push(self.position <= *to),
+                            (Some(from), None) => vec_bools.push(self.position >= *from),
+                            (Some(from), Some(to)) => {
+                                vec_bools.push(self.position >= *from && self.position <= *to)
+                            }
+                        }
+                    } else {
+                        vec_bools.push(false);
+                    }
+                }
+                Category::VCFSource(_) => (),
+                Category::NCosmic(n) => {
+                    let mut bools = Vec::new();
+                    for annotations in self.annotations.iter() {
+                        match annotations {
+                            AnnotationType::Cosmic(c) => {
+                                bools.push(c.cosmic_cnt >= *n);
+                                break;
+                            }
+                            _ => (),
+                        }
+                    }
+                    vec_bools.push(bools.iter().any(|&b| b));
+                }
+                Category::NCBIFeature(ncbi_feature) => {
+                    let mut bools = Vec::new();
+                    for annotations in self.annotations.iter() {
+                        match annotations {
+                            AnnotationType::NCBIGFF(v) => {
+                                bools.push(v.feature == *ncbi_feature);
+                            }
+                            _ => (),
+                        }
+                    }
+                    vec_bools.push(bools.iter().any(|&b| b));
+                }
+                Category::VAF { min, max } => {
+                    let v = if self.vaf.is_none() {
+                        let mut s = self.clone();
+                        s.vaf()
+                    } else {
+                        self.vaf.unwrap()
+                    };
+                    vec_bools.push(v >= *min && v <= *max);
+                }
+                Category::Pangolin => {
+                    vec_bools.push(self.annotations.iter().filter(|a| match a {
+                        AnnotationType::Pangolin(_) => true,
+                        _ => false 
+                    }).count() > 0);
+                }
+            }
+        }
+        vec_bools.iter().all(|&x| x)
+    }
+
+    pub fn callers(&self) -> Vec<String> {
+        self.source
+            .iter()
+            .map(|source| source.to_string())
+            .collect()
+    }
+}
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
+enum AlterationCategory {
+    SNV,
+    INS,
+    DEL,
+    REP,
+    Other,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
+pub enum AnnotationType {
+    VariantCategory(VariantCategory),
+    VEP(Vec<VEP>),
+    Cluster(i32),
+    Cosmic(Cosmic),
+    GnomAD(GnomAD),
+    NCBIGFF(NCBIGFF),
+    Pangolin(Pangolin)
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
+pub enum VariantCategory {
+    Somatic,
+    LowMRDDepth,
+    LOH,
+    Constit,
+    LowDiversity,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, ToSchema)]
+pub enum ReferenceAlternative {
+    Nucleotide(Base),
+    Nucleotides(Vec<Base>),
+    Unstructured(String),
+}
+
+impl FromStr for ReferenceAlternative {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self> {
+        let mut possible_bases = s.as_bytes().iter();
+        let mut res: Vec<Base> = Vec::new();
+        while let Some(&base) = possible_bases.next() {
+            match base.try_into() {
+                std::result::Result::Ok(b) => res.push(b),
+                Err(_) => {
+                    return Ok(Self::Unstructured(s.to_string()));
+                }
+            }
+        }
+
+        if res.len() == 1 {
+            return Ok(Self::Nucleotide(res.pop().unwrap()));
+        } else {
+            return Ok(Self::Nucleotides(res));
+        }
+    }
+}
+
+impl fmt::Display for ReferenceAlternative {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let string = match self {
+            ReferenceAlternative::Nucleotide(b) => b.to_string(),
+            ReferenceAlternative::Nucleotides(bases) => bases
+                .iter()
+                .fold(String::new(), |acc, e| format!("{}{}", acc, e.to_string())),
+            ReferenceAlternative::Unstructured(s) => s.to_string(),
+        };
+        write!(f, "{}", string)
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, ToSchema)]
+pub enum Base {
+    A,
+    T,
+    C,
+    G,
+    N,
+}
+
+impl TryFrom<u8> for Base {
+    type Error = anyhow::Error;
+    fn try_from(base: u8) -> Result<Self> {
+        match base {
+            b'A' => Ok(Base::A),
+            b'T' => Ok(Base::T),
+            b'C' => Ok(Base::C),
+            b'G' => Ok(Base::G),
+            b'N' => Ok(Base::N),
+            _ => Err(anyhow!(
+                "Unknown base: {}",
+                String::from_utf8_lossy(&vec![base])
+            )),
+        }
+    }
+}
+
+impl Base {
+    pub fn into_u8(self) -> u8 {
+        return match self {
+            Base::A => b'A',
+            Base::T => b'T',
+            Base::C => b'C',
+            Base::G => b'G',
+            Base::N => b'N',
+        };
+    }
+}
+
+impl fmt::Display for Base {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // Use `self.number` to refer to each positional data point.
+        let str = match self {
+            Base::A => "A",
+            Base::T => "T",
+            Base::C => "C",
+            Base::G => "G",
+            Base::N => "N",
+        };
+        write!(f, "{}", str)
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, ToSchema)]
+pub enum Format {
+    DeepVariant(DeepVariantFormat),
+    ClairS(ClairSFormat),
+    Sniffles(SnifflesFormat),
+    Nanomonsv(NanomonsvFormat),
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, ToSchema)]
+pub enum Info {
+    #[schema(value_type=String)]
+    DeepVariant(DeepVariantInfo),
+    #[schema(value_type=String)]
+    ClairS(ClairSInfo),
+    #[schema(value_type=String)]
+    Sniffles(SnifflesInfo),
+    #[schema(value_type=String)]
+    Nanomonsv(NanomonsvInfo),
+}
+
+fn parse_info(s: &str, source: &VCFSource) -> Result<Info> {
+    match source {
+        VCFSource::DeepVariant => Ok(Info::DeepVariant(s.parse()?)),
+        VCFSource::ClairS => Ok(Info::ClairS(s.parse()?)),
+        VCFSource::Sniffles => Ok(Info::Sniffles(s.parse()?)),
+        VCFSource::Nanomonsv => Ok(Info::Nanomonsv(s.parse()?)),
+    }
+}
+
+fn parse_format(vcf_source: &VCFSource, data: &str) -> Result<Format> {
+    let res = match vcf_source {
+        VCFSource::DeepVariant => Format::DeepVariant(data.parse()?),
+        VCFSource::ClairS => Format::ClairS(data.parse()?),
+        VCFSource::Sniffles => Format::Sniffles(data.parse()?),
+        VCFSource::Nanomonsv => Format::Nanomonsv(data.parse()?),
+    };
+    Ok(res)
+}
+
+pub fn sort_variants(d: Vec<Variant>, dict_path: &str) -> Result<Vec<Variant>> {
+    info!("Sorting {} entries", d.len());
+    let dict = read_dict(dict_path)?;
+
+    let mut store: HashMap<String, Vec<Variant>> = HashMap::new();
+
+    // add to store
+    d.iter().for_each(|e| {
+        if let Some(vec) = store.get_mut(&e.contig) {
+            vec.push(e.clone());
+        } else {
+            store.insert(e.contig.to_string(), vec![e.clone()]);
+        }
+    });
+
+    // sort in each contig
+    store
+        .iter_mut()
+        .for_each(|(_, vec)| vec.sort_by(|a, b| a.position.partial_cmp(&b.position).unwrap()));
+
+    // return contig in the order of dict file
+    Ok(dict
+        .iter()
+        .flat_map(|(chr, _)| {
+            if let Some((_, vec)) = store.remove_entry(chr) {
+                vec
+            } else {
+                vec![]
+            }
+        })
+        .collect())
+}
+
+#[derive(Debug)]
+pub enum Category {
+    VariantCategory(VariantCategory),
+    PositionRange {
+        contig: String,
+        from: Option<u32>,
+        to: Option<u32>,
+    },
+    VCFSource(VCFSource),
+    NCosmic(u64),
+    NCBIFeature(String),
+    VAF {
+        min: f32,
+        max: f32,
+    },
+    Pangolin
+}
+
+pub fn run_pipe(name: &str, multi: &MultiProgress) -> Result<()> {
+    let cfg = config::Config::get()?;
+    let deepvariant_diag_vcf = format!(
+        "{}/{name}/diag/DeepVariant/{name}_diag_DeepVariant_PASSED.vcf.gz",
+        cfg.longreads_results_dir
+    );
+    if !std::path::Path::new(&deepvariant_diag_vcf).exists() {
+        return Err(anyhow!("{deepvariant_diag_vcf} is required"));
+        // panic!("{deepvariant_diag_vcf} is required")
+    }
+    let deepvariant_mrd_vcf = format!(
+        "{}/{name}/mrd/DeepVariant/{name}_mrd_DeepVariant_PASSED.vcf.gz",
+        cfg.longreads_results_dir
+    );
+    if !std::path::Path::new(&deepvariant_mrd_vcf).exists() {
+        return Err(anyhow!("{deepvariant_mrd_vcf} is required"));
+    }
+    let mrd_bam = format!(
+        "{}/{name}/mrd/{name}_mrd_hs1.bam",
+        cfg.longreads_results_dir
+    );
+    if !std::path::Path::new(&mrd_bam).exists() {
+        return Err(anyhow!("{mrd_bam} is required"));
+    }
+    let clairs_vcf = format!(
+        "{}/{name}/diag/ClairS/{name}_diag_clairs_PASSED.vcf.gz",
+        cfg.longreads_results_dir
+    );
+    if !std::path::Path::new(&clairs_vcf).exists() {
+        return Err(anyhow!("{clairs_vcf} is required"));
+    }
+    let clairs_indels_vcf = format!(
+        "{}/{name}/diag/ClairS/{name}_diag_clairs_indel_PASSED.vcf.gz",
+        cfg.longreads_results_dir
+    );
+    if !std::path::Path::new(&clairs_indels_vcf).exists() {
+        return Err(anyhow!("{clairs_indels_vcf} is required"));
+    }
+    let sniffles_vcf = format!(
+        "{}/{name}/diag/Sniffles/{name}_diag_sniffles.vcf",
+        cfg.longreads_results_dir
+    );
+    let sniffles_mrd_vcf = format!(
+        "{}/{name}/mrd/Sniffles/{name}_mrd_sniffles.vcf",
+        cfg.longreads_results_dir
+    );
+    if !std::path::Path::new(&sniffles_vcf).exists() {
+        return Err(anyhow!("{sniffles_vcf} is required"));
+    }
+    let nanomonsv_vcf = format!(
+        "{}/{name}/diag/nanomonsv/{name}_diag_nanomonsv_PASSED.vcf.gz",
+        cfg.longreads_results_dir
+    );
+    if !std::path::Path::new(&nanomonsv_vcf).exists() {
+        return Err(anyhow!("{nanomonsv_vcf} is required"));
+    }
+
+    // let db_path = "/data/db_results.sqlite".to_string();
+    // `${data_dir}/${name}/diag/${name}_variants.sqlite`
+    let db_path = format!(
+        "{}/{name}/diag/{name}_variants.sqlite",
+        cfg.longreads_results_dir
+    );
+    let bytes_path = format!(
+        "{}/{name}/diag/{name}_variants.bytes.gz",
+        cfg.longreads_results_dir
+    );
+
+    let loh_path = format!(
+        "{}/{name}/diag/{name}_loh.vcf.gz",
+        cfg.longreads_results_dir
+    );
+    // let db_constit_path = format!(
+    //     "{}/{name}/diag/{name}_constit.sqlite",
+    //     cfg.longreads_results_dir
+    // );
+    let bytes_constit_path = format!(
+        "{}/{name}/diag/{name}_constit.bytes.gz",
+        cfg.longreads_results_dir
+    );
+
+    let sources = vec![
+        (
+            deepvariant_diag_vcf.as_str(),
+            &VCFSource::DeepVariant,
+            &VariantType::Somatic,
+        ),
+        (
+            deepvariant_mrd_vcf.as_str(),
+            &VCFSource::DeepVariant,
+            &VariantType::Constitutionnal,
+        ),
+        (
+            clairs_vcf.as_str(),
+            &VCFSource::ClairS,
+            &VariantType::Somatic,
+        ),
+        (
+            sniffles_vcf.as_str(),
+            &VCFSource::Sniffles,
+            &VariantType::Somatic,
+        ),
+        (
+            sniffles_mrd_vcf.as_str(),
+            &VCFSource::Sniffles,
+            &VariantType::Constitutionnal,
+        ),
+        (
+            nanomonsv_vcf.as_str(),
+            &VCFSource::Nanomonsv,
+            &VariantType::Somatic,
+        ),
+    ];
+    let mut variants = Variants::from_vcfs(name.to_string(), sources, &cfg, multi.clone())?;
+
+    variants.vcf_filters();
+    variants.write_vcf_cat(&loh_path, &VariantCategory::LOH)?;
+    variants.bam_filters(&mrd_bam);
+
+    let constits = variants.get_cat(&VariantCategory::Constit);
+    let constits = Variants::from_vec(name.to_string(), &multi, constits);
+    constits.save_bytes(&bytes_constit_path)?;
+
+    variants.keep_somatics_un();
+    info!("Variants retained: {}", variants.len());
+
+    // TODO check if SNP are matching
+    if variants.len() > 100_000 {
+        return Err(anyhow!(
+            "Too many variants, verify if somatic and tumoral samples match."
+        ));
+    }
+
+    variants.merge();
+    variants.sort()?;
+    info!("Variants retained: {}", variants.len());
+    variants.vep();
+    variants.pangolin()?;
+
+    variants.annotate_gff_feature(&cfg.gff_path)?;
+
+    variants.echtvar_annotate(&deepvariant_mrd_vcf)?;
+    variants.filter_snp()?;
+
+    variants.save_bytes(&bytes_path)?;
+    // variants.stats()?;
+    //
+    // if std::path::Path::new(&db_path).exists() {
+    //     crate::sql::variants_sql::remove_variants_names(&db_path, &name)?;
+    // }
+    //
+    variants.save_sql(&db_path)?;
+    variants.stats_sql(&db_path)?;
+    info!("Variants : {}", variants.len());
+
+    Ok(())
+}
+
+// pub fn cluster_variants(d: &mut Vec<Variant>, max_dist: u32) -> i32 {
+//     let mut cluster_id = 0;
+//     let first = d.get(0).unwrap();
+//     let mut last_pos = first.position;
+//     let mut last_contig = first.contig.to_string();
+//
+//     d.iter_mut().for_each(|e| {
+//         if e.contig != last_contig {
+//             cluster_id += 1;
+//             last_contig = e.contig.to_string();
+//         } else if e.position - last_pos > max_dist {
+//             cluster_id += 1;
+//         }
+//         e.annotations.push(AnnotationType::Cluster(cluster_id));
+//         last_pos = e.position;
+//     });
+//
+//     cluster_id
+// }

+ 1 - 0
src/annotations/mod.rs

@@ -4,3 +4,4 @@ pub mod ncbi_gff;
 pub mod cosmic;
 pub mod gnomad;
 pub mod pangolin;
+pub mod phase;

+ 9 - 0
src/annotations/phase.rs

@@ -0,0 +1,9 @@
+use serde::{Deserialize, Serialize};
+use utoipa::ToSchema;
+
+
+
+#[derive(Debug, PartialEq, Serialize, Deserialize, Clone, ToSchema)]
+pub struct Phase {
+    
+}

+ 180 - 212
src/variants.rs

@@ -1,6 +1,11 @@
 use crate::{
     annotations::{
-        cosmic::Cosmic, echtvar::{parse_echtvar_val, run_echtvar}, gnomad::GnomAD, ncbi_gff::NCBIGFF, pangolin::{pangolin_parse_results, pangolin_save_variants, run_pangolin, Pangolin}, vep::{get_best_vep, vep_chunk, VEP}
+        cosmic::Cosmic,
+        echtvar::{parse_echtvar_val, run_echtvar},
+        gnomad::GnomAD,
+        ncbi_gff::NCBIGFF,
+        pangolin::{pangolin_parse_results, pangolin_save_variants, run_pangolin, Pangolin},
+        vep::{get_best_vep, vep_chunk, VEP},
     },
     callers::{
         clairs::{ClairSFormat, ClairSInfo},
@@ -27,17 +32,16 @@ use csv::ReaderBuilder;
 use dashmap::DashMap;
 use hashbrown::HashMap;
 use indicatif::{MultiProgress, ParallelProgressIterator};
-use log::{info, warn};
+use log::info;
 use noodles_core::{region::Region, Position};
 use noodles_fasta::indexed_reader::Builder as FastaBuilder;
 use noodles_gff as gff;
 
 use rayon::prelude::*;
 use serde::{ser::SerializeStruct, Deserialize, Serialize, Serializer};
-use std::{fs, io::Write};
 use std::{
     env::temp_dir,
-    fmt,
+    fmt::{self, Display},
     fs::File,
     str::FromStr,
     sync::{
@@ -45,7 +49,8 @@ use std::{
         Arc,
     },
 };
-use utoipa::{openapi::schema, ToSchema};
+use std::{fs, io::Write};
+use utoipa::ToSchema;
 
 // chr12:25116542|G>T KRAS
 #[derive(Debug, Clone)]
@@ -181,7 +186,7 @@ impl Variants {
         info!("{} variants parsed from somatic VCFs and {} variants positions parsed from constitutionnal VCFs. Executed in {}s", data.len(), constit.len(), elapsed.as_secs());
         let cfg = cfg.clone();
 
-        return Ok(Self {
+        Ok(Self {
             name,
             data,
             constit,
@@ -189,7 +194,7 @@ impl Variants {
             stats_bam,
             cfg,
             mp: mp.clone(),
-        });
+        })
     }
 
     pub fn vcf_filters(&mut self) {
@@ -199,7 +204,7 @@ impl Variants {
 
         let n_tumoral_init = self.len();
         let n_constit_init = self.constit_len();
-        let min_loh_diff = cfg.deepvariant_loh_pval as f64;
+        let min_loh_diff = cfg.deepvariant_loh_pval;
         let min_mrd_depth = cfg.min_mrd_depth;
 
         info!("Filtering Constitutionnal (reported variant in constit), LOH (VAF proportion test < {}), LowMRDDepth (< {} in constit) variants by VCF annotations of {} likely somatic variants", min_loh_diff, min_mrd_depth, n_tumoral_init);
@@ -250,7 +255,7 @@ impl Variants {
                         }
                     }
                 // If not un Constit registry, ClairS look for VCF constit depth and n_alt
-                } else if let Format::ClairS(format) = &tumoral.callers_data.get(0).unwrap().format
+                } else if let Format::ClairS(format) = &tumoral.callers_data.first().unwrap().format
                 {
                     if format.ndp < min_mrd_depth {
                         n_low_mrd_depth.fetch_add(1, Ordering::SeqCst);
@@ -322,13 +327,13 @@ impl Variants {
             for tumoral in chunk.iter_mut() {
                 pg.inc(1);
 
-                if tumoral.annotations.len() > 0 {
+                if !tumoral.annotations.is_empty() {
                     n_already.fetch_add(1, Ordering::SeqCst);
                     continue;
                 }
                 let (pos, is_ins) = match tumoral.alt_cat() {
-                    AlterationCategory::INS => (tumoral.position, true),
-                    AlterationCategory::DEL => (tumoral.position, false),
+                    AlterationCategory::Ins => (tumoral.position, true),
+                    AlterationCategory::Del => (tumoral.position, false),
                     _ => (tumoral.position, false),
                 };
                 match get_hts_nt_pileup(
@@ -368,54 +373,44 @@ impl Variants {
                                 // Check triplets or doublets if DeepVariant
                                 let callers = tumoral.callers();
 
-                                if callers.len() == 1 {
-                                    if callers[0] == "DeepVariant".to_string() {
-                                        let seq_left = &s[0..20];
-                                        let seq_right = &s[21..s.len() - 1];
-
-                                        // Triplet right
-                                        if count_repetitions(seq_right, 3) >= 3 {
-                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
-                                            tumoral.annotations.push(
-                                                AnnotationType::VariantCategory(
-                                                    VariantCategory::LowDiversity,
-                                                ),
-                                            );
-                                            continue;
-                                        }
-
-                                        // Doublet right
-                                        if count_repetitions(seq_right, 2) >= 4 {
-                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
-                                            tumoral.annotations.push(
-                                                AnnotationType::VariantCategory(
-                                                    VariantCategory::LowDiversity,
-                                                ),
-                                            );
-                                            continue;
-                                        }
-
-                                        // Triplet left
-                                        if count_repetitions(seq_left, 3) >= 3 {
-                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
-                                            tumoral.annotations.push(
-                                                AnnotationType::VariantCategory(
-                                                    VariantCategory::LowDiversity,
-                                                ),
-                                            );
-                                            continue;
-                                        }
-
-                                        // Doublet left
-                                        if count_repetitions(seq_left, 2) >= 4 {
-                                            n_low_diversity.fetch_add(1, Ordering::SeqCst);
-                                            tumoral.annotations.push(
-                                                AnnotationType::VariantCategory(
-                                                    VariantCategory::LowDiversity,
-                                                ),
-                                            );
-                                            continue;
-                                        }
+                                if callers.len() == 1 && callers[0] == *"DeepVariant" {
+                                    let seq_left = &s[0..20];
+                                    let seq_right = &s[21..s.len() - 1];
+
+                                    // Triplet right
+                                    if count_repetitions(seq_right, 3) >= 3 {
+                                        n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                        tumoral.annotations.push(AnnotationType::VariantCategory(
+                                            VariantCategory::LowDiversity,
+                                        ));
+                                        continue;
+                                    }
+
+                                    // Doublet right
+                                    if count_repetitions(seq_right, 2) >= 4 {
+                                        n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                        tumoral.annotations.push(AnnotationType::VariantCategory(
+                                            VariantCategory::LowDiversity,
+                                        ));
+                                        continue;
+                                    }
+
+                                    // Triplet left
+                                    if count_repetitions(seq_left, 3) >= 3 {
+                                        n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                        tumoral.annotations.push(AnnotationType::VariantCategory(
+                                            VariantCategory::LowDiversity,
+                                        ));
+                                        continue;
+                                    }
+
+                                    // Doublet left
+                                    if count_repetitions(seq_left, 2) >= 4 {
+                                        n_low_diversity.fetch_add(1, Ordering::SeqCst);
+                                        tumoral.annotations.push(AnnotationType::VariantCategory(
+                                            VariantCategory::LowDiversity,
+                                        ));
+                                        continue;
                                     }
                                 }
                             }
@@ -454,7 +449,7 @@ impl Variants {
                                         VariantCategory::Somatic,
                                     ));
                                 }
-                            } else if tumoral.alt_cat() == AlterationCategory::DEL {
+                            } else if tumoral.alt_cat() == AlterationCategory::Del {
                                 let n_alt_mrd =
                                     bases.clone().into_iter().filter(|e| *e == b'D').count();
                                 if n_alt_mrd > 0 {
@@ -546,10 +541,9 @@ impl Variants {
                 if e.annotations
                     .iter()
                     .filter(|a| match a {
-                        AnnotationType::VariantCategory(vc) => match vc {
-                            VariantCategory::Somatic => false,
-                            _ => true,
-                        },
+                        AnnotationType::VariantCategory(vc) => {
+                            !matches!(vc, VariantCategory::Somatic)
+                        }
                         _ => false,
                     })
                     .count()
@@ -581,7 +575,6 @@ impl Variants {
         Ok(())
     }
 
-    ///
     pub fn merge(&mut self) {
         let pg = self.mp.add(new_pg_speed(self.len() as u64));
         pg.set_message("Merging Variants by contig, positions, ref, alt");
@@ -617,7 +610,7 @@ impl Variants {
             .par_chunks_mut(len / 33)
             .progress_with(pg)
             .for_each(|chunk| {
-                let mut reader = File::open(gff_path.to_string())
+                let mut reader = File::open(&gff_path)
                     .map(noodles_bgzf::Reader::new)
                     .map(gff::Reader::new)
                     .unwrap();
@@ -629,7 +622,7 @@ impl Variants {
                     let r = Region::new(v.contig.to_string(), start..=start);
                     if let std::result::Result::Ok(rows) = reader.query(&index, &r.clone()) {
                         for row in rows {
-                            let ncbi = NCBIGFF::try_from(row.unwrap()).unwrap();
+                            let ncbi = NCBIGFF::from(row.unwrap());
                             v.annotations.push(AnnotationType::NCBIGFF(ncbi));
                         }
                     }
@@ -666,15 +659,12 @@ impl Variants {
                 for (i, row) in chunk.iter().enumerate() {
                     let _ = writeln!(
                         vcf,
-                        "{}\t{}\t{}\t{}\t{}\t{}\tPASS\t.\t{}\t{}",
+                        "{}\t{}\t{}\t{}\t{}\t.\tPASS\t.\t.\t.",
                         row.contig,
                         row.position,
                         i + 1,
                         row.reference,
-                        row.alternative,
-                        ".",
-                        ".",
-                        "."
+                        row.alternative
                     );
                 }
 
@@ -689,21 +679,19 @@ impl Variants {
 
                 // let mut lines: HashMap<u64, Vec<VEPLine>> = HashMap::new();
                 let mut last: usize = 1;
-                for line in reader.deserialize::<VCFRow>() {
-                    if let std::result::Result::Ok(row) = line {
-                        let (cosmic, gnomad) = parse_echtvar_val(&row.info).unwrap();
-                        let id: usize = row.id.parse().unwrap();
-                        if id != last {
-                            panic!("Echtvar output not in input order!");
-                        }
-                        if let Some(c) = cosmic {
-                            chunk[id - 1].annotations.push(AnnotationType::Cosmic(c));
-                        }
-                        if let Some(g) = gnomad {
-                            chunk[id - 1].annotations.push(AnnotationType::GnomAD(g));
-                        }
-                        last += 1;
+                for row in reader.deserialize::<VCFRow>().flatten() {
+                    let (cosmic, gnomad) = parse_echtvar_val(&row.info).unwrap();
+                    let id: usize = row.id.parse().unwrap();
+                    if id != last {
+                        panic!("Echtvar output not in input order!");
                     }
+                    if let Some(c) = cosmic {
+                        chunk[id - 1].annotations.push(AnnotationType::Cosmic(c));
+                    }
+                    if let Some(g) = gnomad {
+                        chunk[id - 1].annotations.push(AnnotationType::GnomAD(g));
+                    }
+                    last += 1;
                 }
             });
         Ok(())
@@ -714,16 +702,13 @@ impl Variants {
             .par_iter()
             .filter(|v| {
                 for annotation in v.annotations.iter() {
-                    match annotation {
-                        AnnotationType::VariantCategory(cat) => {
-                            if cat == category {
-                                return true;
-                            }
+                    if let AnnotationType::VariantCategory(cat) = annotation {
+                        if cat == category {
+                            return true;
                         }
-                        _ => (),
                     }
                 }
-                return false;
+                false
             })
             .collect::<Vec<&Variant>>()
     }
@@ -738,11 +723,8 @@ impl Variants {
             .filter(|e| {
                 let mut res = true;
                 e.annotations.iter().for_each(|a| {
-                    match a {
-                        AnnotationType::GnomAD(g) => {
-                            res = g.gnomad_af < self.cfg.max_gnomad_af;
-                        }
-                        _ => (),
+                    if let AnnotationType::GnomAD(g) = a {
+                        res = g.gnomad_af < self.cfg.max_gnomad_af;
                     };
                 });
                 if !res {
@@ -756,9 +738,9 @@ impl Variants {
     }
 
     pub fn pangolin(&mut self) -> Result<()> {
-        let tmp_file = pangolin_save_variants(&self)?;
+        let tmp_file = pangolin_save_variants(self)?;
         let res_file = run_pangolin(&tmp_file)?;
-        
+
         fs::remove_file(tmp_file)?;
         let res = pangolin_parse_results(&res_file)?;
         let mut res = res.iter();
@@ -769,24 +751,23 @@ impl Variants {
         if let Some(r) = res.next() {
             let mut curr = r.clone();
             for variant in self.data.iter_mut() {
-                if variant.contig == curr.0 {
-                    if variant.position == curr.1 {
-                        if variant.reference == curr.2 {
-                            if variant.alternative == curr.3  {
-                                variant.annotations.push(AnnotationType::Pangolin(curr.4));
-                                n_added += 1;
-                                if let Some(r) = res.next() {
-                                    curr = r.clone();
-                                } else {
-                                    break;
-                                }
-                            }
-                        }
+                if variant.contig == curr.0
+                    && variant.position == curr.1
+                    && variant.reference == curr.2
+                    && variant.alternative == curr.3
+                {
+                    variant.annotations.push(AnnotationType::Pangolin(curr.4));
+                    n_added += 1;
+                    if let Some(r) = res.next() {
+                        curr = r.clone();
+                    } else {
+                        break;
                     }
                 }
             }
         }
 
+        info!("Pangolin annotations {}", n_added);
         assert_eq!(res.len(), 0);
 
         Ok(())
@@ -844,7 +825,7 @@ impl Variants {
                 );
             }
 
-            if callers.len() > 0 {
+            if !callers.is_empty() {
                 n_caller_data += 1;
                 callers.sort();
                 let k = callers.join(",");
@@ -880,13 +861,13 @@ impl Variants {
                     _ => (),
                 };
 
-                if features.len() > 0 {
+                if !features.is_empty() {
                     features.sort();
                     add_hm(&mut ncbi_feature, &features.join(","));
                     n_ncbi_feature += 1;
                 }
 
-                if variant_cat.len() > 0 {
+                if !variant_cat.is_empty() {
                     add_hm(&mut variants_cat, &variant_cat.join(","));
                     n_variants_wcat += 1;
                 }
@@ -929,35 +910,26 @@ impl Variants {
 
         // let file = File::create(path)?;
         // let mut writer = BufWriter::new(file);
-        let mut results = Vec::new();
-        results.push(Stat::new(
-            "consequences".to_string(),
-            cons_cat,
-            n_csq as u32,
-        ));
-        results.push(Stat::new(
-            "variants_cat".to_string(),
-            variants_cat,
-            n_variants_wcat as u32,
-        ));
-        results.push(Stat::new(
-            "ncbi_feature".to_string(),
-            ncbi_feature,
-            n_ncbi_feature as u32,
-        ));
-        results.push(Stat::new(
-            "callers_cat".to_string(),
-            callers_cat,
-            n_caller_data as u32,
-        ));
-
-        // let res = serde_json::to_string(&results)?;
+        let results = vec![
+            Stat::new("consequences".to_string(), cons_cat, n_csq as u32),
+            Stat::new(
+                "variants_cat".to_string(),
+                variants_cat,
+                n_variants_wcat as u32,
+            ),
+            Stat::new(
+                "ncbi_feature".to_string(),
+                ncbi_feature,
+                n_ncbi_feature as u32,
+            ),
+            Stat::new("callers_cat".to_string(), callers_cat, n_caller_data as u32),
+        ];
 
         Ok(results)
     }
 
     pub fn save_sql(&self, path: &str) -> Result<()> {
-        insert_variants(&self, path)
+        insert_variants(self, path)
     }
 
     pub fn stats_sql(&self, path: &str) -> Result<()> {
@@ -997,7 +969,7 @@ impl Variants {
         })
     }
 
-    pub fn filter_category(&self, and_categories: &Vec<Category>) -> Vec<&Variant> {
+    pub fn filter_category(&self, and_categories: &[Category]) -> Vec<&Variant> {
         self.data
             .par_iter()
             .flat_map(|v| {
@@ -1083,19 +1055,15 @@ impl CallerData {
             let imprecise = info
                 .tags
                 .iter()
-                .filter(|s| s.to_string() == "IMPRECISE".to_string())
+                .filter(|s| s.to_string() == *"IMPRECISE")
                 .count();
             let mut n_alt = 0;
             if let Format::Sniffles(f) = &self.format {
                 n_alt = f.dv;
             }
-            if imprecise == 0 && n_alt >= 3 {
-                return false;
-            } else {
-                return true;
-            }
+            !(imprecise == 0 && n_alt >= 3)
         } else {
-            return false;
+            false
         }
     }
 }
@@ -1128,15 +1096,15 @@ impl FromStr for VCFSource {
     }
 }
 
-impl ToString for VCFSource {
-    fn to_string(&self) -> String {
+impl fmt::Display for VCFSource {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let s = match self {
             VCFSource::DeepVariant => "DeepVariant",
             VCFSource::ClairS => "ClairS",
             VCFSource::Sniffles => "Sniffles",
             VCFSource::Nanomonsv => "Nanomonsv",
         };
-        s.to_string()
+        write!(f, "{}", s)
     }
 }
 
@@ -1180,7 +1148,7 @@ impl Variant {
 
     pub fn get_depth(&mut self) -> u32 {
         if let Some(depth) = self.depth {
-            return depth;
+            depth
         } else {
             let depth = self
                 .callers_data
@@ -1189,13 +1157,13 @@ impl Variant {
                 .max()
                 .unwrap();
             self.depth = Some(depth);
-            return depth;
+            depth
         }
     }
 
     pub fn get_n_alt(&mut self) -> u32 {
         if let Some(n_alt) = self.n_alt {
-            return n_alt;
+            n_alt
         } else {
             let n_alt = self
                 .callers_data
@@ -1204,7 +1172,7 @@ impl Variant {
                 .max()
                 .unwrap();
             self.n_alt = Some(n_alt);
-            return n_alt;
+            n_alt
         }
     }
 
@@ -1215,37 +1183,42 @@ impl Variant {
         self.vaf.unwrap()
     }
 
-    fn is_ins(&self) -> bool {
-        match (&self.reference, &self.alternative) {
-            (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => true,
-            _ => false,
-        }
+    pub fn is_ins(&self) -> bool {
+        matches!(
+            (&self.reference, &self.alternative),
+            (
+                ReferenceAlternative::Nucleotide(_),
+                ReferenceAlternative::Nucleotides(_)
+            )
+        )
     }
 
-    fn alt_cat(&self) -> AlterationCategory {
+    pub fn alt_cat(&self) -> AlterationCategory {
         match (&self.reference, &self.alternative) {
             (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotide(_)) => {
-                AlterationCategory::SNV
+                AlterationCategory::Snv
             }
             (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Nucleotides(_)) => {
-                AlterationCategory::INS
+                AlterationCategory::Ins
             }
             (ReferenceAlternative::Nucleotide(_), ReferenceAlternative::Unstructured(_)) => {
                 AlterationCategory::Other
             }
             (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotide(_)) => {
-                AlterationCategory::DEL
+                AlterationCategory::Del
             }
-            (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b)) => {
-                let a = a.len();
-                let b = b.len();
-                if a < b {
-                    AlterationCategory::INS
-                } else if a > b {
-                    AlterationCategory::DEL
-                } else {
-                    AlterationCategory::REP
-                }
+            (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
+                if a.len() < b.len() =>
+            {
+                AlterationCategory::Ins
+            }
+            (ReferenceAlternative::Nucleotides(a), ReferenceAlternative::Nucleotides(b))
+                if a.len() > b.len() =>
+            {
+                AlterationCategory::Del
+            }
+            (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Nucleotides(_)) => {
+                AlterationCategory::Rep
             }
             (ReferenceAlternative::Nucleotides(_), ReferenceAlternative::Unstructured(_)) => {
                 AlterationCategory::Other
@@ -1269,7 +1242,7 @@ impl Variant {
         format!(
             "DP:AD\t{}:{}",
             depth,
-            vec![(depth - n_alt).to_string(), n_alt.to_string()].join(",")
+            [(depth - n_alt).to_string(), n_alt.to_string()].join(",")
         )
     }
 
@@ -1289,20 +1262,17 @@ impl Variant {
         get_best_vep(&self.get_veps())
     }
 
-    pub fn is_from_category(&self, and_categories: &Vec<Category>) -> bool {
+    pub fn is_from_category(&self, and_categories: &[Category]) -> bool {
         let mut vec_bools = Vec::new();
         for category in and_categories.iter() {
             match category {
                 Category::VariantCategory(vc) => {
                     for annotations in self.annotations.iter() {
-                        match annotations {
-                            AnnotationType::VariantCategory(vvc) => {
-                                if vc == vvc {
-                                    vec_bools.push(true);
-                                    break;
-                                }
+                        if let AnnotationType::VariantCategory(vvc) = annotations {
+                            if vc == vvc {
+                                vec_bools.push(true);
+                                break;
                             }
-                            _ => (),
                         }
                     }
                 }
@@ -1324,12 +1294,9 @@ impl Variant {
                 Category::NCosmic(n) => {
                     let mut bools = Vec::new();
                     for annotations in self.annotations.iter() {
-                        match annotations {
-                            AnnotationType::Cosmic(c) => {
-                                bools.push(c.cosmic_cnt >= *n);
-                                break;
-                            }
-                            _ => (),
+                        if let AnnotationType::Cosmic(c) = annotations {
+                            bools.push(c.cosmic_cnt >= *n);
+                            break;
                         }
                     }
                     vec_bools.push(bools.iter().any(|&b| b));
@@ -1337,11 +1304,8 @@ impl Variant {
                 Category::NCBIFeature(ncbi_feature) => {
                     let mut bools = Vec::new();
                     for annotations in self.annotations.iter() {
-                        match annotations {
-                            AnnotationType::NCBIGFF(v) => {
-                                bools.push(v.feature == *ncbi_feature);
-                            }
-                            _ => (),
+                        if let AnnotationType::NCBIGFF(v) = annotations {
+                            bools.push(v.feature == *ncbi_feature);
                         }
                     }
                     vec_bools.push(bools.iter().any(|&b| b));
@@ -1356,10 +1320,13 @@ impl Variant {
                     vec_bools.push(v >= *min && v <= *max);
                 }
                 Category::Pangolin => {
-                    vec_bools.push(self.annotations.iter().filter(|a| match a {
-                        AnnotationType::Pangolin(_) => true,
-                        _ => false 
-                    }).count() > 0);
+                    vec_bools.push(
+                        self.annotations
+                            .iter()
+                            .filter(|a| matches!(a, AnnotationType::Pangolin(_)))
+                            .count()
+                            > 0,
+                    );
                 }
             }
         }
@@ -1375,10 +1342,10 @@ impl Variant {
 }
 #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
 enum AlterationCategory {
-    SNV,
-    INS,
-    DEL,
-    REP,
+    Snv,
+    Ins,
+    Del,
+    Rep,
     Other,
 }
 
@@ -1390,7 +1357,8 @@ pub enum AnnotationType {
     Cosmic(Cosmic),
     GnomAD(GnomAD),
     NCBIGFF(NCBIGFF),
-    Pangolin(Pangolin)
+    Pangolin(Pangolin),
+    // Phase(Phase)
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
@@ -1413,9 +1381,9 @@ impl FromStr for ReferenceAlternative {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> Result<Self> {
-        let mut possible_bases = s.as_bytes().iter();
+        let possible_bases = s.as_bytes().iter();
         let mut res: Vec<Base> = Vec::new();
-        while let Some(&base) = possible_bases.next() {
+        for &base in possible_bases {
             match base.try_into() {
                 std::result::Result::Ok(b) => res.push(b),
                 Err(_) => {
@@ -1425,9 +1393,9 @@ impl FromStr for ReferenceAlternative {
         }
 
         if res.len() == 1 {
-            return Ok(Self::Nucleotide(res.pop().unwrap()));
+            Ok(Self::Nucleotide(res.pop().unwrap()))
         } else {
-            return Ok(Self::Nucleotides(res));
+            Ok(Self::Nucleotides(res))
         }
     }
 }
@@ -1438,7 +1406,7 @@ impl fmt::Display for ReferenceAlternative {
             ReferenceAlternative::Nucleotide(b) => b.to_string(),
             ReferenceAlternative::Nucleotides(bases) => bases
                 .iter()
-                .fold(String::new(), |acc, e| format!("{}{}", acc, e.to_string())),
+                .fold(String::new(), |acc, e| format!("{}{}", acc, e)),
             ReferenceAlternative::Unstructured(s) => s.to_string(),
         };
         write!(f, "{}", string)
@@ -1465,7 +1433,7 @@ impl TryFrom<u8> for Base {
             b'N' => Ok(Base::N),
             _ => Err(anyhow!(
                 "Unknown base: {}",
-                String::from_utf8_lossy(&vec![base])
+                String::from_utf8_lossy(&[base])
             )),
         }
     }
@@ -1584,7 +1552,7 @@ pub enum Category {
         min: f32,
         max: f32,
     },
-    Pangolin
+    Pangolin,
 }
 
 pub fn run_pipe(name: &str, multi: &MultiProgress) -> Result<()> {
@@ -1707,7 +1675,7 @@ pub fn run_pipe(name: &str, multi: &MultiProgress) -> Result<()> {
     variants.bam_filters(&mrd_bam);
 
     let constits = variants.get_cat(&VariantCategory::Constit);
-    let constits = Variants::from_vec(name.to_string(), &multi, constits);
+    let constits = Variants::from_vec(name.to_string(), multi, constits);
     constits.save_bytes(&bytes_constit_path)?;
 
     variants.keep_somatics_un();