Thomas 1 жил өмнө
parent
commit
fce7a0a5ea

+ 3 - 2
Cargo.lock

@@ -1130,6 +1130,7 @@ dependencies = [
  "pot",
  "prettytable-rs",
  "rayon",
+ "regex",
  "rust-htslib",
  "rust-lapper",
  "serde",
@@ -1345,9 +1346,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.10.4"
+version = "1.10.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
+checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
 dependencies = [
  "aho-corasick",
  "memchr",

+ 1 - 0
Cargo.toml

@@ -43,4 +43,5 @@ crossbeam-deque = "0.8.5"
 trc = "1.2.4"
 pot = "=3.0.0"
 utoipa = "4.2.3"
+regex = "1.10.5"
 

+ 1 - 0
src/annotations/mod.rs

@@ -3,3 +3,4 @@ pub mod echtvar;
 pub mod ncbi_gff;
 pub mod cosmic;
 pub mod gnomad;
+pub mod pangolin;

+ 140 - 0
src/annotations/pangolin.rs

@@ -0,0 +1,140 @@
+use anyhow::{Context, Result};
+use log::{info, warn};
+use regex::Regex;
+use serde::{Deserialize, Serialize};
+use utoipa::ToSchema;
+use std::io::{self, Write};
+use std::str::FromStr;
+use std::{
+    fs::File,
+    io::{BufRead, BufReader},
+    process::{Command, Stdio},
+};
+use uuid::Uuid;
+
+use crate::variants::{ReferenceAlternative, Variants};
+
+#[derive(Debug, ToSchema, Clone, PartialEq, Serialize, Deserialize)]
+pub struct Pangolin {
+    pub predictions: Vec<(u32, f64)>
+}
+
+// pangolin -c CHROM,POS,REF,ALT -s 0.1 /tmp/hattab_test_pango.csv /data/ref/hs1/hs1_simple_chr.fa /data/ref/hs1/gencode.v44.liftedTohs1.db gg.vcf
+pub fn run_pangolin(in_path: &str) -> Result<String> {
+    let tmp_file = format!("/tmp/{}", Uuid::new_v4());
+
+    let bin_dir = "/home/prom/.local/bin";
+    let ref_fa = "/data/ref/hs1/hs1_simple_chr.fa";
+    let db = "/data/ref/hs1/gencode.v44.liftedTohs1.db";
+
+    let mut cmd = Command::new(format!("{bin_dir}/pangolin"))
+        .arg("-c")
+        .arg("CHROM,POS,REF,ALT")
+        .arg("-s")
+        .arg("0.1")
+        .arg(in_path)
+        .arg(ref_fa)
+        .arg(db)
+        .arg(tmp_file.clone())
+        .stdout(Stdio::piped())
+        .spawn()
+        .context("pangolin failed to start")?;
+
+    let stdout = cmd.stdout.take().unwrap();
+
+    let reader = BufReader::new(stdout);
+    reader
+        .lines()
+        .filter_map(|line| line.ok())
+        .filter(|line| line.find("error").is_some())
+        .for_each(|line| warn!("{}", line));
+
+    cmd.wait()?;
+
+    Ok(format!("{}.csv", tmp_file))
+}
+
+pub fn pangolin_save_variants(variants: &Variants) -> Result<String> {
+    let tmp_file = format!("/tmp/{}.csv", Uuid::new_v4());
+    let mut file = File::create(&tmp_file)?;
+
+    writeln!(file, "CHROM,POS,REF,ALT")?;
+    let mut lines = 0;
+    for v in variants.data.iter() {
+        // let u = v.callers();
+        if v.callers().contains(&"Nanomonsv".to_string()) {
+            lines += 1;
+            continue;
+        }
+        writeln!(
+            file,
+            "{}",
+            vec![
+                v.contig.to_string(),
+                v.position.to_string(),
+                v.reference.to_string(),
+                v.alternative.to_string()
+            ]
+            .join(",")
+        )?;
+        lines += 1;
+    }
+    assert_eq!(lines, variants.len());
+
+    Ok(tmp_file.to_string())
+}
+
+pub fn pangolin_parse_results(
+    path: &str,
+) -> Result<
+    Vec<(
+        String,
+        u32,
+        ReferenceAlternative,
+        ReferenceAlternative,
+        Pangolin,
+    )>,
+> {
+    let re = Regex::new(r"^[0-9]*:[0-9]*").unwrap();
+
+    let file = File::open(path)?;
+    let reader = BufReader::new(file);
+
+    let mut lines = reader.lines();
+    lines.next(); // Skip the first line
+
+    let mut res = Vec::new();
+    for line in lines {
+        let line = line?; // Unwrap the Result
+        let parts: Vec<&str> = line.split(',').collect(); // Split the line by comma
+        if parts.len() != 5 {
+            continue;
+        }
+        if parts[4] == "" {
+            continue;
+        }
+
+        let pangolin_res: Vec<&str> = parts[4].split("|").collect();
+        let pangolin_res: Vec<(u32, f64)> = pangolin_res
+            .into_iter()
+            .filter(|s| re.is_match(s))
+            .map(|s| {
+                let (a, b) = s.split_once(":").unwrap();
+                (a.parse().unwrap(), b.parse().unwrap())
+            })
+            .collect();
+
+        if pangolin_res.len() > 0 {
+            res.push((
+                parts[0].to_string(),
+                parts[1].parse::<u32>()?,
+                ReferenceAlternative::from_str(parts[2])?,
+                ReferenceAlternative::from_str(parts[3])?,
+                Pangolin { predictions: pangolin_res}
+            ));
+        }
+    }
+
+    info!("{} pangolin results parsed", res.len());
+    Ok(res)
+}

+ 55 - 2
src/lib.rs

@@ -13,7 +13,14 @@ mod tests {
     use indicatif_log_bridge::LogWrapper;
     use noodles_core::{Position, Region};
 
-    use crate::{config::Config, sql::variants_sql::load_variants_name, utils::count_repetitions};
+    use crate::{
+        config::Config,
+        sql::variants_sql::load_variants_name,
+        utils::count_repetitions,
+        variants::{Category, Variants},
+    };
+
+    use self::annotations::pangolin::pangolin_parse_results;
 
     use super::*;
     #[test]
@@ -48,7 +55,7 @@ mod tests {
         LogWrapper::new(multi.clone(), logger).try_init().unwrap();
 
         let mut variants = load_variants_name(name, &multi)?;
-        let _variants = variants.get_cat(&variants::VariantCategory::Constit);
+        // let _variants = variants.get_cat(&variants::VariantCategory::Constit);
 
         variants.write_vcf_cat("test.vcf.gz", &variants::VariantCategory::Somatic)?;
         println!("{} variants loaded from db.", variants.len());
@@ -80,4 +87,50 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn write_vcf() -> Result<()> {
+        let name = "HATTAB";
+        let logger =
+            env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
+                .build();
+        let multi = MultiProgress::new();
+        LogWrapper::new(multi.clone(), logger).try_init().unwrap();
+
+        let mut variants = Variants::new_from_bytes(
+            name,
+            &format!("/data/longreads_basic_pipe/{name}/diag/{name}_variants.bytes.gz"),
+            multi,
+        )?;
+        println!("{} variants", variants.len());
+
+        variants.write_vcf_cat("/tmp/test_hs1.vcf.gz", &variants::VariantCategory::Somatic)?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn p_pangolin() -> Result<()> {
+        let name = "BECERRA";
+        let logger =
+            env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
+                .build();
+        let multi = MultiProgress::new();
+        LogWrapper::new(multi.clone(), logger).try_init().unwrap();
+
+        let mut variants = Variants::new_from_bytes(
+            name,
+            &format!("/data/longreads_basic_pipe/{name}/diag/{name}_variants.bytes.gz"),
+            multi,
+        )?;
+        variants.pangolin()?;
+
+        let u = variants.filter_category(&vec![Category::Pangolin]);
+
+        println!("{u:?}");
+
+        assert_eq!(u.len(), 3);
+
+        Ok(())
+    }
 }

+ 47 - 6
src/variants.rs

@@ -1,10 +1,6 @@
 use crate::{
     annotations::{
-        cosmic::Cosmic,
-        echtvar::{parse_echtvar_val, run_echtvar},
-        gnomad::GnomAD,
-        ncbi_gff::NCBIGFF,
-        vep::{get_best_vep, vep_chunk, VEP},
+        cosmic::Cosmic, echtvar::{parse_echtvar_val, run_echtvar}, gnomad::GnomAD, ncbi_gff::NCBIGFF, pangolin::{pangolin_parse_results, pangolin_save_variants, run_pangolin, Pangolin}, vep::{get_best_vep, vep_chunk, VEP}
     },
     callers::{
         clairs::{ClairSFormat, ClairSInfo},
@@ -38,7 +34,7 @@ use noodles_gff as gff;
 
 use rayon::prelude::*;
 use serde::{ser::SerializeStruct, Deserialize, Serialize, Serializer};
-use std::io::Write;
+use std::{fs, io::Write};
 use std::{
     env::temp_dir,
     fmt,
@@ -759,6 +755,43 @@ impl Variants {
         Ok(n)
     }
 
+    pub fn pangolin(&mut self) -> Result<()> {
+        let tmp_file = pangolin_save_variants(&self)?;
+        let res_file = run_pangolin(&tmp_file)?;
+        
+        fs::remove_file(tmp_file)?;
+        let res = pangolin_parse_results(&res_file)?;
+        let mut res = res.iter();
+        fs::remove_file(res_file)?;
+        info!("Adding pangolin results for {} variants.", res.len());
+
+        let mut n_added = 0;
+        if let Some(r) = res.next() {
+            let mut curr = r.clone();
+            for variant in self.data.iter_mut() {
+                if variant.contig == curr.0 {
+                    if variant.position == curr.1 {
+                        if variant.reference == curr.2 {
+                            if variant.alternative == curr.3  {
+                                variant.annotations.push(AnnotationType::Pangolin(curr.4));
+                                n_added += 1;
+                                if let Some(r) = res.next() {
+                                    curr = r.clone();
+                                } else {
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        assert_eq!(res.len(), 0);
+
+        Ok(())
+    }
+
     pub fn len(&self) -> usize {
         self.data.len()
     }
@@ -1322,6 +1355,12 @@ impl Variant {
                     };
                     vec_bools.push(v >= *min && v <= *max);
                 }
+                Category::Pangolin => {
+                    vec_bools.push(self.annotations.iter().filter(|a| match a {
+                        AnnotationType::Pangolin(_) => true,
+                        _ => false 
+                    }).count() > 0);
+                }
             }
         }
         vec_bools.iter().all(|&x| x)
@@ -1351,6 +1390,7 @@ pub enum AnnotationType {
     Cosmic(Cosmic),
     GnomAD(GnomAD),
     NCBIGFF(NCBIGFF),
+    Pangolin(Pangolin)
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)]
@@ -1544,6 +1584,7 @@ pub enum Category {
         min: f32,
         max: f32,
     },
+    Pangolin
 }
 
 pub fn run_pipe(name: &str, multi: &MultiProgress) -> Result<()> {

BIN
test.vcf.gz


BIN
test.vcf.gz.tbi