Browse Source

bcftools keep only in a + modkit pileup

Thomas 1 year ago
parent
commit
e0894f5af7
7 changed files with 259 additions and 15 deletions
  1. 1 1
      README.md
  2. 119 3
      src/callers/nanomonsv.rs
  3. 1 1
      src/collection/bam.rs
  4. 39 9
      src/commands/bcftools.rs
  5. 1 0
      src/commands/mod.rs
  6. 76 0
      src/commands/modkit.rs
  7. 22 1
      src/lib.rs

+ 1 - 1
README.md

@@ -16,5 +16,5 @@ sudo apt install cmake libclang-dev
 * (bcftools)[https://www.htslib.org/download/]
 * (modkit)[https://github.com/nanoporetech/modkit]
 * VEP: cf pandora_lib_variants for VEP install
-
+* nanomonsv (cf dependencies at github, TODO: use racon)
 ###

+ 119 - 3
src/callers/nanomonsv.rs

@@ -1,10 +1,14 @@
-use std::{fs, path::Path, thread};
+use std::{
+    fs::{self},
+    path::{Path, PathBuf},
+    thread,
+};
 
 use anyhow::Context;
-use log::info;
+use log::{error, info, warn};
 
 use crate::{
-    commands::bcftools::{bcftools_keep_pass, BcftoolsConfig},
+    commands::bcftools::{bcftools_concat, bcftools_keep_pass, BcftoolsConfig},
     runners::{run_wait, CommandRun, RunReport},
 };
 
@@ -292,3 +296,115 @@ pub fn nanomonsv_get(
     let res = run_wait(&mut cmd_run)?;
     Ok(res)
 }
+
+pub fn nanomonsv_create_pon(config: NanomonSVConfig, pon_path: &str) -> anyhow::Result<()> {
+    let mut passed_mrd = Vec::new();
+    for mrd_dir in find_nanomonsv_dirs(&PathBuf::from(&config.result_dir), "mrd", 0, 3) {
+        let mut passed = None;
+        let mut passed_csi = None;
+        let mut result = None;
+
+        for entry in fs::read_dir(&mrd_dir)
+            .with_context(|| format!("Failed to read directory: {}", mrd_dir.display()))?
+        {
+            let entry =
+                entry.with_context(|| format!("Failed to read entry in: {}", mrd_dir.display()))?;
+            let file_name = entry.file_name().to_string_lossy().to_string();
+            let path = entry.path();
+
+            if file_name.ends_with("_mrd_nanomonsv_PASSED.vcf.gz") {
+                passed = Some(path);
+            } else if file_name.ends_with("_mrd_nanomonsv_PASSED.vcf.gz.csi") {
+                passed_csi = Some(path);
+            } else if file_name.ends_with("_mrd.nanomonsv.result.vcf") {
+                result = Some(path);
+            }
+        }
+
+        match (result, passed.clone(), passed_csi) {
+            (None, None, None) => error!("No result in {}", mrd_dir.display()),
+            (Some(p), None, None) => {
+                let output = replace_filename_suffix(
+                    &p,
+                    "_mrd.nanomonsv.result.vcf",
+                    "_mrd_nanomonsv_PASSED.vcf.gz",
+                );
+                info!("Do pass for {} to {}", p.display(), output.display());
+
+                if let Err(r) = bcftools_keep_pass(
+                    p.to_str().unwrap(),
+                    output.to_str().unwrap(),
+                    BcftoolsConfig::default(),
+                ) {
+                    error!("{r}");
+                } else {
+                    passed_mrd.push(output);
+                }
+            }
+            (Some(_), Some(p), None) => warn!("Do csi for {}", p.display()),
+            (Some(_), Some(p), Some(_)) => passed_mrd.push(p),
+            _ => {} // All files found
+        }
+    }
+
+    println!("{} vcf to concat", passed_mrd.len());
+    bcftools_concat(
+        passed_mrd
+            .iter()
+            .map(|p| p.to_string_lossy().to_string())
+            .collect(),
+        pon_path,
+        BcftoolsConfig::default(),
+    )?;
+
+    Ok(())
+}
+
+pub fn find_nanomonsv_dirs(
+    root: &Path,
+    time_point: &str,
+    depth: usize,
+    max_depth: usize,
+) -> Vec<PathBuf> {
+    if depth >= max_depth {
+        return Vec::new();
+    }
+
+    let entries: Vec<_> = match fs::read_dir(root) {
+        Ok(entries) => entries.filter_map(Result::ok).collect(),
+        Err(_) => return Vec::new(),
+    };
+
+    let mut result: Vec<PathBuf> = entries
+        .iter()
+        .filter_map(|entry| {
+            let path = entry.path();
+            if entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false)
+                && path
+                    .to_string_lossy()
+                    .contains(&format!("{}/nanomonsv", time_point))
+            {
+                Some(path)
+            } else {
+                None
+            }
+        })
+        .collect();
+
+    result.extend(
+        entries
+            .iter()
+            .filter(|entry| entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false))
+            .flat_map(|dir| find_nanomonsv_dirs(&dir.path(), time_point, depth + 1, max_depth)),
+    );
+
+    result
+}
+
+pub fn replace_filename_suffix(path: &Path, from: &str, to: &str) -> PathBuf {
+    let file_name = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
+
+    let new_file_name = file_name.replace(from, to);
+
+    path.with_file_name(new_file_name)
+}

+ 1 - 1
src/collection/bam.rs

@@ -40,7 +40,7 @@ impl Bam {
         let stem = path
             .clone()
             .file_stem()
-            .context("Can't parse stem from {path}")?
+            .context(format!("Can't parse stem from {}", path.display()))?
             .to_string_lossy()
             .to_string();
         let stem: Vec<&str> = stem.split('_').collect();

+ 39 - 9
src/commands/bcftools.rs

@@ -1,3 +1,4 @@
+use anyhow::Context;
 use std::fs;
 use uuid::Uuid;
 
@@ -26,15 +27,7 @@ pub fn bcftools_keep_pass(
     let tmp_file = format!("/tmp/{}", Uuid::new_v4());
 
     // First sort
-    let mut cmd_run = CommandRun::new(
-        &config.bin,
-        &[
-            "sort",
-            input,
-            "-o",
-            &tmp_file,
-        ],
-    );
+    let mut cmd_run = CommandRun::new(&config.bin, &["sort", input, "-o", &tmp_file]);
     let _ = run_wait(&mut cmd_run)?;
 
     // Then filter
@@ -56,3 +49,40 @@ pub fn bcftools_keep_pass(
     fs::remove_file(tmp_file)?;
     Ok(res)
 }
+
+pub fn bcftools_concat(
+    inputs: Vec<String>,
+    output: &str,
+    config: BcftoolsConfig,
+) -> anyhow::Result<RunReport> {
+    let tmp_file = format!("/tmp/{}", Uuid::new_v4());
+    fs::write(&tmp_file, inputs.join("\n"))?;
+
+    let args = [
+        "concat",
+        "--write-index",
+        "--threads",
+        &config.threads.to_string(),
+        "-a",
+        "-D",
+        "-f",
+        &tmp_file,
+        "-o",
+        output,
+    ];
+    // Then filter
+    let mut cmd_run = CommandRun::new(&config.bin, &args);
+    let res = run_wait(&mut cmd_run)?;
+    fs::remove_file(tmp_file)?;
+    Ok(res)
+}
+
+pub fn bcftools_keep_only_in_a(a: &str, b: &str, out: &str, config: &BcftoolsConfig) -> anyhow::Result<()> {
+    let args = ["isec", "-C", "-w", "1", a, b, "-o", out];
+    let mut cmd_run = CommandRun::new(&config.bin, &args);
+    let _ = run_wait(&mut cmd_run).context(format!(
+        "Error while running bcftools isec {}",
+        args.join(" ")
+    ))?;
+    Ok(())
+}

+ 1 - 0
src/commands/mod.rs

@@ -1,2 +1,3 @@
 pub mod dorado;
 pub mod bcftools;
+pub mod modkit;

+ 76 - 0
src/commands/modkit.rs

@@ -0,0 +1,76 @@
+use std::{fs, path::Path};
+
+use anyhow::Context;
+
+use crate::runners::{run_wait, CommandRun};
+
+#[derive(Debug)]
+pub struct ModkitConfig {
+    pub bin: String,
+    pub crabz_bin: String,
+    pub tabix_bin: String,
+    pub threads: u8,
+}
+
+impl Default for ModkitConfig {
+    fn default() -> Self {
+        Self {
+            bin: "modkit".to_string(),
+            crabz_bin: "crabz".to_string(),
+            tabix_bin: "tabix".to_string(),
+            threads: 155,
+        }
+    }
+}
+
+pub fn bed_methyl(bam: &Path, config: &ModkitConfig) -> anyhow::Result<()> {
+    let dir = bam
+        .parent()
+        .context(format!("No parent dir for {}", bam.display()))?;
+    let meth_dir = dir.join("5mC_5hmC");
+    if !meth_dir.exists() {
+        fs::create_dir(&meth_dir).context(format!("Can't create {}", meth_dir.display()))?;
+    }
+    let time_point_dir = dir
+        .parent()
+        .context(format!("No parent dir for {}", dir.display()))?;
+    let time_point = time_point_dir.file_name().unwrap().to_str().unwrap();
+
+    let id_dir = time_point_dir
+        .parent()
+        .context(format!("No parent dir for {}", time_point_dir.display()))?;
+    let id = id_dir.file_name().unwrap().to_str().unwrap();
+
+    let pileup_bed = meth_dir.join(format!("{id}_${time_point}_5mC_5hmC_modPileup.bed"));
+
+    let pileup_bed_str = pileup_bed.to_str().unwrap();
+    let mut cmd_run = CommandRun::new(
+        &config.bin,
+        &[
+            "pileup",
+            "-t",
+            &config.threads.to_string(),
+            bam.to_str().unwrap(),
+            pileup_bed_str,
+            "--log-filepath",
+            meth_dir.join("pileup.log").to_str().unwrap(),
+        ],
+    );
+    let _ = run_wait(&mut cmd_run)?;
+
+    let mut cmd_compress =
+        CommandRun::new(&config.crabz_bin, &["-I", "-f", "bgzf", pileup_bed_str]);
+    let _ = run_wait(&mut cmd_compress).context(format!(
+        "Error while runnig crabz for {}",
+        pileup_bed.display()
+    ))?;
+
+    let final_bed = format!("{pileup_bed_str}.gz");
+    let mut cmd_tabix = CommandRun::new(&config.tabix_bin, &[&final_bed]);
+    let _ =
+        run_wait(&mut cmd_tabix).context(format!("Error while runnig tabix for {final_bed}"))?;
+
+    Ok(())
+}
+
+// odkit dmr pair -m C -t 160 --ref /data/ref/hs1/chm13v2.0.fa -r /data/ref/hs1/chm13v2.0_CGI_corrected.bed -a /data/longreads_basic_pipe/BACOU/mrd/5mC_5hmC/BACOU_mrd_5mC_5hmC.bed.gz -b /data/longreads_basic_pipe/BACOU/diag/5mC_5hmC/BACOU_diag_5mC_5hmC.bed.gz -o /data/longreads_basic_pipe/BACOU/diag/5mC_5hmC/BACOU_diag_5mC_5hmC_DMR_C_mrd.bed.gz

+ 22 - 1
src/lib.rs

@@ -21,11 +21,12 @@ lazy_static! {
 mod tests {
     use std::{fs, path::Path};
 
+    use callers::nanomonsv::nanomonsv_create_pon;
     use log::info;
 
     use self::{callers::deep_variant::DeepVariantConfig, collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{callers::{clairs::{ClairS, ClairSConfig}, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVConfig}}, collection::{bam::{self, BamType}, run_tasks, variants::VariantsCollection, vcf::VcfCollection, Collections, CollectionsConfig}, commands::{bcftools::{bcftools_keep_pass, BcftoolsConfig}, dorado::Dorado}};
+    use crate::{callers::{clairs::{ClairS, ClairSConfig}, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVConfig, NanomonSVSolo}}, collection::{bam::{self, BamType}, run_tasks, variants::VariantsCollection, vcf::VcfCollection, Collections, CollectionsConfig}, commands::{bcftools::{bcftools_keep_pass, BcftoolsConfig}, dorado::Dorado}};
 
     fn init() {
         let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
@@ -137,6 +138,20 @@ mod tests {
         NanomonSV::new("test_a", "/data/test_data/subset.bam", "/data/test_data/subset_mrd.bam", config).run()
     }
 
+    #[test]
+    fn nanomonsv_solo() -> anyhow::Result<()> {
+        init();
+        // let id = "CAZIER";
+        // let time_point = "diag";
+        // NanomonSVSolo::new(id, &format!("/data/longreads_basic_pipe/{id}/{time_point}/{id}_{time_point}_hs1.bam"), time_point, NanomonSVConfig::default()).run()?;
+        let time_point = "mrd";
+        for id in ["MERY", "CAMARA", "FRANIATTE", "FERATI", "IQBAL", "COLLE", "JOLIVET", "BAFFREAU", "MANCUS", "BELARBI", "BENGUIRAT", "HENAUX", "MEDDAH"] {
+
+        NanomonSVSolo::new(id, &format!("/data/longreads_basic_pipe/{id}/{time_point}/{id}_{time_point}_hs1.bam"), time_point, NanomonSVConfig::default()).run()?;
+        }
+        Ok(())
+    }
+
     // cargo test run -- --nocapture; ~/run_scripts/notify_finish.sh &
     #[test]
     fn todo_all() -> anyhow::Result<()> {
@@ -224,4 +239,10 @@ mod tests {
         collections.todo_assembler()?;
         Ok(())
     }
+
+    #[test]
+    fn sv_pon() -> anyhow::Result<()> {
+        init();
+        nanomonsv_create_pon(NanomonSVConfig::default(), "/data/ref/hs1/nanomonsv_pon.vcf.gz")
+    }
 }