Thomas 1 year ago
parent
commit
b74a501c6c
5 changed files with 352 additions and 40 deletions
  1. 219 5
      Cargo.lock
  2. 4 0
      Cargo.toml
  3. 81 28
      src/bam.rs
  4. 16 3
      src/lib.rs
  5. 32 4
      src/pod5.rs

+ 219 - 5
Cargo.lock

@@ -518,7 +518,7 @@ dependencies = [
  "arrow-select 49.0.0",
  "num",
  "regex",
- "regex-syntax",
+ "regex-syntax 0.8.4",
 ]
 
 [[package]]
@@ -535,7 +535,7 @@ dependencies = [
  "memchr",
  "num",
  "regex",
- "regex-syntax",
+ "regex-syntax 0.8.4",
 ]
 
 [[package]]
@@ -1389,6 +1389,19 @@ name = "log"
 version = "0.4.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
+dependencies = [
+ "value-bag",
+]
+
+[[package]]
+name = "logtest"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb3e43a8657c1d64516dcc9db8ca03826a4aceaf89d5ce1b37b59f6ff0e43026"
+dependencies = [
+ "lazy_static",
+ "log",
+]
 
 [[package]]
 name = "lzma-sys"
@@ -1401,6 +1414,15 @@ dependencies = [
  "pkg-config",
 ]
 
+[[package]]
+name = "matchers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
+dependencies = [
+ "regex-automata 0.1.10",
+]
+
 [[package]]
 name = "memchr"
 version = "2.7.4"
@@ -1452,6 +1474,16 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num"
 version = "0.4.3"
@@ -1579,6 +1611,12 @@ dependencies = [
  "vcpkg",
 ]
 
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "pandora_lib_bindings"
 version = "0.1.0"
@@ -1627,11 +1665,15 @@ dependencies = [
  "env_logger 0.11.3",
  "glob",
  "log",
+ "logtest",
  "pandora_lib_bindings",
  "pandora_lib_pileup",
  "pandora_lib_pod5",
  "regex",
  "serde",
+ "test-log",
+ "tracing",
+ "tracing-test",
 ]
 
 [[package]]
@@ -1760,8 +1802,17 @@ checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata",
- "regex-syntax",
+ "regex-automata 0.4.7",
+ "regex-syntax 0.8.4",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax 0.6.29",
 ]
 
 [[package]]
@@ -1772,9 +1823,15 @@ checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax",
+ "regex-syntax 0.8.4",
 ]
 
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
 [[package]]
 name = "regex-syntax"
 version = "0.8.4"
@@ -1894,6 +1951,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -1984,6 +2050,28 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "test-log"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3dffced63c2b5c7be278154d76b479f9f9920ed34e7574201407f0b14e2bbb93"
+dependencies = [
+ "env_logger 0.11.3",
+ "test-log-macros",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "test-log-macros"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5999e24eaa32083191ba4e425deb75cdf25efefabe5aaccb7446dd0d4122a3f5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.71",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.62"
@@ -2004,6 +2092,16 @@ dependencies = [
  "syn 2.0.71",
 ]
 
+[[package]]
+name = "thread_local"
+version = "1.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+]
+
 [[package]]
 name = "tiny-keccak"
 version = "2.0.2"
@@ -2058,6 +2156,88 @@ dependencies = [
  "syn 2.0.71",
 ]
 
+[[package]]
+name = "tracing"
+version = "0.1.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.71",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "regex",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+]
+
+[[package]]
+name = "tracing-test"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68"
+dependencies = [
+ "tracing-core",
+ "tracing-subscriber",
+ "tracing-test-macro",
+]
+
+[[package]]
+name = "tracing-test-macro"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568"
+dependencies = [
+ "quote",
+ "syn 2.0.71",
+]
+
 [[package]]
 name = "unicode-bidi"
 version = "0.3.15"
@@ -2112,6 +2292,18 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "valuable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
+
+[[package]]
+name = "value-bag"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -2184,6 +2376,22 @@ version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
 [[package]]
 name = "winapi-util"
 version = "0.1.8"
@@ -2193,6 +2401,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
 [[package]]
 name = "windows-core"
 version = "0.52.0"

+ 4 - 0
Cargo.toml

@@ -15,4 +15,8 @@ regex = "1.10.5"
 chrono = "0.4.38"
 csv = "1.3.0"
 serde = { version = "1.0.204", features = ["derive"] }
+tracing-test = "0.2.5"
+tracing = "0.1.40"
+logtest = "2.0.0"
+test-log = "0.2.16"
 

+ 81 - 28
src/bam.rs

@@ -5,6 +5,8 @@ use std::{
 };
 
 use anyhow::{anyhow, Context};
+use glob::glob;
+use log::{info, warn};
 use pandora_lib_bindings::{
     progs::cramino::{Cramino, CraminoRes},
     utils::RunBin,
@@ -14,39 +16,83 @@ use pandora_lib_bindings::{
 pub struct Bam {
     pub id: String,
     pub time_point: String,
+    pub reference_genome: String,
+    pub bam_type: BamType,
     pub path: PathBuf,
     pub file_metadata: Metadata,
-    pub cramino: CraminoRes,
+    pub cramino: Option<CraminoRes>,
+}
+
+#[derive(Debug, PartialEq)]
+pub enum BamType {
+    WGS,
+    Panel(String),
+    ChIP(String),
 }
 
 impl Bam {
-    pub fn new(bam_dir: &str, id: &str, time_point: &str) -> anyhow::Result<Self> {
-        let bam_path = format!("{bam_dir}/{id}/{time_point}/{id}_{time_point}_hs1.bam");
-        let path = PathBuf::from_str(&bam_path)?;
-        if !path.exists() {
-            return Err(anyhow!("Bam file missing {bam_path}"));
+    pub fn new(path: PathBuf) -> anyhow::Result<Self> {
+        let stem = path
+            .clone()
+            .file_stem()
+            .context("Can't parse stem from {path}")?
+            .to_string_lossy()
+            .to_string();
+        let stem: Vec<&str> = stem.split('_').collect();
+
+        if stem.len() > 4 || stem.len() < 3 {
+            return Err(anyhow!("Error in bam name formating"));
         }
 
-        let tp_dir = path.parent().context("Can't parse parent from: {bam_path}")?;
-        let cramino_path = format!("{}/{id}_{time_point}_hs1_cramino.txt", tp_dir.to_string_lossy());
+        let id = stem[0].to_string();
+        let time_point = stem[1].to_string();
+        let reference_genome = stem.last().context("Can't get last from stem {stem}")?.to_string();
+
+        let bam_type = if stem.len() == 4 {
+            match stem[2] {
+                "oncoT" => BamType::Panel("oncoT".to_string()),
+                "H3K27ac" => BamType::ChIP("H3K27ac".to_string()),
+                "H3K4me3" => BamType::ChIP("H3K4me3".to_string()),
+                _ => return Err(anyhow!("Error in bam name formating {}", path.display())),
+            }
+        } else {
+            BamType::WGS
+        };
+
+        let tp_dir = path
+            .parent()
+            .context("Can't parse parent from: {bam_path}")?;
+        let cramino_path = format!(
+            "{}/{id}_{time_point}_hs1_cramino.txt",
+            tp_dir.to_string_lossy()
+        );
         let file_metadata = fs::metadata(&path)?;
-        if !PathBuf::from_str(&cramino_path)?.exists() {
-            return Err(anyhow!("Cramino file missing {cramino_path}"));
-        }
-        let mut cramino = Cramino::default().with_result_path(&cramino_path);
-        cramino.parse_results()?;
 
-        if let Some(cramino) = cramino.results {
-            Ok(Self {
-                path,
-                file_metadata,
-                cramino,
-                id: id.to_string(),
-                time_point: time_point.to_string(),
-            })
+        let cramino = if bam_type == BamType::WGS {
+            if !PathBuf::from_str(&cramino_path)?.exists() {
+                return Err(anyhow!("Cramino file missing {cramino_path}"));
+            }
+            let mut cramino = Cramino::default().with_result_path(&cramino_path);
+            cramino.parse_results()?;
+
+            if let Some(cramino) = cramino.results {
+                Some(cramino)
+            } else {
+                return Err(anyhow!("Cramino results parsing failed"));
+            }
         } else {
-            Err(anyhow!("Cramino results parsing failed"))
-        }
+            None
+        };
+
+        Ok(Self {
+            path,
+            file_metadata,
+            cramino,
+            id: id.to_string(),
+            time_point: time_point.to_string(),
+            bam_type,
+            reference_genome,
+        })
     }
 }
 
@@ -55,14 +101,21 @@ pub struct BamCollection {
     pub bams: Vec<Bam>,
 }
 
-pub fn load_bam_collection(ids: Vec<String>, bam_dir: &str, time_points: Vec<&str>) -> BamCollection {
+pub fn load_bam_collection(
+    bam_dir: &str,
+) -> BamCollection {
     let mut bams = Vec::new();
+    let pattern = format!("{}/**/*.bam", bam_dir);
 
-    for id in ids {
-        for tp in time_points.iter() {
-            if let std::result::Result::Ok(bam) = Bam::new(bam_dir, &id,tp) {
-                bams.push(bam);
+    for entry in glob(&pattern).expect("Failed to read glob pattern") {
+        match entry {
+            Ok(path) => {
+                match Bam::new(path) {
+                    Ok(bam) => bams.push(bam),
+                    Err(e) => warn!("{e}"),
+                }
             }
+            Err(e) => warn!("Error: {:?}", e),
         }
     }
 

+ 16 - 3
src/lib.rs

@@ -5,6 +5,13 @@ pub mod pod5;
 
 #[cfg(test)]
 mod tests {
+
+    use std::fs;
+
+    use log::{info, warn};
+
+    use crate::bam::BamType;
+
     use self::{bam::Bam, dorado::Run, pod5::Runs};
     use super::*;
 
@@ -71,10 +78,16 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[test_log::test]
     fn bam() -> anyhow::Result<()> {
-        // let bam_collection = ;
-        // println!("{bam:#?}");
+        let bam_collection = bam::load_bam_collection(
+            "/data/longreads_basic_pipe",
+        );
+
+        // bam_collection.bams.iter().filter(|b| b.bam_type == BamType::Panel(_)).for_each(|b| println!("{b:#?}"));
+        bam_collection.bams.iter().filter(|b| matches!(b.bam_type, BamType::Panel(_))).for_each(|b| println!("{b:#?}"));
+
+        // println!("{:#?}", bam_collection.bams);
         Ok(())
     }
 }

+ 32 - 4
src/pod5.rs

@@ -113,10 +113,16 @@ pub fn list_pod_files(dir: &str) -> Result<Vec<Pod5>> {
 
     for entry in glob(&pattern).expect("Failed to read glob pattern") {
         match entry {
-            Ok(path) => match Pod5::from_path(&path, &conf) {
-                Ok(pod5) => pod_files.push(pod5),
-                Err(e) => warn!("{e}"),
-            },
+            Ok(path) => {
+                let p = path.to_str().context("Can't parse path to string {path}")?;
+                if p.contains("/pod5_fail/") || p.contains("/pod5_skip/") {
+                    continue;
+                }
+                match Pod5::from_path(&path, &conf) {
+                    Ok(pod5) => pod_files.push(pod5),
+                    Err(e) => warn!("{e}"),
+                }
+            }
             Err(e) => warn!("Error: {:?}", e),
         }
     }
@@ -459,6 +465,28 @@ impl Runs {
             }
         }
     }
+
+    pub fn ids(&self) -> Vec<String> {
+        let mut ids: Vec<String> = self
+            .runs
+            .iter()
+            .flat_map(|r| {
+                r.flowcells
+                    .iter()
+                    .flat_map(|f| {
+                        return f
+                            .cases
+                            .iter()
+                            .map(|c| c.id.clone())
+                            .collect::<Vec<String>>();
+                    })
+                    .collect::<Vec<String>>()
+            })
+            .collect();
+        ids.sort();
+        ids.dedup();
+        ids
+    }
 }
 
 #[derive(Debug, Deserialize, Clone)]