Thomas 1 vuosi sitten
vanhempi
commit
8fdc44f8b5
6 muutettua tiedostoa jossa 156 lisäystä ja 58 poistoa
  1. 1 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 14 15
      src/dorado.rs
  4. 3 9
      src/lib.rs
  5. 11 12
      src/modkit.rs
  6. 126 22
      src/pod5.rs

+ 1 - 0
Cargo.lock

@@ -931,6 +931,7 @@ name = "pandora_lib_promethion"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "chrono",
  "env_logger",
  "glob",
  "log",

+ 1 - 0
Cargo.toml

@@ -10,3 +10,4 @@ anyhow = "1.0.86"
 glob = "0.3.1"
 pandora_lib_pod5 = { git = "https://git.t0m4.fr/Thomas/pandora_lib_pod5.git" }
 regex = "1.10.5"
+chrono = "0.4.38"

+ 14 - 15
src/dorado.rs

@@ -1,8 +1,8 @@
 use std::{
     fs,
-    io::{self, BufRead, BufReader, Write},
+    io::{BufRead, BufReader},
     process::Command,
-    time::{Duration, SystemTime},
+    time::SystemTime,
 };
 
 pub trait Run {
@@ -23,7 +23,7 @@ pub struct Dorado {
     start_time: SystemTime,
     end_time: SystemTime,
     is_done: bool,
-    log: Vec<String>
+    log: Vec<String>,
 }
 
 impl Dorado {
@@ -59,7 +59,7 @@ impl Run for Dorado {
 
         if !std::path::Path::new(ref_mmi).exists() {
             Command::new("minimap2")
-                .args(&["-x", "map-ont", "-d", ref_mmi, ref_fa])
+                .args(["-x", "map-ont", "-d", ref_mmi, ref_fa])
                 .output()
                 .expect("Failed to execute minimap2");
         }
@@ -76,7 +76,7 @@ impl Run for Dorado {
 
         if !std::path::Path::new(&bam).exists() {
             let dorado_output = Command::new(dorado_bin)
-                .args(&[
+                .args([
                     "basecaller",
                     "sup,5mC_5hmC",
                     pod_dir,
@@ -95,7 +95,7 @@ impl Run for Dorado {
             }
 
             let samtools_view_output = Command::new("samtools")
-                .args(&["view", "-h", "-@ 20", "-b", "/dev/stdin"])
+                .args(["view", "-h", "-@ 20", "-b", "/dev/stdin"])
                 .stdin(dorado_output.stdout.unwrap())
                 .stdout(std::process::Stdio::piped())
                 .stderr(std::process::Stdio::piped())
@@ -107,7 +107,7 @@ impl Run for Dorado {
             }
 
             Command::new("samtools")
-                .args(&["sort", "-@ 30", "/dev/stdin", "-o", &bam])
+                .args(["sort", "-@ 30", "/dev/stdin", "-o", &bam])
                 .stdin(samtools_view_output.stdout.unwrap())
                 .output()
                 .expect("Failed to execute samtools sort");
@@ -141,7 +141,7 @@ impl Run for Dorado {
             // }
 
             Command::new("samtools")
-                .args(&["index", "-@ 150", &bam])
+                .args(["index", "-@ 150", &bam])
                 .output()
                 .expect("Failed to execute samtools index");
         }
@@ -151,16 +151,15 @@ impl Run for Dorado {
             println!("[pipe] Quality control of BAM: {}", bam);
 
             Command::new("cramino")
-                .args(&["-t", "150", "--hist", "--checksum", "--karyotype", &bam])
+                .args(["-t", "150", "--hist", "--checksum", "--karyotype", &bam])
                 .output()
-                .expect("Failed to execute cramino")
-                .stdout;
+                .expect("Failed to execute cramino");
         }
 
         let mod_summary = format!("{}/{}_{}_5mC_5hmC_summary.txt", time_dir, name, time);
         if !std::path::Path::new(&mod_summary).exists() {
             Command::new("modkit")
-                .args(&["summary", "-t", "50", &bam])
+                .args(["summary", "-t", "50", &bam])
                 .output()
                 .expect("Failed to execute modkit summary");
         }
@@ -168,13 +167,13 @@ impl Run for Dorado {
         let fastq = format!("{}/{}/{}/{}_{}.fastq.gz", case_dir, name, time, name, time);
         if !std::path::Path::new(&fastq).exists() {
             Command::new("samtools")
-                .args(&["fastq", "-@ 150", &bam])
+                .args(["fastq", "-@ 150", &bam])
                 .stdout(std::process::Stdio::piped())
                 .spawn()
                 .expect("Failed to execute samtools fastq");
 
             Command::new("crabz")
-                .args(&["-f", "bgzf", "-", "-o", &fastq])
+                .args(["-f", "bgzf", "-", "-o", &fastq])
                 .stdin(std::process::Stdio::piped())
                 .output()
                 .expect("Failed to execute crabz");
@@ -199,7 +198,7 @@ fn print_stderr(stderr: std::process::ChildStderr, save: &mut Vec<String>) {
             Ok(line) => {
                 eprintln!("{}", line);
                 save.push(line);
-            },
+            }
             Err(err) => eprintln!("Error reading stderr: {}", err),
         }
     }

+ 3 - 9
src/lib.rs

@@ -2,14 +2,9 @@ pub mod dorado;
 pub mod modkit;
 pub mod pod5;
 
-pub fn add(left: usize, right: usize) -> usize {
-    left + right
-}
-
 #[cfg(test)]
 mod tests {
-    use self::dorado::Run;
-
+    use self::{dorado::Run, pod5::Runs};
     use super::*;
 
     #[test]
@@ -35,9 +30,8 @@ mod tests {
         let _ =
             env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
                 .build();
-        let files = pod5::list_pod_files("/data/run_data")?;
-        println!("{files:#?}");
-        println!("{}", files.len());
+        let runs = Runs::import_dir("/data/run_data")?;
+        runs.print_info();
         Ok(())
     }
 }

+ 11 - 12
src/modkit.rs

@@ -1,7 +1,6 @@
 use std::{
-    io::{self, BufRead, BufReader, Write},
+    io::{BufRead, BufReader},
     process::Command,
-    time::Duration,
 };
 
 pub fn modkit(bam_path: &str) {
@@ -54,16 +53,16 @@ pub fn modkit(bam_path: &str) {
     child.wait().expect("Command didn't finish");
 }
 
-fn print_stderr(stderr: std::process::ChildStderr) {
-    let stderr_reader = BufReader::new(stderr);
-    for line in stderr_reader.lines() {
-        match line {
-            Ok(line) => eprintln!("{}", line),
-            Err(err) => eprintln!("Error reading stderr: {}", err),
-        }
-    }
-}
-
+// fn print_stderr(stderr: std::process::ChildStderr) {
+//     let stderr_reader = BufReader::new(stderr);
+//     for line in stderr_reader.lines() {
+//         match line {
+//             Ok(line) => eprintln!("{}", line),
+//             Err(err) => eprintln!("Error reading stderr: {}", err),
+//         }
+//     }
+// }
+//
 fn print_stdout(stdout: std::process::ChildStdout) {
     let stdout_reader = BufReader::new(stdout);
     for line in stdout_reader.lines() {

+ 126 - 22
src/pod5.rs

@@ -1,25 +1,44 @@
 use anyhow::{anyhow, Context};
+use chrono::{DateTime, Utc};
 use glob::glob;
 use log::warn;
 use pandora_lib_pod5::Pod5Info;
-use regex::Regex;
-use std::{path::PathBuf, str::FromStr, usize};
-
-#[derive(Debug)]
+use std::{
+    collections::HashMap,
+    fmt::Display,
+    fs::{self, Metadata},
+    os::unix::fs::MetadataExt,
+    path::PathBuf,
+    usize,
+};
+
+#[derive(Debug, Clone)]
 pub struct Pod5 {
     pub path: String,
     pub pod5_type: Pod5Type,
     pub run_name: String,
     pub flowcell_name: String,
+    pub file_metadata: Metadata,
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub enum Pod5Type {
     Raw,
     Demuxed,
 }
 
-#[derive(Debug)]
+impl Display for Pod5Type {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            Pod5Type::Raw => "raw",
+            Pod5Type::Demuxed => "demuxed",
+        };
+        f.write_str(s)
+    }
+    // add code here
+}
+
+#[derive(Debug, Clone)]
 pub struct Pod5Config {
     pub base_dir: String,
     pub type_raw: String,
@@ -53,14 +72,19 @@ impl Pod5 {
             return Err(anyhow!("Can't find the pod5 type {s}"));
         };
 
+        let file_metadata = fs::metadata(path)?;
+
         let sr = s.replace(&config.base_dir, "");
-        let components: Vec<&str> = sr
-            .split("/")
-            .filter(|c| !c.is_empty())
-            .collect();
+        let components: Vec<&str> = sr.split('/').filter(|c| !c.is_empty()).collect();
 
-        let run_name = components.get(config.run_dir_n as usize).context("Can't get run_name")?.to_string();
-        let flowcell_name = components.get(config.flowcell_dir_n as usize).context("Can't get flowcell_name")?.to_string();
+        let run_name = components
+            .get(config.run_dir_n as usize)
+            .context("Can't get run_name")?
+            .to_string();
+        let flowcell_name = components
+            .get(config.flowcell_dir_n as usize)
+            .context("Can't get flowcell_name")?
+            .to_string();
 
         // let info = Pod5Info::from_pod5(s);
 
@@ -68,7 +92,8 @@ impl Pod5 {
             path: s.to_string(),
             pod5_type,
             run_name,
-            flowcell_name
+            flowcell_name,
+            file_metadata,
         })
     }
 }
@@ -86,21 +111,100 @@ pub fn list_pod_files(dir: &str) -> anyhow::Result<Vec<Pod5>> {
             Err(e) => warn!("Error: {:?}", e),
         }
     }
-
     Ok(pod_files)
 }
 
-
 #[derive(Debug)]
-struct Run {
-    run_name: String,
-    flowcells: Vec<Flowcell>
+pub struct Run {
+    pub run_name: String,
+    pub flowcells: Vec<FlowCell>,
+}
+
+#[derive(Debug, Clone)]
+pub struct FlowCell {
+    pub flowcell_name: String,
+    pub run_name: String,
+    pub pod5_type: Pod5Type,
+    pub pod5_info: Pod5Info,
+    pub pod5: Vec<Pod5>,
 }
 
 #[derive(Debug)]
-struct Flowcell {
-    flowcell_name: String,
-    pod5_type: String,
-    info: Pod5Info
+pub struct Runs {
+    pub importation_date: DateTime<Utc>,
+    pub runs: Vec<Run>,
+}
+
+impl Runs {
+    pub fn import_dir(dir: &str) -> anyhow::Result<Self> {
+        let pod5 = list_pod_files(dir)?;
+
+        let mut fc: HashMap<String, Vec<Pod5>> = HashMap::new();
+        for pod in pod5 {
+            let k = format!("{}-{}", pod.run_name, pod.flowcell_name);
+            fc.entry(k).or_default().push(pod);
+        }
+
+        let flow_cells: Vec<FlowCell> = fc.into_values().map(|v| {
+                let first = &v[0];
+                let pod5_info = Pod5Info::from_pod5(&first.path);
+                FlowCell {
+                    flowcell_name: first.flowcell_name.clone(),
+                    run_name: first.run_name.clone(),
+                    pod5_type: first.pod5_type.clone(),
+                    pod5_info,
+                    pod5: v,
+                }
+            })
+            .collect();
+
+        let mut runs = HashMap::new();
+        for fc in flow_cells {
+            runs.entry(fc.run_name.clone())
+                .or_insert_with(Vec::new)
+                .push(fc);
+        }
+
+        let runs: Vec<Run> = runs
+            .into_values()
+            .map(|v| Run {
+                run_name: v[0].run_name.clone(),
+                flowcells: v.to_vec(),
+            })
+            .collect();
+
+        Ok(Self {
+            importation_date: Utc::now(),
+            runs,
+        })
+    }
+
+    pub fn print_info(&self) {
+        self.runs.iter().for_each(|run| {
+            run.flowcells.iter().for_each(|fc| {
+                let total_size: u64 = fc.pod5.iter().map(|p| p.file_metadata.size()).sum();
+                let n_files = fc.pod5.len();
+                let dates: Vec<DateTime<Utc>> = fc
+                    .pod5
+                    .iter()
+                    .map(|p| p.file_metadata.created().unwrap().into())
+                    .collect();
+                let from = dates.iter().min().unwrap();
+                let to = dates.iter().max().unwrap();
+                let s = [
+                    run.run_name.clone(),
+                    from.to_string(),
+                    to.to_string(),
+                    n_files.to_string(),
+                    total_size.to_string(),
+                    fc.flowcell_name.to_string(),
+                    fc.pod5_type.to_string(),
+                    fc.pod5_info.acquisition_id.clone(),
+                ]
+                .join("\t");
+                println!("{s}");
+            });
+        });
+    }
 }