Thomas 3 ay önce
ebeveyn
işleme
5d8b2bf45e
4 değiştirilmiş dosya ile 163 ekleme ve 177 silme
  1. 0 97
      src/cases/case.rs
  2. 0 2
      src/cases/mod.rs
  3. 163 77
      src/collection/pod5.rs
  4. 0 1
      src/lib.rs

+ 0 - 97
src/cases/case.rs

@@ -1,97 +0,0 @@
-use rusqlite::{Connection, OptionalExtension};
-
-use crate::config::Config;
-
-#[derive(Debug)]
-pub struct Case {
-    pub id: String,
-    pub n_runs: CaseType,
-}
-
-#[derive(Debug)]
-pub enum CaseType {
-    Paired { n_runs_diag: u8, n_runs_mrd: u8 },
-    Diag { n_runs: u8 },
-}
-
-pub struct DBCases {
-    conn: Connection,
-}
-
-impl DBCases {
-    pub fn new(config: &Config) -> anyhow::Result<Self> {
-        let conn = Connection::open(&config.db_cases_path)?;
-        Ok(DBCases { conn })
-    }
-
-    pub fn add_case(&self, case: &Case) -> anyhow::Result<()> {
-        match case.n_runs {
-            CaseType::Paired {
-                n_runs_diag,
-                n_runs_mrd,
-            } => {
-                self.conn.execute(
-                    "INSERT INTO cases (id, type, n_runs_diag, n_runs_mrd) VALUES (?1, ?2, ?3, ?4)",
-                    (&case.id, "Paired", n_runs_diag, n_runs_mrd),
-                )?;
-            }
-            CaseType::Diag { n_runs } => {
-                self.conn.execute(
-                    "INSERT INTO cases (id, type, n_runs) VALUES (?1, ?2, ?3)",
-                    (&case.id, "Diag", n_runs),
-                )?;
-            }
-        }
-        Ok(())
-    }
-
-    pub fn remove_case(&self, id: &str) -> anyhow::Result<()> {
-        self.conn.execute("DELETE FROM cases WHERE id = ?1", [id])?;
-        Ok(())
-    }
-
-    pub fn search_case(&self, id: &str) -> anyhow::Result<Option<Case>> {
-        let mut stmt = self
-            .conn
-            .prepare("SELECT id, type, n_runs_diag, n_runs_mrd, n_runs FROM cases WHERE id = ?1")?;
-        let case = stmt
-            .query_row([id], |row| {
-                let case_type: String = row.get(1)?;
-                let n_runs = match case_type.as_str() {
-                    "Paired" => CaseType::Paired {
-                        n_runs_diag: row.get(2)?,
-                        n_runs_mrd: row.get(3)?,
-                    },
-                    "Diag" => CaseType::Diag {
-                        n_runs: row.get(4)?,
-                    },
-                    _ => {
-                        return Err(rusqlite::Error::InvalidColumnType(
-                            1,
-                            "Unknown case type".into(),
-                            rusqlite::types::Type::Text,
-                        ))
-                    }
-                };
-                Ok(Case {
-                    id: row.get(0)?,
-                    n_runs,
-                })
-            }).optional()?;
-        Ok(case)
-    }
-
-    pub fn create_table(&self) -> anyhow::Result<()> {
-        self.conn.execute(
-            "CREATE TABLE IF NOT EXISTS cases (
-                id TEXT PRIMARY KEY,
-                type TEXT NOT NULL,
-                n_runs_diag INTEGER,
-                n_runs_mrd INTEGER,
-                n_runs INTEGER
-            )",
-            [],
-        )?;
-        Ok(())
-    }
-}

+ 0 - 2
src/cases/mod.rs

@@ -1,2 +0,0 @@
-pub mod case;
-

+ 163 - 77
src/collection/pod5.rs

@@ -1,14 +1,15 @@
 use std::{
-    fmt,
-    fs::File,
-    io::{BufReader, Write},
-    path::{Path, PathBuf},
+    collections::HashSet, fmt, fs, path::{Path, PathBuf}
 };
 
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 
-use crate::{helpers::{human_size, list_files_with_ext}, io::pod5_infos::Pod5Info};
+use crate::{
+    collection::flowcells::IdInput,
+    helpers::{human_size, list_files_with_ext},
+    io::pod5_infos::Pod5Info,
+};
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Pod5 {
@@ -76,13 +77,17 @@ impl Pod5 {
     }
 }
 
-#[derive(Debug)]
-pub struct Pod5sFlowCell {
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Pod5sRun {
+    pub run_id: String,
     pub flow_cell_id: String,
+    pub sequencing_kit: String,
+    pub cases: Vec<IdInput>,
     pub pod5s: Vec<Pod5>,
+    pub dir: PathBuf,
 }
 
-impl Pod5sFlowCell {
+impl Pod5sRun {
     /// Load all `.pod5` files from a directory and build a collection.
     ///
     /// The directory is scanned using `list_files_with_ext`.  
@@ -95,98 +100,77 @@ impl Pod5sFlowCell {
 
         let mut pod5s = Vec::with_capacity(pod_paths.len());
         let mut flow_cell_id: Option<String> = None;
+        let mut sequencing_kit: Option<String> = None;
+        let mut run_id: Option<String> = None;
 
         for p in pod_paths {
             let pod = Pod5::from_path(&p)?;
+            // run_id uniqueness check
+            match &run_id {
+                None => run_id = Some(pod.protocol_run_id.clone()),
+                Some(exp) if &pod.protocol_run_id != exp => {
+                    anyhow::bail!(
+                        "Mixed sequencing kits: expected '{}', found '{}' (file: {})",
+                        exp,
+                        pod.protocol_run_id,
+                        pod.path.display()
+                    );
+                }
+                _ => {}
+            }
 
+            // flow_cell_id uniqueness check
             match &flow_cell_id {
-                None => {
-                    // First pod defines the flowcell
-                    flow_cell_id = Some(pod.flow_cell_id.clone());
+                None => flow_cell_id = Some(pod.flow_cell_id.clone()),
+                Some(exp) if &pod.flow_cell_id != exp => {
+                    anyhow::bail!(
+                        "Mixed flow cells: expected '{}', found '{}' (file: {})",
+                        exp,
+                        pod.flow_cell_id,
+                        pod.path.display()
+                    );
                 }
-                Some(expected_id) => {
-                    if &pod.flow_cell_id != expected_id {
-                        anyhow::bail!(format!(
-                            "Mixed flow cells in directory: expected '{}', found '{}' (file: {})",
-                            expected_id,
-                            pod.flow_cell_id,
-                            pod.path.display(),
-                        ));
-                    }
+                _ => {}
+            }
+
+            // sequencing_kit uniqueness check
+            match &sequencing_kit {
+                None => sequencing_kit = Some(pod.sequencing_kit.clone()),
+                Some(exp) if &pod.sequencing_kit != exp => {
+                    anyhow::bail!(
+                        "Mixed sequencing kits: expected '{}', found '{}' (file: {})",
+                        exp,
+                        pod.sequencing_kit,
+                        pod.path.display()
+                    );
                 }
+                _ => {}
             }
 
             pod5s.push(pod);
         }
 
+        let run_id = run_id.ok_or(anyhow::anyhow!("No pod5 files loaded"))?;
         let flow_cell_id = flow_cell_id.ok_or(anyhow::anyhow!("No pod5 files loaded"))?;
+        let sequencing_kit = sequencing_kit.ok_or(anyhow::anyhow!("No pod5 files loaded"))?;
 
         Ok(Self {
+            run_id,
             flow_cell_id,
+            sequencing_kit,
+            cases: Vec::new(),
             pod5s,
+            dir: dir.as_ref().into(),
         })
     }
 
-    /// Save the collection as JSON to the given path.
-    ///
-    /// The output is a single JSON array containing all `Pod5` entries.
-    /// Existing files are overwritten.
-    pub fn save_to_json<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<dyn std::error::Error>> {
-        let mut f = File::create(path)?;
-        let data = serde_json::to_vec_pretty(&self.pod5s)?;
-        f.write_all(&data)?;
-        Ok(())
-    }
-
-    /// Load a collection from a JSON file.
-    ///
-    /// The file must contain a JSON array of `Pod5` objects.
-    pub fn load_from_json<P: AsRef<Path>>(path: P) -> Result<Self, Box<dyn std::error::Error>> {
-        let f = File::open(path)?;
-        let reader = BufReader::new(f);
-
-        // Expect JSON array of Pod5
-        let loaded: Vec<Pod5> = serde_json::from_reader(reader)?;
-
-        if loaded.is_empty() {
-            return Err("JSON contains no Pod5 entries".into());
-        }
-
-        let mut pod5s = Vec::with_capacity(loaded.len());
-        let mut flow_cell_id: Option<String> = None;
-
-        for pod in loaded {
-            match &flow_cell_id {
-                None => {
-                    // First pod defines the flowcell
-                    flow_cell_id = Some(pod.flow_cell_id.clone());
-                }
-                Some(expected_id) => {
-                    if &pod.flow_cell_id != expected_id {
-                        return Err(format!(
-                            "Mixed flow cells in JSON: expected '{}', found '{}' (file: {})",
-                            expected_id,
-                            pod.flow_cell_id,
-                            pod.path.display(),
-                        )
-                        .into());
-                    }
-                }
-            }
-
-            pod5s.push(pod);
-        }
-
-        Ok(Self {
-            flow_cell_id: flow_cell_id.expect("flow_cell_id must be set"),
-            pod5s,
-        })
-    }
     /// Compute summary statistics for the collection.
     pub fn stats(&self) -> Pod5sFlowCellStats {
         if self.pod5s.is_empty() {
             return Pod5sFlowCellStats {
+                run_id: self.run_id.clone(),
                 flow_cell_id: self.flow_cell_id.clone(),
+                sequencing_kit: self.sequencing_kit.clone(),
                 count: 0,
                 total_size: 0,
                 min_acq: None,
@@ -220,7 +204,9 @@ impl Pod5sFlowCell {
             Some(self.pod5s.iter().map(|p| p.sample_rate as f64).sum::<f64>() / count as f64);
 
         Pod5sFlowCellStats {
+            run_id: self.run_id.clone(),
             flow_cell_id: self.flow_cell_id.clone(),
+            sequencing_kit: self.sequencing_kit.clone(),
             count,
             total_size,
             min_acq: Some(min_acq),
@@ -234,7 +220,9 @@ impl Pod5sFlowCell {
 
 #[derive(Debug, Clone)]
 pub struct Pod5sFlowCellStats {
+    pub run_id: String,
     pub flow_cell_id: String,
+    pub sequencing_kit: String,
     pub count: usize,
     pub total_size: u64,
     pub min_acq: Option<DateTime<Utc>>,
@@ -248,9 +236,16 @@ impl fmt::Display for Pod5sFlowCellStats {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         writeln!(f, "Pod5 Flow Cell Stats")?;
         writeln!(f, "---------------------")?;
+        writeln!(f, "Run ID: {}", self.run_id)?;
         writeln!(f, "Flow Cell ID: {}", self.flow_cell_id)?;
+        writeln!(f, "Sequencing kit: {}", self.sequencing_kit)?;
         writeln!(f, "Count: {}", self.count)?;
-        writeln!(f, "Total Size: {} ({} bytes)", human_size(self.total_size), self.total_size)?;
+        writeln!(
+            f,
+            "Total Size: {} ({} bytes)",
+            human_size(self.total_size),
+            self.total_size
+        )?;
 
         if let Some(t) = self.min_acq {
             writeln!(f, "Acquisition Start (min): {}", t)?;
@@ -274,6 +269,94 @@ impl fmt::Display for Pod5sFlowCellStats {
     }
 }
 
+#[derive(Debug, Default, Serialize, Deserialize)]
+pub struct Pod5sRuns {
+    pub data: Vec<Pod5sRun>,
+}
+
+impl Pod5sRuns {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Add a new `Pod5sRun` by scanning a directory of `.pod5` files.
+    ///
+    /// - Builds a `Pod5sRun` via [`Pod5sRun::load_from_dir`].
+    /// - If **no** existing run has the same `(run_id, flow_cell_id, sequencing_kit)`,
+    ///   the new run is appended to `data`.
+    /// - If a run **already exists** with these three identifiers, its `pod5s` list is
+    ///   **merged** with the new one:
+    ///   - New `Pod5` entries are only added if their file name (from `pod.path.file_name()`)
+    ///     does **not** already exist in the run.
+    ///   - Duplicate file names are silently skipped (no error).
+    pub fn add_from_dir<P: AsRef<Path>>(&mut self, dir: P) -> anyhow::Result<()> {
+        let mut new_run = Pod5sRun::load_from_dir(&dir)?;
+
+        // Try to find an existing run with same identifiers
+        if let Some(existing) = self.data.iter_mut().find(|r| {
+            r.run_id == new_run.run_id
+                && r.flow_cell_id == new_run.flow_cell_id
+                && r.sequencing_kit == new_run.sequencing_kit
+        }) {
+            // Build a set of existing Pod5 file names
+            let mut existing_names: HashSet<String> = existing
+                .pod5s
+                .iter()
+                .filter_map(|p| p.path.file_name().map(|n| n.to_string_lossy().into_owned()))
+                .collect();
+
+            // Keep only Pod5 entries with a new file name
+            new_run.pod5s.retain(|p| {
+                if let Some(name_os) = p.path.file_name() {
+                    let name = name_os.to_string_lossy().to_string();
+                    if existing_names.contains(&name) {
+                        // duplicate -> skip
+                        false
+                    } else {
+                        existing_names.insert(name);
+                        true
+                    }
+                } else {
+                    // No file name, keep it (or change to false if you prefer to drop these)
+                    true
+                }
+            });
+
+            // Merge the unique new Pod5s into the existing run
+            existing.pod5s.extend(new_run.pod5s);
+
+            // Optionally merge other fields (e.g., cases) if needed
+            // existing.cases.extend(new_run.cases);
+
+            Ok(())
+        } else {
+            // No matching run: add as a new entry
+            self.data.push(new_run);
+            Ok(())
+        }
+    }
+
+    /// Save metadata (not raw POD5s) as JSON.
+    pub fn save_json<P: AsRef<Path>>(&self, path: P) -> anyhow::Result<()> {
+        let s = serde_json::to_string_pretty(self)?;
+        fs::write(path, s)?;
+        Ok(())
+    }
+
+    /// Load metadata JSON and restore each run via scanning its directory.
+    ///
+    /// Rebuilds `pod5s` by calling `load_from_dir` for each `dir`.
+    pub fn load_json<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
+        let raw: Pod5sRuns = serde_json::from_str(&fs::read_to_string(path)?)?;
+
+        let mut rebuilt = Pod5sRuns::new();
+        for r in raw.data {
+            rebuilt.add_from_dir(&r.dir)?;
+        }
+        Ok(rebuilt)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::helpers::test_init;
@@ -285,8 +368,11 @@ mod tests {
         test_init();
 
         let dir = "/mnt/beegfs02/scratch/t_steimle/prom_runs/A/20251117_0915_P2I-00461-A_PBI55810_22582b29/pod5_recovered";
+        let saved_runs = "~/data/seq_runs_cases.json";
+
 
-        let flow_cell = Pod5sFlowCell::load_from_dir(dir)?;
+        let flow_cell = Pod5sRun::load_from_dir(dir)?;
+        println!("{:#?}", flow_cell.pod5s.first());
         let stats = flow_cell.stats();
 
         println!("{stats}");

+ 0 - 1
src/lib.rs

@@ -134,7 +134,6 @@ use std::sync::{Arc, Mutex};
 
 pub mod annotation;
 pub mod callers;
-pub mod cases;
 pub mod collection;
 pub mod commands;
 pub mod config;