|
|
@@ -1,14 +1,15 @@
|
|
|
use std::{
|
|
|
- fmt,
|
|
|
- fs::File,
|
|
|
- io::{BufReader, Write},
|
|
|
- path::{Path, PathBuf},
|
|
|
+ collections::HashSet, fmt, fs, path::{Path, PathBuf}
|
|
|
};
|
|
|
|
|
|
use chrono::{DateTime, Utc};
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
|
-use crate::{helpers::{human_size, list_files_with_ext}, io::pod5_infos::Pod5Info};
|
|
|
+use crate::{
|
|
|
+ collection::flowcells::IdInput,
|
|
|
+ helpers::{human_size, list_files_with_ext},
|
|
|
+ io::pod5_infos::Pod5Info,
|
|
|
+};
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
pub struct Pod5 {
|
|
|
@@ -76,13 +77,17 @@ impl Pod5 {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-#[derive(Debug)]
|
|
|
-pub struct Pod5sFlowCell {
|
|
|
+#[derive(Debug, Serialize, Deserialize)]
|
|
|
+pub struct Pod5sRun {
|
|
|
+ pub run_id: String,
|
|
|
pub flow_cell_id: String,
|
|
|
+ pub sequencing_kit: String,
|
|
|
+ pub cases: Vec<IdInput>,
|
|
|
pub pod5s: Vec<Pod5>,
|
|
|
+ pub dir: PathBuf,
|
|
|
}
|
|
|
|
|
|
-impl Pod5sFlowCell {
|
|
|
+impl Pod5sRun {
|
|
|
/// Load all `.pod5` files from a directory and build a collection.
|
|
|
///
|
|
|
/// The directory is scanned using `list_files_with_ext`.
|
|
|
@@ -95,98 +100,77 @@ impl Pod5sFlowCell {
|
|
|
|
|
|
let mut pod5s = Vec::with_capacity(pod_paths.len());
|
|
|
let mut flow_cell_id: Option<String> = None;
|
|
|
+ let mut sequencing_kit: Option<String> = None;
|
|
|
+ let mut run_id: Option<String> = None;
|
|
|
|
|
|
for p in pod_paths {
|
|
|
let pod = Pod5::from_path(&p)?;
|
|
|
+ // run_id uniqueness check
|
|
|
+ match &run_id {
|
|
|
+ None => run_id = Some(pod.protocol_run_id.clone()),
|
|
|
+ Some(exp) if &pod.protocol_run_id != exp => {
|
|
|
+ anyhow::bail!(
|
|
|
+ "Mixed sequencing kits: expected '{}', found '{}' (file: {})",
|
|
|
+ exp,
|
|
|
+ pod.protocol_run_id,
|
|
|
+ pod.path.display()
|
|
|
+ );
|
|
|
+ }
|
|
|
+ _ => {}
|
|
|
+ }
|
|
|
|
|
|
+ // flow_cell_id uniqueness check
|
|
|
match &flow_cell_id {
|
|
|
- None => {
|
|
|
- // First pod defines the flowcell
|
|
|
- flow_cell_id = Some(pod.flow_cell_id.clone());
|
|
|
+ None => flow_cell_id = Some(pod.flow_cell_id.clone()),
|
|
|
+ Some(exp) if &pod.flow_cell_id != exp => {
|
|
|
+ anyhow::bail!(
|
|
|
+ "Mixed flow cells: expected '{}', found '{}' (file: {})",
|
|
|
+ exp,
|
|
|
+ pod.flow_cell_id,
|
|
|
+ pod.path.display()
|
|
|
+ );
|
|
|
}
|
|
|
- Some(expected_id) => {
|
|
|
- if &pod.flow_cell_id != expected_id {
|
|
|
- anyhow::bail!(format!(
|
|
|
- "Mixed flow cells in directory: expected '{}', found '{}' (file: {})",
|
|
|
- expected_id,
|
|
|
- pod.flow_cell_id,
|
|
|
- pod.path.display(),
|
|
|
- ));
|
|
|
- }
|
|
|
+ _ => {}
|
|
|
+ }
|
|
|
+
|
|
|
+ // sequencing_kit uniqueness check
|
|
|
+ match &sequencing_kit {
|
|
|
+ None => sequencing_kit = Some(pod.sequencing_kit.clone()),
|
|
|
+ Some(exp) if &pod.sequencing_kit != exp => {
|
|
|
+ anyhow::bail!(
|
|
|
+ "Mixed sequencing kits: expected '{}', found '{}' (file: {})",
|
|
|
+ exp,
|
|
|
+ pod.sequencing_kit,
|
|
|
+ pod.path.display()
|
|
|
+ );
|
|
|
}
|
|
|
+ _ => {}
|
|
|
}
|
|
|
|
|
|
pod5s.push(pod);
|
|
|
}
|
|
|
|
|
|
+ let run_id = run_id.ok_or(anyhow::anyhow!("No pod5 files loaded"))?;
|
|
|
let flow_cell_id = flow_cell_id.ok_or(anyhow::anyhow!("No pod5 files loaded"))?;
|
|
|
+ let sequencing_kit = sequencing_kit.ok_or(anyhow::anyhow!("No pod5 files loaded"))?;
|
|
|
|
|
|
Ok(Self {
|
|
|
+ run_id,
|
|
|
flow_cell_id,
|
|
|
+ sequencing_kit,
|
|
|
+ cases: Vec::new(),
|
|
|
pod5s,
|
|
|
+ dir: dir.as_ref().into(),
|
|
|
})
|
|
|
}
|
|
|
|
|
|
- /// Save the collection as JSON to the given path.
|
|
|
- ///
|
|
|
- /// The output is a single JSON array containing all `Pod5` entries.
|
|
|
- /// Existing files are overwritten.
|
|
|
- pub fn save_to_json<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<dyn std::error::Error>> {
|
|
|
- let mut f = File::create(path)?;
|
|
|
- let data = serde_json::to_vec_pretty(&self.pod5s)?;
|
|
|
- f.write_all(&data)?;
|
|
|
- Ok(())
|
|
|
- }
|
|
|
-
|
|
|
- /// Load a collection from a JSON file.
|
|
|
- ///
|
|
|
- /// The file must contain a JSON array of `Pod5` objects.
|
|
|
- pub fn load_from_json<P: AsRef<Path>>(path: P) -> Result<Self, Box<dyn std::error::Error>> {
|
|
|
- let f = File::open(path)?;
|
|
|
- let reader = BufReader::new(f);
|
|
|
-
|
|
|
- // Expect JSON array of Pod5
|
|
|
- let loaded: Vec<Pod5> = serde_json::from_reader(reader)?;
|
|
|
-
|
|
|
- if loaded.is_empty() {
|
|
|
- return Err("JSON contains no Pod5 entries".into());
|
|
|
- }
|
|
|
-
|
|
|
- let mut pod5s = Vec::with_capacity(loaded.len());
|
|
|
- let mut flow_cell_id: Option<String> = None;
|
|
|
-
|
|
|
- for pod in loaded {
|
|
|
- match &flow_cell_id {
|
|
|
- None => {
|
|
|
- // First pod defines the flowcell
|
|
|
- flow_cell_id = Some(pod.flow_cell_id.clone());
|
|
|
- }
|
|
|
- Some(expected_id) => {
|
|
|
- if &pod.flow_cell_id != expected_id {
|
|
|
- return Err(format!(
|
|
|
- "Mixed flow cells in JSON: expected '{}', found '{}' (file: {})",
|
|
|
- expected_id,
|
|
|
- pod.flow_cell_id,
|
|
|
- pod.path.display(),
|
|
|
- )
|
|
|
- .into());
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- pod5s.push(pod);
|
|
|
- }
|
|
|
-
|
|
|
- Ok(Self {
|
|
|
- flow_cell_id: flow_cell_id.expect("flow_cell_id must be set"),
|
|
|
- pod5s,
|
|
|
- })
|
|
|
- }
|
|
|
/// Compute summary statistics for the collection.
|
|
|
pub fn stats(&self) -> Pod5sFlowCellStats {
|
|
|
if self.pod5s.is_empty() {
|
|
|
return Pod5sFlowCellStats {
|
|
|
+ run_id: self.run_id.clone(),
|
|
|
flow_cell_id: self.flow_cell_id.clone(),
|
|
|
+ sequencing_kit: self.sequencing_kit.clone(),
|
|
|
count: 0,
|
|
|
total_size: 0,
|
|
|
min_acq: None,
|
|
|
@@ -220,7 +204,9 @@ impl Pod5sFlowCell {
|
|
|
Some(self.pod5s.iter().map(|p| p.sample_rate as f64).sum::<f64>() / count as f64);
|
|
|
|
|
|
Pod5sFlowCellStats {
|
|
|
+ run_id: self.run_id.clone(),
|
|
|
flow_cell_id: self.flow_cell_id.clone(),
|
|
|
+ sequencing_kit: self.sequencing_kit.clone(),
|
|
|
count,
|
|
|
total_size,
|
|
|
min_acq: Some(min_acq),
|
|
|
@@ -234,7 +220,9 @@ impl Pod5sFlowCell {
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
|
|
pub struct Pod5sFlowCellStats {
|
|
|
+ pub run_id: String,
|
|
|
pub flow_cell_id: String,
|
|
|
+ pub sequencing_kit: String,
|
|
|
pub count: usize,
|
|
|
pub total_size: u64,
|
|
|
pub min_acq: Option<DateTime<Utc>>,
|
|
|
@@ -248,9 +236,16 @@ impl fmt::Display for Pod5sFlowCellStats {
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
writeln!(f, "Pod5 Flow Cell Stats")?;
|
|
|
writeln!(f, "---------------------")?;
|
|
|
+ writeln!(f, "Run ID: {}", self.run_id)?;
|
|
|
writeln!(f, "Flow Cell ID: {}", self.flow_cell_id)?;
|
|
|
+ writeln!(f, "Sequencing kit: {}", self.sequencing_kit)?;
|
|
|
writeln!(f, "Count: {}", self.count)?;
|
|
|
- writeln!(f, "Total Size: {} ({} bytes)", human_size(self.total_size), self.total_size)?;
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ "Total Size: {} ({} bytes)",
|
|
|
+ human_size(self.total_size),
|
|
|
+ self.total_size
|
|
|
+ )?;
|
|
|
|
|
|
if let Some(t) = self.min_acq {
|
|
|
writeln!(f, "Acquisition Start (min): {}", t)?;
|
|
|
@@ -274,6 +269,94 @@ impl fmt::Display for Pod5sFlowCellStats {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+#[derive(Debug, Default, Serialize, Deserialize)]
|
|
|
+pub struct Pod5sRuns {
|
|
|
+ pub data: Vec<Pod5sRun>,
|
|
|
+}
|
|
|
+
|
|
|
+impl Pod5sRuns {
|
|
|
+ pub fn new() -> Self {
|
|
|
+ Self::default()
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Add a new `Pod5sRun` by scanning a directory of `.pod5` files.
|
|
|
+ ///
|
|
|
+ /// - Builds a `Pod5sRun` via [`Pod5sRun::load_from_dir`].
|
|
|
+ /// - If **no** existing run has the same `(run_id, flow_cell_id, sequencing_kit)`,
|
|
|
+ /// the new run is appended to `data`.
|
|
|
+ /// - If a run **already exists** with these three identifiers, its `pod5s` list is
|
|
|
+ /// **merged** with the new one:
|
|
|
+ /// - New `Pod5` entries are only added if their file name (from `pod.path.file_name()`)
|
|
|
+ /// does **not** already exist in the run.
|
|
|
+ /// - Duplicate file names are silently skipped (no error).
|
|
|
+ pub fn add_from_dir<P: AsRef<Path>>(&mut self, dir: P) -> anyhow::Result<()> {
|
|
|
+ let mut new_run = Pod5sRun::load_from_dir(&dir)?;
|
|
|
+
|
|
|
+ // Try to find an existing run with same identifiers
|
|
|
+ if let Some(existing) = self.data.iter_mut().find(|r| {
|
|
|
+ r.run_id == new_run.run_id
|
|
|
+ && r.flow_cell_id == new_run.flow_cell_id
|
|
|
+ && r.sequencing_kit == new_run.sequencing_kit
|
|
|
+ }) {
|
|
|
+ // Build a set of existing Pod5 file names
|
|
|
+ let mut existing_names: HashSet<String> = existing
|
|
|
+ .pod5s
|
|
|
+ .iter()
|
|
|
+ .filter_map(|p| p.path.file_name().map(|n| n.to_string_lossy().into_owned()))
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ // Keep only Pod5 entries with a new file name
|
|
|
+ new_run.pod5s.retain(|p| {
|
|
|
+ if let Some(name_os) = p.path.file_name() {
|
|
|
+ let name = name_os.to_string_lossy().to_string();
|
|
|
+ if existing_names.contains(&name) {
|
|
|
+ // duplicate -> skip
|
|
|
+ false
|
|
|
+ } else {
|
|
|
+ existing_names.insert(name);
|
|
|
+ true
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ // No file name, keep it (or change to false if you prefer to drop these)
|
|
|
+ true
|
|
|
+ }
|
|
|
+ });
|
|
|
+
|
|
|
+ // Merge the unique new Pod5s into the existing run
|
|
|
+ existing.pod5s.extend(new_run.pod5s);
|
|
|
+
|
|
|
+ // Optionally merge other fields (e.g., cases) if needed
|
|
|
+ // existing.cases.extend(new_run.cases);
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ } else {
|
|
|
+ // No matching run: add as a new entry
|
|
|
+ self.data.push(new_run);
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Save metadata (not raw POD5s) as JSON.
|
|
|
+ pub fn save_json<P: AsRef<Path>>(&self, path: P) -> anyhow::Result<()> {
|
|
|
+ let s = serde_json::to_string_pretty(self)?;
|
|
|
+ fs::write(path, s)?;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Load metadata JSON and restore each run via scanning its directory.
|
|
|
+ ///
|
|
|
+ /// Rebuilds `pod5s` by calling `load_from_dir` for each `dir`.
|
|
|
+ pub fn load_json<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
|
|
|
+ let raw: Pod5sRuns = serde_json::from_str(&fs::read_to_string(path)?)?;
|
|
|
+
|
|
|
+ let mut rebuilt = Pod5sRuns::new();
|
|
|
+ for r in raw.data {
|
|
|
+ rebuilt.add_from_dir(&r.dir)?;
|
|
|
+ }
|
|
|
+ Ok(rebuilt)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
use crate::helpers::test_init;
|
|
|
@@ -285,8 +368,11 @@ mod tests {
|
|
|
test_init();
|
|
|
|
|
|
let dir = "/mnt/beegfs02/scratch/t_steimle/prom_runs/A/20251117_0915_P2I-00461-A_PBI55810_22582b29/pod5_recovered";
|
|
|
+ let saved_runs = "~/data/seq_runs_cases.json";
|
|
|
+
|
|
|
|
|
|
- let flow_cell = Pod5sFlowCell::load_from_dir(dir)?;
|
|
|
+ let flow_cell = Pod5sRun::load_from_dir(dir)?;
|
|
|
+ println!("{:#?}", flow_cell.pod5s.first());
|
|
|
let stats = flow_cell.stats();
|
|
|
|
|
|
println!("{stats}");
|