|
|
@@ -0,0 +1,736 @@
|
|
|
+//! # PromRun Module
|
|
|
+//!
|
|
|
+//! This module handles Oxford Nanopore Technologies (ONT) PromethION sequencing run data,
|
|
|
+//! including BAM file metadata extraction and run directory management.
|
|
|
+//!
|
|
|
+//! ## Overview
|
|
|
+//!
|
|
|
+//! The module provides two main structures:
|
|
|
+//! - [`PromBam`]: Represents metadata extracted from a single BAM file produced by MinKNOW
|
|
|
+//! - [`PromRun`]: Represents a complete sequencing run directory with all associated files
|
|
|
+//!
|
|
|
+//! ## Usage
|
|
|
+//!
|
|
|
+//! ```rust,ignore
|
|
|
+//! use crate::collection::prom::{PromBam, PromRun};
|
|
|
+//! use crate::config::Config;
|
|
|
+//!
|
|
|
+//! // Parse a single BAM file
|
|
|
+//! let bam = PromBam::from_path("/path/to/file.bam")?;
|
|
|
+//! println!("{}", bam);
|
|
|
+//!
|
|
|
+//! // Import an entire run directory
|
|
|
+//! let config = Config::default();
|
|
|
+//! let run = PromRun::from_dir("/path/to/run_dir", &config)?;
|
|
|
+//! println!("{}", run);
|
|
|
+//! ```
|
|
|
+
|
|
|
+use std::{
|
|
|
+ collections::BTreeMap,
|
|
|
+ fmt,
|
|
|
+ fs::{self, File},
|
|
|
+ io::{BufReader, BufWriter},
|
|
|
+ path::{Path, PathBuf},
|
|
|
+};
|
|
|
+
|
|
|
+use anyhow::Context;
|
|
|
+use chrono::{DateTime, Utc};
|
|
|
+use rayon::{
|
|
|
+ iter::{IntoParallelRefIterator, ParallelIterator},
|
|
|
+ ThreadPoolBuilder,
|
|
|
+};
|
|
|
+use rust_htslib::bam::{self, Read};
|
|
|
+use serde::{Deserialize, Serialize};
|
|
|
+
|
|
|
+use crate::{
|
|
|
+ collection::{
|
|
|
+ flowcells::IdInput,
|
|
|
+ minknow::{parse_pore_activity_from_reader, MinKnowSampleSheet, PoreStateEntry},
|
|
|
+ pod5::Pod5,
|
|
|
+ },
|
|
|
+ config::Config,
|
|
|
+ helpers::list_files_recursive,
|
|
|
+};
|
|
|
+
|
|
|
+/// Metadata extracted from an ONT PromethION BAM file header.
|
|
|
+///
|
|
|
+/// This structure parses and stores relevant information from BAM headers
|
|
|
+/// produced by MinKNOW basecalling, including run identifiers, basecalling
|
|
|
+/// models, and instrument information.
|
|
|
+///
|
|
|
+/// # Header Parsing
|
|
|
+///
|
|
|
+/// The parser extracts information from two SAM header record types:
|
|
|
+/// - `@RG` (Read Group): Run ID, basecall model, sample info, timestamps
|
|
|
+/// - `@PG` (Program): MinKNOW version and GPU information
|
|
|
+///
|
|
|
+/// # Example
|
|
|
+///
|
|
|
+/// ```rust,ignore
|
|
|
+/// let bam = PromBam::from_path("/path/to/reads.bam")?;
|
|
|
+///
|
|
|
+/// println!("Run: {}", bam.run_id);
|
|
|
+/// println!("Sample: {}", bam.sample);
|
|
|
+/// println!("Basecaller: {}", bam.basecall_model);
|
|
|
+/// ```
|
|
|
+#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
+pub struct PromBam {
|
|
|
+ /// Absolute path to the BAM file.
|
|
|
+ pub path: PathBuf,
|
|
|
+
|
|
|
+ /// Last modification timestamp of the file.
|
|
|
+ pub modified: DateTime<Utc>,
|
|
|
+
|
|
|
+ /// File size in bytes.
|
|
|
+ pub bam_size: u64,
|
|
|
+
|
|
|
+ /// Unique run identifier from the `@RG DS:runid=` field.
|
|
|
+ pub run_id: String,
|
|
|
+
|
|
|
+ /// Basecalling model name (e.g., `dna_r10.4.1_e8.2_400bps_sup@v4.2.0`).
|
|
|
+ pub basecall_model: String,
|
|
|
+
|
|
|
+ /// Modified base detection models, space-separated if multiple.
|
|
|
+ pub modbase_models: String,
|
|
|
+
|
|
|
+ /// Instrument/device model (e.g., `PromethION`, `P2I`).
|
|
|
+ pub instrument: String,
|
|
|
+
|
|
|
+ /// Flowcell barcode identifier (e.g., `PBI55810`).
|
|
|
+ pub flowcell_id: String,
|
|
|
+
|
|
|
+ /// Sample identifier from the sample sheet.
|
|
|
+ pub sample: String,
|
|
|
+
|
|
|
+ /// Library identifier.
|
|
|
+ pub library: String,
|
|
|
+
|
|
|
+ /// Full read group ID string from `@RG ID:`.
|
|
|
+ pub read_group_id: String,
|
|
|
+
|
|
|
+ /// Sequencing timestamp in ISO 8601 format from `@RG DT:`.
|
|
|
+ pub timestamp: String,
|
|
|
+
|
|
|
+ /// GPU information from MinKNOW `@PG DS:` field.
|
|
|
+ pub gpu_info: String,
|
|
|
+
|
|
|
+ /// MinKNOW software version.
|
|
|
+ pub minknow_version: String,
|
|
|
+}
|
|
|
+
|
|
|
+impl PromBam {
|
|
|
+ /// Parses BAM header metadata from the file at the given path.
|
|
|
+ ///
|
|
|
+ /// Opens the BAM file, reads the SAM header, and extracts ONT-specific
|
|
|
+ /// metadata from `@RG` and `@PG` records.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ ///
|
|
|
+ /// * `path` - Path to a BAM file (must be readable and valid BAM format)
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ ///
|
|
|
+ /// Returns `Ok(PromBam)` with populated fields, or an error if:
|
|
|
+ /// - The file cannot be opened or read
|
|
|
+ /// - The file is not a valid BAM format
|
|
|
+ /// - The header cannot be parsed as UTF-8
|
|
|
+ ///
|
|
|
+ /// # Notes
|
|
|
+ ///
|
|
|
+ /// - Fields may be empty strings if the corresponding header tags are missing
|
|
|
+ /// - Only processes `@RG` records with `PL:ONT` (Oxford Nanopore platform)
|
|
|
+ /// - Only processes `@PG` records with `PN:minknow`
|
|
|
+ ///
|
|
|
+ /// # Example
|
|
|
+ ///
|
|
|
+ /// ```rust,ignore
|
|
|
+ /// let bam = PromBam::from_path("sample.bam")?;
|
|
|
+ /// assert!(!bam.run_id.is_empty());
|
|
|
+ /// ```
|
|
|
+ pub fn from_path(path: impl AsRef<Path>) -> anyhow::Result<Self> {
|
|
|
+ let path = path.as_ref().to_path_buf();
|
|
|
+
|
|
|
+ // Retrieve file metadata
|
|
|
+ let meta = fs::metadata(&path)
|
|
|
+ .with_context(|| format!("Failed to read metadata for {}", path.display()))?;
|
|
|
+ let modified: DateTime<Utc> = meta
|
|
|
+ .modified()
|
|
|
+ .map(DateTime::<Utc>::from)
|
|
|
+ .with_context(|| format!("Failed to get modification time for {}", path.display()))?;
|
|
|
+ let bam_size = meta.len();
|
|
|
+
|
|
|
+ // Open BAM and read header
|
|
|
+ let reader = bam::Reader::from_path(&path)
|
|
|
+ .with_context(|| format!("Failed to open BAM file: {}", path.display()))?;
|
|
|
+
|
|
|
+ let header = reader.header().clone();
|
|
|
+ let text =
|
|
|
+ std::str::from_utf8(header.as_bytes()).context("BAM header contains invalid UTF-8")?;
|
|
|
+
|
|
|
+ // Initialize with defaults
|
|
|
+ let mut prom = PromBam {
|
|
|
+ path,
|
|
|
+ modified,
|
|
|
+ bam_size,
|
|
|
+ run_id: String::new(),
|
|
|
+ basecall_model: String::new(),
|
|
|
+ modbase_models: String::new(),
|
|
|
+ instrument: String::new(),
|
|
|
+ flowcell_id: String::new(),
|
|
|
+ sample: String::new(),
|
|
|
+ library: String::new(),
|
|
|
+ read_group_id: String::new(),
|
|
|
+ timestamp: String::new(),
|
|
|
+ gpu_info: String::new(),
|
|
|
+ minknow_version: String::new(),
|
|
|
+ };
|
|
|
+
|
|
|
+ // Parse header lines
|
|
|
+ for line in text.lines() {
|
|
|
+ if !line.starts_with('@') {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut fields = line.split('\t');
|
|
|
+ let Some(tag) = fields.next() else {
|
|
|
+ continue;
|
|
|
+ };
|
|
|
+ let tag = tag.trim_start_matches('@');
|
|
|
+
|
|
|
+ // Build key-value map from tab-separated fields
|
|
|
+ let kv: Vec<(&str, &str)> = fields
|
|
|
+ .filter_map(|f| {
|
|
|
+ let mut it = f.splitn(2, ':');
|
|
|
+ Some((it.next()?, it.next().unwrap_or("")))
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ match tag {
|
|
|
+ "PG" => Self::parse_pg_record(&kv, &mut prom),
|
|
|
+ "RG" => Self::parse_rg_record(&kv, &mut prom),
|
|
|
+ _ => {}
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(prom)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Parses `@PG` (Program) header record for MinKNOW-specific fields.
|
|
|
+ fn parse_pg_record(kv: &[(&str, &str)], prom: &mut PromBam) {
|
|
|
+ // Only process MinKNOW program records
|
|
|
+ let is_minknow = kv.iter().any(|(k, v)| *k == "PN" && *v == "minknow");
|
|
|
+ if !is_minknow {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (k, v) in kv {
|
|
|
+ match *k {
|
|
|
+ "DS" => prom.gpu_info = (*v).to_string(),
|
|
|
+ "VN" => prom.minknow_version = (*v).to_string(),
|
|
|
+ _ => {}
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Parses `@RG` (Read Group) header record for ONT-specific fields.
|
|
|
+ fn parse_rg_record(kv: &[(&str, &str)], prom: &mut PromBam) {
|
|
|
+ // Only process ONT platform read groups
|
|
|
+ let is_ont = kv.iter().any(|(k, v)| *k == "PL" && *v == "ONT");
|
|
|
+ if !is_ont {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Parse composite DS field: runid=... basecall_model=... modbase_models=...
|
|
|
+ if let Some((_, ds_raw)) = kv.iter().find(|(k, _)| *k == "DS") {
|
|
|
+ for part in ds_raw.split_whitespace() {
|
|
|
+ if let Some(val) = part.strip_prefix("runid=") {
|
|
|
+ prom.run_id = val.to_string();
|
|
|
+ } else if let Some(val) = part.strip_prefix("basecall_model=") {
|
|
|
+ prom.basecall_model = val.to_string();
|
|
|
+ } else if let Some(val) = part.strip_prefix("modbase_models=") {
|
|
|
+ // Remove commas from modbase_models list
|
|
|
+ prom.modbase_models = val.replace(',', " ");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Extract standard SAM tags
|
|
|
+ for (k, v) in kv {
|
|
|
+ match *k {
|
|
|
+ "ID" => prom.read_group_id = (*v).to_string(),
|
|
|
+ "DT" => prom.timestamp = (*v).to_string(),
|
|
|
+ "LB" => prom.library = (*v).to_string(),
|
|
|
+ "PM" => prom.instrument = (*v).to_string(),
|
|
|
+ "PU" => prom.flowcell_id = (*v).to_string(),
|
|
|
+ "SM" => prom.sample = (*v).to_string(),
|
|
|
+ _ => {}
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl fmt::Display for PromBam {
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ writeln!(f, "PromBam")?;
|
|
|
+ writeln!(f, " Path: {}", self.path.display())?;
|
|
|
+ writeln!(f, " Modified: {}", self.modified)?;
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " BamSize: {:.2} GiB",
|
|
|
+ self.bam_size as f64 / (1024.0_f64.powi(3))
|
|
|
+ )?;
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(f, " Run ID: {}", self.run_id)?;
|
|
|
+ writeln!(f, " Basecaller: {}", self.basecall_model)?;
|
|
|
+ writeln!(f, " Modbase Models: {}", self.modbase_models)?;
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(f, " Instrument: {}", self.instrument)?;
|
|
|
+ writeln!(f, " Flowcell ID: {}", self.flowcell_id)?;
|
|
|
+ writeln!(f, " Sample: {}", self.sample)?;
|
|
|
+ writeln!(f, " Library: {}", self.library)?;
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(f, " RG ID: {}", self.read_group_id)?;
|
|
|
+ writeln!(f, " Timestamp: {}", self.timestamp)?;
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(f, " GPU: {}", self.gpu_info)?;
|
|
|
+ writeln!(f, " MinKNOW Ver: {}", self.minknow_version)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// A complete ONT PromethION sequencing run with all associated files.
|
|
|
+///
|
|
|
+/// Represents an imported run directory containing BAM files, POD5 raw signal
|
|
|
+/// files, sample sheets, and pore activity logs. Provides JSON serialization
|
|
|
+/// for caching parsed metadata.
|
|
|
+///
|
|
|
+/// # Directory Structure
|
|
|
+///
|
|
|
+/// The importer recursively scans for files, supporting various MinKNOW output layouts:
|
|
|
+///
|
|
|
+/// ```text
|
|
|
+/// run_dir/
|
|
|
+/// ├── sample_sheet_*.csv # Required: MinKNOW sample sheet
|
|
|
+/// ├── pore_activity_*.csv # Optional: pore state logs
|
|
|
+/// │
|
|
|
+/// │ # POD5 files (any structure supported):
|
|
|
+/// ├── pod5/ # Non-multiplexed runs
|
|
|
+/// │ └── *.pod5
|
|
|
+/// ├── pod5_pass/ # Multiplexed runs (pass reads)
|
|
|
+/// │ └── barcode*/
|
|
|
+/// │ └── *.pod5
|
|
|
+/// ├── pod5_fail/ # Failed reads (optional)
|
|
|
+/// │ └── *.pod5
|
|
|
+/// │
|
|
|
+/// │ # BAM files (any structure supported):
|
|
|
+/// ├── bam/ # Non-multiplexed runs
|
|
|
+/// │ └── *.bam
|
|
|
+/// └── bam_pass/ # Multiplexed runs
|
|
|
+/// └── barcode*/
|
|
|
+/// └── *.bam
|
|
|
+/// ```
|
|
|
+///
|
|
|
+/// Files are discovered recursively regardless of subdirectory naming.
|
|
|
+///
|
|
|
+/// # Example
|
|
|
+///
|
|
|
+/// ```rust,ignore
|
|
|
+/// let config = Config::default();
|
|
|
+///
|
|
|
+/// // Import a new run
|
|
|
+/// let run = PromRun::from_dir("/data/runs/20240101_run", &config)?;
|
|
|
+///
|
|
|
+/// // Load a cached run
|
|
|
+/// let run = PromRun::open("protocol_run_id_here", &config)?;
|
|
|
+///
|
|
|
+/// println!("Flowcell: {}", run.flow_cell_id);
|
|
|
+/// println!("BAM files: {}", run.bams.len());
|
|
|
+/// println!("POD5 files: {}", run.pod5s.len());
|
|
|
+/// ```
|
|
|
+#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
+pub struct PromRun {
|
|
|
+ /// Root directory of the sequencing run.
|
|
|
+ pub dir: PathBuf,
|
|
|
+
|
|
|
+ /// Timestamp when this run was imported/parsed.
|
|
|
+ pub import_date: DateTime<Utc>,
|
|
|
+
|
|
|
+ /// Unique identifier for the protocol run (from sample sheet).
|
|
|
+ pub protocol_run_id: String,
|
|
|
+
|
|
|
+ /// Flowcell position identifier (e.g., device slot or port).
|
|
|
+ pub position_id: String,
|
|
|
+
|
|
|
+ /// Flowcell barcode/identifier (e.g., `FAB12345`).
|
|
|
+ pub flow_cell_id: String,
|
|
|
+
|
|
|
+ /// Sample ID from the sample sheet.
|
|
|
+ pub sample_id: String,
|
|
|
+
|
|
|
+ /// Experiment ID from the sample sheet.
|
|
|
+ pub experiment_id: String,
|
|
|
+
|
|
|
+ /// Flowcell product code (e.g., `FLO-MIN106`, `FLO-PRO002`).
|
|
|
+ pub flow_cell_product_code: String,
|
|
|
+
|
|
|
+ /// Library preparation kit identifier (e.g., `SQK-LSK114`).
|
|
|
+ pub kit: String,
|
|
|
+
|
|
|
+ /// Associated case identifiers for clinical/research tracking.
|
|
|
+ pub cases: Vec<IdInput>,
|
|
|
+
|
|
|
+ /// All POD5 raw signal files found in the run directory.
|
|
|
+ pub pod5s: Vec<Pod5>,
|
|
|
+
|
|
|
+ /// All BAM files found in the run directory.
|
|
|
+ pub bams: Vec<PromBam>,
|
|
|
+
|
|
|
+ /// Pore activity/state log entries, if available.
|
|
|
+ pub pore_activity: Option<Vec<PoreStateEntry>>,
|
|
|
+}
|
|
|
+
|
|
|
+impl PromRun {
|
|
|
+ /// Imports a sequencing run from a directory.
|
|
|
+ ///
|
|
|
+ /// Recursively scans the directory for BAM files, POD5 files, sample sheets,
|
|
|
+ /// and pore activity logs. Parses all files in parallel and caches the result.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ ///
|
|
|
+ /// * `dir` - Path to the run directory (must exist and be a directory)
|
|
|
+ /// * `config` - Application configuration (provides thread count and cache paths)
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ ///
|
|
|
+ /// Returns `Ok(PromRun)` on success, or an error if:
|
|
|
+ /// - `dir` is not a directory
|
|
|
+ /// - No sample sheet (`sample_sheet_*.csv`) is found
|
|
|
+ /// - Sample sheet parsing fails
|
|
|
+ /// - Thread pool creation fails
|
|
|
+ ///
|
|
|
+ /// # Side Effects
|
|
|
+ ///
|
|
|
+ /// Automatically saves the parsed run to the JSON cache directory specified
|
|
|
+ /// in `config.run_cache_dir`.
|
|
|
+ ///
|
|
|
+ /// # Performance
|
|
|
+ ///
|
|
|
+ /// BAM and POD5 file parsing is parallelized using Rayon with the thread
|
|
|
+ /// count from `config.threads`.
|
|
|
+ ///
|
|
|
+ /// # Example
|
|
|
+ ///
|
|
|
+ /// ```rust,ignore
|
|
|
+ /// let config = Config::default();
|
|
|
+ /// let run = PromRun::from_dir("/data/runs/my_run", &config)?;
|
|
|
+ /// println!("Imported {} BAM files", run.bams.len());
|
|
|
+ /// ```
|
|
|
+ pub fn from_dir(dir: impl AsRef<Path>, config: &Config) -> anyhow::Result<Self> {
|
|
|
+ let dir = dir.as_ref().to_path_buf();
|
|
|
+ if !dir.is_dir() {
|
|
|
+ anyhow::bail!(
|
|
|
+ "Failed to import Run: input path is not a directory: {}",
|
|
|
+ dir.display()
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ // Collect file paths by type
|
|
|
+ let mut bam_paths = Vec::new();
|
|
|
+ let mut sample_sheet_path = None;
|
|
|
+ let mut pod5_paths = Vec::new();
|
|
|
+ let mut pore_activity_path = None;
|
|
|
+
|
|
|
+ for file in list_files_recursive(&dir) {
|
|
|
+ let name = file.file_name().and_then(|s| s.to_str()).unwrap_or("");
|
|
|
+ let ext = file.extension().and_then(|e| e.to_str()).unwrap_or("");
|
|
|
+
|
|
|
+ match ext {
|
|
|
+ "bam" => bam_paths.push(file),
|
|
|
+ "pod5" => pod5_paths.push(file),
|
|
|
+ "csv" => {
|
|
|
+ if name.starts_with("sample_sheet") {
|
|
|
+ sample_sheet_path = Some(file);
|
|
|
+ } else if name.starts_with("pore_activity") {
|
|
|
+ pore_activity_path = Some(file);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ _ => {}
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Parse required sample sheet
|
|
|
+ let sample_sheet = sample_sheet_path
|
|
|
+ .ok_or_else(|| anyhow::anyhow!("No sample_sheet_*.csv found in {}", dir.display()))
|
|
|
+ .and_then(MinKnowSampleSheet::from_path)?;
|
|
|
+
|
|
|
+ // Parse optional pore activity
|
|
|
+ let pore_activity = pore_activity_path
|
|
|
+ .and_then(|path| File::open(path).ok())
|
|
|
+ .and_then(|mut reader| parse_pore_activity_from_reader(&mut reader).ok());
|
|
|
+
|
|
|
+ // Build thread pool for parallel parsing
|
|
|
+ let pool = ThreadPoolBuilder::new()
|
|
|
+ .num_threads(config.threads.into())
|
|
|
+ .build()
|
|
|
+ .context("Failed to build Rayon thread pool")?;
|
|
|
+
|
|
|
+ // Parse BAM files in parallel
|
|
|
+ let bams: Vec<PromBam> = pool.install(|| {
|
|
|
+ bam_paths
|
|
|
+ .par_iter()
|
|
|
+ .filter_map(|p| match PromBam::from_path(p) {
|
|
|
+ Ok(bam) => Some(bam),
|
|
|
+ Err(e) => {
|
|
|
+ log::warn!("Failed to parse BAM {}: {}", p.display(), e);
|
|
|
+ None
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+ });
|
|
|
+
|
|
|
+ // Parse POD5 files in parallel
|
|
|
+ let pod5s: Vec<Pod5> = pool.install(|| {
|
|
|
+ pod5_paths
|
|
|
+ .par_iter()
|
|
|
+ .filter_map(|p| match Pod5::from_path(p) {
|
|
|
+ Ok(pod5) => Some(pod5),
|
|
|
+ Err(e) => {
|
|
|
+ log::warn!("Failed to parse POD5 {}: {}", p.display(), e);
|
|
|
+ None
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+ });
|
|
|
+
|
|
|
+ let prom_run = Self {
|
|
|
+ dir,
|
|
|
+ import_date: Utc::now(),
|
|
|
+ protocol_run_id: sample_sheet.protocol_run_id,
|
|
|
+ position_id: sample_sheet.position_id,
|
|
|
+ flow_cell_id: sample_sheet.flow_cell_id,
|
|
|
+ sample_id: sample_sheet.sample_id,
|
|
|
+ experiment_id: sample_sheet.experiment_id,
|
|
|
+ flow_cell_product_code: sample_sheet.flow_cell_product_code,
|
|
|
+ kit: sample_sheet.kit,
|
|
|
+ cases: Vec::new(),
|
|
|
+ pod5s,
|
|
|
+ bams,
|
|
|
+ pore_activity,
|
|
|
+ };
|
|
|
+
|
|
|
+ // Cache to disk
|
|
|
+ prom_run.save_json(prom_run.cache_path(config))?;
|
|
|
+
|
|
|
+ Ok(prom_run)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Serializes this run to a JSON file.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ ///
|
|
|
+ /// * `path` - Output file path (will be created or overwritten)
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if file creation or JSON serialization fails.
|
|
|
+ pub fn save_json(&self, path: impl AsRef<Path>) -> anyhow::Result<()> {
|
|
|
+ let path = path.as_ref();
|
|
|
+ let file = BufWriter::new(
|
|
|
+ File::create(path)
|
|
|
+ .with_context(|| format!("Failed to create JSON file: {}", path.display()))?,
|
|
|
+ );
|
|
|
+ serde_json::to_writer_pretty(file, self)
|
|
|
+ .with_context(|| format!("Failed to serialize PromRun to {}", path.display()))?;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Deserializes a run from a JSON file.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ ///
|
|
|
+ /// * `path` - Path to a previously saved JSON cache file
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if the file cannot be read or parsed.
|
|
|
+ pub fn load_json(path: impl AsRef<Path>) -> anyhow::Result<Self> {
|
|
|
+ let path = path.as_ref();
|
|
|
+ let file = BufReader::new(
|
|
|
+ File::open(path)
|
|
|
+ .with_context(|| format!("Failed to open JSON file: {}", path.display()))?,
|
|
|
+ );
|
|
|
+ let data = serde_json::from_reader(file)
|
|
|
+ .with_context(|| format!("Failed to parse PromRun from {}", path.display()))?;
|
|
|
+ Ok(data)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns the cache file path for this run.
|
|
|
+ ///
|
|
|
+ /// The cache path is `{config.run_cache_dir}/{protocol_run_id}.json`.
|
|
|
+ #[must_use]
|
|
|
+ pub fn cache_path(&self, config: &Config) -> PathBuf {
|
|
|
+ let mut path = PathBuf::from(&config.run_cache_dir);
|
|
|
+ path.push(format!("{}.json", self.protocol_run_id));
|
|
|
+ path
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Opens a previously cached run by its protocol run ID.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ ///
|
|
|
+ /// * `run_id` - The protocol run ID (filename stem in the cache directory)
|
|
|
+ /// * `config` - Application configuration with `run_cache_dir`
|
|
|
+ ///
|
|
|
+ /// # Example
|
|
|
+ ///
|
|
|
+ /// ```rust,ignore
|
|
|
+ /// let run = PromRun::open("abc123-def456", &config)?;
|
|
|
+ /// ```
|
|
|
+ pub fn open(run_id: &str, config: &Config) -> anyhow::Result<Self> {
|
|
|
+ let mut path = PathBuf::from(&config.run_cache_dir);
|
|
|
+ path.push(format!("{run_id}.json"));
|
|
|
+ Self::load_json(path)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns the total size of all BAM files in bytes.
|
|
|
+ #[must_use]
|
|
|
+ pub fn total_bam_size(&self) -> u64 {
|
|
|
+ self.bams.iter().map(|b| b.bam_size).sum()
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns the total size of all POD5 files in bytes.
|
|
|
+ #[must_use]
|
|
|
+ pub fn total_pod5_size(&self) -> u64 {
|
|
|
+ self.pod5s.iter().map(|p| p.file_size).sum()
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Statistics for files in a single directory.
|
|
|
+#[derive(Default)]
|
|
|
+struct DirStats {
|
|
|
+ pod5_count: usize,
|
|
|
+ pod5_size: u64,
|
|
|
+ bam_count: usize,
|
|
|
+ bam_size: u64,
|
|
|
+}
|
|
|
+
|
|
|
+impl fmt::Display for PromRun {
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ /// Converts bytes to GiB.
|
|
|
+ fn to_gib(bytes: u64) -> f64 {
|
|
|
+ bytes as f64 / 1024.0_f64.powi(3)
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut dir_stats: BTreeMap<String, DirStats> = BTreeMap::new();
|
|
|
+
|
|
|
+ // Aggregate POD5 stats by directory
|
|
|
+ for pod5 in &self.pod5s {
|
|
|
+ let rel = pod5.path.strip_prefix(&self.dir).unwrap_or(&pod5.path);
|
|
|
+ let dir = rel.parent().unwrap_or(Path::new("."));
|
|
|
+ let key = dir.display().to_string();
|
|
|
+
|
|
|
+ let entry = dir_stats.entry(key).or_default();
|
|
|
+ entry.pod5_count += 1;
|
|
|
+ entry.pod5_size += pod5.file_size;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Aggregate BAM stats by directory
|
|
|
+ for bam in &self.bams {
|
|
|
+ let rel = bam.path.strip_prefix(&self.dir).unwrap_or(&bam.path);
|
|
|
+ let dir = rel.parent().unwrap_or(Path::new("."));
|
|
|
+ let key = dir.display().to_string();
|
|
|
+
|
|
|
+ let entry = dir_stats.entry(key).or_default();
|
|
|
+ entry.bam_count += 1;
|
|
|
+ entry.bam_size += bam.bam_size;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Header information
|
|
|
+ writeln!(f, "📦 PromRun")?;
|
|
|
+ writeln!(f, " dir : {}", self.dir.display())?;
|
|
|
+ writeln!(f, " imported : {}", self.import_date)?;
|
|
|
+ writeln!(f, " run id : {}", self.protocol_run_id)?;
|
|
|
+ writeln!(f, " position : {}", self.position_id)?;
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " flow cell : {} ({})",
|
|
|
+ self.flow_cell_id, self.flow_cell_product_code
|
|
|
+ )?;
|
|
|
+ writeln!(f, " sample id : {}", self.sample_id)?;
|
|
|
+ writeln!(f, " experiment : {}", self.experiment_id)?;
|
|
|
+ writeln!(f, " kit : {}", self.kit)?;
|
|
|
+ writeln!(f, " cases : {}", self.cases.len())?;
|
|
|
+
|
|
|
+ match &self.pore_activity {
|
|
|
+ Some(pa) => writeln!(f, " pore act. : {} entries", pa.len())?,
|
|
|
+ None => writeln!(f, " pore act. : none")?,
|
|
|
+ }
|
|
|
+
|
|
|
+ writeln!(f)?;
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ "📁 Files by directory (relative to {})",
|
|
|
+ self.dir.display()
|
|
|
+ )?;
|
|
|
+
|
|
|
+ if dir_stats.is_empty() {
|
|
|
+ writeln!(f, " (no files)")?;
|
|
|
+ return Ok(());
|
|
|
+ }
|
|
|
+
|
|
|
+ for (dir, stats) in dir_stats {
|
|
|
+ writeln!(f, " - {dir}")?;
|
|
|
+ if stats.pod5_count > 0 {
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " POD5 : {:>3} files, {:6.2} GiB",
|
|
|
+ stats.pod5_count,
|
|
|
+ to_gib(stats.pod5_size)
|
|
|
+ )?;
|
|
|
+ }
|
|
|
+ if stats.bam_count > 0 {
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " BAM : {:>3} files, {:6.2} GiB",
|
|
|
+ stats.bam_count,
|
|
|
+ to_gib(stats.bam_size)
|
|
|
+ )?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[cfg(test)]
|
|
|
+mod tests {
|
|
|
+ use super::*;
|
|
|
+ use log::info;
|
|
|
+
|
|
|
+ use crate::helpers::test_init;
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn prom_run_bam() -> anyhow::Result<()> {
|
|
|
+ test_init();
|
|
|
+
|
|
|
+ let bam_file = "/mnt/beegfs02/scratch/t_steimle/test_data/inputs/test_run_A/bam_pass/barcode02/PBI55810_pass_barcode02_22582b29_d02c5bb8_0.bam";
|
|
|
+ let bam = PromBam::from_path(bam_file)?;
|
|
|
+ info!("{bam}");
|
|
|
+
|
|
|
+ let bam_file = "/home/t_steimle/mnt/prom/20251121_001_01_CD/01/20251121_1531_P2I-00461-A_PBI52256_b1dd5673/bam_pass/PBI52256_pass_b1dd5673_414982db_0.bam";
|
|
|
+ let bam = PromBam::from_path(bam_file)?;
|
|
|
+ info!("{bam}");
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn prom_run_import() -> anyhow::Result<()> {
|
|
|
+ test_init();
|
|
|
+ let config = Config::default();
|
|
|
+
|
|
|
+ let dir = "/home/t_steimle/mnt/prom/20251121_001_01_CD/01/20251121_1531_P2I-00461-A_PBI52256_b1dd5673";
|
|
|
+
|
|
|
+ let prom_run = PromRun::from_dir(dir, &config)?;
|
|
|
+ info!("{prom_run}");
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|