11 mesiacov pred · 7911999078
--- a/src/collection/flowcells.rs
+++ b/src/collection/flowcells.rs
@@ -0,0 +1,760 @@
 
				+use std::{collections::{HashMap, HashSet}, fmt, fs::{self, File}, io::{BufReader, Read}, os::unix::fs::MetadataExt, path::Path};
			
 
				+
			
 
				+use anyhow::Context;
			
 
				+use chrono::{DateTime, TimeZone, Utc};
			
 
				+use glob::glob;
			
 
				+use csv::ReaderBuilder;
			
 
				+use log::{info, warn};
			
 
				+use serde::{Deserialize, Serialize};
			
 
				+
			
 
				+use crate::helpers::{find_files, list_directories};
			
 
				+
			
 
				+use super::minknow::MinKnowSampleSheet;
			
 
				+
			
 
				+/// A collection of `IdInput` records, with utility methods
			
 
				+/// for loading, saving, deduplication, and construction from TSV.
			
 
				+#[derive(Debug, Serialize, Deserialize, Clone)]
			
 
				+pub struct IdsInput {
			
 
				+    /// The list of ID entries.
			
 
				+    pub data: Vec<IdInput>,
			
 
				+}
			
 
				+
			
 
				+impl IdsInput {
			
 
				+    /// Load `IdsInput` from a JSON file.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// * `path` - Path to a JSON file containing an array of `IdInput`.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    /// Returns an error if the file cannot be opened or parsed.
			
 
				+    pub fn load_json(path: &str) -> anyhow::Result<Self> {
			
 
				+        let f = File::open(path)?;
			
 
				+        let s: Self = serde_json::from_reader(f)?;
			
 
				+        Ok(s)
			
 
				+    }
			
 
				+
			
 
				+    /// Save `IdsInput` to a JSON file.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// * `path` - Destination file path.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    /// Returns an error if the file cannot be created or written.
			
 
				+    pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
			
 
				+        let f = File::create(path)?;
			
 
				+        serde_json::to_writer(f, self)?;
			
 
				+        Ok(())
			
 
				+    }
			
 
				+
			
 
				+    /// Remove duplicate `IdInput` entries.
			
 
				+    ///
			
 
				+    /// Keeps the first occurrence of each unique `IdInput`.
			
 
				+    pub fn dedup(&mut self) {
			
 
				+        let mut unique = HashSet::new();
			
 
				+        self.data.retain(|item| unique.insert(item.clone()));
			
 
				+    }
			
 
				+
			
 
				+    /// Load `IdsInput` from a TSV file using corrected flowcell names.
			
 
				+    ///
			
 
				+    /// This method internally deduplicates the data.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// * `path` - Path to the TSV file.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    /// Returns an error if loading or parsing fails.
			
 
				+    pub fn load_from_tsv(path: &str) -> anyhow::Result<Self> {
			
 
				+        let inputs = load_flowcells_corrected_names(path)?;
			
 
				+        let data = inputs
			
 
				+            .iter()
			
 
				+            .map(|line| IdInput {
			
 
				+                id: line.id.to_string(),
			
 
				+                time_point: line.sample_type.to_string(),
			
 
				+                barcode: line.barcode_number.to_string(),
			
 
				+                flow_cell: line.flow_cell.to_string(),
			
 
				+                run: line.run.to_string(),
			
 
				+            })
			
 
				+            .collect();
			
 
				+
			
 
				+        let mut res = Self { data };
			
 
				+        res.dedup();
			
 
				+        Ok(res)
			
 
				+    }
			
 
				+
			
 
				+    /// Add a new `IdInput` and deduplicate the collection.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// * `values` - A new `IdInput` record.
			
 
				+    pub fn add_input(&mut self, values: IdInput) {
			
 
				+        self.data.push(values);
			
 
				+        self.dedup();
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/// A unique sample identifier from sequencing metadata.
			
 
				+///
			
 
				+/// Uniqueness is defined by the combination of all fields.
			
 
				+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
			
 
				+pub struct IdInput {
			
 
				+    /// Sample or patient ID.
			
 
				+    pub id: String,
			
 
				+    /// Time point or sample type.
			
 
				+    pub time_point: String,
			
 
				+    /// Barcode number (sequencing index).
			
 
				+    pub barcode: String,
			
 
				+    /// Flow cell identifier.
			
 
				+    pub flow_cell: String,
			
 
				+    /// Run identifier.
			
 
				+    pub run: String,
			
 
				+}
			
 
				+
			
 
				+/// Represents a single record describing a barcode-flowcell pairing,
			
 
				+/// including original and corrected metadata.
			
 
				+///
			
 
				+/// This struct is typically deserialized from a TSV file and used to map
			
 
				+/// `.pod5` files to metadata like corrected flowcell names and experimental time points.
			
 
				+#[derive(Debug, Serialize, Deserialize, Clone)]
			
 
				+pub struct FCLine {
			
 
				+    /// Unique identifier for the sample or barcode group (e.g., "P001X03").
			
 
				+    pub id: String,
			
 
				+
			
 
				+    /// Sample type associated with this record (e.g., "normal", "tumoral").
			
 
				+    pub sample_type: String,
			
 
				+
			
 
				+    /// The barcode number (e.g., "NB01", "NB02").
			
 
				+    pub barcode_number: String,
			
 
				+
			
 
				+    /// Original flowcell name as found in the raw `.pod5` metadata.
			
 
				+    pub flow_cell: String,
			
 
				+
			
 
				+    /// Sequencing run name this flowcell belongs to (e.g., "20240101_FAB123").
			
 
				+    pub run: String,
			
 
				+
			
 
				+    /// Original path to data (can be absolute or relative).
			
 
				+    pub path: String,
			
 
				+
			
 
				+    /// Corrected flowcell name used to resolve naming inconsistencies.
			
 
				+    pub ref_flow_cell: String,
			
 
				+}
			
 
				+
			
 
				+/// Loads corrected flowcell metadata from a tab-delimited file.
			
 
				+///
			
 
				+/// This function parses a TSV file where each row is deserialized into an `FCLine`.
			
 
				+/// It also normalizes some fields (e.g., lowercases `sample_type`, uppercases `id`)
			
 
				+/// for consistency in downstream processing.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+/// - `file_path`: Path to the TSV file containing flowcell correction data.
			
 
				+///
			
 
				+/// # Returns
			
 
				+/// A vector of `FCLine` records, one per line in the file.
			
 
				+///
			
 
				+/// # Errors
			
 
				+/// Returns an error if the file cannot be opened or if any line fails to deserialize.
			
 
				+///
			
 
				+/// # Expected Format (TSV with header)
			
 
				+/// ```text
			
 
				+/// id    sample_type    barcode_number    flow_cell    run_path    ref_flow_cell
			
 
				+/// P001X03    tumoral    NB01    FC123    RUN123    /path/to/data    FC123_CORR
			
 
				+/// ```
			
 
				+///
			
 
				+/// # Example
			
 
				+/// ```
			
 
				+/// let fc_lines = load_flowcells_corrected_names("flowcells.tsv")?;
			
 
				+/// assert!(!fc_lines.is_empty());
			
 
				+/// ```
			
 
				+pub fn load_flowcells_corrected_names(file_path: &str) -> anyhow::Result<Vec<FCLine>> {
			
 
				+    let file = File::open(file_path)?;
			
 
				+
			
 
				+    let mut rdr = ReaderBuilder::new()
			
 
				+        .delimiter(b'\t')
			
 
				+        .has_headers(true)
			
 
				+        .from_reader(file);
			
 
				+
			
 
				+    let mut records = Vec::new();
			
 
				+    for result in rdr.deserialize() {
			
 
				+        let mut record: FCLine = result?;
			
 
				+
			
 
				+        // formating
			
 
				+        record.sample_type = record.sample_type.to_lowercase();
			
 
				+        record.id = record.id.to_uppercase();
			
 
				+
			
 
				+        records.push(record);
			
 
				+    }
			
 
				+
			
 
				+    Ok(records)
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/// Container for a deduplicated and enriched collection of flowcells (`FlowCel`).
			
 
				+///
			
 
				+/// `FlowCells` represents the aggregated result of scanning multiple sources:
			
 
				+/// - A cached archive of flowcells (`archive_store_path`)
			
 
				+/// - A live scan of the local run directory (`local_run_dir`)
			
 
				+///
			
 
				+/// Each [`FlowCel`] contains all necessary metadata for downstream processing,
			
 
				+/// including parsed MinKNOW sample sheet data, `.pod5` file statistics, experiment layout,
			
 
				+/// and optional sample/case annotations from an [`IdsInput`] file.
			
 
				+///
			
 
				+/// The [`FlowCells::load`] method performs the following:
			
 
				+/// - Loads existing flowcells from the archive if available
			
 
				+/// - Scans local directories for new or updated flowcells
			
 
				+/// - Deduplicates flowcells using the `flowcell_id`
			
 
				+/// - Retains the most recently modified version of each flowcell
			
 
				+/// - Enriches each flowcell with case-level annotations
			
 
				+///
			
 
				+/// # Fields
			
 
				+/// - `flow_cells`: A deduplicated list of fully parsed [`FlowCel`] instances.
			
 
				+///
			
 
				+/// # Example
			
 
				+/// ```
			
 
				+/// let flow_cells = FlowCells::load(
			
 
				+///     "/mnt/data/runs",
			
 
				+///     "inputs.json",
			
 
				+///     "flowcell_cache.json"
			
 
				+/// )?;
			
 
				+/// println!("Loaded {} unique flowcells", flow_cells.flow_cells.len());
			
 
				+/// ```
			
 
				+///
			
 
				+/// # Deduplication
			
 
				+/// Flowcells are uniquely identified by their `flowcell_id`, a combination of
			
 
				+/// `{experiment_id}/{sample_id}`. If both archived and local versions exist,
			
 
				+/// the one with the latest `.pod5` modification time is retained.
			
 
				+///
			
 
				+/// # Related Types
			
 
				+/// - [`FlowCel`]: Describes a flowcell, its metadata, and files
			
 
				+/// - [`FlowCellExperiment`]: Muxed vs. demuxed layout classification
			
 
				+/// - [`FlowCellLocation`]: Indicates source (local or archive)
			
 
				+/// - [`MinKnowSampleSheet`]: Parsed sample sheet data
			
 
				+/// - [`IdInput`]: Case-level annotation applied to flowcells
			
 
				+#[derive(Debug, Serialize, Deserialize, Clone)]
			
 
				+pub struct FlowCells {
			
 
				+    /// A collection of parsed flowcell metadata records.
			
 
				+    pub flow_cells: Vec<FlowCel>,
			
 
				+}
			
 
				+
			
 
				+impl FlowCells {
			
 
				+    /// Loads and merges `FlowCel` objects from both archive and local filesystem, deduplicating by `flowcell_id`.
			
 
				+    ///
			
 
				+    /// This function combines flowcells from:
			
 
				+    /// - a precomputed archive (JSON),
			
 
				+    /// - and a dynamic scan of local run directories.
			
 
				+    ///
			
 
				+    /// The result is deduplicated by `flowcell_id`, and enriched with case-level annotations
			
 
				+    /// from an `IdsInput` file based on `sample_id` and `experiment_id`.
			
 
				+    ///
			
 
				+    /// # Deduplication Logic
			
 
				+    /// If a flowcell appears in both sources, the one with the more recent `modified` timestamp is retained.
			
 
				+    pub fn load(
			
 
				+        local_run_dir: &str,
			
 
				+        inputs_path: &str,
			
 
				+        archive_store_path: &str,
			
 
				+    ) -> anyhow::Result<Self> {
			
 
				+        let mut merged_map: HashMap<String, FlowCel> = HashMap::new();
			
 
				+
			
 
				+        // Load from archive if present
			
 
				+        if Path::new(archive_store_path).exists() {
			
 
				+            let file = File::open(archive_store_path)?;
			
 
				+            let archived: Vec<FlowCel> = serde_json::from_reader(BufReader::new(file))?;
			
 
				+
			
 
				+            for fc in archived {
			
 
				+                merged_map.insert(fc.flowcell_id.clone(), fc);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // Scan local sample_sheets
			
 
				+        let sample_sheets = find_files(&format!("{local_run_dir}/**/sample_sheet*"))?;
			
 
				+        for sample_sheet_path in sample_sheets {
			
 
				+            let dir = sample_sheet_path.parent().ok_or_else(|| {
			
 
				+                anyhow::anyhow!(
			
 
				+                    "Failed to get directory from path: {}",
			
 
				+                    sample_sheet_path.display()
			
 
				+                )
			
 
				+            })?;
			
 
				+            let dir_str = dir.to_string_lossy().to_string();
			
 
				+
			
 
				+            let (sample_sheet, files) = scan_local(&dir_str)?;
			
 
				+            let fc = FlowCel::new(sample_sheet, FlowCellLocation::Local(dir_str), files)?;
			
 
				+
			
 
				+            // Dedup by flowcell_id, retain most recently modified
			
 
				+            merged_map
			
 
				+                .entry(fc.flowcell_id.clone())
			
 
				+                .and_modify(|existing| {
			
 
				+                    if fc.modified > existing.modified {
			
 
				+                        *existing = fc.clone();
			
 
				+                    }
			
 
				+                })
			
 
				+                .or_insert(fc);
			
 
				+        }
			
 
				+
			
 
				+        // Load input metadata and annotate flowcells
			
 
				+        let inputs = IdsInput::load_json(inputs_path)?;
			
 
				+        for fc in merged_map.values_mut() {
			
 
				+            fc.cases = inputs
			
 
				+                .data
			
 
				+                .iter()
			
 
				+                .filter(|info| {
			
 
				+                    info.flow_cell == fc.run.sample_id && info.run == fc.run.experiment_id
			
 
				+                })
			
 
				+                .cloned()
			
 
				+                .collect();
			
 
				+        }
			
 
				+
			
 
				+        Ok(Self {
			
 
				+            flow_cells: merged_map.into_values().collect(),
			
 
				+        })
			
 
				+    }
			
 
				+
			
 
				+    /// Updates a JSON archive of `FlowCel` objects by scanning `.tar` archives in a directory.
			
 
				+    ///
			
 
				+    /// This function is used to **discover new archived flowcells** by scanning all `.tar` files
			
 
				+    /// in a given directory, parsing their contents using [`scan_archive`] and [`FlowCel::new`],
			
 
				+    /// and then appending the results to an existing JSON file (if present).
			
 
				+    /// Flowcells are **deduplicated** by `flowcell_id`, and the updated result is saved back to disk.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// - `archive_path`: Path to a directory containing `.tar` archives produced by MinKNOW.
			
 
				+    /// - `save_path`: Path to a JSON file where the deduplicated list of `FlowCel` objects will be saved.
			
 
				+    ///
			
 
				+    /// # Behavior
			
 
				+    /// - If `save_path` exists, the function loads existing flowcells from it.
			
 
				+    /// - Then it scans all `.tar` files in `archive_path`, one by one:
			
 
				+    ///     - Extracts `sample_sheet` and `.pod5` file metadata using [`scan_archive`]
			
 
				+    ///     - Builds a new [`FlowCel`] using [`FlowCel::new`] with location `FlowCellLocation::Archived(...)`
			
 
				+    ///     - Logs and skips entries that fail to parse
			
 
				+    /// - All new flowcells are added to the existing list and deduplicated.
			
 
				+    /// - The updated list is sorted and written back to `save_path`.
			
 
				+    ///
			
 
				+    /// # Deduplication
			
 
				+    /// - Flowcells are deduplicated using `.dedup_by_key(|fc| fc.flowcell_id.clone())`.
			
 
				+    /// - The last encountered entry is kept if duplicates exist.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// - `Ok(())` if scanning and update succeeds.
			
 
				+    /// - `Err` if the archive directory, `.tar` files, or save path cannot be processed.
			
 
				+    ///
			
 
				+    /// # Example
			
 
				+    /// ```
			
 
				+    /// update_archive_from_scan("archives/", "flowcell_cache.json")?;
			
 
				+    /// ```
			
 
				+    ///
			
 
				+    /// # See Also
			
 
				+    /// - [`scan_archive`]
			
 
				+    /// - [`FlowCel`]
			
 
				+    /// - [`FlowCellLocation::Archived`]
			
 
				+    pub fn update_archive_from_scan(archive_path: &str, save_path: &str) -> anyhow::Result<()> {
			
 
				+        // Load existing archive, if any
			
 
				+        let mut all: Vec<FlowCel> = if Path::new(save_path).exists() {
			
 
				+            let file = File::open(save_path)?;
			
 
				+            serde_json::from_reader(BufReader::new(file))?
			
 
				+        } else {
			
 
				+            Vec::new()
			
 
				+        };
			
 
				+
			
 
				+        let n_before = all.len();
			
 
				+        let pattern = format!("{archive_path}/*.tar");
			
 
				+
			
 
				+        // Scan all .tar archives
			
 
				+        let res: Vec<FlowCel> = glob(&pattern)?
			
 
				+            .filter_map(Result::ok)
			
 
				+            .filter_map(|path| {
			
 
				+                let archive_str = path.to_string_lossy();
			
 
				+                let (sample_sheet, files) = match scan_archive(&archive_str) {
			
 
				+                    Ok(r) => r,
			
 
				+                    Err(e) => {
			
 
				+                        warn!("Failed to scan archive {}: {e}", archive_str);
			
 
				+                        return None;
			
 
				+                    }
			
 
				+                };
			
 
				+                match FlowCel::new(
			
 
				+                    sample_sheet,
			
 
				+                    FlowCellLocation::Archived(archive_path.to_string()),
			
 
				+                    files,
			
 
				+                ) {
			
 
				+                    Ok(fc) => Some(fc),
			
 
				+                    Err(e) => {
			
 
				+                        warn!("Failed to create FlowCel from {}: {e}", archive_str);
			
 
				+                        None
			
 
				+                    }
			
 
				+                }
			
 
				+            })
			
 
				+            .collect();
			
 
				+
			
 
				+        // Merge, deduplicate, and write updated archive
			
 
				+        all.extend(res);
			
 
				+        all.sort_by(|a, b| a.flowcell_id.cmp(&b.flowcell_id));
			
 
				+        all.dedup_by_key(|v| v.flowcell_id.clone());
			
 
				+
			
 
				+        let n_final = all.len();
			
 
				+        info!("{} new archive(s) discovered.", n_final.saturating_sub(n_before));
			
 
				+
			
 
				+        let json = serde_json::to_string_pretty(&all)
			
 
				+            .map_err(|e| anyhow::anyhow!("Can't convert into json.\n{e}"))?;
			
 
				+
			
 
				+        fs::write(save_path, json)
			
 
				+            .map_err(|e| anyhow::anyhow!("Can't write file: {save_path}.\n{e}"))?;
			
 
				+
			
 
				+        Ok(())
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/// Represents a fully described flowcell unit, including experimental metadata,
			
 
				+/// physical location (local or archived), sample sheet data, and associated pod5 files.
			
 
				+///
			
 
				+/// A `FlowCel` object serves as the central unit in the data model for sample aggregation
			
 
				+/// and downstream processing.
			
 
				+///
			
 
				+/// # Fields
			
 
				+/// - `flowcell_id`: A compound identifier, typically formatted as `{experiment_id}/{sample_id}`.
			
 
				+/// - `experiment`: Experiment type inferred from `.pod5` files (see `FlowCellExperiment`).
			
 
				+/// - `location`: Whether the flowcell was loaded from a local directory or archive store.
			
 
				+/// - `modified`: Last modification timestamp among `.pod5` files.
			
 
				+/// - `run`: The original MinKNOW sample sheet metadata (`MinKnowSampleSheet`).
			
 
				+/// - `pod5_size`: Total size (in bytes) of `.pod5` files.
			
 
				+/// - `n_pod5`: Number of `.pod5` files found.
			
 
				+/// - `cases`: List of sample/case-level annotations associated with this flowcell (from `IdsInput`).
			
 
				+#[derive(Debug, Serialize, Deserialize, Clone)]
			
 
				+pub struct FlowCel {
			
 
				+    pub flowcell_id: String,
			
 
				+    pub experiment: FlowCellExperiment,
			
 
				+    pub location: FlowCellLocation,
			
 
				+    pub modified: DateTime<Utc>,
			
 
				+    pub run: MinKnowSampleSheet,
			
 
				+    pub pod5_size: usize,
			
 
				+    pub n_pod5: usize,
			
 
				+    pub cases: Vec<IdInput>,
			
 
				+}
			
 
				+
			
 
				+/// Describes the physical origin of a flowcell when loaded.
			
 
				+///
			
 
				+/// This is used to differentiate flowcells discovered during a local scan
			
 
				+/// versus those restored from an archived store.
			
 
				+#[derive(Debug, Serialize, Deserialize, Clone)]
			
 
				+pub enum FlowCellLocation {
			
 
				+    /// Flowcell discovered in a local filesystem path.
			
 
				+    Local(String),
			
 
				+
			
 
				+    /// Flowcell restored from a `.tar` archive or a serialized cache.
			
 
				+    Archived(String),
			
 
				+}
			
 
				+
			
 
				+impl fmt::Display for FlowCellLocation {
			
 
				+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
			
 
				+        write!(
			
 
				+            f,
			
 
				+            "{}",
			
 
				+            match self {
			
 
				+                FlowCellLocation::Local(_) => "local",
			
 
				+                FlowCellLocation::Archived(_) => "archived",
			
 
				+            }
			
 
				+        )
			
 
				+    }
			
 
				+}
			
 
				+impl FlowCel {
			
 
				+    /// Constructs a new `FlowCel` from a sample sheet and associated file list.
			
 
				+    ///
			
 
				+    /// This method aggregates information from a parsed `MinKnowSampleSheet` and the
			
 
				+    /// corresponding `.pod5` file metadata, and infers the experiment type from
			
 
				+    /// file paths using `FlowCellExperiment::from_pod5_paths`.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// - `sample_sheet`: Parsed sample sheet metadata.
			
 
				+    /// - `location`: Origin of the flowcell (local or archived).
			
 
				+    /// - `files`: List of files associated with the flowcell, each with:
			
 
				+    ///   - `String`: file path
			
 
				+    ///   - `u64`: size in bytes
			
 
				+    ///   - `DateTime<Utc>`: modification time
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// - `Ok(FlowCel)` if experiment type and file metadata are successfully resolved.
			
 
				+    /// - `Err` if the experiment type cannot be determined.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    /// - If `FlowCellExperiment::from_pod5_paths` fails (e.g., unknown layout).
			
 
				+    ///
			
 
				+    /// # Example
			
 
				+    /// ```
			
 
				+    /// let fc = FlowCel::new(sample_sheet, FlowCellLocation::Local(dir), files)?;
			
 
				+    /// println!("Flowcell ID: {}", fc.flowcell_id);
			
 
				+    /// ```
			
 
				+    pub fn new(
			
 
				+        sample_sheet: MinKnowSampleSheet,
			
 
				+        location: FlowCellLocation,
			
 
				+        files: Vec<(String, u64, DateTime<Utc>)>,
			
 
				+    ) -> anyhow::Result<Self> {
			
 
				+        let flowcell_id = format!("{}/{}", sample_sheet.experiment_id, sample_sheet.sample_id);
			
 
				+
			
 
				+        // Filter .pod5 files
			
 
				+        let pod5s: Vec<_> = files
			
 
				+            .iter()
			
 
				+            .filter(|(path, _, _)| path.ends_with(".pod5"))
			
 
				+            .cloned()
			
 
				+            .collect();
			
 
				+        let n_pod5 = pod5s.len();
			
 
				+
			
 
				+        // Infer experiment type from pod5 paths
			
 
				+        let experiment = FlowCellExperiment::from_pod5_paths(
			
 
				+            &files.iter().map(|(p, _, _)| p.to_string()).collect(),
			
 
				+        )
			
 
				+        .ok_or_else(|| anyhow::anyhow!("Can't find experiment type for {flowcell_id}"))?;
			
 
				+
			
 
				+        // Aggregate pod5 size and latest modification time
			
 
				+        let (pod5_size, modified): (usize, DateTime<Utc>) = files
			
 
				+            .into_iter()
			
 
				+            .filter(|(path, _, _)| path.ends_with(".pod5"))
			
 
				+            .fold(
			
 
				+                (0, DateTime::<Utc>::MIN_UTC),
			
 
				+                |(acc_size, acc_time), (_, size, time)| {
			
 
				+                    (
			
 
				+                        acc_size + size as usize,
			
 
				+                        if acc_time < time { time } else { acc_time },
			
 
				+                    )
			
 
				+                },
			
 
				+            );
			
 
				+
			
 
				+        Ok(Self {
			
 
				+            flowcell_id,
			
 
				+            experiment,
			
 
				+            location,
			
 
				+            modified,
			
 
				+            run: sample_sheet,
			
 
				+            pod5_size,
			
 
				+            n_pod5,
			
 
				+            cases: Vec::new(),
			
 
				+        })
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/// Describes the type of experiment layout based on `.pod5` file structure.
			
 
				+///
			
 
				+/// Used to distinguish between whole-genome sequencing (WGS) `.pod5` files
			
 
				+/// organized in a single (muxed) directory or demultiplexed (`pod5_pass`) structure.
			
 
				+#[derive(Debug, Serialize, Deserialize, Clone)]
			
 
				+pub enum FlowCellExperiment {
			
 
				+    /// `.pod5` files are stored in a single unbarcoded directory, typically `/pod5/`.
			
 
				+    WGSPod5Mux(String),
			
 
				+
			
 
				+    /// `.pod5` files are organized by barcode in subdirectories, typically `/pod5_pass/`.
			
 
				+    WGSPod5Demux(String),
			
 
				+}
			
 
				+
			
 
				+impl fmt::Display for FlowCellExperiment {
			
 
				+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
			
 
				+        write!(
			
 
				+            f,
			
 
				+            "{}",
			
 
				+            match self {
			
 
				+                FlowCellExperiment::WGSPod5Mux(_) => "WGS Pod5 Muxed",
			
 
				+                FlowCellExperiment::WGSPod5Demux(_) => "WGS Pod5 Demuxed",
			
 
				+            }
			
 
				+        )
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+impl FlowCellExperiment {
			
 
				+    /// Attempts to infer the experiment type from the immediate subdirectories of the given path.
			
 
				+    ///
			
 
				+    /// This is useful when scanning a flowcell directory directly and checking
			
 
				+    /// whether it contains a `pod5/` or `pod5_pass/` structure.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// - `flowcell_path`: Path to the root of a flowcell directory.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// - `Some(FlowCellExperiment)` if a known subdirectory is found.
			
 
				+    /// - `None` if no match is detected.
			
 
				+    pub fn from_path(flowcell_path: &str) -> Option<Self> {
			
 
				+        for dir in list_directories(flowcell_path).ok().unwrap_or_default() {
			
 
				+            if dir == "pod5" {
			
 
				+                return Some(FlowCellExperiment::WGSPod5Mux(dir.to_string()));
			
 
				+            }
			
 
				+            if dir == "pod5_pass" {
			
 
				+                return Some(FlowCellExperiment::WGSPod5Demux(dir.to_string()));
			
 
				+            }
			
 
				+        }
			
 
				+        None
			
 
				+    }
			
 
				+
			
 
				+    /// Attempts to infer the experiment type from a list of `.pod5` file paths.
			
 
				+    ///
			
 
				+    /// This is typically used when files have already been collected and their
			
 
				+    /// parent directories can be checked for naming conventions.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// - `all_paths`: Vector of paths (as strings) to `.pod5` files or directories.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// - `Some(FlowCellExperiment)` if a known suffix is detected.
			
 
				+    /// - `None` if no matching pattern is found.
			
 
				+    pub fn from_pod5_paths(all_paths: &Vec<String>) -> Option<Self> {
			
 
				+        for path in all_paths {
			
 
				+            if path.ends_with("/pod5/") || path.ends_with("/pod5") {
			
 
				+                return Some(FlowCellExperiment::WGSPod5Mux(path.to_string()));
			
 
				+            }
			
 
				+
			
 
				+            if path.ends_with("/pod5_pass/") || path.ends_with("/pod5_pass") {
			
 
				+                return Some(FlowCellExperiment::WGSPod5Demux(path.to_string()));
			
 
				+            }
			
 
				+        }
			
 
				+        None
			
 
				+    }
			
 
				+
			
 
				+    /// Returns the underlying string (directory path) for the experiment.
			
 
				+    ///
			
 
				+    /// This is useful when you need access to the directory path used to classify the experiment.
			
 
				+    pub fn inner(&self) -> &str {
			
 
				+        match self {
			
 
				+            FlowCellExperiment::WGSPod5Mux(v) => v,
			
 
				+            FlowCellExperiment::WGSPod5Demux(v) => v,
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/// Represents the result of scanning a MinKNOW experiment source.
			
 
				+///
			
 
				+/// This tuple includes:
			
 
				+/// - `MinKnowSampleSheet`: Parsed metadata describing the experiment/sample.
			
 
				+/// - `Vec<(String, u64, DateTime<Utc>)>`: A list of files with:
			
 
				+///   - `String`: file path
			
 
				+///   - `u64`: file size in bytes
			
 
				+///   - `DateTime<Utc>`: last modification time (UTC)
			
 
				+type ExperimentData = (MinKnowSampleSheet, Vec<(String, u64, DateTime<Utc>)>);
			
 
				+
			
 
				+/// Scans a local directory for MinKNOW experiment files and metadata.
			
 
				+///
			
 
				+/// This function recursively walks a directory using globbing,
			
 
				+/// collects file paths, sizes, and modification timestamps,
			
 
				+/// and identifies a `sample_sheet` file to parse as `MinKnowSampleSheet`.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+/// - `dir`: Root directory to scan (absolute or relative).
			
 
				+///
			
 
				+/// # Returns
			
 
				+/// - `Ok(ExperimentData)` containing the sample sheet and a list of file records.
			
 
				+/// - `Err` if the directory can't be accessed, no sample sheet is found, or parsing fails.
			
 
				+///
			
 
				+/// # Requirements
			
 
				+/// - A file path containing `"sample_sheet"` must be present and readable.
			
 
				+/// - The sample sheet must be formatted according to MinKNOW expectations
			
 
				+///   (1 header + 1 data row).
			
 
				+///
			
 
				+/// # Errors
			
 
				+/// - If reading files or metadata fails.
			
 
				+/// - If the sample sheet is missing or invalid.
			
 
				+///
			
 
				+/// # Example
			
 
				+/// ```
			
 
				+/// let (sheet, files) = scan_local("/data/run001")?;
			
 
				+/// println!("Kit used: {}", sheet.kit);
			
 
				+/// println!("Number of files found: {}", files.len());
			
 
				+/// ```
			
 
				+pub fn scan_local(dir: &str) -> anyhow::Result<ExperimentData> {
			
 
				+    let mut result = Vec::new();
			
 
				+    let mut sample_sheet: Option<String> = None;
			
 
				+
			
 
				+    for entry in glob(&format!("{}/**/*", dir))? {
			
 
				+        let file = entry.context("Failed to read an entry from the tar archive")?;
			
 
				+
			
 
				+        // Extract file properties safely
			
 
				+        let metadata = file.metadata().context(format!(
			
 
				+            "Failed to access file metadata: {}",
			
 
				+            file.display()
			
 
				+        ))?;
			
 
				+        let size = metadata.size();
			
 
				+        let modified = metadata.mtime();
			
 
				+        let modified_utc: DateTime<Utc> = Utc.timestamp_opt(modified as i64, 0).unwrap();
			
 
				+
			
 
				+        let path = file.to_string_lossy().into_owned();
			
 
				+
			
 
				+        if path.contains("sample_sheet") {
			
 
				+            sample_sheet = Some(path.clone());
			
 
				+        }
			
 
				+
			
 
				+        result.push((path, size, modified_utc));
			
 
				+    }
			
 
				+
			
 
				+    let sample_sheet = sample_sheet.ok_or(anyhow::anyhow!("No sample sheet detected in: {dir}"))?;
			
 
				+    let sample_sheet = MinKnowSampleSheet::from_path(&sample_sheet)
			
 
				+        .context(anyhow::anyhow!("Can't parse sample sheet data"))?;
			
 
				+
			
 
				+    Ok((sample_sheet, result))
			
 
				+}
			
 
				+
			
 
				+/// Scans a `.tar` archive containing a MinKNOW sequencing experiment.
			
 
				+///
			
 
				+/// This function opens a TAR archive, searches for the `sample_sheet` CSV file,
			
 
				+/// extracts its metadata, and parses it into a `MinKnowSampleSheet`.
			
 
				+/// All other entries in the archive are collected with their path, size, and modification time.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+/// - `tar_path`: Path to the `.tar` archive file.
			
 
				+///
			
 
				+/// # Returns
			
 
				+/// - `Ok(ExperimentData)`: A tuple containing the parsed sample sheet and the list of file metadata.
			
 
				+/// - `Err`: If the archive is unreadable, malformed, or missing the `sample_sheet`.
			
 
				+///
			
 
				+/// # Archive Requirements
			
 
				+/// - Must contain exactly one file matching `"sample_sheet"` in its path.
			
 
				+/// - The sample sheet must contain a valid CSV header and a single data row.
			
 
				+///
			
 
				+/// # Errors
			
 
				+/// - Fails if the archive can't be opened or read.
			
 
				+/// - Fails if any entry is malformed (e.g., missing timestamp).
			
 
				+/// - Fails if no sample sheet is found or if it is malformed.
			
 
				+///
			
 
				+/// # Example
			
 
				+/// ```no_run
			
 
				+/// let (sample_sheet, files) = scan_archive("archive.tar")?;
			
 
				+/// println!("Sample ID: {}", sample_sheet.sample_id);
			
 
				+/// println!("Total files in archive: {}", files.len());
			
 
				+/// ```
			
 
				+pub fn scan_archive(tar_path: &str) -> anyhow::Result<ExperimentData> {
			
 
				+    info!("Scanning archive: {tar_path}");
			
 
				+
			
 
				+    let file = File::open(tar_path)
			
 
				+        .with_context(|| format!("Failed to open tar file at path: {}", tar_path))?;
			
 
				+
			
 
				+    let mut archive = tar::Archive::new(file);
			
 
				+    let mut result = Vec::new();
			
 
				+    let mut sample_sheet: Option<String> = None;
			
 
				+
			
 
				+    // Iterate through the entries in the archive
			
 
				+    for entry in archive.entries_with_seek()? {
			
 
				+        let mut file = entry.context("Failed to read an entry from the tar archive")?;
			
 
				+
			
 
				+        // Extract file properties safely
			
 
				+        let size = file.size();
			
 
				+        let modified = file
			
 
				+            .header()
			
 
				+            .mtime()
			
 
				+            .context("Failed to get modification time")?;
			
 
				+        let modified_utc: DateTime<Utc> = Utc.timestamp_opt(modified as i64, 0).unwrap();
			
 
				+
			
 
				+        let path = file
			
 
				+            .path()
			
 
				+            .context("Failed to get file path from tar entry")?
			
 
				+            .to_string_lossy()
			
 
				+            .into_owned();
			
 
				+
			
 
				+        if path.contains("sample_sheet") {
			
 
				+            let mut buffer = String::new();
			
 
				+            file.read_to_string(&mut buffer)
			
 
				+                .context("Failed to read file contents as string")?;
			
 
				+            sample_sheet = Some(buffer);
			
 
				+        }
			
 
				+
			
 
				+        result.push((path, size, modified_utc));
			
 
				+    }
			
 
				+
			
 
				+    let sample_sheet = sample_sheet.ok_or(anyhow::anyhow!(
			
 
				+        "No sample sheet detected in archive: {tar_path}"
			
 
				+    ))?;
			
 
				+    let (_, data) = sample_sheet
			
 
				+        .split_once("\n")
			
 
				+        .ok_or(anyhow::anyhow!("Can't parse sample sheet data"))?;
			
 
				+    let sample_sheet: MinKnowSampleSheet = data
			
 
				+        .try_into()
			
 
				+        .map_err(|e| anyhow::anyhow!("Can't parse sample sheet.\n{e}"))?;
			
 
				+    Ok((sample_sheet, result))
			
 
				+}
			
--- a/src/collection/minknow.rs
+++ b/src/collection/minknow.rs
@@ -0,0 +1,426 @@
 
				+use std::{fs::File, str::FromStr};
			
 
				+
			
 
				+use serde::{Deserialize, Serialize};
			
 
				+
			
 
				+
			
 
				+/// Represents a single entry from a MinKNOW sample sheet CSV file.
			
 
				+///
			
 
				+/// This structure captures the metadata associated with a sample flowcell
			
 
				+/// as defined by the Oxford Nanopore MinKNOW software.
			
 
				+///
			
 
				+/// Expected format (CSV):
			
 
				+/// ```text
			
 
				+/// protocol_run_id,position_id,flow_cell_id,sample_id,experiment_id,flow_cell_product_code,kit
			
 
				+/// 0ef3f65c-aa2b-4936-b49b-e55d361e9d85,1F,PBC97196,Projet_143,Projet_143,FLO-PRO114M,
			
 
				+/// ```
			
 
				+#[derive(Debug, Serialize, Deserialize, Clone)]
			
 
				+pub struct MinKnowSampleSheet {
			
 
				+    /// Unique identifier for the protocol run.
			
 
				+    pub protocol_run_id: String,
			
 
				+
			
 
				+    /// Identifier for the flowcell position (e.g., port or device slot).
			
 
				+    pub position_id: String,
			
 
				+
			
 
				+    /// Flowcell barcode or identifier (e.g., FAB123).
			
 
				+    pub flow_cell_id: String,
			
 
				+
			
 
				+    /// Sample ID associated with this run.
			
 
				+    pub sample_id: String,
			
 
				+
			
 
				+    /// Experiment ID assigned in the sample sheet.
			
 
				+    pub experiment_id: String,
			
 
				+
			
 
				+    /// Product code for the flowcell (e.g., FLO-MIN106).
			
 
				+    pub flow_cell_product_code: String,
			
 
				+
			
 
				+    /// Kit identifier used for sample preparation (e.g., SQK-LSK109).
			
 
				+    pub kit: String,
			
 
				+}
			
 
				+
			
 
				+impl TryFrom<&str> for MinKnowSampleSheet {
			
 
				+    type Error = anyhow::Error;
			
 
				+
			
 
				+    /// Attempts to parse a single comma-separated line (CSV row) into a `MinKnowSampleSheet`.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// - `value`: A CSV-formatted string representing a single data row (excluding the header).
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// - `Ok(MinKnowSampleSheet)` if parsing succeeds.
			
 
				+    /// - `Err` if the row does not contain exactly 7 fields.
			
 
				+    ///
			
 
				+    /// # Example
			
 
				+    /// ```
			
 
				+    /// let row = "1234-ABCD,ABC123,FAB001,SAMPLE001,EXP001,FCP001,KIT001";
			
 
				+    /// let sheet = MinKnowSampleSheet::try_from(row)?;
			
 
				+    /// assert_eq!(sheet.sample_id, "SAMPLE001");
			
 
				+    /// ```
			
 
				+    fn try_from(value: &str) -> anyhow::Result<Self> {
			
 
				+        let cells: Vec<&str> = value.split(',').collect();
			
 
				+        if cells.len() != 7 {
			
 
				+            return Err(anyhow::anyhow!(
			
 
				+                "Number of fields is not equal to 7: {value}"
			
 
				+            ));
			
 
				+        }
			
 
				+
			
 
				+        Ok(Self {
			
 
				+            protocol_run_id: cells[0].to_string(),
			
 
				+            position_id: cells[1].to_string(),
			
 
				+            flow_cell_id: cells[2].to_string(),
			
 
				+            sample_id: cells[3].to_string(),
			
 
				+            experiment_id: cells[4].to_string(),
			
 
				+            flow_cell_product_code: cells[5].to_string(),
			
 
				+            kit: cells[6].to_string(),
			
 
				+        })
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+impl MinKnowSampleSheet {
			
 
				+    /// Loads a `MinKnowSampleSheet` from a file path containing a 2-line CSV:
			
 
				+    /// a header and a single data row.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    /// - `path`: Path to the MinKNOW sample sheet file.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// - `Ok(MinKnowSampleSheet)` if the file is well-formed.
			
 
				+    /// - `Err` if the file is missing, malformed, or has an invalid header.
			
 
				+    ///
			
 
				+    /// # Expected Format
			
 
				+    /// The file must contain:
			
 
				+    /// - A single header line:
			
 
				+    ///   `"protocol_run_id,position_id,flow_cell_id,sample_id,experiment_id,flow_cell_product_code,kit"`
			
 
				+    /// - One data line corresponding to a sample.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    /// - If the file is missing, empty, or contains malformed data.
			
 
				+    ///
			
 
				+    /// # Example
			
 
				+    /// ```
			
 
				+    /// let sheet = MinKnowSampleSheet::from_path("samplesheet.csv")?;
			
 
				+    /// println!("Sample ID: {}", sheet.sample_id);
			
 
				+    /// ```
			
 
				+    pub fn from_path(path: &str) -> anyhow::Result<Self> {
			
 
				+        use std::fs::File;
			
 
				+        use std::io::{self, BufRead};
			
 
				+
			
 
				+        let file =
			
 
				+            File::open(path).map_err(|e| anyhow::anyhow!("Can't open file: {path}\n\t{e}"))?;
			
 
				+        let reader = io::BufReader::new(file);
			
 
				+
			
 
				+        let mut lines = reader.lines();
			
 
				+
			
 
				+        // Validate header line
			
 
				+        if let Some(header_line) = lines.next() {
			
 
				+            let header_line =
			
 
				+                header_line.map_err(|e| anyhow::anyhow!("Error reading header line: {e}"))?;
			
 
				+
			
 
				+            if header_line
			
 
				+                != "protocol_run_id,position_id,flow_cell_id,sample_id,experiment_id,flow_cell_product_code,kit"
			
 
				+            {
			
 
				+                return Err(anyhow::anyhow!(
			
 
				+                    "File header doesn't match MinKnow sample sheet format: {header_line}"
			
 
				+                ));
			
 
				+            }
			
 
				+        } else {
			
 
				+            return Err(anyhow::anyhow!("File is empty or missing a header."));
			
 
				+        }
			
 
				+
			
 
				+        // Parse the data row
			
 
				+        if let Some(data_line) = lines.next() {
			
 
				+            let data_line =
			
 
				+                data_line.map_err(|e| anyhow::anyhow!("Error reading data line: {e}"))?;
			
 
				+            return data_line.as_str().try_into();
			
 
				+        }
			
 
				+
			
 
				+        Err(anyhow::anyhow!(
			
 
				+            "File doesn't contain the expected second line (data row)."
			
 
				+        ))
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/// Loads Nanopore channel state entries from a CSV file.
			
 
				+///
			
 
				+/// This function parses a CSV file that contains time-series data for
			
 
				+/// individual sequencing channels, including their status (`adapter`, `strand`, etc.),
			
 
				+/// time since experiment start, and duration in samples.
			
 
				+///
			
 
				+/// The CSV file is expected to have the following headers:
			
 
				+/// - `Channel`
			
 
				+/// - `State`
			
 
				+/// - `Experiment Time (minutes)`
			
 
				+/// - `State Time (samples)`
			
 
				+///
			
 
				+/// The `State` column is deserialized into a `NanoporeChannelStatus` enum.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `path` - Path to the CSV file containing the channel state data.
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A `Result` containing:
			
 
				+/// - `Ok(Vec<ChannelStateEntry>)` on success
			
 
				+/// - `Err(Box<dyn Error>)` if the file can't be opened or parsed
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// This function will return an error if:
			
 
				+/// - The file cannot be opened
			
 
				+/// - The CSV is malformed or missing expected headers
			
 
				+/// - A status value cannot be parsed into `NanoporeChannelStatus`
			
 
				+///
			
 
				+/// # Example
			
 
				+///
			
 
				+/// ```rust
			
 
				+/// let entries = load_channel_states("nanopore_data.csv")?;
			
 
				+/// for entry in entries {
			
 
				+///     println!("{:?}", entry);
			
 
				+/// }
			
 
				+/// ```
			
 
				+///
			
 
				+/// # Dependencies
			
 
				+///
			
 
				+/// Requires the [`csv`](https://docs.rs/csv), [`serde`](https://docs.rs/serde), and [`serde_derive`](https://docs.rs/serde_derive) crates.
			
 
				+///
			
 
				+/// # See Also
			
 
				+///
			
 
				+/// - [`ChannelStateEntry`]
			
 
				+/// - [`NanoporeChannelStatus`]
			
 
				+pub fn load_channel_states(path: &str) -> anyhow::Result<Vec<ChannelStateEntry>> {
			
 
				+    let file = std::fs::File::open(path)?;
			
 
				+    let mut rdr = csv::ReaderBuilder::new()
			
 
				+        .delimiter(b',')
			
 
				+        .has_headers(true)
			
 
				+        .from_reader(file);
			
 
				+
			
 
				+    let mut records = Vec::new();
			
 
				+    for result in rdr.deserialize() {
			
 
				+        let record: ChannelStateEntry = result?;
			
 
				+        records.push(record);
			
 
				+    }
			
 
				+
			
 
				+    Ok(records)
			
 
				+}
			
 
				+
			
 
				+/// One entry from a Nanopore channel state CSV.
			
 
				+#[derive(Debug, Deserialize)]
			
 
				+pub struct ChannelStateEntry {
			
 
				+    /// Channel identifier (e.g., 2, 3, 4...).
			
 
				+    #[serde(rename = "Channel")]
			
 
				+    pub channel: u32,
			
 
				+
			
 
				+    /// Current status of the channel (adapter, strand, etc).
			
 
				+    #[serde(rename = "State", deserialize_with = "deserialize_status")]
			
 
				+    pub state: NanoporeChannelStatus,
			
 
				+
			
 
				+    /// Time since experiment started, in minutes.
			
 
				+    #[serde(rename = "Experiment Time (minutes)")]
			
 
				+    pub experiment_time_minutes: f32,
			
 
				+
			
 
				+    /// Duration of this state in samples (not time units).
			
 
				+    #[serde(rename = "State Time (samples)")]
			
 
				+    pub state_time_samples: u64,
			
 
				+}
			
 
				+
			
 
				+/// Represents the current status of a Nanopore sequencing channel.
			
 
				+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
			
 
				+pub enum NanoporeChannelStatus {
			
 
				+    /// An adapter is detected in the pore, but sequencing hasn't started yet.
			
 
				+    Adapter,
			
 
				+
			
 
				+    /// Channel has been disabled by the system or user.
			
 
				+    Disabled,
			
 
				+
			
 
				+    /// Channel is reserved or blocked, typically during transitions like muxing.
			
 
				+    Locked,
			
 
				+
			
 
				+    /// More than one nanopore detected in the channel, which is undesirable.
			
 
				+    Multiple,
			
 
				+
			
 
				+    /// No nanopore detected in the channel.
			
 
				+    NoPore,
			
 
				+
			
 
				+    /// Channel is flagged for manual reset due to an error or inactivity.
			
 
				+    PendingManualReset,
			
 
				+
			
 
				+    /// Channel is about to undergo a multiplexer change.
			
 
				+    PendingMuxChange,
			
 
				+
			
 
				+    /// A functional nanopore is detected and ready to start sequencing.
			
 
				+    Pore,
			
 
				+
			
 
				+    /// Signal is saturated, often due to blockage or abnormal activity.
			
 
				+    Saturated,
			
 
				+
			
 
				+    /// Channel is actively sequencing a strand of DNA or RNA.
			
 
				+    Strand,
			
 
				+
			
 
				+    /// Channel is not accessible or not reporting status.
			
 
				+    Unavailable,
			
 
				+
			
 
				+    /// Channel is currently undergoing unblocking to restore functionality.
			
 
				+    Unblocking,
			
 
				+
			
 
				+    /// Channel is in an undefined or unclassified state.
			
 
				+    Unclassified,
			
 
				+
			
 
				+    /// Channel is unclassified following a reset attempt.
			
 
				+    UnclassifiedFollowingReset,
			
 
				+
			
 
				+    /// Unknown status with a negative signal or pattern.
			
 
				+    UnknownNegative,
			
 
				+
			
 
				+    /// Unknown status with a positive signal or pattern.
			
 
				+    UnknownPositive,
			
 
				+
			
 
				+    /// No signal detected; the channel appears inactive or disconnected.
			
 
				+    Zero,
			
 
				+}
			
 
				+
			
 
				+impl FromStr for NanoporeChannelStatus {
			
 
				+    type Err = String;
			
 
				+
			
 
				+    fn from_str(s: &str) -> Result<Self, Self::Err> {
			
 
				+        use NanoporeChannelStatus::*;
			
 
				+        match s.trim().to_lowercase().as_str() {
			
 
				+            "adapter" => Ok(Adapter),
			
 
				+            "disabled" => Ok(Disabled),
			
 
				+            "locked" => Ok(Locked),
			
 
				+            "multiple" => Ok(Multiple),
			
 
				+            "no_pore" => Ok(NoPore),
			
 
				+            "pending_manual_reset" => Ok(PendingManualReset),
			
 
				+            "pending_mux_change" => Ok(PendingMuxChange),
			
 
				+            "pore" => Ok(Pore),
			
 
				+            "saturated" => Ok(Saturated),
			
 
				+            "strand" => Ok(Strand),
			
 
				+            "unavailable" => Ok(Unavailable),
			
 
				+            "unblocking" => Ok(Unblocking),
			
 
				+            "unclassified" => Ok(Unclassified),
			
 
				+            "unclassified_following_reset" => Ok(UnclassifiedFollowingReset),
			
 
				+            "unknown_negative" => Ok(UnknownNegative),
			
 
				+            "unknown_positive" => Ok(UnknownPositive),
			
 
				+            "zero" => Ok(Zero),
			
 
				+            _ => Err(format!("Unknown channel status: {}", s)),
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Used by serde to parse the status field
			
 
				+fn deserialize_status<'de, D>(deserializer: D) -> Result<NanoporeChannelStatus, D::Error>
			
 
				+where
			
 
				+    D: serde::Deserializer<'de>,
			
 
				+{
			
 
				+    let s = String::deserialize(deserializer)?;
			
 
				+    NanoporeChannelStatus::from_str(&s).map_err(serde::de::Error::custom)
			
 
				+}
			
 
				+
			
 
				+/// Represents a single timepoint of flowcell throughput metrics from a Nanopore sequencing run.
			
 
				+///
			
 
				+/// Each record summarizes various metrics such as the number of reads,
			
 
				+/// basecalled reads, raw samples, and throughput estimates at a given minute
			
 
				+/// of the experiment.
			
 
				+#[derive(Debug, Deserialize)]
			
 
				+pub struct ThroughputEntry {
			
 
				+    /// Time since the start of the experiment, in minutes.
			
 
				+    #[serde(rename = "Experiment Time (minutes)")]
			
 
				+    pub experiment_time_minutes: u32,
			
 
				+
			
 
				+    /// Total number of reads detected.
			
 
				+    #[serde(rename = "Reads")]
			
 
				+    pub reads: u64,
			
 
				+
			
 
				+    /// Number of reads that passed basecalling filters.
			
 
				+    #[serde(rename = "Basecalled Reads Passed")]
			
 
				+    pub basecalled_reads_passed: u64,
			
 
				+
			
 
				+    /// Number of reads that failed basecalling filters.
			
 
				+    #[serde(rename = "Basecalled Reads Failed")]
			
 
				+    pub basecalled_reads_failed: u64,
			
 
				+
			
 
				+    /// Number of reads skipped during basecalling.
			
 
				+    #[serde(rename = "Basecalled Reads Skipped")]
			
 
				+    pub basecalled_reads_skipped: u64,
			
 
				+
			
 
				+    /// Number of raw signal samples selected for processing.
			
 
				+    #[serde(rename = "Selected Raw Samples")]
			
 
				+    pub selected_raw_samples: u64,
			
 
				+
			
 
				+    /// Number of events (e.g., current transitions) selected.
			
 
				+    #[serde(rename = "Selected Events")]
			
 
				+    pub selected_events: u64,
			
 
				+
			
 
				+    /// Estimated number of base pairs sequenced.
			
 
				+    #[serde(rename = "Estimated Bases")]
			
 
				+    pub estimated_bases: u64,
			
 
				+
			
 
				+    /// Actual number of basecalled base pairs.
			
 
				+    #[serde(rename = "Basecalled Bases")]
			
 
				+    pub basecalled_bases: u64,
			
 
				+
			
 
				+    /// Number of raw signal samples used for basecalling.
			
 
				+    #[serde(rename = "Basecalled Samples")]
			
 
				+    pub basecalled_samples: u64,
			
 
				+}
			
 
				+
			
 
				+/// Loads flowcell throughput statistics from a CSV file.
			
 
				+///
			
 
				+/// This function reads per-minute summary statistics from a CSV file
			
 
				+/// generated by a Nanopore sequencing run. These metrics include the number
			
 
				+/// of reads, basecalled results, and estimated throughput.
			
 
				+///
			
 
				+/// The CSV file must have the following headers:
			
 
				+/// - `Experiment Time (minutes)`
			
 
				+/// - `Reads`
			
 
				+/// - `Basecalled Reads Passed`
			
 
				+/// - `Basecalled Reads Failed`
			
 
				+/// - `Basecalled Reads Skipped`
			
 
				+/// - `Selected Raw Samples`
			
 
				+/// - `Selected Events`
			
 
				+/// - `Estimated Bases`
			
 
				+/// - `Basecalled Bases`
			
 
				+/// - `Basecalled Samples`
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `path` - Path to the CSV file containing throughput metrics.
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A `Result` containing:
			
 
				+/// - `Ok(Vec<ThroughputEntry>)` if parsing succeeds
			
 
				+/// - `Err(Box<dyn Error>)` if an error occurs reading or deserializing the file
			
 
				+///
			
 
				+/// # Errors
			
 
				+///
			
 
				+/// Will return an error if:
			
 
				+/// - The file does not exist or is unreadable
			
 
				+/// - The CSV format is invalid or missing headers
			
 
				+///
			
 
				+/// # Example
			
 
				+///
			
 
				+/// ```rust
			
 
				+/// let throughput = load_throughput_entries("throughput.csv")?;
			
 
				+/// for entry in throughput {
			
 
				+///     println!("{:?}", entry);
			
 
				+/// }
			
 
				+/// ```
			
 
				+///
			
 
				+/// # Dependencies
			
 
				+///
			
 
				+/// Requires the `csv` and `serde` crates.
			
 
				+pub fn load_throughput_entries(path: &str) -> anyhow::Result<Vec<ThroughputEntry>> {
			
 
				+    let file = File::open(path)?;
			
 
				+    let mut rdr = csv::ReaderBuilder::new()
			
 
				+        .delimiter(b',')
			
 
				+        .has_headers(true)
			
 
				+        .from_reader(file);
			
 
				+
			
 
				+    let mut records = Vec::new();
			
 
				+    for result in rdr.deserialize() {
			
 
				+        let record: ThroughputEntry = result?;
			
 
				+        records.push(record);
			
 
				+    }
			
 
				+
			
 
				+    Ok(records)
			
 
				+}
			
--- a/src/collection/mod.rs
+++ b/src/collection/mod.rs
@@ -35,6 +35,8 @@ pub mod bam;
 
				 pub mod modbases;
			
 
				 pub mod pod5;
			
 
				 pub mod vcf;
			
 
				+pub mod flowcells;
			
 
				+pub mod minknow;
			
 
				 
			
 
				 #[derive(Debug, Clone)]
			
 
				 pub struct CollectionsConfig {
			
@@ -80,7 +82,6 @@ impl Collections {
 
				             ..
			
 
				         } = &config;
			
 
				         let pod5 = Pod5Collection::new(pod_dir, corrected_fc_path, result_dir)?;
			
 
				-        // let pod5 = Pod5Collection::default();
			
 
				         let bam = BamCollection::new(result_dir);
			
 
				         let vcf = VcfCollection::new(result_dir);
			
 
				         let modbases = ModBasesCollection::new(result_dir);
			
--- a/src/collection/pod5.rs
+++ b/src/collection/pod5.rs
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -177,7 +177,7 @@ mod tests {
 
				 
			
 
				     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
			
 
				     use super::*;
			
 
				-    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::SavanaCN}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig, ShouldRun}, commands::dorado::Dorado, helpers::find_files, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges}, scan::scan::somatic_scan, variant::{variant::AlterationCategory, variants_stats::{self, VariantsStats}}};
			
 
				+    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::SavanaCN}, collection::{bam, flowcells::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig, ShouldRun}, commands::dorado::Dorado, helpers::find_files, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges}, scan::scan::somatic_scan, variant::{variant::AlterationCategory, variants_stats::{self, VariantsStats}}};
			
 
				 
			
 
				     // export RUST_LOG="debug"
			
 
				     fn init() {
			
@@ -665,24 +665,24 @@ mod tests {
 
				     #[test]
			
 
				     fn pipe_somatic() -> anyhow::Result<()> {   
			
 
				         init();
			
 
				-        // let collections = Collections::new(
			
 
				-        //     CollectionsConfig::default()
			
 
				-        // )?;
			
 
				-        // for (a, _) in collections.bam_pairs().iter() {
			
 
				-        //     if a.id.as_str() != "CHAMPION" {
			
 
				-        //         continue;
			
 
				-        //     }
			
 
				-        //     if let Err(e) = SomaticPipe::initialize(&a.id, Config::default()).map(|mut p| if p.should_run() {
			
 
				-        //         if let Err(e) = p.run() {
			
 
				-        //             error!("{e}");
			
 
				-        //         }
			
 
				-        //     }) {
			
 
				-        //         error!("{e}");
			
 
				-        //     }
			
 
				-        // }
			
 
				-        // Ok(())
			
 
				-        let id = "JEANSELME";
			
 
				-        SomaticPipe::initialize(id, Config::default())?.run()
			
 
				+        let collections = Collections::new(
			
 
				+            CollectionsConfig::default()
			
 
				+        )?;
			
 
				+        for (a, _) in collections.bam_pairs().iter() {
			
 
				+            // if a.id.as_str() != "CHAMPION" {
			
 
				+            //     continue;
			
 
				+            // }
			
 
				+            if let Err(e) = SomaticPipe::initialize(&a.id, Config::default()).map(|mut p| if p.should_run() {
			
 
				+                if let Err(e) = p.run() {
			
 
				+                    error!("{e}");
			
 
				+                }
			
 
				+            }) {
			
 
				+                error!("{e}");
			
 
				+            }
			
 
				+        }
			
 
				+        Ok(())
			
 
				+        // let id = "HENAUX";
			
 
				+        // SomaticPipe::initialize(id, Config::default())?.run()
			
 
				     }
			
 
				 
			
 
				     #[test]
			
--- a/src/variant/variant.rs
+++ b/src/variant/variant.rs
@@ -582,15 +582,18 @@ impl FromStr for Infos {
 
				 impl fmt::Display for Infos {
			
 
				     /// Formats the `Infos` as a semicolon-separated VCF-style INFO string.
			
 
				     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
			
 
				-        write!(
			
 
				-            f,
			
 
				-            "{}",
			
 
				-            self.0
			
 
				-                .iter()
			
 
				-                .map(|e| e.to_string())
			
 
				-                .collect::<Vec<String>>()
			
 
				-                .join(";")
			
 
				-        )
			
 
				+        let items: Vec<_> = self
			
 
				+            .0
			
 
				+            .iter()
			
 
				+            .filter(|info| !matches!(info, Info::Empty))
			
 
				+            .map(ToString::to_string)
			
 
				+            .collect();
			
 
				+
			
 
				+        if items.is_empty() {
			
 
				+            write!(f, ".")
			
 
				+        } else {
			
 
				+            write!(f, "{}", items.join(";"))
			
 
				+        }
			
 
				     }
			
 
				 }
			
 
				 
			
--- a/src/variant/variant_collection.rs
+++ b/src/variant/variant_collection.rs
@@ -590,21 +590,45 @@ impl Variant {
 
				 
			
 
				     /// Merge all `Infos` from the list of `VcfVariant`s.
			
 
				     pub fn merge_infos(&self) -> Infos {
			
 
				-        let mut seen_keys = HashSet::new();
			
 
				-        let mut merged = Vec::new();
			
 
				+    use std::collections::HashSet;
			
 
				+    use log::warn;
			
 
				 
			
 
				-        for vcf in self.vcf_variants.iter() {
			
 
				-            for info in &vcf.infos.0 {
			
 
				-                let key = info.key();
			
 
				-                if seen_keys.insert(key.to_string()) {
			
 
				-                    merged.push(info.clone());
			
 
				+    let mut seen_keys = HashSet::new();
			
 
				+    let mut merged = Vec::new();
			
 
				+    let mut end_info: Option<(u32, &VcfVariant)> = None;
			
 
				+
			
 
				+    for vcf in self.vcf_variants.iter() {
			
 
				+        for info in &vcf.infos.0 {
			
 
				+            let key = info.key().to_string();
			
 
				+
			
 
				+            if key == "END" {
			
 
				+                if let Info::END(e) = info {
			
 
				+                    end_info = Some((*e, vcf));
			
 
				                 }
			
 
				+            } else if seen_keys.insert(key) {
			
 
				+                merged.push(info.clone());
			
 
				             }
			
 
				         }
			
 
				+    }
			
 
				 
			
 
				-        Infos(merged)
			
 
				+    if let Some((end_pos, vcf)) = end_info {
			
 
				+        let pos = vcf.position.position + 1;
			
 
				+        if end_pos >= pos {
			
 
				+            merged.push(Info::END(end_pos));
			
 
				+        } else {
			
 
				+            warn!(
			
 
				+                "Invalid INFO/END={} < POS={} at {}:{} – skipping END field.",
			
 
				+                end_pos,
			
 
				+                pos,
			
 
				+                vcf.position.contig(),
			
 
				+                pos
			
 
				+            );
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				+    Infos(merged)
			
 
				+}
			
 
				+
			
 
				     pub fn merge_formats(&self) -> Formats {
			
 
				         let mut seen_keys = HashSet::new();
			
 
				         let mut merged = Vec::new();