|
@@ -1,15 +1,24 @@
|
|
|
-use std::{collections::{HashMap, HashSet}, fmt, fs::{self, File}, io::{BufReader, Read}, os::unix::fs::MetadataExt, path::Path};
|
|
|
|
|
|
|
+use std::{
|
|
|
|
|
+ collections::{HashMap, HashSet},
|
|
|
|
|
+ fmt,
|
|
|
|
|
+ fs::{self, File},
|
|
|
|
|
+ io::{BufReader, Read},
|
|
|
|
|
+ os::unix::fs::MetadataExt,
|
|
|
|
|
+ path::Path,
|
|
|
|
|
+};
|
|
|
|
|
|
|
|
use anyhow::Context;
|
|
use anyhow::Context;
|
|
|
use chrono::{DateTime, TimeZone, Utc};
|
|
use chrono::{DateTime, TimeZone, Utc};
|
|
|
use glob::glob;
|
|
use glob::glob;
|
|
|
-use csv::ReaderBuilder;
|
|
|
|
|
use log::{info, warn};
|
|
use log::{info, warn};
|
|
|
use serde::{Deserialize, Serialize};
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
|
|
|
-use crate::helpers::{find_files, list_directories};
|
|
|
|
|
|
|
+use crate::{
|
|
|
|
|
+ collection::minknow::{parse_pore_activity_from_reader, parse_throughput_from_reader},
|
|
|
|
|
+ helpers::{find_files, list_directories},
|
|
|
|
|
+};
|
|
|
|
|
|
|
|
-use super::minknow::MinKnowSampleSheet;
|
|
|
|
|
|
|
+use super::minknow::{MinKnowSampleSheet, PoreStateEntry, ThroughputEntry};
|
|
|
|
|
|
|
|
/// A collection of `IdInput` records, with utility methods
|
|
/// A collection of `IdInput` records, with utility methods
|
|
|
/// for loading, saving, deduplication, and construction from TSV.
|
|
/// for loading, saving, deduplication, and construction from TSV.
|
|
@@ -19,6 +28,21 @@ pub struct IdsInput {
|
|
|
pub data: Vec<IdInput>,
|
|
pub data: Vec<IdInput>,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// A unique sample identifier from sequencing metadata.
|
|
|
|
|
+///
|
|
|
|
|
+/// Uniqueness is defined by the combination of all fields.
|
|
|
|
|
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
|
|
|
|
|
+pub struct IdInput {
|
|
|
|
|
+ /// Case or patient ID.
|
|
|
|
|
+ pub case_id: String,
|
|
|
|
|
+ /// Time point or sample type.
|
|
|
|
|
+ pub sample_type: String,
|
|
|
|
|
+ /// Barcode number (sequencing index).
|
|
|
|
|
+ pub barcode: String,
|
|
|
|
|
+ /// Flow cell identifier.
|
|
|
|
|
+ pub flow_cell_id: String,
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
impl IdsInput {
|
|
impl IdsInput {
|
|
|
/// Load `IdsInput` from a JSON file.
|
|
/// Load `IdsInput` from a JSON file.
|
|
|
///
|
|
///
|
|
@@ -42,7 +66,7 @@ impl IdsInput {
|
|
|
/// Returns an error if the file cannot be created or written.
|
|
/// Returns an error if the file cannot be created or written.
|
|
|
pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
|
|
pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
|
|
|
let f = File::create(path)?;
|
|
let f = File::create(path)?;
|
|
|
- serde_json::to_writer(f, self)?;
|
|
|
|
|
|
|
+ serde_json::to_writer_pretty(f, self)?;
|
|
|
Ok(())
|
|
Ok(())
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -54,33 +78,6 @@ impl IdsInput {
|
|
|
self.data.retain(|item| unique.insert(item.clone()));
|
|
self.data.retain(|item| unique.insert(item.clone()));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /// Load `IdsInput` from a TSV file using corrected flowcell names.
|
|
|
|
|
- ///
|
|
|
|
|
- /// This method internally deduplicates the data.
|
|
|
|
|
- ///
|
|
|
|
|
- /// # Arguments
|
|
|
|
|
- /// * `path` - Path to the TSV file.
|
|
|
|
|
- ///
|
|
|
|
|
- /// # Errors
|
|
|
|
|
- /// Returns an error if loading or parsing fails.
|
|
|
|
|
- pub fn load_from_tsv(path: &str) -> anyhow::Result<Self> {
|
|
|
|
|
- let inputs = load_flowcells_corrected_names(path)?;
|
|
|
|
|
- let data = inputs
|
|
|
|
|
- .iter()
|
|
|
|
|
- .map(|line| IdInput {
|
|
|
|
|
- id: line.id.to_string(),
|
|
|
|
|
- time_point: line.sample_type.to_string(),
|
|
|
|
|
- barcode: line.barcode_number.to_string(),
|
|
|
|
|
- flow_cell: line.flow_cell.to_string(),
|
|
|
|
|
- run: line.run.to_string(),
|
|
|
|
|
- })
|
|
|
|
|
- .collect();
|
|
|
|
|
-
|
|
|
|
|
- let mut res = Self { data };
|
|
|
|
|
- res.dedup();
|
|
|
|
|
- Ok(res)
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
/// Add a new `IdInput` and deduplicate the collection.
|
|
/// Add a new `IdInput` and deduplicate the collection.
|
|
|
///
|
|
///
|
|
|
/// # Arguments
|
|
/// # Arguments
|
|
@@ -91,102 +88,6 @@ impl IdsInput {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/// A unique sample identifier from sequencing metadata.
|
|
|
|
|
-///
|
|
|
|
|
-/// Uniqueness is defined by the combination of all fields.
|
|
|
|
|
-#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, Hash)]
|
|
|
|
|
-pub struct IdInput {
|
|
|
|
|
- /// Sample or patient ID.
|
|
|
|
|
- pub id: String,
|
|
|
|
|
- /// Time point or sample type.
|
|
|
|
|
- pub time_point: String,
|
|
|
|
|
- /// Barcode number (sequencing index).
|
|
|
|
|
- pub barcode: String,
|
|
|
|
|
- /// Flow cell identifier.
|
|
|
|
|
- pub flow_cell: String,
|
|
|
|
|
- /// Run identifier.
|
|
|
|
|
- pub run: String,
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-/// Represents a single record describing a barcode-flowcell pairing,
|
|
|
|
|
-/// including original and corrected metadata.
|
|
|
|
|
-///
|
|
|
|
|
-/// This struct is typically deserialized from a TSV file and used to map
|
|
|
|
|
-/// `.pod5` files to metadata like corrected flowcell names and experimental time points.
|
|
|
|
|
-#[derive(Debug, Serialize, Deserialize, Clone)]
|
|
|
|
|
-pub struct FCLine {
|
|
|
|
|
- /// Unique identifier for the sample or barcode group (e.g., "P001X03").
|
|
|
|
|
- pub id: String,
|
|
|
|
|
-
|
|
|
|
|
- /// Sample type associated with this record (e.g., "normal", "tumoral").
|
|
|
|
|
- pub sample_type: String,
|
|
|
|
|
-
|
|
|
|
|
- /// The barcode number (e.g., "NB01", "NB02").
|
|
|
|
|
- pub barcode_number: String,
|
|
|
|
|
-
|
|
|
|
|
- /// Original flowcell name as found in the raw `.pod5` metadata.
|
|
|
|
|
- pub flow_cell: String,
|
|
|
|
|
-
|
|
|
|
|
- /// Sequencing run name this flowcell belongs to (e.g., "20240101_FAB123").
|
|
|
|
|
- pub run: String,
|
|
|
|
|
-
|
|
|
|
|
- /// Original path to data (can be absolute or relative).
|
|
|
|
|
- pub path: String,
|
|
|
|
|
-
|
|
|
|
|
- /// Corrected flowcell name used to resolve naming inconsistencies.
|
|
|
|
|
- pub ref_flow_cell: String,
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-/// Loads corrected flowcell metadata from a tab-delimited file.
|
|
|
|
|
-///
|
|
|
|
|
-/// This function parses a TSV file where each row is deserialized into an `FCLine`.
|
|
|
|
|
-/// It also normalizes some fields (e.g., lowercases `sample_type`, uppercases `id`)
|
|
|
|
|
-/// for consistency in downstream processing.
|
|
|
|
|
-///
|
|
|
|
|
-/// # Arguments
|
|
|
|
|
-/// - `file_path`: Path to the TSV file containing flowcell correction data.
|
|
|
|
|
-///
|
|
|
|
|
-/// # Returns
|
|
|
|
|
-/// A vector of `FCLine` records, one per line in the file.
|
|
|
|
|
-///
|
|
|
|
|
-/// # Errors
|
|
|
|
|
-/// Returns an error if the file cannot be opened or if any line fails to deserialize.
|
|
|
|
|
-///
|
|
|
|
|
-/// # Expected Format (TSV with header)
|
|
|
|
|
-/// ```text
|
|
|
|
|
-/// id sample_type barcode_number flow_cell run_path ref_flow_cell
|
|
|
|
|
-/// P001X03 tumoral NB01 FC123 RUN123 /path/to/data FC123_CORR
|
|
|
|
|
-/// ```
|
|
|
|
|
-///
|
|
|
|
|
-/// # Example
|
|
|
|
|
-/// ```
|
|
|
|
|
-/// let fc_lines = load_flowcells_corrected_names("flowcells.tsv")?;
|
|
|
|
|
-/// assert!(!fc_lines.is_empty());
|
|
|
|
|
-/// ```
|
|
|
|
|
-pub fn load_flowcells_corrected_names(file_path: &str) -> anyhow::Result<Vec<FCLine>> {
|
|
|
|
|
- let file = File::open(file_path)?;
|
|
|
|
|
-
|
|
|
|
|
- let mut rdr = ReaderBuilder::new()
|
|
|
|
|
- .delimiter(b'\t')
|
|
|
|
|
- .has_headers(true)
|
|
|
|
|
- .from_reader(file);
|
|
|
|
|
-
|
|
|
|
|
- let mut records = Vec::new();
|
|
|
|
|
- for result in rdr.deserialize() {
|
|
|
|
|
- let mut record: FCLine = result?;
|
|
|
|
|
-
|
|
|
|
|
- // formating
|
|
|
|
|
- record.sample_type = record.sample_type.to_lowercase();
|
|
|
|
|
- record.id = record.id.to_uppercase();
|
|
|
|
|
-
|
|
|
|
|
- records.push(record);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- Ok(records)
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
/// Container for a deduplicated and enriched collection of flowcells (`FlowCel`).
|
|
/// Container for a deduplicated and enriched collection of flowcells (`FlowCel`).
|
|
|
///
|
|
///
|
|
|
/// `FlowCells` represents the aggregated result of scanning multiple sources:
|
|
/// `FlowCells` represents the aggregated result of scanning multiple sources:
|
|
@@ -231,7 +132,7 @@ pub fn load_flowcells_corrected_names(file_path: &str) -> anyhow::Result<Vec<FCL
|
|
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
|
|
pub struct FlowCells {
|
|
pub struct FlowCells {
|
|
|
/// A collection of parsed flowcell metadata records.
|
|
/// A collection of parsed flowcell metadata records.
|
|
|
- pub flow_cells: Vec<FlowCel>,
|
|
|
|
|
|
|
+ pub flow_cells: Vec<FlowCell>,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
impl FlowCells {
|
|
impl FlowCells {
|
|
@@ -241,8 +142,8 @@ impl FlowCells {
|
|
|
/// - a precomputed archive (JSON),
|
|
/// - a precomputed archive (JSON),
|
|
|
/// - and a dynamic scan of local run directories.
|
|
/// - and a dynamic scan of local run directories.
|
|
|
///
|
|
///
|
|
|
- /// The result is deduplicated by `flowcell_id`, and enriched with case-level annotations
|
|
|
|
|
- /// from an `IdsInput` file based on `sample_id` and `experiment_id`.
|
|
|
|
|
|
|
+ /// The result is deduplicated by `flow_cell_id`, and enriched with case-level annotations
|
|
|
|
|
+ /// from an `IdsInput` file.
|
|
|
///
|
|
///
|
|
|
/// # Deduplication Logic
|
|
/// # Deduplication Logic
|
|
|
/// If a flowcell appears in both sources, the one with the more recent `modified` timestamp is retained.
|
|
/// If a flowcell appears in both sources, the one with the more recent `modified` timestamp is retained.
|
|
@@ -251,15 +152,15 @@ impl FlowCells {
|
|
|
inputs_path: &str,
|
|
inputs_path: &str,
|
|
|
archive_store_path: &str,
|
|
archive_store_path: &str,
|
|
|
) -> anyhow::Result<Self> {
|
|
) -> anyhow::Result<Self> {
|
|
|
- let mut merged_map: HashMap<String, FlowCel> = HashMap::new();
|
|
|
|
|
|
|
+ let mut merged_map: HashMap<String, FlowCell> = HashMap::new();
|
|
|
|
|
|
|
|
- // Load from archive if present
|
|
|
|
|
|
|
+ // Load Vec<FlowCell> from archive if present
|
|
|
if Path::new(archive_store_path).exists() {
|
|
if Path::new(archive_store_path).exists() {
|
|
|
let file = File::open(archive_store_path)?;
|
|
let file = File::open(archive_store_path)?;
|
|
|
- let archived: Vec<FlowCel> = serde_json::from_reader(BufReader::new(file))?;
|
|
|
|
|
|
|
+ let archived: Vec<FlowCell> = serde_json::from_reader(BufReader::new(file))?;
|
|
|
|
|
|
|
|
for fc in archived {
|
|
for fc in archived {
|
|
|
- merged_map.insert(fc.flowcell_id.clone(), fc);
|
|
|
|
|
|
|
+ merged_map.insert(fc.sample_sheet.flow_cell_id.clone(), fc);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -272,14 +173,21 @@ impl FlowCells {
|
|
|
sample_sheet_path.display()
|
|
sample_sheet_path.display()
|
|
|
)
|
|
)
|
|
|
})?;
|
|
})?;
|
|
|
|
|
+
|
|
|
let dir_str = dir.to_string_lossy().to_string();
|
|
let dir_str = dir.to_string_lossy().to_string();
|
|
|
|
|
|
|
|
- let (sample_sheet, files) = scan_local(&dir_str)?;
|
|
|
|
|
- let fc = FlowCel::new(sample_sheet, FlowCellLocation::Local(dir_str), files)?;
|
|
|
|
|
|
|
+ let (sample_sheet, pore_activity, throughput, files) = scan_local(&dir_str)?;
|
|
|
|
|
+ let fc = FlowCell::new(
|
|
|
|
|
+ sample_sheet,
|
|
|
|
|
+ pore_activity,
|
|
|
|
|
+ throughput,
|
|
|
|
|
+ FlowCellLocation::Local(dir_str),
|
|
|
|
|
+ files,
|
|
|
|
|
+ )?;
|
|
|
|
|
|
|
|
// Dedup by flowcell_id, retain most recently modified
|
|
// Dedup by flowcell_id, retain most recently modified
|
|
|
merged_map
|
|
merged_map
|
|
|
- .entry(fc.flowcell_id.clone())
|
|
|
|
|
|
|
+ .entry(fc.sample_sheet.flow_cell_id.clone())
|
|
|
.and_modify(|existing| {
|
|
.and_modify(|existing| {
|
|
|
if fc.modified > existing.modified {
|
|
if fc.modified > existing.modified {
|
|
|
*existing = fc.clone();
|
|
*existing = fc.clone();
|
|
@@ -288,15 +196,13 @@ impl FlowCells {
|
|
|
.or_insert(fc);
|
|
.or_insert(fc);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Load input metadata and annotate flowcells
|
|
|
|
|
|
|
+ // Load IdsInput and annotate flowcells
|
|
|
let inputs = IdsInput::load_json(inputs_path)?;
|
|
let inputs = IdsInput::load_json(inputs_path)?;
|
|
|
for fc in merged_map.values_mut() {
|
|
for fc in merged_map.values_mut() {
|
|
|
fc.cases = inputs
|
|
fc.cases = inputs
|
|
|
.data
|
|
.data
|
|
|
.iter()
|
|
.iter()
|
|
|
- .filter(|info| {
|
|
|
|
|
- info.flow_cell == fc.run.sample_id && info.run == fc.run.experiment_id
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ .filter(|info| info.flow_cell_id == fc.sample_sheet.flow_cell_id)
|
|
|
.cloned()
|
|
.cloned()
|
|
|
.collect();
|
|
.collect();
|
|
|
}
|
|
}
|
|
@@ -341,11 +247,11 @@ impl FlowCells {
|
|
|
///
|
|
///
|
|
|
/// # See Also
|
|
/// # See Also
|
|
|
/// - [`scan_archive`]
|
|
/// - [`scan_archive`]
|
|
|
- /// - [`FlowCel`]
|
|
|
|
|
|
|
+ /// - [`FlowCell`]
|
|
|
/// - [`FlowCellLocation::Archived`]
|
|
/// - [`FlowCellLocation::Archived`]
|
|
|
pub fn update_archive_from_scan(archive_path: &str, save_path: &str) -> anyhow::Result<()> {
|
|
pub fn update_archive_from_scan(archive_path: &str, save_path: &str) -> anyhow::Result<()> {
|
|
|
// Load existing archive, if any
|
|
// Load existing archive, if any
|
|
|
- let mut all: Vec<FlowCel> = if Path::new(save_path).exists() {
|
|
|
|
|
|
|
+ let mut all: Vec<FlowCell> = if Path::new(save_path).exists() {
|
|
|
let file = File::open(save_path)?;
|
|
let file = File::open(save_path)?;
|
|
|
serde_json::from_reader(BufReader::new(file))?
|
|
serde_json::from_reader(BufReader::new(file))?
|
|
|
} else {
|
|
} else {
|
|
@@ -356,19 +262,22 @@ impl FlowCells {
|
|
|
let pattern = format!("{archive_path}/*.tar");
|
|
let pattern = format!("{archive_path}/*.tar");
|
|
|
|
|
|
|
|
// Scan all .tar archives
|
|
// Scan all .tar archives
|
|
|
- let res: Vec<FlowCel> = glob(&pattern)?
|
|
|
|
|
|
|
+ let res: Vec<FlowCell> = glob(&pattern)?
|
|
|
.filter_map(Result::ok)
|
|
.filter_map(Result::ok)
|
|
|
.filter_map(|path| {
|
|
.filter_map(|path| {
|
|
|
let archive_str = path.to_string_lossy();
|
|
let archive_str = path.to_string_lossy();
|
|
|
- let (sample_sheet, files) = match scan_archive(&archive_str) {
|
|
|
|
|
- Ok(r) => r,
|
|
|
|
|
- Err(e) => {
|
|
|
|
|
- warn!("Failed to scan archive {}: {e}", archive_str);
|
|
|
|
|
- return None;
|
|
|
|
|
- }
|
|
|
|
|
- };
|
|
|
|
|
- match FlowCel::new(
|
|
|
|
|
|
|
+ let (sample_sheet, pore_activity, throughput, files) =
|
|
|
|
|
+ match scan_archive(&archive_str) {
|
|
|
|
|
+ Ok(r) => r,
|
|
|
|
|
+ Err(e) => {
|
|
|
|
|
+ warn!("Failed to scan archive {}: {e}", archive_str);
|
|
|
|
|
+ return None;
|
|
|
|
|
+ }
|
|
|
|
|
+ };
|
|
|
|
|
+ match FlowCell::new(
|
|
|
sample_sheet,
|
|
sample_sheet,
|
|
|
|
|
+ pore_activity,
|
|
|
|
|
+ throughput,
|
|
|
FlowCellLocation::Archived(archive_path.to_string()),
|
|
FlowCellLocation::Archived(archive_path.to_string()),
|
|
|
files,
|
|
files,
|
|
|
) {
|
|
) {
|
|
@@ -383,11 +292,18 @@ impl FlowCells {
|
|
|
|
|
|
|
|
// Merge, deduplicate, and write updated archive
|
|
// Merge, deduplicate, and write updated archive
|
|
|
all.extend(res);
|
|
all.extend(res);
|
|
|
- all.sort_by(|a, b| a.flowcell_id.cmp(&b.flowcell_id));
|
|
|
|
|
- all.dedup_by_key(|v| v.flowcell_id.clone());
|
|
|
|
|
|
|
+ all.sort_by(|a, b| {
|
|
|
|
|
+ a.sample_sheet
|
|
|
|
|
+ .flow_cell_id
|
|
|
|
|
+ .cmp(&b.sample_sheet.flow_cell_id)
|
|
|
|
|
+ });
|
|
|
|
|
+ all.dedup_by_key(|v| v.sample_sheet.flow_cell_id.clone());
|
|
|
|
|
|
|
|
let n_final = all.len();
|
|
let n_final = all.len();
|
|
|
- info!("{} new archive(s) discovered.", n_final.saturating_sub(n_before));
|
|
|
|
|
|
|
+ info!(
|
|
|
|
|
+ "{} new archive(s) discovered.",
|
|
|
|
|
+ n_final.saturating_sub(n_before)
|
|
|
|
|
+ );
|
|
|
|
|
|
|
|
let json = serde_json::to_string_pretty(&all)
|
|
let json = serde_json::to_string_pretty(&all)
|
|
|
.map_err(|e| anyhow::anyhow!("Can't convert into json.\n{e}"))?;
|
|
.map_err(|e| anyhow::anyhow!("Can't convert into json.\n{e}"))?;
|
|
@@ -402,28 +318,28 @@ impl FlowCells {
|
|
|
/// Represents a fully described flowcell unit, including experimental metadata,
|
|
/// Represents a fully described flowcell unit, including experimental metadata,
|
|
|
/// physical location (local or archived), sample sheet data, and associated pod5 files.
|
|
/// physical location (local or archived), sample sheet data, and associated pod5 files.
|
|
|
///
|
|
///
|
|
|
-/// A `FlowCel` object serves as the central unit in the data model for sample aggregation
|
|
|
|
|
|
|
+/// A `FlowCell` object serves as the central unit in the data model for sample aggregation
|
|
|
/// and downstream processing.
|
|
/// and downstream processing.
|
|
|
///
|
|
///
|
|
|
/// # Fields
|
|
/// # Fields
|
|
|
-/// - `flowcell_id`: A compound identifier, typically formatted as `{experiment_id}/{sample_id}`.
|
|
|
|
|
|
|
+/// - `sample_sheet`: The original MinKNOW sample sheet metadata (`MinKnowSampleSheet`).
|
|
|
/// - `experiment`: Experiment type inferred from `.pod5` files (see `FlowCellExperiment`).
|
|
/// - `experiment`: Experiment type inferred from `.pod5` files (see `FlowCellExperiment`).
|
|
|
/// - `location`: Whether the flowcell was loaded from a local directory or archive store.
|
|
/// - `location`: Whether the flowcell was loaded from a local directory or archive store.
|
|
|
/// - `modified`: Last modification timestamp among `.pod5` files.
|
|
/// - `modified`: Last modification timestamp among `.pod5` files.
|
|
|
-/// - `run`: The original MinKNOW sample sheet metadata (`MinKnowSampleSheet`).
|
|
|
|
|
/// - `pod5_size`: Total size (in bytes) of `.pod5` files.
|
|
/// - `pod5_size`: Total size (in bytes) of `.pod5` files.
|
|
|
/// - `n_pod5`: Number of `.pod5` files found.
|
|
/// - `n_pod5`: Number of `.pod5` files found.
|
|
|
/// - `cases`: List of sample/case-level annotations associated with this flowcell (from `IdsInput`).
|
|
/// - `cases`: List of sample/case-level annotations associated with this flowcell (from `IdsInput`).
|
|
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
|
|
-pub struct FlowCel {
|
|
|
|
|
- pub flowcell_id: String,
|
|
|
|
|
|
|
+pub struct FlowCell {
|
|
|
|
|
+ pub sample_sheet: MinKnowSampleSheet,
|
|
|
pub experiment: FlowCellExperiment,
|
|
pub experiment: FlowCellExperiment,
|
|
|
pub location: FlowCellLocation,
|
|
pub location: FlowCellLocation,
|
|
|
pub modified: DateTime<Utc>,
|
|
pub modified: DateTime<Utc>,
|
|
|
- pub run: MinKnowSampleSheet,
|
|
|
|
|
pub pod5_size: usize,
|
|
pub pod5_size: usize,
|
|
|
pub n_pod5: usize,
|
|
pub n_pod5: usize,
|
|
|
pub cases: Vec<IdInput>,
|
|
pub cases: Vec<IdInput>,
|
|
|
|
|
+ pub pore_activity: Option<Vec<PoreStateEntry>>,
|
|
|
|
|
+ pub throughput: Option<Vec<ThroughputEntry>>,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// Describes the physical origin of a flowcell when loaded.
|
|
/// Describes the physical origin of a flowcell when loaded.
|
|
@@ -451,7 +367,8 @@ impl fmt::Display for FlowCellLocation {
|
|
|
)
|
|
)
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
-impl FlowCel {
|
|
|
|
|
|
|
+
|
|
|
|
|
+impl FlowCell {
|
|
|
/// Constructs a new `FlowCel` from a sample sheet and associated file list.
|
|
/// Constructs a new `FlowCel` from a sample sheet and associated file list.
|
|
|
///
|
|
///
|
|
|
/// This method aggregates information from a parsed `MinKnowSampleSheet` and the
|
|
/// This method aggregates information from a parsed `MinKnowSampleSheet` and the
|
|
@@ -480,10 +397,12 @@ impl FlowCel {
|
|
|
/// ```
|
|
/// ```
|
|
|
pub fn new(
|
|
pub fn new(
|
|
|
sample_sheet: MinKnowSampleSheet,
|
|
sample_sheet: MinKnowSampleSheet,
|
|
|
|
|
+ pore_activity: Option<Vec<PoreStateEntry>>,
|
|
|
|
|
+ throughput: Option<Vec<ThroughputEntry>>,
|
|
|
location: FlowCellLocation,
|
|
location: FlowCellLocation,
|
|
|
files: Vec<(String, u64, DateTime<Utc>)>,
|
|
files: Vec<(String, u64, DateTime<Utc>)>,
|
|
|
) -> anyhow::Result<Self> {
|
|
) -> anyhow::Result<Self> {
|
|
|
- let flowcell_id = format!("{}/{}", sample_sheet.experiment_id, sample_sheet.sample_id);
|
|
|
|
|
|
|
+ let flowcell_id = sample_sheet.flow_cell_id.clone();
|
|
|
|
|
|
|
|
// Filter .pod5 files
|
|
// Filter .pod5 files
|
|
|
let pod5s: Vec<_> = files
|
|
let pod5s: Vec<_> = files
|
|
@@ -514,14 +433,15 @@ impl FlowCel {
|
|
|
);
|
|
);
|
|
|
|
|
|
|
|
Ok(Self {
|
|
Ok(Self {
|
|
|
- flowcell_id,
|
|
|
|
|
experiment,
|
|
experiment,
|
|
|
location,
|
|
location,
|
|
|
modified,
|
|
modified,
|
|
|
- run: sample_sheet,
|
|
|
|
|
|
|
+ sample_sheet,
|
|
|
pod5_size,
|
|
pod5_size,
|
|
|
n_pod5,
|
|
n_pod5,
|
|
|
cases: Vec::new(),
|
|
cases: Vec::new(),
|
|
|
|
|
+ pore_activity,
|
|
|
|
|
+ throughput,
|
|
|
})
|
|
})
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -619,7 +539,12 @@ impl FlowCellExperiment {
|
|
|
/// - `String`: file path
|
|
/// - `String`: file path
|
|
|
/// - `u64`: file size in bytes
|
|
/// - `u64`: file size in bytes
|
|
|
/// - `DateTime<Utc>`: last modification time (UTC)
|
|
/// - `DateTime<Utc>`: last modification time (UTC)
|
|
|
-type ExperimentData = (MinKnowSampleSheet, Vec<(String, u64, DateTime<Utc>)>);
|
|
|
|
|
|
|
+type ExperimentData = (
|
|
|
|
|
+ MinKnowSampleSheet,
|
|
|
|
|
+ Option<Vec<PoreStateEntry>>,
|
|
|
|
|
+ Option<Vec<ThroughputEntry>>,
|
|
|
|
|
+ Vec<(String, u64, DateTime<Utc>)>,
|
|
|
|
|
+);
|
|
|
|
|
|
|
|
/// Scans a local directory for MinKNOW experiment files and metadata.
|
|
/// Scans a local directory for MinKNOW experiment files and metadata.
|
|
|
///
|
|
///
|
|
@@ -652,6 +577,8 @@ type ExperimentData = (MinKnowSampleSheet, Vec<(String, u64, DateTime<Utc>)>);
|
|
|
pub fn scan_local(dir: &str) -> anyhow::Result<ExperimentData> {
|
|
pub fn scan_local(dir: &str) -> anyhow::Result<ExperimentData> {
|
|
|
let mut result = Vec::new();
|
|
let mut result = Vec::new();
|
|
|
let mut sample_sheet: Option<String> = None;
|
|
let mut sample_sheet: Option<String> = None;
|
|
|
|
|
+ let mut pore_activity = None;
|
|
|
|
|
+ let mut throughput = None;
|
|
|
|
|
|
|
|
for entry in glob(&format!("{}/**/*", dir))? {
|
|
for entry in glob(&format!("{}/**/*", dir))? {
|
|
|
let file = entry.context("Failed to read an entry from the tar archive")?;
|
|
let file = entry.context("Failed to read an entry from the tar archive")?;
|
|
@@ -667,8 +594,25 @@ pub fn scan_local(dir: &str) -> anyhow::Result<ExperimentData> {
|
|
|
|
|
|
|
|
let path = file.to_string_lossy().into_owned();
|
|
let path = file.to_string_lossy().into_owned();
|
|
|
|
|
|
|
|
- if path.contains("sample_sheet") {
|
|
|
|
|
|
|
+ if path.contains("pore_activity") {
|
|
|
|
|
+ let mut reader =
|
|
|
|
|
+ File::open(&file).with_context(|| format!("Failed to open: {}", file.display()))?;
|
|
|
|
|
+ pore_activity = Some(parse_pore_activity_from_reader(&mut reader).with_context(
|
|
|
|
|
+ || {
|
|
|
|
|
+ format!(
|
|
|
|
|
+ "Failed to parse pore activity date from: {}",
|
|
|
|
|
+ file.display()
|
|
|
|
|
+ )
|
|
|
|
|
+ },
|
|
|
|
|
+ )?);
|
|
|
|
|
+ } else if path.contains("sample_sheet") {
|
|
|
sample_sheet = Some(path.clone());
|
|
sample_sheet = Some(path.clone());
|
|
|
|
|
+ } else if path.contains("throughput") {
|
|
|
|
|
+ let mut reader =
|
|
|
|
|
+ File::open(&file).with_context(|| format!("Failed to open: {}", file.display()))?;
|
|
|
|
|
+ throughput = Some(parse_throughput_from_reader(&mut reader).with_context(|| {
|
|
|
|
|
+ format!("Failed to parse throughput date from: {}", file.display())
|
|
|
|
|
+ })?);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
result.push((path, size, modified_utc));
|
|
result.push((path, size, modified_utc));
|
|
@@ -678,7 +622,7 @@ pub fn scan_local(dir: &str) -> anyhow::Result<ExperimentData> {
|
|
|
let sample_sheet = MinKnowSampleSheet::from_path(&sample_sheet)
|
|
let sample_sheet = MinKnowSampleSheet::from_path(&sample_sheet)
|
|
|
.context(anyhow::anyhow!("Can't parse sample sheet data"))?;
|
|
.context(anyhow::anyhow!("Can't parse sample sheet data"))?;
|
|
|
|
|
|
|
|
- Ok((sample_sheet, result))
|
|
|
|
|
|
|
+ Ok((sample_sheet, pore_activity, throughput, result))
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// Scans a `.tar` archive containing a MinKNOW sequencing experiment.
|
|
/// Scans a `.tar` archive containing a MinKNOW sequencing experiment.
|
|
@@ -717,8 +661,12 @@ pub fn scan_archive(tar_path: &str) -> anyhow::Result<ExperimentData> {
|
|
|
|
|
|
|
|
let mut archive = tar::Archive::new(file);
|
|
let mut archive = tar::Archive::new(file);
|
|
|
let mut result = Vec::new();
|
|
let mut result = Vec::new();
|
|
|
|
|
+
|
|
|
let mut sample_sheet: Option<String> = None;
|
|
let mut sample_sheet: Option<String> = None;
|
|
|
|
|
|
|
|
|
|
+ let mut throughput = None;
|
|
|
|
|
+ let mut pore_activity = None;
|
|
|
|
|
+
|
|
|
// Iterate through the entries in the archive
|
|
// Iterate through the entries in the archive
|
|
|
for entry in archive.entries_with_seek()? {
|
|
for entry in archive.entries_with_seek()? {
|
|
|
let mut file = entry.context("Failed to read an entry from the tar archive")?;
|
|
let mut file = entry.context("Failed to read an entry from the tar archive")?;
|
|
@@ -737,11 +685,21 @@ pub fn scan_archive(tar_path: &str) -> anyhow::Result<ExperimentData> {
|
|
|
.to_string_lossy()
|
|
.to_string_lossy()
|
|
|
.into_owned();
|
|
.into_owned();
|
|
|
|
|
|
|
|
- if path.contains("sample_sheet") {
|
|
|
|
|
|
|
+ if path.contains("pore_activity") {
|
|
|
|
|
+ pore_activity = Some(
|
|
|
|
|
+ parse_pore_activity_from_reader(&mut file)
|
|
|
|
|
+ .context("Failed to read pore_activity data: {path}")?,
|
|
|
|
|
+ );
|
|
|
|
|
+ } else if path.contains("sample_sheet") {
|
|
|
let mut buffer = String::new();
|
|
let mut buffer = String::new();
|
|
|
file.read_to_string(&mut buffer)
|
|
file.read_to_string(&mut buffer)
|
|
|
.context("Failed to read file contents as string")?;
|
|
.context("Failed to read file contents as string")?;
|
|
|
sample_sheet = Some(buffer);
|
|
sample_sheet = Some(buffer);
|
|
|
|
|
+ } else if path.contains("throughput") {
|
|
|
|
|
+ throughput = Some(
|
|
|
|
|
+ parse_throughput_from_reader(&mut file)
|
|
|
|
|
+ .context("Failed to read throughput data: {path}")?,
|
|
|
|
|
+ );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
result.push((path, size, modified_utc));
|
|
result.push((path, size, modified_utc));
|
|
@@ -756,5 +714,5 @@ pub fn scan_archive(tar_path: &str) -> anyhow::Result<ExperimentData> {
|
|
|
let sample_sheet: MinKnowSampleSheet = data
|
|
let sample_sheet: MinKnowSampleSheet = data
|
|
|
.try_into()
|
|
.try_into()
|
|
|
.map_err(|e| anyhow::anyhow!("Can't parse sample sheet.\n{e}"))?;
|
|
.map_err(|e| anyhow::anyhow!("Can't parse sample sheet.\n{e}"))?;
|
|
|
- Ok((sample_sheet, result))
|
|
|
|
|
|
|
+ Ok((sample_sheet, pore_activity, throughput, result))
|
|
|
}
|
|
}
|