Browse Source

Flowcells cross cases

Thomas 8 months ago
parent
commit
df61133efb
1 changed files with 135 additions and 87 deletions
  1. 135 87
      src/collection/flowcells.rs

+ 135 - 87
src/collection/flowcells.rs

@@ -45,45 +45,27 @@ pub struct IdInput {
 
 impl IdsInput {
     /// Load `IdsInput` from a JSON file.
-    ///
-    /// # Arguments
-    /// * `path` - Path to a JSON file containing an array of `IdInput`.
-    ///
-    /// # Errors
-    /// Returns an error if the file cannot be opened or parsed.
-    pub fn load_json(path: &str) -> anyhow::Result<Self> {
-        let f = File::open(path)?;
-        let s: Self = serde_json::from_reader(f)?;
-        Ok(s)
+    pub fn load_json<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
+        let file = File::open(&path)?;
+        Ok(serde_json::from_reader(file)?)
     }
 
-    /// Save `IdsInput` to a JSON file.
-    ///
-    /// # Arguments
-    /// * `path` - Destination file path.
-    ///
-    /// # Errors
-    /// Returns an error if the file cannot be created or written.
-    pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
-        let f = File::create(path)?;
-        serde_json::to_writer_pretty(f, self)?;
+    /// Save `IdsInput` to a JSON file (pretty-printed).
+    pub fn save_json<P: AsRef<Path>>(&self, path: P) -> anyhow::Result<()> {
+        let file = File::create(&path)?;
+        serde_json::to_writer_pretty(file, self)?;
         Ok(())
     }
 
-    /// Remove duplicate `IdInput` entries.
-    ///
-    /// Keeps the first occurrence of each unique `IdInput`.
+    /// Remove duplicate `IdInput` entries, retaining the first occurrence.
     pub fn dedup(&mut self) {
-        let mut unique = HashSet::new();
-        self.data.retain(|item| unique.insert(item.clone()));
+        let mut seen = HashSet::new();
+        self.data.retain(|item| seen.insert(item.clone()));
     }
 
-    /// Add a new `IdInput` and deduplicate the collection.
-    ///
-    /// # Arguments
-    /// * `values` - A new `IdInput` record.
-    pub fn add_input(&mut self, values: IdInput) {
-        self.data.push(values);
+    /// Add a new `IdInput` and deduplicate.
+    pub fn add_input(&mut self, entry: IdInput) {
+        self.data.push(entry);
         self.dedup();
     }
 }
@@ -136,62 +118,146 @@ pub struct FlowCells {
 }
 
 impl FlowCells {
-    /// Loads and merges `FlowCel` objects from both archive and local filesystem, deduplicating by `flowcell_id`.
+    /// Load and merge flowcells from archive and local directories, then annotate with `IdsInput`.
     ///
-    /// This function combines flowcells from:
-    /// - a precomputed archive (JSON),
-    /// - and a dynamic scan of local run directories.
+    /// # Arguments
+    /// * `local_run_dir` - Root directory containing local flowcell run folders.
+    /// * `inputs_path` - Path to a JSON file with [`IdInput`] annotations.
+    /// * `archive_store_path` - Path to a cached JSON file of archived flowcells.
     ///
-    /// The result is deduplicated by `flow_cell_id`, and enriched with case-level annotations
-    /// from an `IdsInput` file.
+    /// # Returns
+    /// A deduplicated and annotated [`FlowCells`] collection.
     ///
-    /// # Deduplication Logic
-    /// If a flowcell appears in both sources, the one with the more recent `modified` timestamp is retained.
+    /// # Errors
+    /// Returns an error if any input file cannot be read or parsed, or if local scanning fails.
     pub fn load(
         local_run_dir: &str,
         inputs_path: &str,
         archive_store_path: &str,
     ) -> anyhow::Result<Self> {
-        let mut merged_map: HashMap<String, FlowCell> = HashMap::new();
+        let mut instance = Self {
+            flow_cells: Vec::new(),
+        };
 
-        // Load Vec<FlowCell> from archive if present
-        if Path::new(archive_store_path).exists() {
-            let file = File::open(archive_store_path)
-                .with_context(|| format!("Failed to open File: {archive_store_path}"))?;
-            let archived: Vec<FlowCell> = serde_json::from_reader(BufReader::new(file))
-                .with_context(|| format!("Failed to parse FlowCells from: {archive_store_path}"))?;
+        instance.load_from_archive(archive_store_path)?;
+        instance.load_from_local(local_run_dir)?;
+        instance.annotate_with_inputs(inputs_path)?;
 
-            for fc in archived {
-                merged_map.insert(fc.sample_sheet.flow_cell_id.clone(), fc);
-            }
+        Ok(instance)
+    }
+
+    /// Load flowcells from a cached archive and merge into `self.flow_cells`.
+    ///
+    /// Retains only the most recently modified entry for each `flowcell_id`.
+    ///
+    /// # Arguments
+    /// * `archive_path` - Path to archive JSON file.
+    ///
+    /// # Errors
+    /// Returns an error if the file cannot be read or parsed.
+    fn load_from_archive(&mut self, archive_path: &str) -> anyhow::Result<()> {
+        if !Path::new(archive_path).exists() {
+            return Ok(());
         }
 
-        // Scan local sample_sheets
-        let sample_sheets = find_files(&format!("{local_run_dir}/**/sample_sheet*"))?;
-        for sample_sheet_path in sample_sheets {
-            let dir = sample_sheet_path.parent().ok_or_else(|| {
-                anyhow::anyhow!(
-                    "Failed to get directory from path: {}",
-                    sample_sheet_path.display()
-                )
-            })?;
+        let file = File::open(archive_path)
+            .with_context(|| format!("Failed to open archive file: {archive_path}"))?;
+        let archived: Vec<FlowCell> = serde_json::from_reader(BufReader::new(file))
+            .with_context(|| format!("Failed to parse FlowCells from: {archive_path}"))?;
+
+        self.insert_flowcells(archived);
+        Ok(())
+    }
+
+    /// Scan local run directories for flowcells and merge into `self.flow_cells`.
+    ///
+    /// Uses `scan_local()` to parse local metadata and selects the most recently modified
+    /// version if duplicates exist.
+    ///
+    /// # Arguments
+    /// * `local_dir` - Root directory containing flowcell runs.
+    ///
+    /// # Errors
+    /// Returns an error if scanning fails or if directory layout is invalid.
+    fn load_from_local(&mut self, local_dir: &str) -> anyhow::Result<()> {
+        let sample_sheets = find_files(&format!("{local_dir}/**/sample_sheet*"))?;
+
+        for path in sample_sheets {
+            let dir = path
+                .parent()
+                .ok_or_else(|| anyhow::anyhow!("Invalid sample_sheet path: {}", path.display()))?;
+            let dir_str = dir.to_string_lossy();
 
-            let dir_str = dir.to_string_lossy().to_string();
+            let (sheet, pore, throughput, files) = scan_local(&dir_str)
+                .with_context(|| format!("Failed to scan local dir: {dir_str}"))?;
 
-            let (sample_sheet, pore_activity, throughput, files) = scan_local(&dir_str)
-                .with_context(|| format!("Failed to scan local run dir: {dir_str}"))?;
             let fc = FlowCell::new(
-                sample_sheet,
-                pore_activity,
+                sheet,
+                pore,
                 throughput,
-                FlowCellLocation::Local(dir_str.clone()),
+                FlowCellLocation::Local(dir_str.to_string()),
                 files,
             )
-            .with_context(|| format!("Failed to load FlowCell from dir: {dir_str}"))?;
+            .with_context(|| format!("Failed to create FlowCell from dir: {dir_str}"))?;
 
-            // Dedup by flowcell_id, retain most recently modified
-            merged_map
-                .entry(fc.sample_sheet.flow_cell_id.clone())
+            self.insert_flowcells(vec![fc]);
+        }
+
+        Ok(())
+    }
+
+    /// Annotate flowcells with matching `IdInput` records from JSON.
+    ///
+    /// Assigns to `FlowCell.cases` all matching `IdInput`s by `flow_cell_id`.
+    ///
+    /// # Arguments
+    /// * `inputs_path` - Path to JSON file containing a list of `IdInput` records.
+    ///
+    /// # Errors
+    /// Returns an error if the file is unreadable or malformed.
+    fn annotate_with_inputs(&mut self, inputs_path: &str) -> anyhow::Result<()> {
+        if !Path::new(inputs_path).exists() {
+            warn!("IdsInput file not found at {}", inputs_path);
+            return Ok(());
+        }
+
+        let inputs = IdsInput::load_json(inputs_path)
+            .with_context(|| format!("Failed to load IdsInput from: {inputs_path}"))?;
+
+        self.cross_cases(inputs)
+            .with_context(|| format!("Failed to cross with IdsInput from: {inputs_path}"))?;
+
+        Ok(())
+    }
+
+    pub fn cross_cases(&mut self, inputs: IdsInput) -> anyhow::Result<()> {
+        for fc in &mut self.flow_cells {
+            fc.cases = inputs
+                .data
+                .iter()
+                .filter(|id| id.flow_cell_id == fc.sample_sheet.flow_cell_id)
+                .cloned()
+                .collect();
+        }
+
+        Ok(())
+    }
+
+    /// Insert new flowcells into the current collection, deduplicating by `flowcell_id`.
+    ///
+    /// For duplicates, retains the flowcell with the newer `.modified` timestamp.
+    ///
+    /// # Arguments
+    /// * `new_cells` - A vector of parsed `FlowCell` instances.
+    fn insert_flowcells(&mut self, new_cells: Vec<FlowCell>) {
+        let mut map: HashMap<String, FlowCell> = self
+            .flow_cells
+            .drain(..)
+            .map(|fc| (fc.sample_sheet.flow_cell_id.clone(), fc))
+            .collect();
+
+        for fc in new_cells {
+            map.entry(fc.sample_sheet.flow_cell_id.clone())
                 .and_modify(|existing| {
                     if fc.modified > existing.modified {
                         *existing = fc.clone();
@@ -200,25 +266,7 @@ impl FlowCells {
                 .or_insert(fc);
         }
 
-        // Load IdsInput and annotate flowcells
-        if Path::new(inputs_path).exists() {
-            let inputs = IdsInput::load_json(inputs_path)
-                .with_context(|| format!("Failed to load json from: {inputs_path}"))?;
-            for fc in merged_map.values_mut() {
-                fc.cases = inputs
-                    .data
-                    .iter()
-                    .filter(|info| info.flow_cell_id == fc.sample_sheet.flow_cell_id)
-                    .cloned()
-                    .collect();
-            }
-        } else {
-            warn!("No inputs json...");
-        }
-
-        Ok(Self {
-            flow_cells: merged_map.into_values().collect(),
-        })
+        self.flow_cells = map.into_values().collect();
     }
 
     /// Updates a JSON archive of `FlowCel` objects by scanning `.tar` archives in a directory.