Thomas 6 months ago
parent
commit
ff1b5435aa
1 changed files with 48 additions and 94 deletions
  1. 48 94
      src/collection/flowcells.rs

+ 48 - 94
src/collection/flowcells.rs

@@ -288,109 +288,63 @@ impl FlowCells {
         self.flow_cells = map.into_values().collect();
     }
 
-    /// Updates a JSON archive of `FlowCel` objects by scanning `.tar` archives in a directory.
-    ///
-    /// This function is used to **discover new archived flowcells** by scanning all `.tar` files
-    /// in a given directory, parsing their contents using [`scan_archive`] and [`FlowCel::new`],
-    /// and then appending the results to an existing JSON file (if present).
-    /// Flowcells are **deduplicated** by `flowcell_id`, and the updated result is saved back to disk.
-    ///
-    /// # Arguments
-    /// - `archive_path`: Path to a directory containing `.tar` archives produced by MinKNOW.
-    /// - `save_path`: Path to a JSON file where the deduplicated list of `FlowCel` objects will be saved.
-    ///
-    /// # Behavior
-    /// - If `save_path` exists, the function loads existing flowcells from it.
-    /// - Then it scans all `.tar` files in `archive_path`, one by one:
-    ///     - Extracts `sample_sheet` and `.pod5` file metadata using [`scan_archive`]
-    ///     - Builds a new [`FlowCel`] using [`FlowCel::new`] with location `FlowCellLocation::Archived(...)`
-    ///     - Logs and skips entries that fail to parse
-    /// - All new flowcells are added to the existing list and deduplicated.
-    /// - The updated list is sorted and written back to `save_path`.
-    ///
-    /// # Deduplication
-    /// - Flowcells are deduplicated using `.dedup_by_key(|fc| fc.flowcell_id.clone())`.
-    /// - The last encountered entry is kept if duplicates exist.
-    ///
-    /// # Returns
-    /// - `Ok(())` if scanning and update succeeds.
-    /// - `Err` if the archive directory, `.tar` files, or save path cannot be processed.
-    ///
-    /// # Example
-    /// ```
-    /// update_archive_from_scan("archives/", "flowcell_cache.json")?;
-    /// ```
-    ///
-    /// # See Also
-    /// - [`scan_archive`]
-    /// - [`FlowCell`]
-    /// - [`FlowCellLocation::Archived`]
-    pub fn update_archive_from_scan(archive_path: &str, save_path: &str) -> anyhow::Result<()> {
-        // Load existing archive, if any
-        let mut all: Vec<FlowCell> = if Path::new(save_path).exists() {
-            let file = File::open(save_path)?;
-            serde_json::from_reader(BufReader::new(file))
-                .map_err(|e| anyhow::anyhow!("Failed to parse: {save_path}.\n\t{e}"))?
-        } else {
-            Vec::new()
-        };
-
-        let n_before = all.len();
-        let pattern = format!("{archive_path}/*.tar");
-
-        // Scan all .tar archives
-        let res: Vec<FlowCell> = glob(&pattern)?
-            .filter_map(Result::ok)
-            .filter_map(|path| {
-                let archive_str = path.to_string_lossy();
-                let (sample_sheet, pore_activity, throughput, files) =
-                    match scan_archive(&archive_str) {
-                        Ok(r) => r,
+    /// Discover new archived flowcells in `archive_dir` and update the JSON at `store_path`.
+    ///
+    /// - Loads existing archive (gzip-aware) into `self`.
+    /// - Recursively scans **all** `.tar` files under `archive_dir`.
+    /// - For each archive, runs `scan_archive`, builds a `FlowCell` with
+    ///   `FlowCellLocation::Archived(archive_dir)`, and merges it via `insert_flowcells`.
+    /// - Logs how many new flowcells were added.
+    /// - Saves back to `store_path` (gzip-compressed).
+    pub fn update_archive_from_scan(
+        &mut self,
+        archive_dir: &str,
+        store_path: &str,
+    ) -> anyhow::Result<()> {
+        // 1) Load whatever’s already in the archive (if present)
+        self.load_from_archive(store_path)
+            .with_context(|| format!("Loading existing archive from {}", store_path))?;
+
+        let before = self.flow_cells.len();
+        info!("Scanning '{}' for .tar archives…", archive_dir);
+
+        // 2) Find & process every .tar (recursive)
+        let pattern = format!("{}/*.tar", archive_dir);
+        let entries =
+            glob(&pattern).with_context(|| format!("Invalid glob pattern: {}", &pattern))?;
+        for entry in entries.filter_map(Result::ok) {
+            let path = entry.to_string_lossy();
+            match scan_archive(&path) {
+                Ok((sheet, pore, throughput, files)) => {
+                    match FlowCell::new(
+                        sheet,
+                        pore,
+                        throughput,
+                        FlowCellLocation::Archived(archive_dir.to_string()),
+                        files,
+                    ) {
+                        Ok(fc) => {
+                            // merge & dedupe
+                            self.insert_flowcells(vec![fc]);
+                        }
                         Err(e) => {
-                            warn!("Failed to scan archive {}: {e}", archive_str);
-                            return None;
+                            warn!("Failed to build FlowCell from '{}': {}", path, e);
                         }
-                    };
-                match FlowCell::new(
-                    sample_sheet,
-                    pore_activity,
-                    throughput,
-                    FlowCellLocation::Archived(archive_path.to_string()),
-                    files,
-                ) {
-                    Ok(fc) => Some(fc),
-                    Err(e) => {
-                        warn!("Failed to create FlowCel from {}: {e}", archive_str);
-                        None
                     }
                 }
-            })
-            .collect();
-
-        // Merge, deduplicate, and write updated archive
-        all.extend(res);
-        all.sort_by(|a, b| {
-            a.sample_sheet
-                .flow_cell_id
-                .cmp(&b.sample_sheet.flow_cell_id)
-        });
-        all.dedup_by_key(|v| v.sample_sheet.flow_cell_id.clone());
-
-        let n_final = all.len();
-        info!(
-            "{} new archive(s) discovered.",
-            n_final.saturating_sub(n_before)
-        );
+                Err(e) => warn!("Failed to scan archive '{}': {}", path, e),
+            }
+        }
 
-        let json = serde_json::to_string_pretty(&all)
-            .map_err(|e| anyhow::anyhow!("Can't convert into json.\n{e}"))?;
+        let added = self.flow_cells.len().saturating_sub(before);
+        info!("Discovered {} new archived flowcell(s).", added);
 
-        fs::write(save_path, json)
-            .map_err(|e| anyhow::anyhow!("Can't write file: {save_path}.\n{e}"))?;
+        // 3) Save the merged list back
+        self.save_to_archive(store_path)
+            .with_context(|| format!("Saving updated archive to {}", store_path))?;
 
         Ok(())
     }
-
     /// Sorts `flow_cells` in-place from oldest (`modified` smallest) to newest.
     pub fn sort_by_modified(&mut self) {
         self.flow_cells.sort_by_key(|fc| fc.modified);