Ver Fonte

Pod5sFlowCell

Thomas há 3 meses atrás
pai
commit
1f42f2b8ef

+ 7 - 10
src/callers/clairs.rs

@@ -1,21 +1,18 @@
 use crate::{
-    annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
-    collection::{vcf::Vcf, Initialize, ShouldRun, Version},
-    commands::bcftools::{bcftools_concat, bcftools_keep_pass, BcftoolsConfig},
-    config::Config,
-    helpers::{is_file_older, remove_dir_if_exists, temp_file_path},
-    io::vcf::read_vcf,
-    runners::{run_wait, DockerRun, Run},
-    variant::{
+    annotation::{Annotation, Annotations, Caller, CallerCat, Sample}, collection::vcf::Vcf, commands::bcftools::{BcftoolsConfig, bcftools_concat, bcftools_keep_pass}, config::Config, helpers::{is_file_older, remove_dir_if_exists, temp_file_path}, io::vcf::read_vcf, pipes::{Initialize, ShouldRun, Version}, runners::{DockerRun, Run, run_wait}, variant::{
         variant::{Label, Variants},
         variant_collection::VariantCollection,
-    },
+    }
 };
 use anyhow::{Context, Ok};
 use log::{debug, info, warn};
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use regex::Regex;
-use std::{fs, path::Path, process::{Command, Stdio}};
+use std::{
+    fs,
+    path::Path,
+    process::{Command, Stdio},
+};
 
 /// A pipeline runner for executing ClairS on paired tumor and normal samples.
 ///

+ 2 - 1
src/callers/deep_somatic.rs

@@ -11,11 +11,12 @@ use regex::Regex;
 
 use crate::{
     annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
-    collection::{vcf::Vcf, Initialize, ShouldRun, Version},
+    collection::vcf::Vcf,
     commands::bcftools::{bcftools_keep_pass, BcftoolsConfig},
     config::Config,
     helpers::{is_file_older, remove_dir_if_exists},
     io::vcf::read_vcf,
+    pipes::{Initialize, ShouldRun, Version},
     runners::{run_wait, DockerRun, Run},
     variant::{
         variant::{Label, Variants},

+ 8 - 3
src/callers/deep_variant.rs

@@ -2,15 +2,20 @@ use anyhow::Context;
 use log::{debug, info};
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use regex::Regex;
-use std::{fs, path::Path, process::{Command, Stdio}};
+use std::{
+    fs,
+    path::Path,
+    process::{Command, Stdio},
+};
 
 use crate::{
     annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
-    collection::{vcf::Vcf, InitializeSolo, ShouldRun, Version},
+    collection::vcf::Vcf,
     commands::bcftools::{bcftools_keep_pass, BcftoolsConfig},
     config::Config,
     helpers::{is_file_older, remove_dir_if_exists},
     io::vcf::read_vcf,
+    pipes::{InitializeSolo, ShouldRun, Version},
     runners::{run_wait, DockerRun, Run},
     variant::{
         variant::{Label, Variants},
@@ -230,7 +235,7 @@ impl Variants for DeepVariant {
         let variants = read_vcf(&vcf_passed)
             .map_err(|e| anyhow::anyhow!("Failed to read DeepVariant VCF {}.\n{e}", vcf_passed))?;
         variants.par_iter().for_each(|v| {
-            annotations.insert_update(v.hash(), &[caller.clone()]);
+            annotations.insert_update(v.hash(), std::slice::from_ref(&caller));
         });
         info!("{}, {} variants loaded.", caller, variants.len());
         Ok(VariantCollection {

+ 4 - 3
src/callers/nanomonsv.rs

@@ -10,12 +10,13 @@ use log::{debug, error, info, warn};
 
 use crate::{
     annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
-    collection::{vcf::Vcf, Initialize, InitializeSolo, ShouldRun, Version},
-    commands::bcftools::{bcftools_concat, bcftools_keep_pass, BcftoolsConfig},
+    collection::vcf::Vcf,
+    commands::bcftools::{BcftoolsConfig, bcftools_concat, bcftools_keep_pass},
     config::Config,
     helpers::{is_file_older, remove_dir_if_exists},
     io::vcf::read_vcf,
-    runners::{run_wait, CommandRun, Run, RunReport},
+    pipes::{Initialize, InitializeSolo, ShouldRun, Version},
+    runners::{CommandRun, Run, RunReport, run_wait},
     variant::{
         variant::{Label, Variants},
         variant_collection::VariantCollection,

+ 2 - 1
src/callers/savana.rs

@@ -1,6 +1,6 @@
 use crate::{
     annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
-    collection::{vcf::Vcf, Initialize, ShouldRun, Version},
+    collection::vcf::Vcf,
     commands::{
         bcftools::{bcftools_keep_pass, BcftoolsConfig},
         longphase::{LongphaseConfig, LongphaseHap, LongphasePhase},
@@ -8,6 +8,7 @@ use crate::{
     config::Config,
     helpers::{is_file_older, remove_dir_if_exists},
     io::{readers::get_gz_reader, vcf::read_vcf},
+    pipes::{Initialize, ShouldRun, Version},
     positions::{num_to_contig, GenomeRange},
     runners::{run_wait, CommandRun, Run},
     variant::{

+ 4 - 4
src/callers/severus.rs

@@ -1,14 +1,15 @@
 use crate::{
     annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
-    collection::{vcf::Vcf, Initialize, InitializeSolo, ShouldRun, Version},
+    collection::vcf::Vcf,
     commands::{
-        bcftools::{bcftools_keep_pass_precise, BcftoolsConfig},
+        bcftools::{BcftoolsConfig, bcftools_keep_pass_precise},
         longphase::LongphasePhase,
     },
     config::Config,
     helpers::{is_file_older, remove_dir_if_exists},
     io::vcf::read_vcf,
-    runners::{run_wait, CommandRun, Run},
+    pipes::{Initialize, InitializeSolo, ShouldRun, Version},
+    runners::{CommandRun, Run, run_wait},
     variant::{
         variant::{Label, Variants},
         variant_collection::VariantCollection,
@@ -174,7 +175,6 @@ impl Run for Severus {
     }
 }
 
-
 impl CallerCat for Severus {
     /// Returns the annotation category for Severus calls.
     ///

+ 7 - 936
src/collection/mod.rs

@@ -1,936 +1,7 @@
-use std::{
-    collections::HashMap,
-    fmt,
-    fs::{self, metadata},
-    path::{Path, PathBuf},
-    thread,
-    time::SystemTime,
-};
-
-use anyhow::Context;
-use chrono::{DateTime, Utc};
-use glob::glob;
-use log::{error, info, warn};
-use modbases::{Dmr, ModBasesCollection, ModType};
-use pandora_lib_variants::variants::Variants;
-
-use self::{bam::BamCollection, /* pod5::Pod5Collection, */ vcf::VcfCollection};
-use crate::{
-    callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::NanomonSV},
-    /* collection::pod5::FlowCellCase, */
-    commands::{
-        dorado::Dorado as BasecallAlign,
-        modkit::{bed_methyl, dmr_c_mrd_diag, ModkitConfig},
-    },
-    config::Config,
-    functions::{
-        assembler::{Assembler, AssemblerConfig},
-        variants::{RunVariantsAgg, VariantsConfig},
-    },
-    runners::Run,
-    scan::scan::par_whole_scan,
-};
-
-// pub mod bam;
-// pub mod modbases;
-// pub mod pod5;
-// pub mod vcf;
-// pub mod flowcells;
-// pub mod minknow;
-// pub mod run;
-
-#[derive(Debug, Clone)]
-pub struct CollectionsConfig {
-    pub pod_dir: String,
-    pub corrected_fc_path: String,
-    pub result_dir: String,
-    pub dict_file: String,
-    pub min_diag_cov: f32,
-    pub min_mrd_cov: f32,
-    pub id_black_list: Vec<String>,
-}
-
-impl Default for CollectionsConfig {
-    fn default() -> Self {
-        Self {
-            pod_dir: "/data/run_data".to_string(),
-            corrected_fc_path: "/data/flow_cells.tsv".to_string(),
-            result_dir: "/data/longreads_basic_pipe".to_string(),
-            dict_file: "/data/ref/hs1/chm13v2.0.dict".to_string(),
-            min_diag_cov: 11.0,
-            min_mrd_cov: 10.0,
-            id_black_list: Vec::default(),
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct Collections {
-    pub config: CollectionsConfig,
-    // pub pod5: Pod5Collection,
-    pub bam: BamCollection,
-    pub vcf: VcfCollection,
-    pub modbases: ModBasesCollection,
-    pub tasks: Vec<CollectionsTasks>,
-}
-
-impl Collections {
-    pub fn new(config: CollectionsConfig) -> anyhow::Result<Self> {
-        let CollectionsConfig {
-            pod_dir,
-            corrected_fc_path,
-            result_dir,
-            ..
-        } = &config;
-        // let pod5 = Pod5Collection::new(pod_dir, corrected_fc_path, result_dir)?;
-        let bam = BamCollection::new(result_dir);
-        let vcf = VcfCollection::new(result_dir);
-        let modbases = ModBasesCollection::new(result_dir);
-
-        Ok(Self {
-            // pod5: Pod5Collection::default(),
-            bam,
-            vcf,
-            modbases,
-            tasks: Vec::new(),
-            config,
-        })
-    }
-
-    pub fn todo(&mut self) -> anyhow::Result<()> {
-        info!("Looking for base calling tasks...");
-
-        let mut tasks = Vec::new();
-        let mut to_demux = Vec::new();
-        for run in self.pod5.runs.iter() {
-            for fc in run.flowcells.iter() {
-                let acq_id = fc.pod5_info.acquisition_id.clone();
-                for case in fc.cases.iter() {
-                    let bams_ids: Vec<String> = self
-                        .bam
-                        .get(&case.id, &case.time_point)
-                        .iter()
-                        .flat_map(|b| {
-                            b.composition
-                                .iter()
-                                .map(|c| c.0.clone())
-                                .collect::<Vec<String>>()
-                        })
-                        .filter(|id| *id == acq_id)
-                        .collect();
-                    if bams_ids.is_empty() {
-                        match fc.pod5_type {
-                            pod5::Pod5Type::Raw => to_demux.push(case.clone()),
-                            pod5::Pod5Type::Demuxed => {
-                                tasks.push(CollectionsTasks::Align(case.clone()))
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        // Group for muxed and push task with all cases
-        let mut grouped: HashMap<PathBuf, Vec<FlowCellCase>> = HashMap::new();
-        for case in to_demux {
-            grouped.entry(case.pod_dir.clone()).or_default().push(case);
-        }
-        grouped
-            .into_values()
-            .for_each(|data| tasks.push(CollectionsTasks::DemuxAlign(data)));
-
-        // de novo
-        // tasks.extend(self.todo_assembler()?);
-
-        // Remove VCF anterior to BAM
-        // let vcf_by_id = self.vcf.group_by_id();
-        // vcf_by_id.iter().for_each(|(id, vcfs)| {
-        //     if let (Some(diag), Some(mrd)) = (
-        //         self.bam.get(id, "diag").first(),
-        //         self.bam.get(id, "mrd").first(),
-        //     ) {
-        //         let diag_modified = diag.modified;
-        //         let mrd_modified = mrd.modified;
-        //         let mut rm_paths: Vec<&Path> = vcfs
-        //             .iter()
-        //             .flat_map(|vcf| {
-        //                 let vcf_mod: DateTime<Utc> = vcf
-        //                     .file_metadata
-        //                     .modified()
-        //                     .expect("Can't read VCF modified time.")
-        //                     .into();
-        //
-        //                 // For somatic caller erase if one bam (diag or mrd) is more recent.
-        //                 if vcf.caller != "DeepVariant" {
-        //                     if vcf_mod < diag_modified || vcf_mod < mrd_modified {
-        //                         vec![vcf.path.parent().unwrap()]
-        //                     } else {
-        //                         vec![]
-        //                     }
-        //                 } else if (vcf.time_point == "diag" && vcf_mod < diag_modified)
-        //                     || (vcf.time_point == "mrd" && vcf_mod < mrd_modified)
-        //                 {
-        //                     vec![vcf.path.parent().unwrap()]
-        //                 } else {
-        //                     vec![]
-        //                 }
-        //             })
-        //             .collect();
-        //         rm_paths.sort();
-        //         rm_paths.dedup();
-        //         rm_paths
-        //             .iter()
-        //             .for_each(|p| fs::remove_dir_all(p).expect("Can't erase caller dir."))
-        //     }
-        // });
-
-        // Variant calling
-        info!("Looking for variant calling tasks...");
-        // self.bam.bams.iter().map(|b| b.id.clone()).for_each(|id| {
-        //     if let (Some(diag), Some(mrd)) = (
-        //         self.bam.get(&id, "diag").first(),
-        //         self.bam.get(&id, "mrd").first(),
-        //     ) {
-        //         if let (Some(diag_cramino), Some(mrd_cramino)) = (&diag.cramino, &mrd.cramino) {
-        //             if diag_cramino.mean_coverage >= self.config.min_diag_cov.into()
-        //                 && mrd_cramino.mean_coverage >= self.config.min_mrd_cov.into()
-        //                 && !self.config.id_black_list.contains(&id)
-        //             {
-        //                 let caller_time: Vec<(&str, &str)> = vcf_by_id
-        //                     .iter()
-        //                     .filter(|(i, _)| *i == id)
-        //                     .flat_map(|(_, vcfs)| {
-        //                         vcfs.iter()
-        //                             .map(|vcf| (vcf.caller.as_str(), vcf.time_point.as_str()))
-        //                     })
-        //                     .collect();
-        //
-        //                 if !caller_time.contains(&("clairs", "diag"))
-        //                     || !caller_time.contains(&("clairs_indel", "diag"))
-        //                 {
-        //                     tasks.push(CollectionsTasks::ClairS {
-        //                         id: id.to_string(),
-        //                         diag_bam: diag.path.to_str().unwrap().to_string(),
-        //                         mrd_bam: mrd.path.to_str().unwrap().to_string(),
-        //                         config: ClairSConfig::default(),
-        //                     });
-        //                 }
-        //                 if !caller_time.contains(&("DeepVariant", "diag")) {
-        //                     tasks.push(CollectionsTasks::DeepVariant {
-        //                         id: id.to_string(),
-        //                         time_point: "diag".to_string(),
-        //                         bam: diag.path.to_str().unwrap().to_string(),
-        //                         config: DeepVariantConfig::default(),
-        //                     });
-        //                 }
-        //                 if !caller_time.contains(&("DeepVariant", "mrd")) {
-        //                     tasks.push(CollectionsTasks::DeepVariant {
-        //                         id: id.to_string(),
-        //                         time_point: "mrd".to_string(),
-        //                         bam: mrd.path.to_str().unwrap().to_string(),
-        //                         config: DeepVariantConfig::default(),
-        //                     });
-        //                 }
-        //                 if !caller_time.contains(&("nanomonsv", "diag")) {
-        //                     tasks.push(CollectionsTasks::NanomonSV {
-        //                         id: id.to_string(),
-        //                         diag_bam: diag.path.to_str().unwrap().to_string(),
-        //                         mrd_bam: mrd.path.to_str().unwrap().to_string(),
-        //                         config: NanomonSVConfig::default(),
-        //                     });
-        //                 }
-        //             }
-        //         }
-        //     }
-        // });
-
-        // Tasks dedup
-        let mut hs = HashMap::new();
-        tasks.into_iter().for_each(|t| {
-            hs.insert(t.to_string(), t);
-        });
-        self.tasks = hs.into_values().collect();
-
-        // Variants DeepVariant
-        // self.tasks.extend(self.todo_deepvariants());
-
-        // Variants ClairS
-        // self.tasks.extend(self.todo_clairs());
-
-        // Variants Nanomonsv
-        // self.tasks.extend(self.todo_nanomonsv());
-
-        // Variants aggregation
-        // self.tasks.extend(self.todo_variants_agg()?);
-
-        // ModPileup
-        // self.tasks.extend(self.todo_mod_pileup());
-
-        // DMR C diag vs mrd
-        // self.tasks.extend(self.todo_dmr_c_diag_mrd());
-
-        // Tasks sorting
-        self.tasks.sort_by_cached_key(|task| task.get_order());
-
-        Ok(())
-    }
-
-    pub fn tasks_dedup(&mut self) {
-        let mut hs = HashMap::new();
-        self.tasks.clone().into_iter().for_each(|t| {
-            hs.insert(t.to_string(), t);
-        });
-        self.tasks = hs.into_values().collect();
-    }
-
-    pub fn todo_bam_count(&mut self, config: &Config) -> anyhow::Result<()> {
-        // Whole scan
-        for wgs_bam in self
-            .bam
-            .by_id_completed(self.config.min_diag_cov, self.config.min_mrd_cov)
-        {
-            let id = wgs_bam.id.as_str();
-
-            let count_dir = match wgs_bam.time_point.as_str() {
-                "diag" => config.tumoral_dir_count(id),
-                "mrd" => config.normal_dir_count(id),
-                _ => anyhow::bail!("Unknown bam time point {}", wgs_bam.time_point),
-            };
-
-            if PathBuf::from(&count_dir).exists() {
-                let dir_mod: DateTime<Utc> = fs::metadata(&count_dir)?.modified()?.into();
-                if wgs_bam.modified > dir_mod {
-                    fs::remove_dir_all(&count_dir)?;
-                }
-            }
-
-            if !PathBuf::from(&count_dir).exists() {
-                self.tasks.push(CollectionsTasks::CountBam {
-                    bam_path: wgs_bam.path.to_string_lossy().to_string(),
-                    count_dir,
-                    config: config.clone(),
-                });
-            }
-        }
-        Ok(())
-    }
-
-    // No pair needed
-    pub fn todo_assembler(&self) -> anyhow::Result<Vec<CollectionsTasks>> {
-        let mut tasks = Vec::new();
-        let config = AssemblerConfig::default();
-        for b in &self.bam.bams {
-            let assemblies_dir = format!(
-                "{}/{}/{}/{}",
-                config.result_dir, b.id, b.time_point, config.output_dir_name
-            );
-
-            if !Path::new(&assemblies_dir).exists() {
-                tasks.push(CollectionsTasks::Assemble {
-                    id: b.id.clone(),
-                    time_point: b.time_point.clone(),
-                    config: config.clone(),
-                });
-                continue;
-            }
-
-            let pattern = format!("{assemblies_dir}/*/*.bam");
-            let mut mtimes: Vec<SystemTime> = glob(&pattern)?
-                .filter_map(|entry| entry.ok())
-                .filter_map(|path| metadata(path).ok()?.modified().ok())
-                .collect();
-
-            if mtimes.is_empty() {
-                tasks.push(CollectionsTasks::Assemble {
-                    id: b.id.clone(),
-                    time_point: b.time_point.clone(),
-                    config: config.clone(),
-                });
-                continue;
-            }
-            mtimes.sort_unstable();
-            mtimes.dedup();
-            let max_mtime: DateTime<Utc> =
-                mtimes.last().context("No modified time")?.to_owned().into();
-            if b.modified > max_mtime {
-                tasks.push(CollectionsTasks::Assemble {
-                    id: b.id.clone(),
-                    time_point: b.time_point.clone(),
-                    config: config.clone(),
-                });
-            }
-        }
-
-        Ok(tasks)
-    }
-
-    pub fn todo_deepvariants(&self) -> Vec<CollectionsTasks> {
-        self.bam
-            .bams
-            .iter()
-            .filter_map(|b| {
-                if self.vcf.vcfs.iter().any(|v| {
-                    v.caller == "DeepVariant"
-                        && v.id == b.id
-                        && v.time == b.time_point
-                        && v.modified().unwrap_or_default() > b.modified
-                }) {
-                    None
-                } else {
-                    Some(CollectionsTasks::DeepVariant {
-                        id: b.id.clone(),
-                        time_point: b.time_point.clone(),
-                        bam: b.path.to_string_lossy().to_string(),
-                    })
-                }
-            })
-            .collect()
-    }
-
-    pub fn todo_clairs(&self) -> Vec<CollectionsTasks> {
-        self.bam_pairs()
-            .iter()
-            .filter_map(|(diag, mrd)| {
-                if self.vcf.vcfs.iter().any(|v| {
-                    v.caller == "clairs"
-                        && v.id == diag.id
-                        && v.time == diag.time_point
-                        && (v.modified().unwrap_or_default() > diag.modified
-                            || v.modified().unwrap_or_default() > mrd.modified)
-                }) {
-                    None
-                } else {
-                    Some(CollectionsTasks::ClairS {
-                        id: diag.id.clone(),
-                        diag_bam: diag.path.to_string_lossy().to_string(),
-                        mrd_bam: mrd.path.to_string_lossy().to_string(),
-                    })
-                }
-            })
-            .collect()
-    }
-    pub fn run_clairs(&self) -> anyhow::Result<()> {
-        for task in self.todo_clairs() {
-            match task.run() {
-                Ok(_) => info!("done"),
-                Err(e) => warn!("{e}"),
-            }
-        }
-        Ok(())
-    }
-
-    pub fn todo_nanomonsv(&self) -> Vec<CollectionsTasks> {
-        self.bam_pairs()
-            .iter()
-            .filter_map(|(diag, mrd)| {
-                if self.vcf.vcfs.iter().any(|v| {
-                    v.caller == "nanomonsv"
-                        && v.id == diag.id
-                        && v.time == diag.time_point
-                        && (v.modified().unwrap_or_default() > diag.modified
-                            || v.modified().unwrap_or_default() > mrd.modified)
-                }) {
-                    None
-                } else {
-                    Some(CollectionsTasks::NanomonSV {
-                        id: diag.id.clone(),
-                    })
-                }
-            })
-            .collect()
-    }
-    pub fn todo_mod_pileup(&self) -> Vec<CollectionsTasks> {
-        let config = ModkitConfig::default();
-        self.bam
-            .bams
-            .iter()
-            .filter_map(|b| {
-                if self.modbases.modbases.iter().any(|mb| {
-                    mb.id == b.id && mb.time_point == b.time_point && mb.pileup_modif > b.modified
-                }) {
-                    None
-                } else {
-                    Some(CollectionsTasks::ModPileup {
-                        bam: b.path.clone(),
-                        config: config.clone(),
-                    })
-                }
-            })
-            .collect()
-    }
-
-    pub fn todo_dmr_c_diag_mrd(&self) -> Vec<CollectionsTasks> {
-        let config = ModkitConfig::default();
-        self.bam
-            .ids()
-            .iter()
-            .filter_map(|id| {
-                if let Ok((diag, mrd)) = self.modbases.get_diag_mrd(id, ModType::Mod5mC5hmC) {
-                    let dmr: Vec<&Dmr> = diag
-                        .dmr_files
-                        .iter()
-                        .filter(|dmr| dmr.base == "C" && dmr.vs == "mrd")
-                        .collect();
-
-                    if dmr.len() == 1 {
-                        let dmr = dmr.first().unwrap();
-                        if let Ok(metadata) = dmr.path.metadata() {
-                            if let Ok(modif) = metadata.modified() {
-                                let m: DateTime<Utc> = modif.into();
-                                if diag.pileup_modif > m || mrd.pileup_modif > m {
-                                    return Some(CollectionsTasks::DMRCDiagMrd {
-                                        id: id.clone(),
-                                        config: config.clone(),
-                                    });
-                                }
-                            }
-                        }
-                        None
-                    } else {
-                        Some(CollectionsTasks::DMRCDiagMrd {
-                            id: id.clone(),
-                            config: config.clone(),
-                        })
-                    }
-                } else {
-                    None
-                }
-            })
-            .collect()
-    }
-
-    /// Generates pairs of diagnostic and MRD BAM files.
-    ///
-    /// This function performs the following steps:
-    /// 1. Extracts and deduplicates IDs from all BAM files.
-    /// 2. For each unique ID, attempts to find a pair of BAM files:
-    ///    - One labeled as "diag" (diagnostic)
-    ///    - One labeled as "mrd" (minimal residual disease)
-    /// 3. Returns pairs where both "diag" and "mrd" BAMs are found.
-    ///
-    /// # Returns
-    ///
-    /// * `Vec<(bam::Bam, bam::Bam)>` - A vector of tuples, each containing a pair of BAM files
-    ///   (diagnostic and MRD) for a unique ID.
-    ///
-    pub fn bam_pairs(&self) -> Vec<(bam::WGSBam, bam::WGSBam)> {
-        let mut ids: Vec<String> = self.bam.bams.iter().map(|b| b.id.clone()).collect();
-        ids.sort();
-        ids.dedup();
-
-        ids.iter()
-            .filter_map(|id| {
-                match (
-                    self.bam.get(id, "diag").first(),
-                    self.bam.get(id, "mrd").first(),
-                ) {
-                    (Some(&diag), Some(&mrd)) => Some((diag.clone(), mrd.clone())),
-                    _ => None,
-                }
-            })
-            .collect()
-    }
-
-    /// Aggregates variant tasks based on BAM pairs and VCF files.
-    ///
-    /// This function performs the following operations:
-    /// 1. Iterates through BAM pairs (DIAG/MRD).
-    /// 2. Checks for the existence of a _constit.bytes.gz file for each pair.
-    /// 3. If the file exists, compares its modification time with VCF files.
-    /// 4. Creates variant tasks if the file is older than one of VCF or if it doesn't exist.
-    ///
-    /// # Arguments
-    ///
-    /// * `self` - The struct instance containing BAM pairs and VCF information.
-    ///
-    /// # Returns
-    ///
-    /// * `anyhow::Result<Vec<CollectionsTasks>>` - A Result containing a vector of `CollectionsTasks::Variants`
-    ///   if successful, or an error if file metadata cannot be accessed.
-    // pub fn todo_variants_agg(&self) -> anyhow::Result<Vec<CollectionsTasks>> {
-    //     let mut tasks = Vec::new();
-    //     let config = VariantsConfig::default();
-    //     let vcfs_ids = self.vcf.group_by_id();
-    //     for pair in &self.bam_pairs() {
-    //         if self.config.id_black_list.contains(&pair.0.id) {
-    //             continue;
-    //         }
-    //         let const_path = format!(
-    //             "{}/{}/diag/{}_constit.bytes.gz",
-    //             &config.result_dir, &pair.0.id, &pair.0.id
-    //         );
-    //         let constit = Path::new(&const_path);
-    //
-    //         if constit.exists() {
-    //             let vcfs: Vec<_> = vcfs_ids.iter().filter(|(id, _)| id == &pair.0.id).collect();
-    //             if let Some((_, vcfs)) = vcfs.first() {
-    //                 let mtime = constit
-    //                     .metadata()
-    //                     .context(format!("Can't access file metadata {const_path}."))?
-    //                     .mtime();
-    //                 let n_new = vcfs
-    //                     .iter()
-    //                     .filter(|vcf| mtime < vcf.file_metadata.mtime())
-    //                     .count();
-    //                 if n_new > 0 {
-    //                     tasks.push(CollectionsTasks::SomaticVariants {
-    //                         id: pair.0.id.clone(),
-    //                         config: config.clone(),
-    //                     });
-    //                 }
-    //             }
-    //         } else {
-    //             tasks.push(CollectionsTasks::SomaticVariants {
-    //                 id: pair.0.id.clone(),
-    //                 config: config.clone(),
-    //             });
-    //         }
-    //     }
-    //     Ok(tasks)
-    // }
-
-    /// Runs all tasks in the collection.
-    ///
-    /// This method attempts to execute each task in the collection.
-    ///
-    /// # Returns
-    ///
-    /// Returns `Ok(())` if the process completes without any critical errors, even if
-    /// individual tasks fail.
-    ///
-    /// # Errors
-    ///
-    /// This function will return an error if:
-    /// - Fetching todo tasks fails when the initial task list is empty.
-    /// - Any critical error occurs during the execution process.
-    ///
-    /// Note that individual task failures do not cause this method to return an error.
-    pub fn run(&mut self) -> anyhow::Result<()> {
-        if self.tasks.is_empty() {
-            self.todo().context("Failed to fetch todo tasks")?;
-            if self.tasks.is_empty() {
-                info!("No tasks to run");
-                return Ok(());
-            }
-        }
-
-        let n_tasks = self.tasks.len();
-        warn!("{n_tasks} tasks to run");
-
-        let mut completed_tasks = Vec::new();
-
-        for (i, task) in self.tasks.iter().enumerate() {
-            warn!("Running task {}/{}", i + 1, n_tasks);
-            info!("{task}");
-
-            match task.clone().run() {
-                Ok(_) => {
-                    info!("Task completed successfully");
-                    completed_tasks.push(i);
-                }
-                Err(err) => error!("Task failed: {}", err),
-            }
-        }
-
-        // Remove completed tasks
-        for &index in completed_tasks.iter().rev() {
-            self.tasks.remove(index);
-        }
-
-        info!(
-            "{} tasks completed, {} tasks remaining",
-            completed_tasks.len(),
-            self.tasks.len()
-        );
-
-        Ok(())
-    }
-
-    pub fn run_deepvariant(&mut self) -> anyhow::Result<()> {
-        let tasks = self.todo_deepvariants();
-
-        let n_tasks = tasks.len();
-        warn!("{n_tasks} tasks to run");
-
-        for (i, tasks_chunk) in tasks.chunks_exact(2).enumerate() {
-            match tasks_chunk {
-                [a, b] => {
-                    warn!("Running task {}/{} and {}/{n_tasks}", i + 1, n_tasks, i + 2);
-                    info!("{a} and {b}");
-
-                    let a = if let CollectionsTasks::DeepVariant {
-                        id,
-                        time_point,
-                        bam,
-                        ..
-                    } = a
-                    {
-                        CollectionsTasks::DeepVariant {
-                            id: id.to_string(),
-                            time_point: time_point.to_string(),
-                            bam: bam.to_string(),
-                        }
-                    } else {
-                        anyhow::bail!("Err")
-                    };
-
-                    let b = if let CollectionsTasks::DeepVariant {
-                        id,
-                        time_point,
-                        bam,
-                        ..
-                    } = b
-                    {
-                        CollectionsTasks::DeepVariant {
-                            id: id.to_string(),
-                            time_point: time_point.to_string(),
-                            bam: bam.to_string(),
-                        }
-                    } else {
-                        anyhow::bail!("Err");
-                    };
-
-                    let handle1 = thread::spawn(|| a.run());
-                    let handle2 = thread::spawn(|| b.run());
-                    let _ = handle1.join().unwrap();
-                    let _ = handle2.join().unwrap();
-                }
-                [a] => {
-                    info!("Single task: ({})", a);
-                    let _ = a.clone().run();
-                }
-                _ => (),
-            }
-        }
-
-        Ok(())
-    }
-}
-
-#[derive(Clone, Debug)]
-pub enum CollectionsTasks {
-    Align(FlowCellCase),
-    DemuxAlign(Vec<FlowCellCase>),
-    CountBam {
-        bam_path: String,
-        count_dir: String,
-        config: Config,
-    },
-    Assemble {
-        id: String,
-        time_point: String,
-        config: AssemblerConfig,
-    },
-    ModPileup {
-        bam: PathBuf,
-        config: ModkitConfig,
-    },
-    DMRCDiagMrd {
-        id: String,
-        config: ModkitConfig,
-    },
-    DeepVariant {
-        id: String,
-        time_point: String,
-        bam: String,
-    },
-    ClairS {
-        id: String,
-        diag_bam: String,
-        mrd_bam: String,
-    },
-    NanomonSV {
-        id: String,
-    },
-    SomaticVariants {
-        id: String,
-        config: VariantsConfig,
-    },
-}
-
-impl CollectionsTasks {
-    pub fn run(self) -> anyhow::Result<()> {
-        match self {
-            CollectionsTasks::Align(case) => {
-                BasecallAlign::init(case.clone(), Config::default())?.run_pipe()
-            }
-            CollectionsTasks::DemuxAlign(cases) => {
-                BasecallAlign::from_mux(cases, Config::default())
-            }
-            CollectionsTasks::ModPileup { bam, config } => bed_methyl(bam, &config),
-            CollectionsTasks::DeepVariant { id, time_point, .. } => {
-                DeepVariant::initialize(&id, &time_point, Config::default())?.run()
-            }
-            CollectionsTasks::ClairS { id, .. } => {
-                ClairS::initialize(&id, Config::default())?.run()
-            }
-            CollectionsTasks::NanomonSV { id, .. } => {
-                NanomonSV::initialize(&id, Config::default())?.run()
-            }
-            CollectionsTasks::CountBam {
-                bam_path,
-                count_dir,
-                config,
-            } => par_whole_scan(&count_dir, &bam_path, &config),
-            CollectionsTasks::SomaticVariants { id, config } => {
-                RunVariantsAgg::new(id, config).run()
-            }
-            CollectionsTasks::Assemble {
-                id,
-                time_point,
-                config,
-            } => Assembler::new(id, time_point, config).run(),
-            CollectionsTasks::DMRCDiagMrd { id, config } => dmr_c_mrd_diag(&id, &config),
-        }
-    }
-
-    pub fn get_order(&self) -> u8 {
-        match self {
-            CollectionsTasks::Align(_) => 0,
-            CollectionsTasks::DemuxAlign(_) => 1,
-            CollectionsTasks::ModPileup { .. } => 2,
-            CollectionsTasks::DMRCDiagMrd { .. } => 3,
-            CollectionsTasks::CountBam { .. } => 4,
-            CollectionsTasks::Assemble { .. } => 5,
-            CollectionsTasks::DeepVariant { .. } => 6,
-            CollectionsTasks::ClairS { .. } => 7,
-            CollectionsTasks::NanomonSV { .. } => 8,
-            CollectionsTasks::SomaticVariants { .. } => 9,
-        }
-    }
-}
-
-// Implement Display for CollectionsTasks
-impl fmt::Display for CollectionsTasks {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        use CollectionsTasks::*;
-
-        match self {
-            Align(case) => write!(
-                f,
-                "Alignment task for: {} {} {} {}",
-                case.id,
-                case.time_point,
-                case.barcode,
-                case.pod_dir.display()
-            ),
-            DemuxAlign(cases) => write!(
-                f,
-                "Demultiplex and alignment task for: {}",
-                cases
-                    .iter()
-                    .map(|c| format!("{} {} {}", c.id, c.time_point, c.barcode))
-                    .collect::<Vec<String>>()
-                    .join(", ")
-            ),
-            DeepVariant {
-                id,
-                time_point,
-                bam,
-                ..
-            } => {
-                write!(
-                    f,
-                    "DeepVariant calling task for {} {}, from bam: {}",
-                    id, time_point, bam
-                )
-            }
-            ClairS {
-                id,
-                diag_bam,
-                mrd_bam,
-                ..
-            } => {
-                write!(
-                    f,
-                    "ClairS calling task for {}, with diag_bam: {}, mrd_bam: {}",
-                    id, diag_bam, mrd_bam
-                )
-            }
-            NanomonSV { id } => {
-                write!(f, "NanomonSV calling task for {id}")
-            }
-            CountBam {
-                bam_path,
-                count_dir,
-                ..
-            } => write!(f, "Whole bam count for bam: {bam_path} into {count_dir}"),
-            SomaticVariants { id, .. } => write!(f, "Variants aggregation for {}", id),
-            Assemble { id, time_point, .. } => {
-                write!(f, "De novo assemblage for {} {}", id, time_point)
-            }
-            ModPileup { bam, .. } => write!(f, "ModPileup for {}", bam.display()),
-            DMRCDiagMrd { id, .. } => write!(f, "DMR C methylation diag vs mrd for {id}"),
-        }
-    }
-}
-
-pub fn run_tasks(config: CollectionsConfig) -> anyhow::Result<()> {
-    let mut last_n = Vec::with_capacity(3);
-    let mut consecutive_same_count = 0;
-
-    loop {
-        let mut collection =
-            Collections::new(config.clone()).context("Failed to create new Collections")?;
-        collection.todo().context("Failed to get todo tasks")?;
-        collection
-            .tasks
-            .iter()
-            .for_each(|t| info!("Planned task: {t}"));
-
-        let n_tasks = collection.tasks.len();
-
-        if n_tasks == 0 {
-            info!("All results are up to date");
-            break;
-        }
-
-        if last_n.len() >= 2 && last_n.iter().rev().take(2).all(|&x| x == n_tasks) {
-            consecutive_same_count += 1;
-            if consecutive_same_count >= 2 {
-                error!("Tasks are not progressing");
-                break;
-            }
-        } else {
-            consecutive_same_count = 0;
-        }
-
-        last_n.push(n_tasks);
-        if last_n.len() > 3 {
-            last_n.remove(0);
-        }
-
-        collection.run().context("Failed to run collection tasks")?;
-    }
-
-    Ok(())
-}
-
-pub trait Initialize: Sized {
-    fn initialize(id: &str, config: Config) -> anyhow::Result<Self>;
-}
-
-pub trait InitializeSolo: Sized {
-    fn initialize(id: &str, time: &str, config: Config) -> anyhow::Result<Self>;
-}
-
-pub trait ShouldRun {
-    fn should_run(&self) -> bool;
-}
-
-pub trait Version {
-    fn version(config: &Config) -> anyhow::Result<String>;
-}
-
-pub trait LoadVariants {
-    fn load_variants(&self) -> anyhow::Result<Variants>;
-}
-
-pub fn exists_all(paths: Vec<&str>) -> anyhow::Result<()> {
-    for path in paths.iter() {
-        if !Path::new(path).exists() {
-            anyhow::bail!("{path} should exist")
-        }
-    }
-    Ok(())
-}
+pub mod bam;
+pub mod flowcells;
+pub mod minknow;
+pub mod modbases;
+pub mod pod5;
+pub mod run;
+pub mod vcf;

+ 240 - 626
src/collection/pod5.rs

@@ -1,682 +1,296 @@
-use anyhow::{anyhow, Context, Result};
-use chrono::{DateTime, Utc};
-use csv::ReaderBuilder;
-use glob::glob;
-use hashbrown::HashMap;
-use log::{info, warn};
-use rayon::prelude::*;
-use serde::{Deserialize, Serialize};
 use std::{
-    fmt::Display,
-    fs::{self, File, Metadata},
-    os::unix::fs::MetadataExt,
+    fmt,
+    fs::File,
+    io::{BufReader, Write},
     path::{Path, PathBuf},
 };
-use crate::io::pod5_infos::Pod5Info;
-
-/// Represents a collection of Pod5 sequencing runs and associated metadata.
-///
-/// A `Pod5Collection` groups multiple sequencing runs (`Run`), each consisting of
-/// one or more flow cells. It is initialized by scanning a directory of `.pod5` files,
-/// optionally mapping flow cell names to corrected identifiers, and assigning BAM and
-/// `.pod5` directories.
-///
-/// # Fields
-/// - `importation_date`: Timestamp of when this collection was created.
-/// - `runs`: List of runs with associated flow cells and metadata.
-/// - `bam_dir`: Directory containing BAM files.
-/// - `pod5_dir`: Directory containing `.pod5` files.
-#[derive(Debug, Default)]
-pub struct Pod5Collection {
-    pub importation_date: DateTime<Utc>,
-    pub runs: Vec<Run>,
-    pub bam_dir: String,
-    pub pod5_dir: String,
-}
 
-impl Pod5Collection {
-    /// Constructs a new `Pod5Collection` by scanning the given `.pod5` directory,
-    /// applying corrected flowcell naming, and grouping data by run.
-    ///
-    /// # Arguments
-    /// - `pod5_dir`: Path to directory containing `.pod5` files.
-    /// - `corrected_fc_path`: Path to file with corrected flowcell mappings.
-    /// - `bam_dir`: Path to directory containing BAM files.
-    ///
-    /// # Returns
-    /// - `Ok(Pod5Collection)` if the data is consistent and valid.
-    /// - `Err(anyhow::Error)` if listing, parsing, or validation fails.
-    pub fn new(pod5_dir: &str, corrected_fc_path: &str, bam_dir: &str) -> Result<Self> {
-        // Load pod5 files
-        let pod5_files = list_pod_files(pod5_dir)?;
-        info!("n pod5 {}", pod5_files.len());
-
-        // Group pod5 files by run-flowcell key
-        let mut grouped: HashMap<String, Vec<Pod5>> = HashMap::new();
-        for pod in pod5_files {
-            let key = format!("{}••{}", pod.run_name, pod.flowcell_name);
-            grouped.entry(key).or_default().push(pod);
-        }
-
-        // Load corrected flowcell mapping
-        let corrected_fc = load_flowcells_corrected_names(corrected_fc_path)?;
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
 
-        // Construct FlowCells in parallel from Pod5 groups
-        let flowcells: Vec<FlowCell> = grouped
-            .into_values()
-            .par_bridge()
-            .map(|group| FlowCell::new(group, &corrected_fc))
-            .collect::<Result<Vec<_>>>()?;
+use crate::{helpers::{human_size, list_files_with_ext}, io::pod5_infos::Pod5Info};
 
-        // Group FlowCells by run_name (sequential step)
-        let mut runs_map: HashMap<String, Vec<FlowCell>> = HashMap::new();
-        for fc in flowcells {
-            runs_map.entry(fc.run_name.clone()).or_default().push(fc);
-        }
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Pod5 {
+    pub name: String,
+    pub file_size: u64,
+    pub path: PathBuf,
+    pub acquisition_id: String,
+    pub acquisition_start_time: DateTime<Utc>,
+    pub adc_max: i16,
+    pub adc_min: i16,
+    pub experiment_name: String,
+    pub flow_cell_id: String,
+    pub flow_cell_product_code: String,
+    pub protocol_name: String,
+    pub protocol_run_id: String,
+    pub protocol_start_time: DateTime<Utc>,
+    pub sample_id: String,
+    pub sample_rate: u16,
+    pub sequencing_kit: String,
+    pub sequencer_position: String,
+    pub sequencer_position_type: String,
+    pub software: String,
+    pub system_name: String,
+    pub system_type: String,
+}
 
-        // Convert each run group into a Run
-        let runs: Vec<Run> = runs_map
-            .into_values()
-            .map(|fcs| Run {
-                run_name: fcs[0].run_name.clone(),
-                flowcells: fcs,
-            })
-            .collect();
+impl Pod5 {
+    /// Construct a `Pod5` from a filesystem path.
+    ///
+    /// This loads the metadata using `Pod5Info::from_pod5` and fills the
+    /// corresponding fields in `Pod5`.
+    pub fn from_path<P: AsRef<Path>>(path: P) -> std::io::Result<Self> {
+        let path_ref = path.as_ref();
+        let info = Pod5Info::from_pod5(path_ref.to_str().unwrap());
+        let file_size = std::fs::metadata(path_ref)?.len();
 
         Ok(Self {
-            importation_date: Utc::now(),
-            runs,
-            bam_dir: bam_dir.to_string(),
-            pod5_dir: pod5_dir.to_string(),
+            name: path_ref
+                .file_name()
+                .and_then(|s| s.to_str())
+                .unwrap_or("")
+                .to_string(),
+            file_size,
+            path: PathBuf::from(path_ref),
+
+            acquisition_id: info.acquisition_id,
+            acquisition_start_time: info.acquisition_start_time,
+            adc_max: info.adc_max,
+            adc_min: info.adc_min,
+            experiment_name: info.experiment_name,
+            flow_cell_id: info.flow_cell_id,
+            flow_cell_product_code: info.flow_cell_product_code,
+            protocol_name: info.protocol_name,
+            protocol_run_id: info.protocol_run_id,
+            protocol_start_time: info.protocol_start_time,
+            sample_id: info.sample_id,
+            sample_rate: info.sample_rate,
+            sequencing_kit: info.sequencing_kit,
+            sequencer_position: info.sequencer_position,
+            sequencer_position_type: info.sequencer_position_type,
+            software: info.software,
+            system_name: info.system_name,
+            system_type: info.system_type,
         })
     }
-
-    pub fn print_info(&self) {
-        self.runs.iter().for_each(|run| {
-            run.flowcells.iter().for_each(|fc| {
-                let total_size: u64 = fc.pod5.iter().map(|p| p.file_metadata.size()).sum();
-                let n_files = fc.pod5.len();
-                let dates: Vec<DateTime<Utc>> = fc
-                    .pod5
-                    .iter()
-                    .map(|p| p.file_metadata.modified().unwrap().into())
-                    .collect();
-                let from = dates.iter().min().unwrap();
-                let to = dates.iter().max().unwrap();
-                let s = [
-                    run.run_name.clone(),
-                    from.to_string(),
-                    to.to_string(),
-                    n_files.to_string(),
-                    total_size.to_string(),
-                    fc.flowcell_name.to_string(),
-                    fc.pod5_type.to_string(),
-                    fc.pod5_info.acquisition_id.clone(),
-                    format!("{:?}", fc.cases),
-                ]
-                .join("\t");
-                println!("{s}");
-            });
-        });
-    }
-
-    /// Returns a sorted and deduplicated list of all unique `FlowCellCase` IDs in the collection.
-    pub fn ids(&self) -> Vec<String> {
-        let mut ids: Vec<String> = self
-            .runs
-            .iter()
-            .flat_map(|r| r.flowcells.iter())
-            .flat_map(|f| f.cases.iter().map(|c| c.id.clone()))
-            .collect();
-
-        ids.sort_unstable(); // faster than sort()
-        ids.dedup();
-        ids
-    }
 }
 
-/// Represents a sequencing run, which may contain multiple flowcells.
-///
-/// A `Run` groups flowcells that were processed together during a sequencing event
-/// (e.g., a MinION or PromethION run). It serves as a logical grouping for downstream analysis.
-///
-/// # Fields
-/// - `run_name`: Unique identifier for the sequencing run (e.g., "20240301_RUN42").
-/// - `flowcells`: List of `FlowCell` objects associated with this run.
 #[derive(Debug)]
-pub struct Run {
-    /// Name of the sequencing run.
-    pub run_name: String,
-
-    /// Flowcells that belong to this run.
-    pub flowcells: Vec<FlowCell>,
+pub struct Pod5sFlowCell {
+    pub flow_cell_id: String,
+    pub pod5s: Vec<Pod5>,
 }
 
-/// Represents a flowcell and its associated metadata, cases, and `.pod5` files.
-///
-/// A `FlowCell` encapsulates all relevant information needed to track,
-/// identify, and process a physical flowcell, including its corrected name,
-/// acquisition metadata, and associated `.pod5` files.
-///
-/// # Fields
-/// - `flowcell_name`: Original name of the flowcell as found in `.pod5` files.
-/// - `corrected_name`: Normalized or corrected version of the flowcell name (if available).
-/// - `cases`: Associated cases (`FlowCellCase`) for this flowcell, usually representing samples or barcodes.
-/// - `run_name`: Name of the sequencing run this flowcell belongs to.
-/// - `pod5_type`: Whether the `.pod5` files are raw or demultiplexed (`Pod5Type`).
-/// - `pod5_info`: Metadata extracted from one representative `.pod5` file (`Pod5Info`).
-/// - `pod5`: All `.pod5` file entries associated with this flowcell.
-#[derive(Debug, Clone)]
-pub struct FlowCell {
-    /// Original flowcell name (e.g., "FCX123").
-    pub flowcell_name: String,
-
-    /// Corrected flowcell name, if normalization was applied.
-    pub corrected_name: String,
-
-    /// Sample/barcode-level associations for this flowcell.
-    pub cases: Vec<FlowCellCase>,
-
-    /// The sequencing run this flowcell belongs to.
-    pub run_name: String,
+impl Pod5sFlowCell {
+    /// Load all `.pod5` files from a directory and build a collection.
+    ///
+    /// The directory is scanned using `list_files_with_ext`.  
+    /// Each file is parsed using `Pod5::from_path`.
+    pub fn load_from_dir<P: AsRef<Path>>(dir: P) -> anyhow::Result<Self> {
+        let pod_paths = list_files_with_ext(dir.as_ref(), "pod5")?;
+        if pod_paths.is_empty() {
+            anyhow::bail!("No .pod5 files found in directory");
+        }
 
-    /// Type of pod5 data: raw or demuxed.
-    pub pod5_type: Pod5Type,
+        let mut pod5s = Vec::with_capacity(pod_paths.len());
+        let mut flow_cell_id: Option<String> = None;
 
-    /// Metadata extracted from a `.pod5` file, including acquisition ID.
-    pub pod5_info: Pod5Info,
+        for p in pod_paths {
+            let pod = Pod5::from_path(&p)?;
 
-    /// The list of `.pod5` files linked to this flowcell.
-    pub pod5: Vec<Pod5>,
-}
+            match &flow_cell_id {
+                None => {
+                    // First pod defines the flowcell
+                    flow_cell_id = Some(pod.flow_cell_id.clone());
+                }
+                Some(expected_id) => {
+                    if &pod.flow_cell_id != expected_id {
+                        anyhow::bail!(format!(
+                            "Mixed flow cells in directory: expected '{}', found '{}' (file: {})",
+                            expected_id,
+                            pod.flow_cell_id,
+                            pod.path.display(),
+                        ));
+                    }
+                }
+            }
 
-impl FlowCell {
-    /// Constructs a new `FlowCell` from a non-empty vector of `Pod5` entries
-    /// and a list of corrected flowcell mappings.
-    ///
-    /// Ensures that all entries in the vector share the same `run_name`, `flowcell_name`, and `pod5_type`.
-    ///
-    /// # Arguments
-    /// - `pods`: A non-empty vector of `Pod5` entries (moved, not cloned).
-    /// - `corrected_fc`: Reference to a list of `FCLine` entries for resolving corrected names.
-    ///
-    /// # Errors
-    /// Returns an error if:
-    /// - `pods` is empty
-    /// - `.pod5` path is invalid UTF-8
-    /// - inconsistent metadata across pod5 entries
-    /// - multiple corrected names are found
-    /// - parent directory resolution fails
-    pub fn new(pods: Vec<Pod5>, corrected_fc: &[FCLine]) -> anyhow::Result<Self> {
-        let first = pods.first().context("Empty pod5 list for FlowCell")?;
-
-        let flowcell_name = &first.flowcell_name;
-        let run_name = &first.run_name;
-        let pod5_type = &first.pod5_type;
-
-        // Consistency check
-        let inconsistent = pods.iter().any(|p| {
-            p.flowcell_name != *flowcell_name
-                || p.run_name != *run_name
-                || p.pod5_type != *pod5_type
-        });
-        if inconsistent {
-            return Err(anyhow!(
-                "Inconsistent pod5 metadata: all entries must share the same run_name, flowcell_name, and pod5_type"
-            ));
+            pod5s.push(pod);
         }
 
-        // Extract and validate .pod5 path
-        let path_str = first.path.to_str().context("Invalid UTF-8 in pod5 path")?;
-        let pod5_info = Pod5Info::from_pod5(path_str);
-
-        // Select corrected entries for this flowcell
-        let matched_fc_lines: Vec<_> = corrected_fc
-            .iter()
-            .filter(|e| e.flow_cell == *flowcell_name)
-            .cloned()
-            .collect();
-
-        // Resolve unique corrected name
-        let corrected_name = {
-            let mut names: Vec<_> = matched_fc_lines
-                .iter()
-                .map(|e| e.ref_flow_cell.clone())
-                .filter(|s| !s.is_empty())
-                .collect();
-            names.dedup();
-
-            match names.len() {
-                0 => String::new(),
-                1 => names[0].clone(),
-                _ => {
-                    return Err(anyhow!(
-                        "Multiple corrected names for flow cell '{}': {:?}",
-                        flowcell_name,
-                        names
-                    ));
-                }
-            }
-        };
-
-        // Cache parent directories
-        let raw_parent = first.path.parent().context("Missing parent for RAW pod5")?;
-        let demuxed_grandparent = raw_parent
-            .parent()
-            .context("Invalid directory structure for DEMUXED pod5")?;
-
-        // Build case list
-        let cases = matched_fc_lines
-            .iter()
-            .map(|e| {
-                let pod_dir = match pod5_type {
-                    Pod5Type::Raw => raw_parent.to_path_buf(),
-                    Pod5Type::Demuxed => {
-                        let mut bc_dir = demuxed_grandparent.to_path_buf();
-                        bc_dir.push(format!("barcode{}", e.barcode_number.replace("NB", "")));
-                        bc_dir
-                    }
-                };
-                Ok(FlowCellCase {
-                    id: e.id.clone(),
-                    time_point: e.sample_type.clone(),
-                    barcode: e.barcode_number.clone(),
-                    pod_dir,
-                })
-            })
-            .collect::<Result<_>>()?;
+        let flow_cell_id = flow_cell_id.ok_or(anyhow::anyhow!("No pod5 files loaded"))?;
 
         Ok(Self {
-            flowcell_name: flowcell_name.clone(),
-            corrected_name,
-            cases,
-            run_name: run_name.clone(),
-            pod5_type: pod5_type.clone(),
-            pod5_info,
-            pod5: pods, // Already moved
+            flow_cell_id,
+            pod5s,
         })
     }
-}
-
-/// Represents the type of `.pod5` file: either raw or demultiplexed.
-#[derive(Debug, Clone, PartialEq)]
-pub enum Pod5Type {
-    /// Raw `.pod5` files directly from acquisition.
-    Raw,
-    /// Demultiplexed `.pod5` files, post-processed by barcoding.
-    Demuxed,
-}
-
-impl Display for Pod5Type {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let s = match self {
-            Pod5Type::Raw => "raw",
-            Pod5Type::Demuxed => "demuxed",
-        };
-        f.write_str(s)
-    }
-}
-
-/// Configuration for interpreting file paths when parsing `.pod5` files.
-#[derive(Debug, Clone)]
-pub struct Pod5Config {
-    /// Base directory (prefix to strip from full paths).
-    pub base_dir: String,
-    /// Substring used to detect "raw" pod5 files.
-    pub type_raw: String,
-    /// Substring used to detect "demuxed" pod5 files.
-    pub type_demuxed: String,
-    /// Index (in path components) where `run_name` is expected.
-    pub run_dir_n: u8,
-    /// Index (in path components) where `flowcell_name` is expected.
-    pub flowcell_dir_n: u8,
-}
 
-impl Default for Pod5Config {
-    fn default() -> Self {
-        Self {
-            base_dir: "/data/run_data".to_string(),
-            type_raw: "/pod5/".to_string(),
-            type_demuxed: "/pod5_pass/".to_string(),
-            run_dir_n: 0,
-            flowcell_dir_n: 1,
-        }
+    /// Save the collection as JSON to the given path.
+    ///
+    /// The output is a single JSON array containing all `Pod5` entries.
+    /// Existing files are overwritten.
+    pub fn save_to_json<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<dyn std::error::Error>> {
+        let mut f = File::create(path)?;
+        let data = serde_json::to_vec_pretty(&self.pod5s)?;
+        f.write_all(&data)?;
+        Ok(())
     }
-}
 
-/// Represents a `.pod5` file and its associated metadata and location info.
-///
-/// Used as the base object for flowcell and run aggregation.
-#[derive(Debug, Clone)]
-pub struct Pod5 {
-    /// Full path to the `.pod5` file.
-    pub path: PathBuf,
+    /// Load a collection from a JSON file.
+    ///
+    /// The file must contain a JSON array of `Pod5` objects.
+    pub fn load_from_json<P: AsRef<Path>>(path: P) -> Result<Self, Box<dyn std::error::Error>> {
+        let f = File::open(path)?;
+        let reader = BufReader::new(f);
 
-    /// Whether the file is raw or demultiplexed.
-    pub pod5_type: Pod5Type,
+        // Expect JSON array of Pod5
+        let loaded: Vec<Pod5> = serde_json::from_reader(reader)?;
 
-    /// Name of the sequencing run this file belongs to.
-    pub run_name: String,
+        if loaded.is_empty() {
+            return Err("JSON contains no Pod5 entries".into());
+        }
 
-    /// Name of the flowcell associated with this file.
-    pub flowcell_name: String,
+        let mut pod5s = Vec::with_capacity(loaded.len());
+        let mut flow_cell_id: Option<String> = None;
 
-    /// Filesystem metadata (e.g., size, modified time).
-    pub file_metadata: Metadata,
-}
+        for pod in loaded {
+            match &flow_cell_id {
+                None => {
+                    // First pod defines the flowcell
+                    flow_cell_id = Some(pod.flow_cell_id.clone());
+                }
+                Some(expected_id) => {
+                    if &pod.flow_cell_id != expected_id {
+                        return Err(format!(
+                            "Mixed flow cells in JSON: expected '{}', found '{}' (file: {})",
+                            expected_id,
+                            pod.flow_cell_id,
+                            pod.path.display(),
+                        )
+                        .into());
+                    }
+                }
+            }
 
-impl Pod5 {
-    /// Constructs a `Pod5` instance from a file path, using a `Pod5Config` to infer type and extract metadata.
-    ///
-    /// # Arguments
-    /// - `path`: Path to the `.pod5` file.
-    /// - `config`: Configuration used to interpret the path structure.
-    ///
-    /// # Returns
-    /// - `Ok(Pod5)` if type and components can be extracted.
-    /// - `Err` if path is malformed, missing components, or type is unrecognized.
-    pub fn from_path(path: impl AsRef<Path>, config: &Pod5Config) -> Result<Self> {
-        let path = path.as_ref();
-        let path_str = path
-            .to_str()
-            .context(format!("Can't convert path to UTF-8 string: {:?}", path))?;
-
-        // Determine Pod5 type by pattern matching
-        let pod5_type = if path_str.contains(&config.type_raw) {
-            Pod5Type::Raw
-        } else if path_str.contains(&config.type_demuxed) {
-            Pod5Type::Demuxed
-        } else {
-            return Err(anyhow!(
-                "Unable to determine pod5 type from path: {}",
-                path_str
-            ));
-        };
-
-        // Extract metadata from filesystem
-        let file_metadata =
-            fs::metadata(path).with_context(|| format!("Failed to get metadata for {:?}", path))?;
-
-        // Strip base_dir and split into components
-        let relative_path = path_str.strip_prefix(&config.base_dir).unwrap_or(path_str); // fallback to full path if base_dir is not a prefix
-
-        let components: Vec<&str> = relative_path.split('/').filter(|c| !c.is_empty()).collect();
-
-        // Extract run_name and flowcell_name from path components
-        let run_name = components
-            .get(config.run_dir_n as usize)
-            .context("Missing run_name in path")?
-            .to_string();
-
-        let flowcell_name = components
-            .get(config.flowcell_dir_n as usize)
-            .context("Missing flowcell_name in path")?
-            .to_string();
+            pod5s.push(pod);
+        }
 
         Ok(Self {
-            path: path.to_path_buf(),
-            pod5_type,
-            run_name,
-            flowcell_name,
-            file_metadata,
+            flow_cell_id: flow_cell_id.expect("flow_cell_id must be set"),
+            pod5s,
         })
     }
-}
+    /// Compute summary statistics for the collection.
+    pub fn stats(&self) -> Pod5FlowCellStats {
+        if self.pod5s.is_empty() {
+            return Pod5FlowCellStats {
+                flow_cell_id: self.flow_cell_id.clone(),
+                count: 0,
+                total_size: 0,
+                min_acq: None,
+                max_acq: None,
+                min_protocol: None,
+                max_protocol: None,
+                avg_sample_rate: None,
+            };
+        }
 
-/// Recursively scans a directory for `.pod5` files and parses them into `Pod5` objects.
-///
-/// This function uses glob-based search to find all `.pod5` files under the given directory
-/// (including subdirectories), then filters out unwanted paths (e.g., `pod5_fail/`, `pod5_skip/`)
-/// and attempts to parse each remaining file using `Pod5::from_path`.
-///
-/// Any file that fails to parse is skipped with a warning.
-///
-/// # Arguments
-/// - `dir`: Path to the root directory to search (absolute or relative).
-///
-/// # Returns
-/// - `Ok(Vec<Pod5>)` on success, with all successfully parsed `.pod5` files.
-/// - `Err(anyhow::Error)` if path parsing fails (e.g., invalid UTF-8).
-///
-/// # Errors
-/// - Fails early if the glob pattern itself is invalid.
-/// - Skips over files that fail to parse, but logs warnings.
-///
-/// # Notes
-/// - Directories containing `/pod5_fail/` or `/pod5_skip/` are excluded.
-/// - The glob pattern used is `{dir}/**/*.pod5`.
-///
-/// # Example
-/// ```
-/// let pod_files = list_pod_files("/data/pods")?;
-/// for pod in pod_files {
-///     println!("{}", pod.path.display());
-/// }
-/// ```
-pub fn list_pod_files(dir: &str) -> Result<Vec<Pod5>> {
-    let pattern = format!("{}/**/*.pod5", dir);
-    let mut pod_files = Vec::new();
-
-    let conf = Pod5Config {
-        base_dir: if dir.ends_with('/') {
-            dir.to_string()
-        } else {
-            format!("{dir}/")
-        },
-        ..Pod5Config::default()
-    };
-
-    for entry in glob(&pattern).expect("Failed to read glob pattern") {
-        match entry {
-            Ok(path) => {
-                let p = path.to_str().context("Can't parse path to string {path}")?;
-                if p.contains("/pod5_fail/") || p.contains("/pod5_skip/") {
-                    continue;
-                }
-                match Pod5::from_path(&path, &conf) {
-                    Ok(pod5) => pod_files.push(pod5),
-                    Err(e) => warn!("{e}"),
-                }
-            }
-            Err(e) => warn!("Error: {:?}", e),
+        let count = self.pod5s.len();
+        let total_size = self.pod5s.iter().map(|p| p.file_size).sum();
+
+        let (min_acq, max_acq) = self.pod5s.iter().map(|p| p.acquisition_start_time).fold(
+            (
+                self.pod5s[0].acquisition_start_time,
+                self.pod5s[0].acquisition_start_time,
+            ),
+            |(minv, maxv), t| (minv.min(t), maxv.max(t)),
+        );
+
+        let (min_protocol, max_protocol) = self.pod5s.iter().map(|p| p.protocol_start_time).fold(
+            (
+                self.pod5s[0].protocol_start_time,
+                self.pod5s[0].protocol_start_time,
+            ),
+            |(minv, maxv), t| (minv.min(t), maxv.max(t)),
+        );
+
+        let avg_sample_rate =
+            Some(self.pod5s.iter().map(|p| p.sample_rate as f64).sum::<f64>() / count as f64);
+
+        Pod5FlowCellStats {
+            flow_cell_id: self.flow_cell_id.clone(),
+            count,
+            total_size,
+            min_acq: Some(min_acq),
+            max_acq: Some(max_acq),
+            min_protocol: Some(min_protocol),
+            max_protocol: Some(max_protocol),
+            avg_sample_rate,
         }
     }
-    Ok(pod_files)
 }
 
-// impl FlowCell {
-//     pub fn cases_pod5_dir(&self) -> Vec<PathBuf> {
-//         match self.pod5_type {
-//             Pod5Type::Raw => {
-//                 let p = self.pod5.first().unwrap();
-//                 vec![p.path.parent().unwrap().to_path_buf()]
-//             },
-//             Pod5Type::Demuxed => {
-//                 self.cases.iter().map(|c| {
-//                     let str_barcode = format!("barcode{}", c.barcode);
-//                 })
-//             },
-//         }
-//     }
-// }
-#[derive(Debug, Clone, Default)]
-pub struct FlowCellCase {
-    pub id: String,
-    pub time_point: String,
-    pub barcode: String,
-    pub pod_dir: PathBuf,
-    // pub basecalled: Option<bool>,
-}
-
-// #[derive(Debug, Serialize, Deserialize, Clone)]
-// pub struct IdsInput {
-//     pub data: Vec<IdInput>,
-// }
-//
-// #[derive(Debug, Serialize, Deserialize, Clone)]
-// pub struct IdInput {
-//     pub id: String,
-//     pub time_point: String,
-//     pub barcode: String,
-//     pub flow_cell: String,
-//     pub run: String,
-// }
-//
-// // Implement PartialEq and Eq for IdInput
-// impl PartialEq for IdInput {
-//     fn eq(&self, other: &Self) -> bool {
-//         self.id == other.id
-//             && self.time_point == other.time_point
-//             && self.barcode == other.barcode
-//             && self.flow_cell == other.flow_cell
-//             && self.run == other.run
-//     }
-// }
-//
-// impl Eq for IdInput {}
-//
-// // Implement Hash for IdInput
-// impl Hash for IdInput {
-//     fn hash<H: Hasher>(&self, state: &mut H) {
-//         self.id.hash(state);
-//         self.time_point.hash(state);
-//         self.barcode.hash(state);
-//         self.flow_cell.hash(state);
-//         self.run.hash(state);
-//     }
-// }
-//
-// impl IdsInput {
-//     pub fn load_json(path: &str) -> anyhow::Result<Self> {
-//         let f = File::open(path)?;
-//         let s: Self = serde_json::from_reader(f)?;
-//         Ok(s)
-//     }
-//
-//     pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
-//         let f = File::create(path)?;
-//         serde_json::to_writer(f, self)?;
-//         Ok(())
-//     }
-//
-//     pub fn dedup(&mut self) {
-//         let mut unique = HashSet::new();
-//         self.data.retain(|item| unique.insert(item.clone()));
-//     }
-//
-//     pub fn load_from_tsv(path: &str) -> anyhow::Result<Self> {
-//         let inputs = load_flowcells_corrected_names(path)?;
-//         let data = inputs
-//             .iter()
-//             .map(|line| IdInput {
-//                 id: line.id.to_string(),
-//                 time_point: line.sample_type.to_string(),
-//                 barcode: line.barcode_number.to_string(),
-//                 flow_cell: line.flow_cell.to_string(),
-//                 run: line.run.to_string(),
-//             })
-//             .collect();
-//
-//         let mut res = Self { data };
-//         res.dedup();
-//         Ok(res)
-//     }
-//
-//     pub fn add_input(&mut self, values: IdInput) {
-//         self.data.push(values);
-//         self.dedup();
-//     }
-// }
-
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct Pod5Run {
-    pub protocol_run_id: String,
-    pub position_id: String,
+#[derive(Debug, Clone)]
+pub struct Pod5FlowCellStats {
     pub flow_cell_id: String,
-    pub id: String,
-    pub time_point: String,
-    pub barcode_number: String,
-    pub flow_cell: String,
-    pub run: String,
-    pub last_pod_dir: (DateTime<Utc>, String),
-    pub archives: Vec<(String, DateTime<Utc>, String)>,
+    pub count: usize,
+    pub total_size: u64,
+    pub min_acq: Option<DateTime<Utc>>,
+    pub max_acq: Option<DateTime<Utc>>,
+    pub min_protocol: Option<DateTime<Utc>>,
+    pub max_protocol: Option<DateTime<Utc>>,
+    pub avg_sample_rate: Option<f64>,
 }
 
-/// Loads corrected flowcell metadata from a tab-delimited file.
-///
-/// This function parses a TSV file where each row is deserialized into an `FCLine`.
-/// It also normalizes some fields (e.g., lowercases `sample_type`, uppercases `id`)
-/// for consistency in downstream processing.
-///
-/// # Arguments
-/// - `file_path`: Path to the TSV file containing flowcell correction data.
-///
-/// # Returns
-/// A vector of `FCLine` records, one per line in the file.
-///
-/// # Errors
-/// Returns an error if the file cannot be opened or if any line fails to deserialize.
-///
-/// # Expected Format (TSV with header)
-/// ```text
-/// id    sample_type    barcode_number    flow_cell    run_path    ref_flow_cell
-/// P001X03    tumoral    NB01    FC123    RUN123    /path/to/data    FC123_CORR
-/// ```
-///
-/// # Example
-/// ```
-/// let fc_lines = load_flowcells_corrected_names("flowcells.tsv")?;
-/// assert!(!fc_lines.is_empty());
-/// ```
-pub fn load_flowcells_corrected_names(file_path: &str) -> anyhow::Result<Vec<FCLine>> {
-    let file = File::open(file_path)?;
-
-    let mut rdr = ReaderBuilder::new()
-        .delimiter(b'\t')
-        .has_headers(true)
-        .from_reader(file);
-
-    let mut records = Vec::new();
-    for result in rdr.deserialize() {
-        let mut record: FCLine = result?;
-
-        // formating
-        record.sample_type = record.sample_type.to_lowercase();
-        record.id = record.id.to_uppercase();
-
-        records.push(record);
-    }
+impl fmt::Display for Pod5FlowCellStats {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "Pod5 Flow Cell Stats")?;
+        writeln!(f, "---------------------")?;
+        writeln!(f, "Flow Cell ID: {}", self.flow_cell_id)?;
+        writeln!(f, "Count: {}", self.count)?;
+        writeln!(f, "Total Size: {} ({} bytes)", human_size(self.total_size), self.total_size)?;
 
-    Ok(records)
-}
+        if let Some(t) = self.min_acq {
+            writeln!(f, "Acquisition Start (min): {}", t)?;
+        }
+        if let Some(t) = self.max_acq {
+            writeln!(f, "Acquisition Start (max): {}", t)?;
+        }
 
+        if let Some(t) = self.min_protocol {
+            writeln!(f, "Protocol Start (min): {}", t)?;
+        }
+        if let Some(t) = self.max_protocol {
+            writeln!(f, "Protocol Start (max): {}", t)?;
+        }
 
-/// Represents a single record describing a barcode-flowcell pairing,
-/// including original and corrected metadata.
-///
-/// This struct is typically deserialized from a TSV file and used to map
-/// `.pod5` files to metadata like corrected flowcell names and experimental time points.
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct FCLine {
-    /// Unique identifier for the sample or barcode group (e.g., "P001X03").
-    pub id: String,
+        if let Some(avg) = self.avg_sample_rate {
+            writeln!(f, "Average Sample Rate: {:.2}", avg)?;
+        }
 
-    /// Sample type associated with this record (e.g., "normal", "tumoral").
-    pub sample_type: String,
+        Ok(())
+    }
+}
 
-    /// The barcode number (e.g., "NB01", "NB02").
-    pub barcode_number: String,
+#[cfg(test)]
+mod tests {
+    use crate::helpers::test_init;
 
-    /// Original flowcell name as found in the raw `.pod5` metadata.
-    pub flow_cell: String,
+    use super::*;
 
-    /// Sequencing run name this flowcell belongs to (e.g., "20240101_FAB123").
-    pub run: String,
+    #[test]
+    fn load_pod5s() -> anyhow::Result<()> {
+        test_init();
 
-    /// Original path to data (can be absolute or relative).
-    pub path: String,
+        let dir = "/mnt/beegfs02/scratch/t_steimle/prom_runs/A/20251117_0915_P2I-00461-A_PBI55810_22582b29/pod5_recovered";
 
-    /// Corrected flowcell name used to resolve naming inconsistencies.
-    pub ref_flow_cell: String,
-}
+        let flow_cell = Pod5sFlowCell::load_from_dir(dir)?;
+        let stats = flow_cell.stats();
+
+        println!("{stats}");
 
+        Ok(())
+    }
+}

+ 682 - 0
src/collection/pod5_old.rs

@@ -0,0 +1,682 @@
+use anyhow::{anyhow, Context, Result};
+use chrono::{DateTime, Utc};
+use csv::ReaderBuilder;
+use glob::glob;
+use hashbrown::HashMap;
+use log::{info, warn};
+use rayon::prelude::*;
+use serde::{Deserialize, Serialize};
+use std::{
+    fmt::Display,
+    fs::{self, File, Metadata},
+    os::unix::fs::MetadataExt,
+    path::{Path, PathBuf},
+};
+use crate::io::pod5_infos::Pod5Info;
+
+/// Represents a collection of Pod5 sequencing runs and associated metadata.
+///
+/// A `Pod5Collection` groups multiple sequencing runs (`Run`), each consisting of
+/// one or more flow cells. It is initialized by scanning a directory of `.pod5` files,
+/// optionally mapping flow cell names to corrected identifiers, and assigning BAM and
+/// `.pod5` directories.
+///
+/// # Fields
+/// - `importation_date`: Timestamp of when this collection was created.
+/// - `runs`: List of runs with associated flow cells and metadata.
+/// - `bam_dir`: Directory containing BAM files.
+/// - `pod5_dir`: Directory containing `.pod5` files.
+#[derive(Debug, Default)]
+pub struct Pod5Collection {
+    pub importation_date: DateTime<Utc>,
+    pub runs: Vec<Run>,
+    pub bam_dir: String,
+    pub pod5_dir: String,
+}
+
+impl Pod5Collection {
+    /// Constructs a new `Pod5Collection` by scanning the given `.pod5` directory,
+    /// applying corrected flowcell naming, and grouping data by run.
+    ///
+    /// # Arguments
+    /// - `pod5_dir`: Path to directory containing `.pod5` files.
+    /// - `corrected_fc_path`: Path to file with corrected flowcell mappings.
+    /// - `bam_dir`: Path to directory containing BAM files.
+    ///
+    /// # Returns
+    /// - `Ok(Pod5Collection)` if the data is consistent and valid.
+    /// - `Err(anyhow::Error)` if listing, parsing, or validation fails.
+    pub fn new(pod5_dir: &str, corrected_fc_path: &str, bam_dir: &str) -> Result<Self> {
+        // Load pod5 files
+        let pod5_files = list_pod_files(pod5_dir)?;
+        info!("n pod5 {}", pod5_files.len());
+
+        // Group pod5 files by run-flowcell key
+        let mut grouped: HashMap<String, Vec<Pod5>> = HashMap::new();
+        for pod in pod5_files {
+            let key = format!("{}••{}", pod.run_name, pod.flowcell_name);
+            grouped.entry(key).or_default().push(pod);
+        }
+
+        // Load corrected flowcell mapping
+        let corrected_fc = load_flowcells_corrected_names(corrected_fc_path)?;
+
+        // Construct FlowCells in parallel from Pod5 groups
+        let flowcells: Vec<FlowCell> = grouped
+            .into_values()
+            .par_bridge()
+            .map(|group| FlowCell::new(group, &corrected_fc))
+            .collect::<Result<Vec<_>>>()?;
+
+        // Group FlowCells by run_name (sequential step)
+        let mut runs_map: HashMap<String, Vec<FlowCell>> = HashMap::new();
+        for fc in flowcells {
+            runs_map.entry(fc.run_name.clone()).or_default().push(fc);
+        }
+
+        // Convert each run group into a Run
+        let runs: Vec<Run> = runs_map
+            .into_values()
+            .map(|fcs| Run {
+                run_name: fcs[0].run_name.clone(),
+                flowcells: fcs,
+            })
+            .collect();
+
+        Ok(Self {
+            importation_date: Utc::now(),
+            runs,
+            bam_dir: bam_dir.to_string(),
+            pod5_dir: pod5_dir.to_string(),
+        })
+    }
+
+    pub fn print_info(&self) {
+        self.runs.iter().for_each(|run| {
+            run.flowcells.iter().for_each(|fc| {
+                let total_size: u64 = fc.pod5.iter().map(|p| p.file_metadata.size()).sum();
+                let n_files = fc.pod5.len();
+                let dates: Vec<DateTime<Utc>> = fc
+                    .pod5
+                    .iter()
+                    .map(|p| p.file_metadata.modified().unwrap().into())
+                    .collect();
+                let from = dates.iter().min().unwrap();
+                let to = dates.iter().max().unwrap();
+                let s = [
+                    run.run_name.clone(),
+                    from.to_string(),
+                    to.to_string(),
+                    n_files.to_string(),
+                    total_size.to_string(),
+                    fc.flowcell_name.to_string(),
+                    fc.pod5_type.to_string(),
+                    fc.pod5_info.acquisition_id.clone(),
+                    format!("{:?}", fc.cases),
+                ]
+                .join("\t");
+                println!("{s}");
+            });
+        });
+    }
+
+    /// Returns a sorted and deduplicated list of all unique `FlowCellCase` IDs in the collection.
+    pub fn ids(&self) -> Vec<String> {
+        let mut ids: Vec<String> = self
+            .runs
+            .iter()
+            .flat_map(|r| r.flowcells.iter())
+            .flat_map(|f| f.cases.iter().map(|c| c.id.clone()))
+            .collect();
+
+        ids.sort_unstable(); // faster than sort()
+        ids.dedup();
+        ids
+    }
+}
+
+/// Represents a sequencing run, which may contain multiple flowcells.
+///
+/// A `Run` groups flowcells that were processed together during a sequencing event
+/// (e.g., a MinION or PromethION run). It serves as a logical grouping for downstream analysis.
+///
+/// # Fields
+/// - `run_name`: Unique identifier for the sequencing run (e.g., "20240301_RUN42").
+/// - `flowcells`: List of `FlowCell` objects associated with this run.
+#[derive(Debug)]
+pub struct Run {
+    /// Name of the sequencing run.
+    pub run_name: String,
+
+    /// Flowcells that belong to this run.
+    pub flowcells: Vec<FlowCell>,
+}
+
+/// Represents a flowcell and its associated metadata, cases, and `.pod5` files.
+///
+/// A `FlowCell` encapsulates all relevant information needed to track,
+/// identify, and process a physical flowcell, including its corrected name,
+/// acquisition metadata, and associated `.pod5` files.
+///
+/// # Fields
+/// - `flowcell_name`: Original name of the flowcell as found in `.pod5` files.
+/// - `corrected_name`: Normalized or corrected version of the flowcell name (if available).
+/// - `cases`: Associated cases (`FlowCellCase`) for this flowcell, usually representing samples or barcodes.
+/// - `run_name`: Name of the sequencing run this flowcell belongs to.
+/// - `pod5_type`: Whether the `.pod5` files are raw or demultiplexed (`Pod5Type`).
+/// - `pod5_info`: Metadata extracted from one representative `.pod5` file (`Pod5Info`).
+/// - `pod5`: All `.pod5` file entries associated with this flowcell.
+#[derive(Debug, Clone)]
+pub struct FlowCell {
+    /// Original flowcell name (e.g., "FCX123").
+    pub flowcell_name: String,
+
+    /// Corrected flowcell name, if normalization was applied.
+    pub corrected_name: String,
+
+    /// Sample/barcode-level associations for this flowcell.
+    pub cases: Vec<FlowCellCase>,
+
+    /// The sequencing run this flowcell belongs to.
+    pub run_name: String,
+
+    /// Type of pod5 data: raw or demuxed.
+    pub pod5_type: Pod5Type,
+
+    /// Metadata extracted from a `.pod5` file, including acquisition ID.
+    pub pod5_info: Pod5Info,
+
+    /// The list of `.pod5` files linked to this flowcell.
+    pub pod5: Vec<Pod5>,
+}
+
+impl FlowCell {
+    /// Constructs a new `FlowCell` from a non-empty vector of `Pod5` entries
+    /// and a list of corrected flowcell mappings.
+    ///
+    /// Ensures that all entries in the vector share the same `run_name`, `flowcell_name`, and `pod5_type`.
+    ///
+    /// # Arguments
+    /// - `pods`: A non-empty vector of `Pod5` entries (moved, not cloned).
+    /// - `corrected_fc`: Reference to a list of `FCLine` entries for resolving corrected names.
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - `pods` is empty
+    /// - `.pod5` path is invalid UTF-8
+    /// - inconsistent metadata across pod5 entries
+    /// - multiple corrected names are found
+    /// - parent directory resolution fails
+    pub fn new(pods: Vec<Pod5>, corrected_fc: &[FCLine]) -> anyhow::Result<Self> {
+        let first = pods.first().context("Empty pod5 list for FlowCell")?;
+
+        let flowcell_name = &first.flowcell_name;
+        let run_name = &first.run_name;
+        let pod5_type = &first.pod5_type;
+
+        // Consistency check
+        let inconsistent = pods.iter().any(|p| {
+            p.flowcell_name != *flowcell_name
+                || p.run_name != *run_name
+                || p.pod5_type != *pod5_type
+        });
+        if inconsistent {
+            return Err(anyhow!(
+                "Inconsistent pod5 metadata: all entries must share the same run_name, flowcell_name, and pod5_type"
+            ));
+        }
+
+        // Extract and validate .pod5 path
+        let path_str = first.path.to_str().context("Invalid UTF-8 in pod5 path")?;
+        let pod5_info = Pod5Info::from_pod5(path_str);
+
+        // Select corrected entries for this flowcell
+        let matched_fc_lines: Vec<_> = corrected_fc
+            .iter()
+            .filter(|e| e.flow_cell == *flowcell_name)
+            .cloned()
+            .collect();
+
+        // Resolve unique corrected name
+        let corrected_name = {
+            let mut names: Vec<_> = matched_fc_lines
+                .iter()
+                .map(|e| e.ref_flow_cell.clone())
+                .filter(|s| !s.is_empty())
+                .collect();
+            names.dedup();
+
+            match names.len() {
+                0 => String::new(),
+                1 => names[0].clone(),
+                _ => {
+                    return Err(anyhow!(
+                        "Multiple corrected names for flow cell '{}': {:?}",
+                        flowcell_name,
+                        names
+                    ));
+                }
+            }
+        };
+
+        // Cache parent directories
+        let raw_parent = first.path.parent().context("Missing parent for RAW pod5")?;
+        let demuxed_grandparent = raw_parent
+            .parent()
+            .context("Invalid directory structure for DEMUXED pod5")?;
+
+        // Build case list
+        let cases = matched_fc_lines
+            .iter()
+            .map(|e| {
+                let pod_dir = match pod5_type {
+                    Pod5Type::Raw => raw_parent.to_path_buf(),
+                    Pod5Type::Demuxed => {
+                        let mut bc_dir = demuxed_grandparent.to_path_buf();
+                        bc_dir.push(format!("barcode{}", e.barcode_number.replace("NB", "")));
+                        bc_dir
+                    }
+                };
+                Ok(FlowCellCase {
+                    id: e.id.clone(),
+                    time_point: e.sample_type.clone(),
+                    barcode: e.barcode_number.clone(),
+                    pod_dir,
+                })
+            })
+            .collect::<Result<_>>()?;
+
+        Ok(Self {
+            flowcell_name: flowcell_name.clone(),
+            corrected_name,
+            cases,
+            run_name: run_name.clone(),
+            pod5_type: pod5_type.clone(),
+            pod5_info,
+            pod5: pods, // Already moved
+        })
+    }
+}
+
+/// Represents the type of `.pod5` file: either raw or demultiplexed.
+#[derive(Debug, Clone, PartialEq)]
+pub enum Pod5Type {
+    /// Raw `.pod5` files directly from acquisition.
+    Raw,
+    /// Demultiplexed `.pod5` files, post-processed by barcoding.
+    Demuxed,
+}
+
+impl Display for Pod5Type {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            Pod5Type::Raw => "raw",
+            Pod5Type::Demuxed => "demuxed",
+        };
+        f.write_str(s)
+    }
+}
+
+/// Configuration for interpreting file paths when parsing `.pod5` files.
+#[derive(Debug, Clone)]
+pub struct Pod5Config {
+    /// Base directory (prefix to strip from full paths).
+    pub base_dir: String,
+    /// Substring used to detect "raw" pod5 files.
+    pub type_raw: String,
+    /// Substring used to detect "demuxed" pod5 files.
+    pub type_demuxed: String,
+    /// Index (in path components) where `run_name` is expected.
+    pub run_dir_n: u8,
+    /// Index (in path components) where `flowcell_name` is expected.
+    pub flowcell_dir_n: u8,
+}
+
+impl Default for Pod5Config {
+    fn default() -> Self {
+        Self {
+            base_dir: "/data/run_data".to_string(),
+            type_raw: "/pod5/".to_string(),
+            type_demuxed: "/pod5_pass/".to_string(),
+            run_dir_n: 0,
+            flowcell_dir_n: 1,
+        }
+    }
+}
+
+/// Represents a `.pod5` file and its associated metadata and location info.
+///
+/// Used as the base object for flowcell and run aggregation.
+#[derive(Debug, Clone)]
+pub struct Pod5 {
+    /// Full path to the `.pod5` file.
+    pub path: PathBuf,
+
+    /// Whether the file is raw or demultiplexed.
+    pub pod5_type: Pod5Type,
+
+    /// Name of the sequencing run this file belongs to.
+    pub run_name: String,
+
+    /// Name of the flowcell associated with this file.
+    pub flowcell_name: String,
+
+    /// Filesystem metadata (e.g., size, modified time).
+    pub file_metadata: Metadata,
+}
+
+impl Pod5 {
+    /// Constructs a `Pod5` instance from a file path, using a `Pod5Config` to infer type and extract metadata.
+    ///
+    /// # Arguments
+    /// - `path`: Path to the `.pod5` file.
+    /// - `config`: Configuration used to interpret the path structure.
+    ///
+    /// # Returns
+    /// - `Ok(Pod5)` if type and components can be extracted.
+    /// - `Err` if path is malformed, missing components, or type is unrecognized.
+    pub fn from_path(path: impl AsRef<Path>, config: &Pod5Config) -> Result<Self> {
+        let path = path.as_ref();
+        let path_str = path
+            .to_str()
+            .context(format!("Can't convert path to UTF-8 string: {:?}", path))?;
+
+        // Determine Pod5 type by pattern matching
+        let pod5_type = if path_str.contains(&config.type_raw) {
+            Pod5Type::Raw
+        } else if path_str.contains(&config.type_demuxed) {
+            Pod5Type::Demuxed
+        } else {
+            return Err(anyhow!(
+                "Unable to determine pod5 type from path: {}",
+                path_str
+            ));
+        };
+
+        // Extract metadata from filesystem
+        let file_metadata =
+            fs::metadata(path).with_context(|| format!("Failed to get metadata for {:?}", path))?;
+
+        // Strip base_dir and split into components
+        let relative_path = path_str.strip_prefix(&config.base_dir).unwrap_or(path_str); // fallback to full path if base_dir is not a prefix
+
+        let components: Vec<&str> = relative_path.split('/').filter(|c| !c.is_empty()).collect();
+
+        // Extract run_name and flowcell_name from path components
+        let run_name = components
+            .get(config.run_dir_n as usize)
+            .context("Missing run_name in path")?
+            .to_string();
+
+        let flowcell_name = components
+            .get(config.flowcell_dir_n as usize)
+            .context("Missing flowcell_name in path")?
+            .to_string();
+
+        Ok(Self {
+            path: path.to_path_buf(),
+            pod5_type,
+            run_name,
+            flowcell_name,
+            file_metadata,
+        })
+    }
+}
+
+/// Recursively scans a directory for `.pod5` files and parses them into `Pod5` objects.
+///
+/// This function uses glob-based search to find all `.pod5` files under the given directory
+/// (including subdirectories), then filters out unwanted paths (e.g., `pod5_fail/`, `pod5_skip/`)
+/// and attempts to parse each remaining file using `Pod5::from_path`.
+///
+/// Any file that fails to parse is skipped with a warning.
+///
+/// # Arguments
+/// - `dir`: Path to the root directory to search (absolute or relative).
+///
+/// # Returns
+/// - `Ok(Vec<Pod5>)` on success, with all successfully parsed `.pod5` files.
+/// - `Err(anyhow::Error)` if path parsing fails (e.g., invalid UTF-8).
+///
+/// # Errors
+/// - Fails early if the glob pattern itself is invalid.
+/// - Skips over files that fail to parse, but logs warnings.
+///
+/// # Notes
+/// - Directories containing `/pod5_fail/` or `/pod5_skip/` are excluded.
+/// - The glob pattern used is `{dir}/**/*.pod5`.
+///
+/// # Example
+/// ```
+/// let pod_files = list_pod_files("/data/pods")?;
+/// for pod in pod_files {
+///     println!("{}", pod.path.display());
+/// }
+/// ```
+pub fn list_pod_files(dir: &str) -> Result<Vec<Pod5>> {
+    let pattern = format!("{}/**/*.pod5", dir);
+    let mut pod_files = Vec::new();
+
+    let conf = Pod5Config {
+        base_dir: if dir.ends_with('/') {
+            dir.to_string()
+        } else {
+            format!("{dir}/")
+        },
+        ..Pod5Config::default()
+    };
+
+    for entry in glob(&pattern).expect("Failed to read glob pattern") {
+        match entry {
+            Ok(path) => {
+                let p = path.to_str().context("Can't parse path to string {path}")?;
+                if p.contains("/pod5_fail/") || p.contains("/pod5_skip/") {
+                    continue;
+                }
+                match Pod5::from_path(&path, &conf) {
+                    Ok(pod5) => pod_files.push(pod5),
+                    Err(e) => warn!("{e}"),
+                }
+            }
+            Err(e) => warn!("Error: {:?}", e),
+        }
+    }
+    Ok(pod_files)
+}
+
+// impl FlowCell {
+//     pub fn cases_pod5_dir(&self) -> Vec<PathBuf> {
+//         match self.pod5_type {
+//             Pod5Type::Raw => {
+//                 let p = self.pod5.first().unwrap();
+//                 vec![p.path.parent().unwrap().to_path_buf()]
+//             },
+//             Pod5Type::Demuxed => {
+//                 self.cases.iter().map(|c| {
+//                     let str_barcode = format!("barcode{}", c.barcode);
+//                 })
+//             },
+//         }
+//     }
+// }
+#[derive(Debug, Clone, Default)]
+pub struct FlowCellCase {
+    pub id: String,
+    pub time_point: String,
+    pub barcode: String,
+    pub pod_dir: PathBuf,
+    // pub basecalled: Option<bool>,
+}
+
+// #[derive(Debug, Serialize, Deserialize, Clone)]
+// pub struct IdsInput {
+//     pub data: Vec<IdInput>,
+// }
+//
+// #[derive(Debug, Serialize, Deserialize, Clone)]
+// pub struct IdInput {
+//     pub id: String,
+//     pub time_point: String,
+//     pub barcode: String,
+//     pub flow_cell: String,
+//     pub run: String,
+// }
+//
+// // Implement PartialEq and Eq for IdInput
+// impl PartialEq for IdInput {
+//     fn eq(&self, other: &Self) -> bool {
+//         self.id == other.id
+//             && self.time_point == other.time_point
+//             && self.barcode == other.barcode
+//             && self.flow_cell == other.flow_cell
+//             && self.run == other.run
+//     }
+// }
+//
+// impl Eq for IdInput {}
+//
+// // Implement Hash for IdInput
+// impl Hash for IdInput {
+//     fn hash<H: Hasher>(&self, state: &mut H) {
+//         self.id.hash(state);
+//         self.time_point.hash(state);
+//         self.barcode.hash(state);
+//         self.flow_cell.hash(state);
+//         self.run.hash(state);
+//     }
+// }
+//
+// impl IdsInput {
+//     pub fn load_json(path: &str) -> anyhow::Result<Self> {
+//         let f = File::open(path)?;
+//         let s: Self = serde_json::from_reader(f)?;
+//         Ok(s)
+//     }
+//
+//     pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
+//         let f = File::create(path)?;
+//         serde_json::to_writer(f, self)?;
+//         Ok(())
+//     }
+//
+//     pub fn dedup(&mut self) {
+//         let mut unique = HashSet::new();
+//         self.data.retain(|item| unique.insert(item.clone()));
+//     }
+//
+//     pub fn load_from_tsv(path: &str) -> anyhow::Result<Self> {
+//         let inputs = load_flowcells_corrected_names(path)?;
+//         let data = inputs
+//             .iter()
+//             .map(|line| IdInput {
+//                 id: line.id.to_string(),
+//                 time_point: line.sample_type.to_string(),
+//                 barcode: line.barcode_number.to_string(),
+//                 flow_cell: line.flow_cell.to_string(),
+//                 run: line.run.to_string(),
+//             })
+//             .collect();
+//
+//         let mut res = Self { data };
+//         res.dedup();
+//         Ok(res)
+//     }
+//
+//     pub fn add_input(&mut self, values: IdInput) {
+//         self.data.push(values);
+//         self.dedup();
+//     }
+// }
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct Pod5Run {
+    pub protocol_run_id: String,
+    pub position_id: String,
+    pub flow_cell_id: String,
+    pub id: String,
+    pub time_point: String,
+    pub barcode_number: String,
+    pub flow_cell: String,
+    pub run: String,
+    pub last_pod_dir: (DateTime<Utc>, String),
+    pub archives: Vec<(String, DateTime<Utc>, String)>,
+}
+
+/// Loads corrected flowcell metadata from a tab-delimited file.
+///
+/// This function parses a TSV file where each row is deserialized into an `FCLine`.
+/// It also normalizes some fields (e.g., lowercases `sample_type`, uppercases `id`)
+/// for consistency in downstream processing.
+///
+/// # Arguments
+/// - `file_path`: Path to the TSV file containing flowcell correction data.
+///
+/// # Returns
+/// A vector of `FCLine` records, one per line in the file.
+///
+/// # Errors
+/// Returns an error if the file cannot be opened or if any line fails to deserialize.
+///
+/// # Expected Format (TSV with header)
+/// ```text
+/// id    sample_type    barcode_number    flow_cell    run_path    ref_flow_cell
+/// P001X03    tumoral    NB01    FC123    RUN123    /path/to/data    FC123_CORR
+/// ```
+///
+/// # Example
+/// ```
+/// let fc_lines = load_flowcells_corrected_names("flowcells.tsv")?;
+/// assert!(!fc_lines.is_empty());
+/// ```
+pub fn load_flowcells_corrected_names(file_path: &str) -> anyhow::Result<Vec<FCLine>> {
+    let file = File::open(file_path)?;
+
+    let mut rdr = ReaderBuilder::new()
+        .delimiter(b'\t')
+        .has_headers(true)
+        .from_reader(file);
+
+    let mut records = Vec::new();
+    for result in rdr.deserialize() {
+        let mut record: FCLine = result?;
+
+        // formating
+        record.sample_type = record.sample_type.to_lowercase();
+        record.id = record.id.to_uppercase();
+
+        records.push(record);
+    }
+
+    Ok(records)
+}
+
+
+/// Represents a single record describing a barcode-flowcell pairing,
+/// including original and corrected metadata.
+///
+/// This struct is typically deserialized from a TSV file and used to map
+/// `.pod5` files to metadata like corrected flowcell names and experimental time points.
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct FCLine {
+    /// Unique identifier for the sample or barcode group (e.g., "P001X03").
+    pub id: String,
+
+    /// Sample type associated with this record (e.g., "normal", "tumoral").
+    pub sample_type: String,
+
+    /// The barcode number (e.g., "NB01", "NB02").
+    pub barcode_number: String,
+
+    /// Original flowcell name as found in the raw `.pod5` metadata.
+    pub flow_cell: String,
+
+    /// Sequencing run name this flowcell belongs to (e.g., "20240101_FAB123").
+    pub run: String,
+
+    /// Original path to data (can be absolute or relative).
+    pub path: String,
+
+    /// Corrected flowcell name used to resolve naming inconsistencies.
+    pub ref_flow_cell: String,
+}
+

+ 551 - 556
src/commands/dorado.rs

@@ -1,19 +1,13 @@
 use std::{
-    fs::{self, File},
+    fs::{self},
     path::{Path, PathBuf},
-    time::SystemTime,
 };
 
 use anyhow::Context;
-use duct::cmd;
-use log::{debug, info, warn};
-use uuid::Uuid;
 
 use crate::{
-    collection::{bam::bam_composition, flowcells::FlowCell, pod5::FlowCellCase},
     commands::{Command, SlurmParams},
     config::Config,
-    helpers::find_unique_file,
     io::pod5_infos::Pod5Info,
     slurm_helpers::max_gpu_per_node,
 };
@@ -151,7 +145,8 @@ impl Command for DoradoBasecall {
 /// ```
 impl super::SlurmRunner for DoradoBasecall {
     fn slurm_args(&self) -> Vec<String> {
-        let (gpu, n) = if let (Some(h100_av), Some(a100_av)) = (max_gpu_per_node("h100"), max_gpu_per_node("a100"))
+        let (gpu, n) = if let (Some(h100_av), Some(a100_av)) =
+            (max_gpu_per_node("h100"), max_gpu_per_node("a100"))
         {
             let (gpu, n) = if h100_av >= a100_av {
                 ("h100", h100_av)
@@ -328,551 +323,551 @@ pub struct DoradoParams {
     pub samtools_sort_threads: u16,
 }
 
-pub struct Dorado {
-    config: Config,
-    case: FlowCellCase,
-    case_dir: String,
-    time_dir: String,
-    bam: String,
-    start_time: SystemTime,
-    end_time: SystemTime,
-    is_done: bool,
-}
-
-impl Dorado {
-    pub fn init(case: FlowCellCase, config: Config) -> anyhow::Result<Self> {
-        let data_dir = &config.result_dir;
-        let case_dir = format!("{}/{}", data_dir, case.id);
-        let time_dir = format!("{}/{}", case_dir, case.time_point);
-        let bam = format!("{}/{}_{}_hs1.bam", time_dir, case.id, case.time_point);
-        debug!("Dorado init with config: {config:#?}");
-        info!("Final BAM file: {bam}");
-
-        Ok(Self {
-            config,
-            start_time: SystemTime::now(),
-            end_time: SystemTime::now(),
-            is_done: false,
-            case_dir,
-            time_dir,
-            bam,
-            case,
-        })
-    }
-
-    // ------------------------------------------------------------------
-    // Small helper to actually execute a shell command
-    // ------------------------------------------------------------------
-    fn run_shell(cmdline: &str) -> anyhow::Result<()> {
-        info!("Running: {cmdline}");
-        cmd!("bash", "-c", cmdline)
-            .run()
-            .map_err(|e| anyhow::anyhow!("Failed to run: {cmdline}\n\t{}", e.to_string()))?;
-        Ok(())
-    }
-
-    // ------------------------------------------------------------------
-    // Command builders (return strings)
-    // ------------------------------------------------------------------
-
-    /// minimap2 index creation (returns None if index already exists)
-    fn create_reference_mmi_cmd(&self) -> Option<String> {
-        if std::path::Path::new(&self.config.align.ref_mmi).exists() {
-            None
-        } else {
-            Some(format!(
-                "minimap2 -x map-ont -d {} {}",
-                self.config.align.ref_mmi, self.config.align.ref_fa
-            ))
-        }
-    }
-
-    /// Dorado + samtools pipeline for basecalling + alignment
-    fn basecall_align_cmd(&self, dorado_bin: &str) -> anyhow::Result<String> {
-        let pod_dir = &self.case.pod_dir;
-        let ref_fa = &self.config.align.ref_fa;
-        let bam = &self.bam;
-        let samtools = &self.config.align.samtools_bin;
-        let samtools_view_threads = self.config.align.samtools_view_threads;
-        let samtools_sort_threads = self.config.align.samtools_sort_threads;
-        let dorado_arg = self.config.align.dorado_basecall_arg.clone();
-
-        let pod_path = fs::read_dir(pod_dir)
-            .map_err(|e| anyhow::anyhow!("Failed to read pod5 dir: {}.\n\t{e}", pod_dir.display()))?
-            .filter_map(|p| p.ok())
-            .map(|p| p.path())
-            .filter(|p| p.extension().unwrap() == "pod5")
-            .take(1)
-            .collect::<Vec<PathBuf>>()
-            .pop()
-            .unwrap();
-
-        let sequencing_kit = Pod5Info::from_pod5(pod_path.to_str().unwrap())
-            .sequencing_kit
-            .to_uppercase();
-
-        let dorado = format!(
-            "{dorado_bin} basecaller --kit-name {sequencing_kit} {dorado_arg} {} --trim all --emit-moves --reference {ref_fa}",
-            pod_dir.display()
-        );
-        info!("Dorado command: {dorado}");
-
-        let samtools_view = format!("{samtools} view -h -@ {samtools_view_threads} -b /dev/stdin");
-        let samtools_sort =
-            format!("{samtools} sort -@ {samtools_sort_threads} /dev/stdin -o {bam}");
-
-        Ok(format!("{dorado} | {samtools_view} | {samtools_sort}"))
-    }
-
-    /// samtools index command
-    fn index_cmd(&self) -> String {
-        let t = self.config.align.samtools_view_threads.to_string();
-        format!(
-            "{} index -@ {t} {}",
-            self.config.align.samtools_bin, self.bam
-        )
-    }
-
-    /// cramino QC command
-    fn cramino_cmd(&self) -> String {
-        format!("cramino -t 150 --karyotype {}", self.bam)
-    }
-
-    /// modkit summary command
-    fn modkit_cmd(&self) -> String {
-        format!("modkit summary -t 50 {}", self.bam)
-    }
-
-    /// fastq export pipeline from BAM
-    fn create_fastq_cmd(&self) -> String {
-        let bam = &self.bam;
-        let fastq = format!(
-            "{}/{}/{}/{}_{}.fastq.gz",
-            self.case_dir, self.case.id, self.case.time_point, self.case.id, self.case.time_point
-        );
-        let samtools = format!("samtools fastq -@ 150 {bam}");
-        let crabz = format!("crabz -f bgzf - -o {fastq}");
-        format!("{samtools} | {crabz}")
-    }
-
-    /// samtools merge command used in `merge_bam`
-    fn merge_bam_cmd(&self, bam: &Path, into: &Path) -> String {
-        format!(
-            "{} merge -@ 160 -h {} {} {} {}",
-            self.config.align.samtools_bin,
-            bam.display(),
-            into.display(),
-            bam.display(),
-            into.display() // placeholder, real tmp path is managed outside
-        )
-    }
-
-    // mux basecall + samtools view into muxed.bam
-    fn from_mux_basecall_cmd(
-        config: &Config,
-        sequencing_kit: &str,
-        pod_dir: &str,
-        muxed_bam: &str,
-    ) -> String {
-        let dorado_bin = &config.align.dorado_bin;
-        let dorado_arg = &config.align.dorado_basecall_arg;
-        let ref_mmi = &config.align.ref_mmi;
-        let samtools_bin = &config.align.samtools_bin;
-        let samtools_view_threads = config.align.samtools_view_threads;
-
-        let dorado = format!(
-            "{dorado_bin} basecaller --kit-name {sequencing_kit} {dorado_arg} {pod_dir} --emit-moves --trim all --reference {ref_mmi}"
-        );
-        let samtools_view =
-            format!("{samtools_bin} view -h -@ {samtools_view_threads} -b -o {muxed_bam}");
-        format!("{dorado} | {samtools_view}")
-    }
-
-    /// samtools split command for demux
-    fn demux_cmd(config: &Config, muxed_bam: &str, tmp_demux_dir: &str) -> String {
-        format!(
-            "{} split -@ {} -f '{}/%*_%!.bam' {}",
-            config.align.samtools_bin, config.align.samtools_view_threads, tmp_demux_dir, muxed_bam
-        )
-    }
-
-    /// dorado aligner + samtools for realignment in from_mux
-    fn realign_cmd(
-        config: &Config,
-        sequencing_kit: &str,
-        barcode: &str,
-        bam: &str,
-        aligned_bam: &str,
-    ) -> String {
-        let dorado = format!(
-            "{} aligner --threads {} {} {}",
-            config.align.dorado_bin, config.align.dorado_aligner_threads, config.align.ref_fa, bam,
-        );
-        let samtools_view = format!(
-            "{} view -h -@ {} -b /dev/stdin",
-            config.align.samtools_bin, config.align.samtools_view_threads
-        );
-        let samtools_sort = format!(
-            "{} sort -@ {} /dev/stdin -o {}",
-            config.align.samtools_bin, config.align.samtools_sort_threads, aligned_bam
-        );
-        let _ = sequencing_kit; // not used here but kept for symmetry
-        format!("{dorado} | {samtools_view} | {samtools_sort}")
-    }
-
-    // ------------------------------------------------------------------
-    // Workflow methods that now *run* the commands
-    // ------------------------------------------------------------------
-
-    fn create_reference_mmi(&self) -> anyhow::Result<()> {
-        if let Some(cmdline) = self.create_reference_mmi_cmd() {
-            Self::run_shell(&cmdline)?;
-        }
-        Ok(())
-    }
-
-    fn create_directories(&self) -> anyhow::Result<()> {
-        if !std::path::Path::new(&self.case_dir).exists() {
-            info!("Creating directory {}", self.case_dir);
-            fs::create_dir(&self.case_dir)?;
-        }
-        if !std::path::Path::new(&self.time_dir).exists() {
-            info!("Creating directory {}", self.time_dir);
-            fs::create_dir(&self.time_dir)?;
-        }
-        Ok(())
-    }
-
-    fn basecall_align(&mut self, dorado_bin: &str) -> anyhow::Result<()> {
-        let pipe = self.basecall_align_cmd(dorado_bin)?;
-        Self::run_shell(&pipe)
-            .map_err(|e| anyhow::anyhow!("Failed to run pipe: {pipe}.\n\t{}", e.to_string()))
-    }
-
-    pub fn index(&self) -> anyhow::Result<()> {
-        let cmdline = self.index_cmd();
-        info!("Running samtools index for {}", self.bam);
-        Self::run_shell(&cmdline)
-    }
-
-    pub fn run_cramino(&self) -> anyhow::Result<()> {
-        let cramino_out = format!(
-            "{}/{}_{}_hs1_cramino.txt",
-            self.time_dir, self.case.id, self.case.time_point
-        );
-        info!("Quality control with cramino for BAM: {}", self.bam);
-        let cmdline = self.cramino_cmd();
-
-        let output = cmd!("bash", "-c", &cmdline)
-            .stdout_capture()
-            .unchecked()
-            .run()?;
-
-        fs::write(cramino_out, output.stdout)?;
-        Ok(())
-    }
-
-    pub fn run_modkit(&self) -> anyhow::Result<()> {
-        let mod_summary = format!(
-            "{}/{}_{}_5mC_5hmC_summary.txt",
-            self.time_dir, self.case.id, self.case.time_point
-        );
-        info!("Generating base modification summary for BAM: {}", self.bam);
-        let cmdline = self.modkit_cmd();
-
-        let output = cmd!("bash", "-c", &cmdline)
-            .stdout_capture()
-            .unchecked()
-            .run()?;
-
-        fs::write(mod_summary, output.stdout)?;
-        Ok(())
-    }
-
-    pub fn create_fastq(&self) -> anyhow::Result<()> {
-        let fastq = format!(
-            "{}/{}/{}/{}_{}.fastq.gz",
-            self.case_dir, self.case.id, self.case.time_point, self.case.id, self.case.time_point
-        );
-        if !std::path::Path::new(&fastq).exists() {
-            let pipe = self.create_fastq_cmd();
-            Self::run_shell(&pipe)?;
-        }
-        Ok(())
-    }
-
-    pub fn merge_bam(&self, bam: &Path) -> anyhow::Result<()> {
-        let composition_a: Vec<String> = bam_composition(bam.to_string_lossy().as_ref(), 20000)?
-            .iter()
-            .map(|(i, _, _)| i.clone())
-            .collect();
-        let composition_b: Vec<String> = bam_composition(&self.bam, 20000)?
-            .iter()
-            .map(|(i, _, _)| i.clone())
-            .collect();
-        let n_id = composition_a
-            .iter()
-            .filter(|id| composition_b.contains(id))
-            .count();
-        if n_id > 0 {
-            warn!(
-                "{} is already merged, reads with the same run_id in the destination BAM.",
-                self.case.id
-            );
-            return Ok(());
-        }
-
-        let into = PathBuf::from(&self.bam);
-        let dir = into.parent().unwrap();
-
-        let original_file = into.file_name().unwrap().to_string_lossy().to_string();
-        let original_i = dir.join(format!("{original_file}.bai"));
-        if !original_i.exists() {
-            self.index()?;
-        }
-
-        let tmp_original_file = format!("{}.bam", Uuid::new_v4());
-        let tmp_original = dir.join(tmp_original_file.clone());
-        let tmp_original_i = dir.join(format!("{tmp_original_file}.bai"));
-
-        info!("Moving {} to {}", &into.display(), &tmp_original.display());
-        fs::rename(&into, &tmp_original)?;
-        info!(
-            "Moving {} to {}",
-            &original_i.display(),
-            &tmp_original_i.display()
-        );
-        fs::rename(original_i, tmp_original_i.clone())?;
-
-        // real merge command with the correct tmp path
-        let merge_cmdline = format!(
-            "{} merge -@ 160 -h {} {} {} {}",
-            self.config.align.samtools_bin,
-            bam.display(),
-            into.display(),
-            bam.display(),
-            tmp_original.display()
-        );
-        info!("Running {merge_cmdline}");
-        Self::run_shell(&merge_cmdline)?;
-
-        fs::remove_file(tmp_original)?;
-        fs::remove_file(tmp_original_i)?;
-        fs::remove_file(bam)?;
-
-        self.index()?;
-        Ok(())
-    }
-
-    pub fn from_mux(cases: Vec<FlowCellCase>, config: Config) -> anyhow::Result<()> {
-        // tmp dir
-        let tmp_dir = format!("{}/.{}", config.result_dir, Uuid::new_v4());
-        info!("Creating tmp dir {tmp_dir}");
-        fs::create_dir(&tmp_dir)?;
-
-        // basecalling into muxed.bam
-        let muxed_bam = format!("{tmp_dir}/muxed.bam");
-        let pod_dir = cases[0].pod_dir.display().to_string();
-
-        let muxed_pod_dir = &cases.first().unwrap().pod_dir;
-        let pod_path = fs::read_dir(muxed_pod_dir)
-            .map_err(|e| {
-                anyhow::anyhow!(
-                    "Failed to read pod5 dir: {}.\n\t{e}",
-                    muxed_pod_dir.display()
-                )
-            })?
-            .filter_map(|p| p.ok())
-            .map(|p| p.path())
-            .filter(|p| p.extension().unwrap() == "pod5")
-            .take(1)
-            .collect::<Vec<PathBuf>>()
-            .pop()
-            .unwrap();
-        let sequencing_kit = Pod5Info::from_pod5(pod_path.to_str().unwrap())
-            .sequencing_kit
-            .to_uppercase();
-
-        let basecall_pipe =
-            Self::from_mux_basecall_cmd(&config, &sequencing_kit, &pod_dir, &muxed_bam);
-        info!("Running: {basecall_pipe}");
-        Self::run_shell(&basecall_pipe)?;
-        info!("Basecalling ✅");
-
-        // demux
-        let tmp_demux_dir = format!("{tmp_dir}/demuxed");
-        fs::create_dir(&tmp_demux_dir)?;
-        let demux_cmdline = Self::demux_cmd(&config, &muxed_bam, &tmp_demux_dir);
-        info!("Demux from {sequencing_kit} into {tmp_demux_dir}");
-        info!("Running: {demux_cmdline}");
-        Self::run_shell(&demux_cmdline)?;
-        info!("Demux ✅");
-
-        for case in cases.iter() {
-            let barcode = case.barcode.replace("NB", "");
-            let bam = find_unique_file(
-                &tmp_demux_dir,
-                &format!("{sequencing_kit}_barcode{}.bam", barcode),
-            )?;
-
-            let aligned_bam = if !config.align.dorado_should_realign {
-                bam.clone()
-            } else {
-                let aligned_bam = format!(
-                    "{tmp_demux_dir}/{sequencing_kit}_barcode{}_aligned.bam",
-                    barcode
-                );
-                let pipe =
-                    Self::realign_cmd(&config, &sequencing_kit, &barcode, &bam, &aligned_bam);
-                info!("Running {pipe}");
-                Self::run_shell(&pipe)?;
-                info!("Alignement ✅");
-                aligned_bam.into()
-            };
-
-            let d = Dorado::init(case.clone(), config.clone())?;
-            d.create_directories()?;
-
-            if PathBuf::from(&d.bam).exists() {
-                info!("Merging");
-                d.merge_bam(&PathBuf::from(aligned_bam))?;
-            } else {
-                info!("Moving from {} to {}", bam, d.bam);
-                fs::rename(aligned_bam, d.bam.clone())?;
-                d.index()?;
-            }
-
-            d.run_cramino()?;
-            d.run_modkit()?;
-        }
-        fs::remove_dir_all(tmp_dir)?;
-
-        Ok(())
-    }
-
-    pub fn run_pipe(&mut self) -> anyhow::Result<()> {
-        let start_time = std::time::SystemTime::now();
-        self.start_time = start_time;
-
-        debug!("Running Dorado with config: {:#?}", self.config);
-        let dorado_bin = self.config.align.dorado_bin.clone();
-
-        self.create_reference_mmi()?;
-        self.create_directories()?;
-
-        info!(
-            "Reading {} pod5 from: {}",
-            self.case.time_point, self.config.pod_dir
-        );
-        let bam_path = std::path::Path::new(&self.bam);
-
-        if !bam_path.exists() {
-            info!("Creating new bam file");
-            self.basecall_align(&dorado_bin)?;
-            self.index()?;
-        } else {
-            let new_bam_path = bam_path
-                .parent()
-                .unwrap()
-                .join(format!("{}.bam", Uuid::new_v4()));
-            warn!("Creating new bam {}", new_bam_path.display());
-
-            let bam = self.bam.clone();
-            self.bam = new_bam_path.clone().to_string_lossy().to_string();
-            self.basecall_align(&dorado_bin)?;
-            self.bam.clone_from(&bam);
-            self.merge_bam(&new_bam_path)?;
-        }
-
-        self.run_cramino()?;
-        self.run_modkit()?;
-        // self.create_fastq()?;
-
-        let end_time = std::time::SystemTime::now();
-        self.end_time = end_time;
-        let execution_time = end_time.duration_since(start_time).unwrap().as_secs_f64();
-        info!(
-            "Dorado and Minimap2 execution time: {} seconds",
-            execution_time
-        );
-        self.is_done = true;
-
-        Ok(())
-    }
-
-    // from_flowcell stays mostly as-is; it just calls run_pipe/from_mux
-    pub fn from_flowcell(flowcell: &FlowCell, config: &Config) -> anyhow::Result<()> {
-        let tp_conv = |time_point: &str| -> String {
-            match time_point {
-                "normal" => config.normal_name.clone(),
-                "tumoral" => config.tumoral_name.clone(),
-                _ => panic!("Error time point name"),
-            }
-        };
-        use crate::collection::flowcells::FlowCellLocation::*;
-        let base_pod_dir = match &flowcell.location {
-            Local(_) => None,
-            Archived(pod_tar) => {
-                let file = File::open(pod_tar)
-                    .map_err(|e| anyhow::anyhow!("Failed to open tar file: {pod_tar}\n\t{e}"))?;
-                let mut archive = tar::Archive::new(file);
-                info!("Un-tar of archived {pod_tar}");
-                archive
-                    .unpack(&config.unarchive_tmp_dir)
-                    .map_err(|e| anyhow::anyhow!("Failed to un-tar: {pod_tar}\n\t{e}"))?;
-                info!("Un-tar of archived {pod_tar} Done.");
-
-                Some(config.unarchive_tmp_dir.to_string())
-            }
-        };
-
-        use crate::collection::flowcells::FlowCellExperiment::*;
-        match &flowcell.experiment {
-            WGSPod5Mux(pod_dir) => {
-                let pod_dir = if let Some(base_pod_dir) = &base_pod_dir {
-                    format!("{base_pod_dir}/{pod_dir}")
-                } else {
-                    pod_dir.clone()
-                };
-
-                let cases = flowcell
-                    .cases
-                    .iter()
-                    .map(|c| FlowCellCase {
-                        id: c.case_id.clone(),
-                        time_point: tp_conv(&c.sample_type),
-                        barcode: c.barcode.clone(),
-                        pod_dir: pod_dir.clone().into(),
-                    })
-                    .collect();
-                info!("Starting basecaller for muxed pod5: {cases:#?}");
-
-                Dorado::from_mux(cases, config.clone())?;
-            }
-            WGSPod5Demux(pod_dir) => {
-                let pod_dir = if let Some(base_pod_dir) = &base_pod_dir {
-                    format!("{base_pod_dir}/{pod_dir}")
-                } else {
-                    pod_dir.clone()
-                };
-
-                for c in flowcell.cases.iter() {
-                    let pod_dir = format!("{pod_dir}/barcode{}", c.barcode.replace("NB", ""));
-                    info!("Starting basecaller for demuxed pod5: {pod_dir}");
-                    let mut d = Dorado::init(
-                        FlowCellCase {
-                            id: c.case_id.clone(),
-                            time_point: tp_conv(&c.sample_type),
-                            barcode: c.barcode.clone(),
-                            pod_dir: pod_dir.into(),
-                        },
-                        config.clone(),
-                    )?;
-                    d.run_pipe()?;
-                }
-            }
-        }
-
-        Ok(())
-    }
-}
+// pub struct Dorado {
+//     config: Config,
+//     case: FlowCellCase,
+//     case_dir: String,
+//     time_dir: String,
+//     bam: String,
+//     start_time: SystemTime,
+//     end_time: SystemTime,
+//     is_done: bool,
+// }
+//
+// impl Dorado {
+//     pub fn init(case: FlowCellCase, config: Config) -> anyhow::Result<Self> {
+//         let data_dir = &config.result_dir;
+//         let case_dir = format!("{}/{}", data_dir, case.id);
+//         let time_dir = format!("{}/{}", case_dir, case.time_point);
+//         let bam = format!("{}/{}_{}_hs1.bam", time_dir, case.id, case.time_point);
+//         debug!("Dorado init with config: {config:#?}");
+//         info!("Final BAM file: {bam}");
+//
+//         Ok(Self {
+//             config,
+//             start_time: SystemTime::now(),
+//             end_time: SystemTime::now(),
+//             is_done: false,
+//             case_dir,
+//             time_dir,
+//             bam,
+//             case,
+//         })
+//     }
+//
+//     // ------------------------------------------------------------------
+//     // Small helper to actually execute a shell command
+//     // ------------------------------------------------------------------
+//     fn run_shell(cmdline: &str) -> anyhow::Result<()> {
+//         info!("Running: {cmdline}");
+//         cmd!("bash", "-c", cmdline)
+//             .run()
+//             .map_err(|e| anyhow::anyhow!("Failed to run: {cmdline}\n\t{}", e.to_string()))?;
+//         Ok(())
+//     }
+//
+//     // ------------------------------------------------------------------
+//     // Command builders (return strings)
+//     // ------------------------------------------------------------------
+//
+//     /// minimap2 index creation (returns None if index already exists)
+//     fn create_reference_mmi_cmd(&self) -> Option<String> {
+//         if std::path::Path::new(&self.config.align.ref_mmi).exists() {
+//             None
+//         } else {
+//             Some(format!(
+//                 "minimap2 -x map-ont -d {} {}",
+//                 self.config.align.ref_mmi, self.config.align.ref_fa
+//             ))
+//         }
+//     }
+//
+//     /// Dorado + samtools pipeline for basecalling + alignment
+//     fn basecall_align_cmd(&self, dorado_bin: &str) -> anyhow::Result<String> {
+//         let pod_dir = &self.case.pod_dir;
+//         let ref_fa = &self.config.align.ref_fa;
+//         let bam = &self.bam;
+//         let samtools = &self.config.align.samtools_bin;
+//         let samtools_view_threads = self.config.align.samtools_view_threads;
+//         let samtools_sort_threads = self.config.align.samtools_sort_threads;
+//         let dorado_arg = self.config.align.dorado_basecall_arg.clone();
+//
+//         let pod_path = fs::read_dir(pod_dir)
+//             .map_err(|e| anyhow::anyhow!("Failed to read pod5 dir: {}.\n\t{e}", pod_dir.display()))?
+//             .filter_map(|p| p.ok())
+//             .map(|p| p.path())
+//             .filter(|p| p.extension().unwrap() == "pod5")
+//             .take(1)
+//             .collect::<Vec<PathBuf>>()
+//             .pop()
+//             .unwrap();
+//
+//         let sequencing_kit = Pod5Info::from_pod5(pod_path.to_str().unwrap())
+//             .sequencing_kit
+//             .to_uppercase();
+//
+//         let dorado = format!(
+//             "{dorado_bin} basecaller --kit-name {sequencing_kit} {dorado_arg} {} --trim all --emit-moves --reference {ref_fa}",
+//             pod_dir.display()
+//         );
+//         info!("Dorado command: {dorado}");
+//
+//         let samtools_view = format!("{samtools} view -h -@ {samtools_view_threads} -b /dev/stdin");
+//         let samtools_sort =
+//             format!("{samtools} sort -@ {samtools_sort_threads} /dev/stdin -o {bam}");
+//
+//         Ok(format!("{dorado} | {samtools_view} | {samtools_sort}"))
+//     }
+//
+//     /// samtools index command
+//     fn index_cmd(&self) -> String {
+//         let t = self.config.align.samtools_view_threads.to_string();
+//         format!(
+//             "{} index -@ {t} {}",
+//             self.config.align.samtools_bin, self.bam
+//         )
+//     }
+//
+//     /// cramino QC command
+//     fn cramino_cmd(&self) -> String {
+//         format!("cramino -t 150 --karyotype {}", self.bam)
+//     }
+//
+//     /// modkit summary command
+//     fn modkit_cmd(&self) -> String {
+//         format!("modkit summary -t 50 {}", self.bam)
+//     }
+//
+//     /// fastq export pipeline from BAM
+//     fn create_fastq_cmd(&self) -> String {
+//         let bam = &self.bam;
+//         let fastq = format!(
+//             "{}/{}/{}/{}_{}.fastq.gz",
+//             self.case_dir, self.case.id, self.case.time_point, self.case.id, self.case.time_point
+//         );
+//         let samtools = format!("samtools fastq -@ 150 {bam}");
+//         let crabz = format!("crabz -f bgzf - -o {fastq}");
+//         format!("{samtools} | {crabz}")
+//     }
+//
+//     /// samtools merge command used in `merge_bam`
+//     fn merge_bam_cmd(&self, bam: &Path, into: &Path) -> String {
+//         format!(
+//             "{} merge -@ 160 -h {} {} {} {}",
+//             self.config.align.samtools_bin,
+//             bam.display(),
+//             into.display(),
+//             bam.display(),
+//             into.display() // placeholder, real tmp path is managed outside
+//         )
+//     }
+//
+//     // mux basecall + samtools view into muxed.bam
+//     fn from_mux_basecall_cmd(
+//         config: &Config,
+//         sequencing_kit: &str,
+//         pod_dir: &str,
+//         muxed_bam: &str,
+//     ) -> String {
+//         let dorado_bin = &config.align.dorado_bin;
+//         let dorado_arg = &config.align.dorado_basecall_arg;
+//         let ref_mmi = &config.align.ref_mmi;
+//         let samtools_bin = &config.align.samtools_bin;
+//         let samtools_view_threads = config.align.samtools_view_threads;
+//
+//         let dorado = format!(
+//             "{dorado_bin} basecaller --kit-name {sequencing_kit} {dorado_arg} {pod_dir} --emit-moves --trim all --reference {ref_mmi}"
+//         );
+//         let samtools_view =
+//             format!("{samtools_bin} view -h -@ {samtools_view_threads} -b -o {muxed_bam}");
+//         format!("{dorado} | {samtools_view}")
+//     }
+//
+//     /// samtools split command for demux
+//     fn demux_cmd(config: &Config, muxed_bam: &str, tmp_demux_dir: &str) -> String {
+//         format!(
+//             "{} split -@ {} -f '{}/%*_%!.bam' {}",
+//             config.align.samtools_bin, config.align.samtools_view_threads, tmp_demux_dir, muxed_bam
+//         )
+//     }
+//
+//     /// dorado aligner + samtools for realignment in from_mux
+//     fn realign_cmd(
+//         config: &Config,
+//         sequencing_kit: &str,
+//         barcode: &str,
+//         bam: &str,
+//         aligned_bam: &str,
+//     ) -> String {
+//         let dorado = format!(
+//             "{} aligner --threads {} {} {}",
+//             config.align.dorado_bin, config.align.dorado_aligner_threads, config.align.ref_fa, bam,
+//         );
+//         let samtools_view = format!(
+//             "{} view -h -@ {} -b /dev/stdin",
+//             config.align.samtools_bin, config.align.samtools_view_threads
+//         );
+//         let samtools_sort = format!(
+//             "{} sort -@ {} /dev/stdin -o {}",
+//             config.align.samtools_bin, config.align.samtools_sort_threads, aligned_bam
+//         );
+//         let _ = sequencing_kit; // not used here but kept for symmetry
+//         format!("{dorado} | {samtools_view} | {samtools_sort}")
+//     }
+//
+//     // ------------------------------------------------------------------
+//     // Workflow methods that now *run* the commands
+//     // ------------------------------------------------------------------
+//
+//     fn create_reference_mmi(&self) -> anyhow::Result<()> {
+//         if let Some(cmdline) = self.create_reference_mmi_cmd() {
+//             Self::run_shell(&cmdline)?;
+//         }
+//         Ok(())
+//     }
+//
+//     fn create_directories(&self) -> anyhow::Result<()> {
+//         if !std::path::Path::new(&self.case_dir).exists() {
+//             info!("Creating directory {}", self.case_dir);
+//             fs::create_dir(&self.case_dir)?;
+//         }
+//         if !std::path::Path::new(&self.time_dir).exists() {
+//             info!("Creating directory {}", self.time_dir);
+//             fs::create_dir(&self.time_dir)?;
+//         }
+//         Ok(())
+//     }
+//
+//     fn basecall_align(&mut self, dorado_bin: &str) -> anyhow::Result<()> {
+//         let pipe = self.basecall_align_cmd(dorado_bin)?;
+//         Self::run_shell(&pipe)
+//             .map_err(|e| anyhow::anyhow!("Failed to run pipe: {pipe}.\n\t{}", e.to_string()))
+//     }
+//
+//     pub fn index(&self) -> anyhow::Result<()> {
+//         let cmdline = self.index_cmd();
+//         info!("Running samtools index for {}", self.bam);
+//         Self::run_shell(&cmdline)
+//     }
+//
+//     pub fn run_cramino(&self) -> anyhow::Result<()> {
+//         let cramino_out = format!(
+//             "{}/{}_{}_hs1_cramino.txt",
+//             self.time_dir, self.case.id, self.case.time_point
+//         );
+//         info!("Quality control with cramino for BAM: {}", self.bam);
+//         let cmdline = self.cramino_cmd();
+//
+//         let output = cmd!("bash", "-c", &cmdline)
+//             .stdout_capture()
+//             .unchecked()
+//             .run()?;
+//
+//         fs::write(cramino_out, output.stdout)?;
+//         Ok(())
+//     }
+//
+//     pub fn run_modkit(&self) -> anyhow::Result<()> {
+//         let mod_summary = format!(
+//             "{}/{}_{}_5mC_5hmC_summary.txt",
+//             self.time_dir, self.case.id, self.case.time_point
+//         );
+//         info!("Generating base modification summary for BAM: {}", self.bam);
+//         let cmdline = self.modkit_cmd();
+//
+//         let output = cmd!("bash", "-c", &cmdline)
+//             .stdout_capture()
+//             .unchecked()
+//             .run()?;
+//
+//         fs::write(mod_summary, output.stdout)?;
+//         Ok(())
+//     }
+//
+//     pub fn create_fastq(&self) -> anyhow::Result<()> {
+//         let fastq = format!(
+//             "{}/{}/{}/{}_{}.fastq.gz",
+//             self.case_dir, self.case.id, self.case.time_point, self.case.id, self.case.time_point
+//         );
+//         if !std::path::Path::new(&fastq).exists() {
+//             let pipe = self.create_fastq_cmd();
+//             Self::run_shell(&pipe)?;
+//         }
+//         Ok(())
+//     }
+//
+//     pub fn merge_bam(&self, bam: &Path) -> anyhow::Result<()> {
+//         let composition_a: Vec<String> = bam_composition(bam.to_string_lossy().as_ref(), 20000)?
+//             .iter()
+//             .map(|(i, _, _)| i.clone())
+//             .collect();
+//         let composition_b: Vec<String> = bam_composition(&self.bam, 20000)?
+//             .iter()
+//             .map(|(i, _, _)| i.clone())
+//             .collect();
+//         let n_id = composition_a
+//             .iter()
+//             .filter(|id| composition_b.contains(id))
+//             .count();
+//         if n_id > 0 {
+//             warn!(
+//                 "{} is already merged, reads with the same run_id in the destination BAM.",
+//                 self.case.id
+//             );
+//             return Ok(());
+//         }
+//
+//         let into = PathBuf::from(&self.bam);
+//         let dir = into.parent().unwrap();
+//
+//         let original_file = into.file_name().unwrap().to_string_lossy().to_string();
+//         let original_i = dir.join(format!("{original_file}.bai"));
+//         if !original_i.exists() {
+//             self.index()?;
+//         }
+//
+//         let tmp_original_file = format!("{}.bam", Uuid::new_v4());
+//         let tmp_original = dir.join(tmp_original_file.clone());
+//         let tmp_original_i = dir.join(format!("{tmp_original_file}.bai"));
+//
+//         info!("Moving {} to {}", &into.display(), &tmp_original.display());
+//         fs::rename(&into, &tmp_original)?;
+//         info!(
+//             "Moving {} to {}",
+//             &original_i.display(),
+//             &tmp_original_i.display()
+//         );
+//         fs::rename(original_i, tmp_original_i.clone())?;
+//
+//         // real merge command with the correct tmp path
+//         let merge_cmdline = format!(
+//             "{} merge -@ 160 -h {} {} {} {}",
+//             self.config.align.samtools_bin,
+//             bam.display(),
+//             into.display(),
+//             bam.display(),
+//             tmp_original.display()
+//         );
+//         info!("Running {merge_cmdline}");
+//         Self::run_shell(&merge_cmdline)?;
+//
+//         fs::remove_file(tmp_original)?;
+//         fs::remove_file(tmp_original_i)?;
+//         fs::remove_file(bam)?;
+//
+//         self.index()?;
+//         Ok(())
+//     }
+//
+//     pub fn from_mux(cases: Vec<FlowCellCase>, config: Config) -> anyhow::Result<()> {
+//         // tmp dir
+//         let tmp_dir = format!("{}/.{}", config.result_dir, Uuid::new_v4());
+//         info!("Creating tmp dir {tmp_dir}");
+//         fs::create_dir(&tmp_dir)?;
+//
+//         // basecalling into muxed.bam
+//         let muxed_bam = format!("{tmp_dir}/muxed.bam");
+//         let pod_dir = cases[0].pod_dir.display().to_string();
+//
+//         let muxed_pod_dir = &cases.first().unwrap().pod_dir;
+//         let pod_path = fs::read_dir(muxed_pod_dir)
+//             .map_err(|e| {
+//                 anyhow::anyhow!(
+//                     "Failed to read pod5 dir: {}.\n\t{e}",
+//                     muxed_pod_dir.display()
+//                 )
+//             })?
+//             .filter_map(|p| p.ok())
+//             .map(|p| p.path())
+//             .filter(|p| p.extension().unwrap() == "pod5")
+//             .take(1)
+//             .collect::<Vec<PathBuf>>()
+//             .pop()
+//             .unwrap();
+//         let sequencing_kit = Pod5Info::from_pod5(pod_path.to_str().unwrap())
+//             .sequencing_kit
+//             .to_uppercase();
+//
+//         let basecall_pipe =
+//             Self::from_mux_basecall_cmd(&config, &sequencing_kit, &pod_dir, &muxed_bam);
+//         info!("Running: {basecall_pipe}");
+//         Self::run_shell(&basecall_pipe)?;
+//         info!("Basecalling ✅");
+//
+//         // demux
+//         let tmp_demux_dir = format!("{tmp_dir}/demuxed");
+//         fs::create_dir(&tmp_demux_dir)?;
+//         let demux_cmdline = Self::demux_cmd(&config, &muxed_bam, &tmp_demux_dir);
+//         info!("Demux from {sequencing_kit} into {tmp_demux_dir}");
+//         info!("Running: {demux_cmdline}");
+//         Self::run_shell(&demux_cmdline)?;
+//         info!("Demux ✅");
+//
+//         for case in cases.iter() {
+//             let barcode = case.barcode.replace("NB", "");
+//             let bam = find_unique_file(
+//                 &tmp_demux_dir,
+//                 &format!("{sequencing_kit}_barcode{}.bam", barcode),
+//             )?;
+//
+//             let aligned_bam = if !config.align.dorado_should_realign {
+//                 bam.clone()
+//             } else {
+//                 let aligned_bam = format!(
+//                     "{tmp_demux_dir}/{sequencing_kit}_barcode{}_aligned.bam",
+//                     barcode
+//                 );
+//                 let pipe =
+//                     Self::realign_cmd(&config, &sequencing_kit, &barcode, &bam, &aligned_bam);
+//                 info!("Running {pipe}");
+//                 Self::run_shell(&pipe)?;
+//                 info!("Alignement ✅");
+//                 aligned_bam.into()
+//             };
+//
+//             let d = Dorado::init(case.clone(), config.clone())?;
+//             d.create_directories()?;
+//
+//             if PathBuf::from(&d.bam).exists() {
+//                 info!("Merging");
+//                 d.merge_bam(&PathBuf::from(aligned_bam))?;
+//             } else {
+//                 info!("Moving from {} to {}", bam, d.bam);
+//                 fs::rename(aligned_bam, d.bam.clone())?;
+//                 d.index()?;
+//             }
+//
+//             d.run_cramino()?;
+//             d.run_modkit()?;
+//         }
+//         fs::remove_dir_all(tmp_dir)?;
+//
+//         Ok(())
+//     }
+//
+//     pub fn run_pipe(&mut self) -> anyhow::Result<()> {
+//         let start_time = std::time::SystemTime::now();
+//         self.start_time = start_time;
+//
+//         debug!("Running Dorado with config: {:#?}", self.config);
+//         let dorado_bin = self.config.align.dorado_bin.clone();
+//
+//         self.create_reference_mmi()?;
+//         self.create_directories()?;
+//
+//         info!(
+//             "Reading {} pod5 from: {}",
+//             self.case.time_point, self.config.pod_dir
+//         );
+//         let bam_path = std::path::Path::new(&self.bam);
+//
+//         if !bam_path.exists() {
+//             info!("Creating new bam file");
+//             self.basecall_align(&dorado_bin)?;
+//             self.index()?;
+//         } else {
+//             let new_bam_path = bam_path
+//                 .parent()
+//                 .unwrap()
+//                 .join(format!("{}.bam", Uuid::new_v4()));
+//             warn!("Creating new bam {}", new_bam_path.display());
+//
+//             let bam = self.bam.clone();
+//             self.bam = new_bam_path.clone().to_string_lossy().to_string();
+//             self.basecall_align(&dorado_bin)?;
+//             self.bam.clone_from(&bam);
+//             self.merge_bam(&new_bam_path)?;
+//         }
+//
+//         self.run_cramino()?;
+//         self.run_modkit()?;
+//         // self.create_fastq()?;
+//
+//         let end_time = std::time::SystemTime::now();
+//         self.end_time = end_time;
+//         let execution_time = end_time.duration_since(start_time).unwrap().as_secs_f64();
+//         info!(
+//             "Dorado and Minimap2 execution time: {} seconds",
+//             execution_time
+//         );
+//         self.is_done = true;
+//
+//         Ok(())
+//     }
+//
+//     // from_flowcell stays mostly as-is; it just calls run_pipe/from_mux
+//     pub fn from_flowcell(flowcell: &FlowCell, config: &Config) -> anyhow::Result<()> {
+//         let tp_conv = |time_point: &str| -> String {
+//             match time_point {
+//                 "normal" => config.normal_name.clone(),
+//                 "tumoral" => config.tumoral_name.clone(),
+//                 _ => panic!("Error time point name"),
+//             }
+//         };
+//         use crate::collection::flowcells::FlowCellLocation::*;
+//         let base_pod_dir = match &flowcell.location {
+//             Local(_) => None,
+//             Archived(pod_tar) => {
+//                 let file = File::open(pod_tar)
+//                     .map_err(|e| anyhow::anyhow!("Failed to open tar file: {pod_tar}\n\t{e}"))?;
+//                 let mut archive = tar::Archive::new(file);
+//                 info!("Un-tar of archived {pod_tar}");
+//                 archive
+//                     .unpack(&config.unarchive_tmp_dir)
+//                     .map_err(|e| anyhow::anyhow!("Failed to un-tar: {pod_tar}\n\t{e}"))?;
+//                 info!("Un-tar of archived {pod_tar} Done.");
+//
+//                 Some(config.unarchive_tmp_dir.to_string())
+//             }
+//         };
+//
+//         use crate::collection::flowcells::FlowCellExperiment::*;
+//         match &flowcell.experiment {
+//             WGSPod5Mux(pod_dir) => {
+//                 let pod_dir = if let Some(base_pod_dir) = &base_pod_dir {
+//                     format!("{base_pod_dir}/{pod_dir}")
+//                 } else {
+//                     pod_dir.clone()
+//                 };
+//
+//                 let cases = flowcell
+//                     .cases
+//                     .iter()
+//                     .map(|c| FlowCellCase {
+//                         id: c.case_id.clone(),
+//                         time_point: tp_conv(&c.sample_type),
+//                         barcode: c.barcode.clone(),
+//                         pod_dir: pod_dir.clone().into(),
+//                     })
+//                     .collect();
+//                 info!("Starting basecaller for muxed pod5: {cases:#?}");
+//
+//                 Dorado::from_mux(cases, config.clone())?;
+//             }
+//             WGSPod5Demux(pod_dir) => {
+//                 let pod_dir = if let Some(base_pod_dir) = &base_pod_dir {
+//                     format!("{base_pod_dir}/{pod_dir}")
+//                 } else {
+//                     pod_dir.clone()
+//                 };
+//
+//                 for c in flowcell.cases.iter() {
+//                     let pod_dir = format!("{pod_dir}/barcode{}", c.barcode.replace("NB", ""));
+//                     info!("Starting basecaller for demuxed pod5: {pod_dir}");
+//                     let mut d = Dorado::init(
+//                         FlowCellCase {
+//                             id: c.case_id.clone(),
+//                             time_point: tp_conv(&c.sample_type),
+//                             barcode: c.barcode.clone(),
+//                             pod_dir: pod_dir.into(),
+//                         },
+//                         config.clone(),
+//                     )?;
+//                     d.run_pipe()?;
+//                 }
+//             }
+//         }
+//
+//         Ok(())
+//     }
+// }

+ 1 - 1
src/commands/longphase.rs

@@ -1,5 +1,5 @@
 use crate::{
-    collection::{Initialize, InitializeSolo},
+    pipes::{Initialize, InitializeSolo},
     commands::bcftools::{bcftools_compress, bcftools_index},
     config::Config,
     helpers::path_prefix,

+ 1 - 1
src/commands/modkit.rs

@@ -5,7 +5,7 @@ use std::{
 use anyhow::Context;
 
 use crate::{
-    collection::InitializeSolo,
+    pipes::InitializeSolo,
     runners::{run_wait, CommandRun, Run},
 };
 

+ 22 - 4
src/helpers.rs

@@ -179,19 +179,19 @@ fn intersection<T: Ord + Clone>(vec1: &[T], vec2: &[T]) -> VectorIntersection<T>
                 // Add to common
                 result
                     .common
-                    .extend(std::iter::repeat(val.clone()).take(count1.min(count2)));
+                    .extend(std::iter::repeat_n(val.clone(), count1.min(count2)));
 
                 // Add excess to only_in_first or only_in_second
                 match count1.cmp(&count2) {
                     Ordering::Greater => {
                         result
                             .only_in_first
-                            .extend(std::iter::repeat(val.clone()).take(count1 - count2));
+                            .extend(std::iter::repeat_n(val.clone(), count1 - count2));
                     }
                     Ordering::Less => {
                         result
                             .only_in_second
-                            .extend(std::iter::repeat(val.clone()).take(count2 - count1));
+                            .extend(std::iter::repeat_n(val.clone(), count2 - count1));
                     }
                     Ordering::Equal => {
                         // No excess elements, do nothing
@@ -685,7 +685,7 @@ pub fn detect_repetition(s: &str) -> Repeat {
     }
 
     // Check for two-char block repetition
-    if len % 2 == 0 {
+    if len.is_multiple_of(2) {
         let mut iter = s.chars();
         let a = iter.next().unwrap();
         let b = iter.next().unwrap();
@@ -730,3 +730,21 @@ pub fn extract_barcode(name: &str) -> Option<u32> {
         .and_then(|m| m.as_str().parse::<u32>().ok())
 }
 
+pub fn human_size(bytes: u64) -> String {
+    const UNITS: [&str; 5] = ["B", "KB", "MB", "GB", "TB"];
+
+    let mut size = bytes as f64;
+    let mut unit = 0;
+
+    while size >= 1024.0 && unit < UNITS.len() - 1 {
+        size /= 1024.0;
+        unit += 1;
+    }
+
+    if unit == 0 {
+        format!("{} {}", bytes, UNITS[unit])
+    } else {
+        format!("{:.2} {}", size, UNITS[unit])
+    }
+}
+

+ 0 - 21
src/io/pod5_infos.rs

@@ -267,24 +267,3 @@ fn read_arrow_table(
     Ok(batches)
 }
 
-
-#[cfg(test)]
-mod tests {
-    use crate::helpers::{list_files_with_ext, test_init};
-    use crate::TEST_DIR;
-
-    use super::*;
-
-    #[test]
-    fn load_pod5s() -> anyhow::Result<()> {
-        test_init();
-
-        let pods = list_files_with_ext("/mnt/beegfs02/scratch/t_steimle/prom_runs/A/20251117_0915_P2I-00461-A_PBI55810_22582b29/pod5_recovered", "pod5")?;
-
-        if let Some(fpod_path) = pods.first() {
-            let i = Pod5Info::from_pod5(fpod_path.to_str().unwrap());
-            println!("{i:#?}");
-        }
-        Ok(())
-    }
-}

Diff do ficheiro suprimidas por serem muito extensas
+ 243 - 385
src/lib.rs


+ 33 - 0
src/pipes/mod.rs

@@ -1,3 +1,36 @@
+use std::path::Path;
+
+use crate::{config::Config};
+
 pub mod somatic;
 pub mod somatic_slurm;
 
+
+pub trait Initialize: Sized {
+    fn initialize(id: &str, config: Config) -> anyhow::Result<Self>;
+}
+
+pub trait InitializeSolo: Sized {
+    fn initialize(id: &str, time: &str, config: Config) -> anyhow::Result<Self>;
+}
+
+pub trait ShouldRun {
+    fn should_run(&self) -> bool;
+}
+
+pub trait Version {
+    fn version(config: &Config) -> anyhow::Result<String>;
+}
+
+// pub trait LoadVariants {
+//     fn load_variants(&self) -> anyhow::Result<Variants>;
+// }
+
+pub fn exists_all(paths: Vec<&str>) -> anyhow::Result<()> {
+    for path in paths.iter() {
+        if !Path::new(path).exists() {
+            anyhow::bail!("{path} should exist")
+        }
+    }
+    Ok(())
+}

+ 2 - 45
src/pipes/somatic.rs

@@ -1,8 +1,8 @@
 use crate::{
     annotation::is_gnomad_and_constit_alt,
-    collection::ShouldRun,
     create_should_run_normal_tumoral, init_solo_callers_normal_tumoral,
     io::bed::read_bed,
+    pipes::{InitializeSolo, Initialize, ShouldRun},
     positions::GenomeRange,
     scan::scan::SomaticScan,
     variant::{
@@ -26,7 +26,6 @@ use crate::{
         clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV,
         savana::Savana, severus::Severus,
     },
-    collection::{Initialize, InitializeSolo},
     config::Config,
     create_should_run, init_somatic_callers,
     runners::Run,
@@ -198,46 +197,6 @@ impl ShouldRun for SomaticPipe {
                 .map_or(true, |ts| ts > res_meta)
         })
     }
-
-    // fn should_run(&self) -> bool {
-    //     let tumoral_dir = self.config.tumoral_dir(&self.id);
-    //     let path_str = format!("{}/{}_somatic_variants.bit", tumoral_dir, self.id);
-    //     let path = Path::new(&path_str);
-    //
-    //     let normal_bam = self.config.normal_bam(&self.id);
-    //
-    //     if path.exists() {
-    //         if let Ok(Ok(res_metadata)) = path.metadata().map(|r| r.modified()) {
-    //             if let Ok(Ok(normal_bam_m)) =
-    //                 Path::new(&normal_bam).metadata().map(|r| r.modified())
-    //             {
-    //                 if normal_bam_m > res_metadata {
-    //                     return true;
-    //                 }
-    //
-    //                 let tumoral_bam = self.config.tumoral_bam(&self.id);
-    //
-    //                 if let Ok(Ok(tumoral_bam_m)) =
-    //                     Path::new(&tumoral_bam).metadata().map(|r| r.modified())
-    //                 {
-    //                     if tumoral_bam_m > res_metadata {
-    //                         return true;
-    //                     } else {
-    //                         return false;
-    //                     }
-    //                 } else {
-    //                     return true;
-    //                 }
-    //             } else {
-    //                 return true;
-    //             }
-    //         } else {
-    //             return true;
-    //         }
-    //     } else {
-    //         return true;
-    //     }
-    // }
 }
 
 impl Run for SomaticPipe {
@@ -351,7 +310,6 @@ impl Run for SomaticPipe {
                 "{stats_dir}/{id}_annotations_02_post_germline.json"
             ))?;
 
-
         // Remove deletions stretch
         // info!("Removing deletions stretchs:");
         // variants_collections.iter_mut().for_each(|coll| {
@@ -820,8 +778,7 @@ impl SomaticPipeStats {
         let mut germlines_callers: Vec<String> = with_germline
             .iter()
             .flat_map(|(_, r)| {
-                r.iter()
-                    .map(|(k, _)| k.to_string())
+                r.keys().map(|k| k.to_string())
                     .collect::<Vec<String>>()
             })
             .collect();

+ 1 - 1
src/scan/scan.rs

@@ -10,11 +10,11 @@ use rayon::{
 };
 use rust_htslib::bam::IndexedReader;
 
-use crate::collection::{Initialize, ShouldRun};
 use crate::helpers::is_file_older;
 use crate::io::writers::get_gz_writer;
 use crate::math::filter_outliers_modified_z_score_with_indices;
 
+use crate::pipes::{Initialize, ShouldRun};
 use crate::runners::Run;
 use crate::variant::variant::Label;
 use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};

+ 5 - 2
src/variant/variant.rs

@@ -1,7 +1,7 @@
 use crate::{
     annotation::Annotations,
-    collection::ShouldRun,
     helpers::{estimate_shannon_entropy, mean, Hash128},
+    pipes::ShouldRun,
     positions::{GenomePosition, GetGenomePosition, VcfPosition},
     runners::Run,
     variant::variant_collection::VariantCollection,
@@ -269,7 +269,10 @@ impl VcfVariant {
                             AlterationCategory::TRL
                         } else if bnd_desc.a_sens != bnd_desc.b_sens {
                             AlterationCategory::DELINV
-                        } else if bnd_desc.a_sens && bnd_desc.a_position > bnd_desc.b_position && !self.alternative.to_string().contains("N") {
+                        } else if bnd_desc.a_sens
+                            && bnd_desc.a_position > bnd_desc.b_position
+                            && !self.alternative.to_string().contains("N")
+                        {
                             AlterationCategory::DUP
                         } else {
                             AlterationCategory::DEL

Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff