ソースを参照

IdsInput Pod5s

Thomas 10 ヶ月 前
コミット
e1e6476df8
3 ファイル変更876 行追加14 行削除
  1. 566 0
      :a
  2. 297 12
      src/collection/pod5.rs
  3. 13 2
      src/lib.rs

+ 566 - 0
:a

@@ -0,0 +1,566 @@
+use anyhow::{anyhow, Context, Result};
+use chrono::{DateTime, Utc};
+use csv::ReaderBuilder;
+use glob::glob;
+use hashbrown::HashMap;
+use log::{info, warn};
+use rayon::prelude::*;
+use serde::{Deserialize, Serialize};
+use std::{
+    fmt::Display,
+    fs::{self, File, Metadata},
+    io::{self, BufRead},
+    os::unix::fs::MetadataExt,
+    path::PathBuf,
+};
+
+use crate::io::pod5_infos::Pod5Info;
+
+#[derive(Debug, Clone)]
+pub struct Pod5 {
+    pub path: PathBuf,
+    pub pod5_type: Pod5Type,
+    pub run_name: String,
+    pub flowcell_name: String,
+    pub file_metadata: Metadata,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum Pod5Type {
+    Raw,
+    Demuxed,
+}
+
+impl Display for Pod5Type {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            Pod5Type::Raw => "raw",
+            Pod5Type::Demuxed => "demuxed",
+        };
+        f.write_str(s)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Pod5Config {
+    pub base_dir: String,
+    pub type_raw: String,
+    pub type_demuxed: String,
+    pub run_dir_n: u8,
+    pub flowcell_dir_n: u8,
+}
+
+impl Default for Pod5Config {
+    fn default() -> Self {
+        Self {
+            base_dir: "/data/run_data".to_string(),
+            type_raw: "/pod5/".to_string(),
+            type_demuxed: "/pod5_pass/".to_string(),
+            run_dir_n: 0,
+            flowcell_dir_n: 1,
+        }
+    }
+}
+
+impl Pod5 {
+    pub fn from_path(path: &PathBuf, config: &Pod5Config) -> Result<Self> {
+        let s = path
+            .to_str()
+            .context("Can't convert PathBuf to str {path:?}")?;
+        let pod5_type = if s.contains(&config.type_raw) {
+            Pod5Type::Raw
+        } else if s.contains(&config.type_demuxed) {
+            Pod5Type::Demuxed
+        } else {
+            return Err(anyhow!("Can't find the pod5 type {s}"));
+        };
+
+        let file_metadata = fs::metadata(path)?;
+
+        let sr = s.replace(&config.base_dir, "");
+        let components: Vec<&str> = sr.split('/').filter(|c| !c.is_empty()).collect();
+
+        let run_name = components
+            .get(config.run_dir_n as usize)
+            .context("Can't get run_name")?
+            .to_string();
+        let flowcell_name = components
+            .get(config.flowcell_dir_n as usize)
+            .context("Can't get flowcell_name")?
+            .to_string();
+
+        Ok(Self {
+            path: path.to_path_buf(),
+            pod5_type,
+            run_name,
+            flowcell_name,
+            file_metadata,
+        })
+    }
+}
+
+pub fn list_pod_files(dir: &str) -> Result<Vec<Pod5>> {
+    let pattern = format!("{}/**/*.pod5", dir);
+    let mut pod_files = Vec::new();
+
+    let conf = Pod5Config {
+        base_dir: if dir.ends_with('/') {
+            dir.to_string()
+        } else {
+            format!("{dir}/")
+        },
+        ..Pod5Config::default()
+    };
+
+    for entry in glob(&pattern).expect("Failed to read glob pattern") {
+        match entry {
+            Ok(path) => {
+                let p = path.to_str().context("Can't parse path to string {path}")?;
+                if p.contains("/pod5_fail/") || p.contains("/pod5_skip/") {
+                    continue;
+                }
+                match Pod5::from_path(&path, &conf) {
+                    Ok(pod5) => pod_files.push(pod5),
+                    Err(e) => warn!("{e}"),
+                }
+            }
+            Err(e) => warn!("Error: {:?}", e),
+        }
+    }
+    Ok(pod_files)
+}
+
+#[derive(Debug)]
+pub struct Run {
+    pub run_name: String,
+    pub flowcells: Vec<FlowCell>,
+}
+
+#[derive(Debug, Clone)]
+pub struct FlowCell {
+    pub flowcell_name: String,
+    pub corrected_name: String,
+    pub cases: Vec<FlowCellCase>,
+    pub run_name: String,
+    pub pod5_type: Pod5Type,
+    pub pod5_info: Pod5Info,
+    pub pod5: Vec<Pod5>,
+}
+
+// impl FlowCell {
+//     pub fn cases_pod5_dir(&self) -> Vec<PathBuf> {
+//         match self.pod5_type {
+//             Pod5Type::Raw => {
+//                 let p = self.pod5.first().unwrap();
+//                 vec![p.path.parent().unwrap().to_path_buf()]
+//             },
+//             Pod5Type::Demuxed => {
+//                 self.cases.iter().map(|c| {
+//                     let str_barcode = format!("barcode{}", c.barcode);
+//                 })
+//             },
+//         }
+//     }
+// }
+
+#[derive(Debug, Default)]
+pub struct Pod5Collection {
+    pub importation_date: DateTime<Utc>,
+    pub runs: Vec<Run>,
+    pub bam_dir: String,
+    pub pod5_dir: String,
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct FlowCellCase {
+    pub id: String,
+    pub time_point: String,
+    pub barcode: String,
+    pub pod_dir: PathBuf,
+    // pub basecalled: Option<bool>,
+}
+
+impl Pod5Collection {
+    pub fn new(pod5_dir: &str, corrected_fc_path: &str, bam_dir: &str) -> Result<Self> {
+        let pod5 = list_pod_files(pod5_dir)?;
+        info!("n pod5 {}", pod5.len());
+
+        let mut fc: HashMap<String, Vec<Pod5>> = HashMap::new();
+        for pod in pod5 {
+            let k = format!("{}-{}", pod.run_name, pod.flowcell_name);
+            fc.entry(k).or_default().push(pod);
+        }
+
+        let corrected_fc = load_flowcells_corrected_names(corrected_fc_path)?;
+        let flow_cells: Vec<FlowCell> = fc
+            .par_values()
+            .map(|v| {
+                let first = &v[0];
+                let pod5_info = Pod5Info::from_pod5(first.path.to_str().unwrap());
+                let flowcell_name = first.flowcell_name.clone();
+
+                let sel: Vec<FCLine> = corrected_fc
+                    .iter()
+                    .filter(|e| e.flow_cell == flowcell_name)
+                    .cloned()
+                    .collect();
+
+                let mut corrected_name: Vec<String> = sel
+                    .clone()
+                    .into_iter()
+                    .map(|e| e.ref_flow_cell)
+                    .filter(|e| !e.is_empty())
+                    .collect();
+                corrected_name.dedup();
+
+                if corrected_name.len() > 1 {
+                    panic!("Multiple corrected flow_cells for {v:?}");
+                }
+
+                let corrected_name = if !corrected_name.is_empty() {
+                    corrected_name.first().unwrap().to_string()
+                } else {
+                    "".to_string()
+                };
+
+                let cases: Vec<FlowCellCase> = sel
+                    .iter()
+                    .map(|e| {
+                        let pod_dir = match first.pod5_type {
+                            Pod5Type::Raw => first.path.parent().unwrap().to_path_buf(),
+                            Pod5Type::Demuxed => {
+                                let mut bc_dir =
+                                    first.path.parent().unwrap().parent().unwrap().to_path_buf();
+                                bc_dir
+                                    .push(format!("barcode{}", e.barcode_number.replace("NB", "")));
+                                bc_dir
+                            }
+                        };
+
+                        FlowCellCase {
+                            id: e.id.clone(),
+                            time_point: e.time_point.clone(),
+                            barcode: e.barcode_number.clone(),
+                            pod_dir,
+                        }
+                    })
+                    .collect();
+
+                FlowCell {
+                    flowcell_name,
+                    corrected_name,
+                    cases,
+                    run_name: first.run_name.clone(),
+                    pod5_type: first.pod5_type.clone(),
+                    pod5_info,
+                    pod5: v.to_vec(),
+                }
+            })
+            .collect();
+
+        let mut runs = HashMap::new();
+        for fc in flow_cells {
+            runs.entry(fc.run_name.clone())
+                .or_insert_with(Vec::new)
+                .push(fc);
+        }
+
+        let runs: Vec<Run> = runs
+            .into_values()
+            .map(|v| Run {
+                run_name: v[0].run_name.clone(),
+                flowcells: v.to_vec(),
+            })
+            .collect();
+
+        Ok(Self {
+            importation_date: Utc::now(),
+            runs,
+            bam_dir: bam_dir.to_string(),
+            pod5_dir: pod5_dir.to_string(),
+        })
+    }
+
+    pub fn print_info(&self) {
+        self.runs.iter().for_each(|run| {
+            run.flowcells.iter().for_each(|fc| {
+                let total_size: u64 = fc.pod5.iter().map(|p| p.file_metadata.size()).sum();
+                let n_files = fc.pod5.len();
+                let dates: Vec<DateTime<Utc>> = fc
+                    .pod5
+                    .iter()
+                    .map(|p| p.file_metadata.modified().unwrap().into())
+                    .collect();
+                let from = dates.iter().min().unwrap();
+                let to = dates.iter().max().unwrap();
+                let s = [
+                    run.run_name.clone(),
+                    from.to_string(),
+                    to.to_string(),
+                    n_files.to_string(),
+                    total_size.to_string(),
+                    fc.flowcell_name.to_string(),
+                    fc.pod5_type.to_string(),
+                    fc.pod5_info.acquisition_id.clone(),
+                    format!("{:?}", fc.cases),
+                ]
+                .join("\t");
+                println!("{s}");
+            });
+        });
+    }
+
+    // pub fn check_local(&self) -> anyhow::Result<()> {
+    //     let mut res = Vec::new();
+    //     for run in self.runs.iter() {
+    //         for fc in run.flowcells.iter() {
+    //             for c in fc.cases.iter() {
+    //                 let bases_called = if let Some(b) = c.basecalled {
+    //                     if b {
+    //                         "✅".to_string()
+    //                     } else {
+    //                         "❌".to_string()
+    //                     }
+    //                 } else {
+    //                     "❌".to_string()
+    //                 };
+    //
+    //                 let s = [
+    //                     c.id.to_string(),
+    //                     c.time_point.to_string(),
+    //                     c.barcode.to_string(),
+    //                     run.run_name.clone(),
+    //                     fc.flowcell_name.to_string(),
+    //                     fc.pod5_type.to_string(),
+    //                     fc.pod5_info.acquisition_id.clone(),
+    //                     bases_called,
+    //                 ]
+    //                 .join("\t");
+    //                 res.push(s);
+    //             }
+    //         }
+    //     }
+    //     res.sort();
+    //     println!("{}", res.join("\n"));
+    //     Ok(())
+    // }
+
+    // pub fn fc_done(&self) {
+    //     for run in self.runs.iter() {
+    //         for fc in run.flowcells.iter() {
+    //             let n_called = fc
+    //                 .cases
+    //                 .iter()
+    //                 .filter(|c| if let Some(b) = c.basecalled { b } else { false })
+    //                 .count();
+    //             if n_called != 0 && n_called == fc.cases.len() {
+    //                 let s = [
+    //                     format!("{}/{}", run.run_name, fc.flowcell_name),
+    //                     fc.pod5_info.acquisition_id.to_string(),
+    //                     format!("{:#?}", fc.cases),
+    //                 ]
+    //                 .join("\t");
+    //                 println!("{s}");
+    //             }
+    //         }
+    //     }
+    // }
+
+    // pub fn todo(&self) {
+    //     let run_dir = &self.pod5_dir;
+    //     for run in self.runs.iter() {
+    //         for fc in run.flowcells.iter() {
+    //             let to_call: Vec<_> = fc
+    //                 .cases
+    //                 .iter()
+    //                 .filter(|c| if let Some(b) = c.basecalled { !b } else { true })
+    //                 .collect();
+    //
+    //             if !to_call.is_empty() {
+    //                 if fc.pod5_type == Pod5Type::Raw && to_call.len() != fc.cases.len() {
+    //                     println!("No solution for: {}/{}", run.run_name, fc.flowcell_name);
+    //                 } else {
+    //                     match fc.pod5_type {
+    //                         Pod5Type::Raw => {
+    //                             let cases: Vec<String> = to_call
+    //                                 .iter()
+    //                                 .map(|c| {
+    //                                     let bc = c.barcode.replace("NB", "");
+    //                                     let tp = c.time_point.to_lowercase();
+    //                                     [bc, c.id.to_string(), tp].join(" ")
+    //                                 })
+    //                                 .collect();
+    //                             println!(
+    //                                 "from_mux.sh {}/{}/{} {}",
+    //                                 run_dir,
+    //                                 run.run_name,
+    //                                 fc.flowcell_name,
+    //                                 cases.join(" ")
+    //                             );
+    //                         }
+    //                         Pod5Type::Demuxed => to_call.iter().for_each(|c| {
+    //                             let bc = c.barcode.replace("NB", "");
+    //                             let tp = c.time_point.to_lowercase();
+    //                             let bam = format!(
+    //                                 "{}/{}/{}/{}_{}_hs1.bam",
+    //                                 self.bam_dir, c.id, c.time_point, c.id, c.time_point
+    //                             );
+    //                             if PathBuf::from(bam).exists() {
+    //                                 let pod_dir: Vec<String> = fc
+    //                                     .pod5
+    //                                     .iter()
+    //                                     .filter(|p| {
+    //                                         p.path.contains(&format!("barcode{}", bc.clone()))
+    //                                     })
+    //                                     .take(1)
+    //                                     .map(|p| p.path.to_string())
+    //                                     .collect();
+    //
+    //                                 let pod_dir = pod_dir.first().unwrap();
+    //                                 let mut pod_dir = PathBuf::from(pod_dir);
+    //                                 pod_dir.pop();
+    //
+    //                                 // TODO sheduler
+    //                                 println!(
+    //                                     "complete_bam.sh {} {} {}",
+    //                                     c.id,
+    //                                     tp,
+    //                                     pod_dir.to_string_lossy()
+    //                                 )
+    //                             } else {
+    //                                 let pod_dir: Vec<String> = fc
+    //                                     .pod5
+    //                                     .iter()
+    //                                     .filter(|p| {
+    //                                         p.path.contains(&format!("barcode{}", bc.clone()))
+    //                                     })
+    //                                     .take(1)
+    //                                     .map(|p| p.path.to_string())
+    //                                     .collect();
+    //
+    //                                 let pod_dir = pod_dir.first().unwrap();
+    //                                 let mut pod_dir = PathBuf::from(pod_dir);
+    //                                 pod_dir.pop();
+    //
+    //                                 println!(
+    //                                     "dorado.sh {} {} {}",
+    //                                     c.id,
+    //                                     tp,
+    //                                     pod_dir.to_string_lossy()
+    //                                 )
+    //                             }
+    //                         }),
+    //                     };
+    //                 }
+    //             }
+    //         }
+    //     }
+    // }
+
+    pub fn ids(&self) -> Vec<String> {
+        let mut ids: Vec<String> = self
+            .runs
+            .iter()
+            .flat_map(|r| {
+                r.flowcells
+                    .iter()
+                    .flat_map(|f| {
+                        f.cases
+                            .iter()
+                            .map(|c| c.id.clone())
+                            .collect::<Vec<String>>()
+                    })
+                    .collect::<Vec<String>>()
+            })
+            .collect();
+        ids.sort();
+        ids.dedup();
+        ids
+    }
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct FCLine {
+    pub id: String,
+    pub time_point: String,
+    pub barcode_number: String,
+    pub flow_cell: String,
+    pub run: String,
+    pub path: String,
+    pub ref_flow_cell: String,
+}
+
+pub fn load_flowcells_corrected_names(file_path: &str) -> anyhow::Result<Vec<FCLine>> {
+    let file = File::open(file_path)?;
+
+    let mut rdr = ReaderBuilder::new()
+        .delimiter(b'\t')
+        .has_headers(true)
+        .from_reader(file);
+
+    let mut records = Vec::new();
+    for result in rdr.deserialize() {
+        let mut record: FCLine = result?;
+
+        // formating
+        record.time_point = record.time_point.to_lowercase();
+        record.id = record.id.to_uppercase();
+
+        records.push(record);
+    }
+
+    Ok(records)
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct MinKnowSampleSheet {
+    pub protocol_run_id: String,
+    pub position_id: String,
+    pub flow_cell_id: String,
+    pub sample_id: String,
+    pub experiment_id: String,
+    pub flow_cell_product_code: String,
+    pub kit: String,
+}
+
+impl TryFrom<&str> for MinKnowSampleSheet {
+    type Error = anyhow::Error;
+
+    fn try_from(value: &str) -> anyhow::Result<Self> {
+        let cells: Vec<&str> = value.split(",").collect();
+        if cells.len() != 7 {
+            return Err(anyhow::anyhow!(
+                "Number of cells not equal to definition. {value}"
+            ));
+        }
+
+        Ok(Self {
+            protocol_run_id: cells[0].to_string(),
+            position_id: cells[1].to_string(),
+            flow_cell_id: cells[2].to_string(),
+            sample_id: cells[3].to_string(),
+            experiment_id: cells[4].to_string(),
+            flow_cell_product_code: cells[5].to_string(),
+            kit: cells[6].to_string(),
+        })
+    }
+}
+
+impl MinKnowSampleSheet {
+    pub fn from_path(path: &str) -> anyhow::Result<Self> {
+        let file = File::open(path).map_err(|e| format!("Can't open file: {path}\n{e}"))?;
+        let reader = io::BufReader::new(file);
+        for (i, line) in reader.lines().enumerate() {
+            let line = line.map_err(|e| format!("Error parsing line: {line:?}\n\t{e}"))?;
+            if i == 0 && line != "protocol_run_id,position_id,flow_cell_id,sample_id,experiment_id,flow_cell_product_code,kit" {
+                return Err(anyhow::anyhow!("File header doesnt correspond to MinKnwo sample sheet: {line}"));
+            } else if i == 1 {
+                return Ok(line.as_str().try_into()?);
+            } else {
+                return Err(anyhow::anyhow!("Wrong MinKnow sample sheet format."));
+            }
+        }
+                        return Err(anyhow::anyhow!("Wrong MinKnow sample sheet format."));
+
+    }
+}

+ 297 - 12
src/collection/pod5.rs

@@ -2,16 +2,19 @@ use anyhow::{anyhow, Context, Result};
 use chrono::{DateTime, Utc};
 use csv::ReaderBuilder;
 use glob::glob;
+use hashbrown::HashMap;
 use log::{info, warn};
-use serde::Deserialize;
+use rayon::prelude::*;
+use serde::{Deserialize, Serialize};
 use std::{
+    collections::HashSet,
     fmt::Display,
     fs::{self, File, Metadata},
+    hash::{Hash, Hasher},
+    io::{self, BufRead},
     os::unix::fs::MetadataExt,
     path::PathBuf,
 };
-use hashbrown::HashMap;
-use rayon::prelude::*;
 
 use crate::io::pod5_infos::Pod5Info;
 
@@ -226,14 +229,14 @@ impl Pod5Collection {
                     .iter()
                     .map(|e| {
                         let pod_dir = match first.pod5_type {
-                            Pod5Type::Raw => {
-                                first.path.parent().unwrap().to_path_buf()
-                            }
+                            Pod5Type::Raw => first.path.parent().unwrap().to_path_buf(),
                             Pod5Type::Demuxed => {
-                                let mut bc_dir = first.path.parent().unwrap().parent().unwrap().to_path_buf();
-                                bc_dir.push(format!("barcode{}", e.barcode_number.replace("NB", "")));
+                                let mut bc_dir =
+                                    first.path.parent().unwrap().parent().unwrap().to_path_buf();
+                                bc_dir
+                                    .push(format!("barcode{}", e.barcode_number.replace("NB", "")));
                                 bc_dir
-                            },
+                            }
                         };
 
                         FlowCellCase {
@@ -464,8 +467,7 @@ impl Pod5Collection {
                 r.flowcells
                     .iter()
                     .flat_map(|f| {
-                        f
-                            .cases
+                        f.cases
                             .iter()
                             .map(|c| c.id.clone())
                             .collect::<Vec<String>>()
@@ -479,7 +481,7 @@ impl Pod5Collection {
     }
 }
 
-#[derive(Debug, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct FCLine {
     pub id: String,
     pub time_point: String,
@@ -511,3 +513,286 @@ pub fn load_flowcells_corrected_names(file_path: &str) -> anyhow::Result<Vec<FCL
 
     Ok(records)
 }
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct MinKnowSampleSheet {
+    pub protocol_run_id: String,
+    pub position_id: String,
+    pub flow_cell_id: String,
+    pub sample_id: String,
+    pub experiment_id: String,
+    pub flow_cell_product_code: String,
+    pub kit: String,
+}
+
+impl TryFrom<&str> for MinKnowSampleSheet {
+    type Error = anyhow::Error;
+
+    fn try_from(value: &str) -> anyhow::Result<Self> {
+        let cells: Vec<&str> = value.split(",").collect();
+        if cells.len() != 7 {
+            return Err(anyhow::anyhow!(
+                "Number of cells not equal to definition. {value}"
+            ));
+        }
+
+        Ok(Self {
+            protocol_run_id: cells[0].to_string(),
+            position_id: cells[1].to_string(),
+            flow_cell_id: cells[2].to_string(),
+            sample_id: cells[3].to_string(),
+            experiment_id: cells[4].to_string(),
+            flow_cell_product_code: cells[5].to_string(),
+            kit: cells[6].to_string(),
+        })
+    }
+}
+
+impl MinKnowSampleSheet {
+    pub fn from_path(path: &str) -> anyhow::Result<Self> {
+        use std::fs::File;
+        use std::io::{self, BufRead};
+
+        // Open the file
+        let file =
+            File::open(path).map_err(|e| anyhow::anyhow!("Can't open file: {path}\n\t{e}"))?;
+        let reader = io::BufReader::new(file);
+
+        // Iterate over lines in the file
+        let mut lines = reader.lines();
+
+        // Check the header (first line)
+        if let Some(header_line) = lines.next() {
+            let header_line =
+                header_line.map_err(|e| anyhow::anyhow!("Error reading header line: {e}"))?;
+            if header_line
+                != "protocol_run_id,position_id,flow_cell_id,sample_id,experiment_id,flow_cell_product_code,kit"
+            {
+                return Err(anyhow::anyhow!(
+                    "File header doesn't correspond to MinKnow sample sheet: {header_line}"
+                ));
+            }
+        } else {
+            return Err(anyhow::anyhow!("File is empty or missing a header."));
+        }
+
+        // Read the second line (data row)
+        if let Some(data_line) = lines.next() {
+            let data_line =
+                data_line.map_err(|e| anyhow::anyhow!("Error reading data line: {e}"))?;
+            return data_line.as_str().try_into(); // Assuming `try_into` is implemented for `Self`
+        }
+
+        // If no second line exists, return an error
+        Err(anyhow::anyhow!(
+            "File doesn't contain the expected second line (data row)."
+        ))
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct IdsInput {
+    pub data: Vec<IdInput>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct IdInput {
+    pub id: String,
+    pub time_point: String,
+    pub barcode: String,
+    pub flow_cell: String,
+    pub run: String,
+}
+
+// Implement PartialEq and Eq for IdInput
+impl PartialEq for IdInput {
+    fn eq(&self, other: &Self) -> bool {
+        self.id == other.id
+            && self.time_point == other.time_point
+            && self.barcode == other.barcode
+            && self.flow_cell == other.flow_cell
+            && self.run == other.run
+    }
+}
+
+impl Eq for IdInput {}
+
+// Implement Hash for IdInput
+impl Hash for IdInput {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.id.hash(state);
+        self.time_point.hash(state);
+        self.barcode.hash(state);
+        self.flow_cell.hash(state);
+        self.run.hash(state);
+    }
+}
+
+impl IdsInput {
+    pub fn load_json(path: &str) -> anyhow::Result<Self> {
+        let f = File::open(path)?;
+        let s: Self = serde_json::from_reader(f)?;
+        Ok(s)
+    }
+
+    pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
+        let f = File::create(path)?;
+        serde_json::to_writer(f, self)?;
+        Ok(())
+    }
+
+    pub fn dedup(&mut self) {
+        let mut unique = HashSet::new();
+        self.data.retain(|item| unique.insert(item.clone()));
+    }
+
+    pub fn load_from_tsv(path: &str) -> anyhow::Result<Self> {
+        let inputs = load_flowcells_corrected_names(path)?;
+        let data = inputs
+            .iter()
+            .map(|line| IdInput {
+                id: line.id.to_string(),
+                time_point: line.time_point.to_string(),
+                barcode: line.barcode_number.to_string(),
+                flow_cell: line.flow_cell.to_string(),
+                run: line.run.to_string(),
+            })
+            .collect();
+
+        let mut res = Self { data };
+        res.dedup();
+        Ok(res)
+    }
+
+    pub fn add_input(&mut self, values: IdInput) {
+        self.data.push(values);
+        self.dedup();
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Pod5Run {
+    pub protocol_run_id: String,
+    pub position_id: String,
+    pub flow_cell_id: String,
+    pub id: String,
+    pub time_point: String,
+    pub barcode_number: String,
+    pub flow_cell: String,
+    pub run: String,
+    pub last_pod_dir: (DateTime<Utc>, String),
+    pub archives: Vec<(String, DateTime<Utc>, String)>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Pod5Archived {
+    pub archive_id: String,
+    pub last_seen: DateTime<Utc>,
+    pub run: MinKnowSampleSheet,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Pod5s {
+    pub locals: Vec<MinKnowSampleSheet>,
+    pub archived: Vec<Pod5Archived>,
+    pub inputs: IdsInput,
+    pub runs: Vec<Pod5Run>,
+}
+
+impl Pod5s {
+    pub fn load_from_local(local_run_dir: &str, inputs_path: &str) -> anyhow::Result<Self> {
+        let pattern = format!("{local_run_dir}/*/*/*/sample_sheet_*.csv");
+        let locals: Vec<MinKnowSampleSheet> = glob(&pattern)
+            .expect("Failed to read glob pattern")
+            .par_bridge()
+            .filter_map(|entry| {
+                match entry {
+                    Ok(path) => match MinKnowSampleSheet::from_path(path.to_str().unwrap()) {
+                        Ok(sample_sheet) => return Some(sample_sheet),
+                        Err(e) => warn!("{e}"),
+                    },
+                    Err(e) => warn!("Error: {:?}", e),
+                }
+                None
+            })
+            .collect();
+
+        // let inputs = load_flowcells_corrected_names(inputs_path)?;
+        let inputs = IdsInput::load_json(inputs_path)?;
+
+        let runs = locals
+            .iter()
+            .flat_map(|local_run| {
+                inputs
+                    .data
+                    .iter()
+                    .filter(|info| {
+                        info.flow_cell == local_run.sample_id && info.run == local_run.experiment_id
+                    })
+                    .map(|info| Pod5Run {
+                        protocol_run_id: local_run.protocol_run_id.to_string(),
+                        position_id: local_run.position_id.to_string(),
+                        flow_cell_id: local_run.flow_cell_id.to_string(),
+                        id: info.id.to_string(),
+                        time_point: info.time_point.to_string(),
+                        barcode_number: info.barcode.to_string(),
+                        flow_cell: info.flow_cell.to_string(),
+                        run: info.run.to_string(),
+                        last_pod_dir: (Utc::now(), local_run_dir.to_string()),
+                        archives: Vec::new(),
+                    })
+            })
+            .collect();
+
+        Ok(Self {
+            locals,
+            archived: Vec::new(),
+            inputs,
+            runs,
+        })
+    }
+
+    pub fn runs_without_input(&self) -> Vec<MinKnowSampleSheet> {
+        self.locals
+            .iter()
+            .filter_map(|local_run| {
+                let input_count = self.count_matching_inputs(local_run);
+
+                if input_count == 0 {
+                    warn!(
+                        "No input information for {}/{}",
+                        local_run.experiment_id, local_run.sample_id
+                    );
+                    Some(local_run.clone())
+                } else if self.is_lacking_inputs(local_run, input_count) {
+                    warn!(
+                        "Lacking input information for {}/{}",
+                        local_run.experiment_id, local_run.sample_id
+                    );
+                    Some(local_run.clone())
+                } else {
+                    None
+                }
+            })
+            .collect()
+    }
+
+    pub fn count_matching_inputs(&self, local_run: &MinKnowSampleSheet) -> usize {
+        self.inputs
+            .data
+            .iter()
+            .filter(|info| {
+                info.flow_cell == local_run.sample_id && info.run == local_run.experiment_id
+            })
+            .count()
+    }
+
+    pub fn is_lacking_inputs(&self, local_run: &MinKnowSampleSheet, input_count: usize) -> bool {
+        let parts: Vec<&str> = local_run.sample_id.split(&['_', '-']).collect();
+        if parts.len() % 3 == 0 {
+            let expected_count = parts.len() / 3;
+            return expected_count != input_count;
+        }
+        false
+    }
+}

+ 13 - 2
src/lib.rs

@@ -43,7 +43,7 @@ mod tests {
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}}, collection::{bam, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado};
+    use crate::{callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}}, collection::{bam, pod5::{IdsInput, MinKnowSampleSheet, Pod5s}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -684,7 +684,7 @@ mod tests {
         init();
         let id = "ADJAGBA";
         let config = Config::default();
-        let path = format!("{}/{id}/diag//somatic_variants.json.gz", config.result_dir);
+        let path = format!("{}/{id}/diag/somatic_variants.json.gz", config.result_dir);
         let variants = variant_collection::Variants::load_from_json(&path)?;
         println!("n variants {}", variants.data.len());
 
@@ -692,4 +692,15 @@ mod tests {
         println!("VEP: {n_vep}");
         Ok(())
     }
+
+    #[test]
+    fn load_sample_sheet() -> anyhow::Result<()> {
+        init();
+
+        let pod5s = Pod5s::load_from_local("/data/run_data", "/data/inputs_ids.json")?;
+        let res = pod5s.runs_without_input();
+        println!("runs: {}", pod5s.runs.len());
+        println!("lacking info runs: {:#?}", res);
+        Ok(())
+    }
 }