|
|
@@ -1,566 +0,0 @@
|
|
|
-use anyhow::{anyhow, Context, Result};
|
|
|
-use chrono::{DateTime, Utc};
|
|
|
-use csv::ReaderBuilder;
|
|
|
-use glob::glob;
|
|
|
-use hashbrown::HashMap;
|
|
|
-use log::{info, warn};
|
|
|
-use rayon::prelude::*;
|
|
|
-use serde::{Deserialize, Serialize};
|
|
|
-use std::{
|
|
|
- fmt::Display,
|
|
|
- fs::{self, File, Metadata},
|
|
|
- io::{self, BufRead},
|
|
|
- os::unix::fs::MetadataExt,
|
|
|
- path::PathBuf,
|
|
|
-};
|
|
|
-
|
|
|
-use crate::io::pod5_infos::Pod5Info;
|
|
|
-
|
|
|
-#[derive(Debug, Clone)]
|
|
|
-pub struct Pod5 {
|
|
|
- pub path: PathBuf,
|
|
|
- pub pod5_type: Pod5Type,
|
|
|
- pub run_name: String,
|
|
|
- pub flowcell_name: String,
|
|
|
- pub file_metadata: Metadata,
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug, Clone, PartialEq)]
|
|
|
-pub enum Pod5Type {
|
|
|
- Raw,
|
|
|
- Demuxed,
|
|
|
-}
|
|
|
-
|
|
|
-impl Display for Pod5Type {
|
|
|
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
|
- let s = match self {
|
|
|
- Pod5Type::Raw => "raw",
|
|
|
- Pod5Type::Demuxed => "demuxed",
|
|
|
- };
|
|
|
- f.write_str(s)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug, Clone)]
|
|
|
-pub struct Pod5Config {
|
|
|
- pub base_dir: String,
|
|
|
- pub type_raw: String,
|
|
|
- pub type_demuxed: String,
|
|
|
- pub run_dir_n: u8,
|
|
|
- pub flowcell_dir_n: u8,
|
|
|
-}
|
|
|
-
|
|
|
-impl Default for Pod5Config {
|
|
|
- fn default() -> Self {
|
|
|
- Self {
|
|
|
- base_dir: "/data/run_data".to_string(),
|
|
|
- type_raw: "/pod5/".to_string(),
|
|
|
- type_demuxed: "/pod5_pass/".to_string(),
|
|
|
- run_dir_n: 0,
|
|
|
- flowcell_dir_n: 1,
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-impl Pod5 {
|
|
|
- pub fn from_path(path: &PathBuf, config: &Pod5Config) -> Result<Self> {
|
|
|
- let s = path
|
|
|
- .to_str()
|
|
|
- .context("Can't convert PathBuf to str {path:?}")?;
|
|
|
- let pod5_type = if s.contains(&config.type_raw) {
|
|
|
- Pod5Type::Raw
|
|
|
- } else if s.contains(&config.type_demuxed) {
|
|
|
- Pod5Type::Demuxed
|
|
|
- } else {
|
|
|
- return Err(anyhow!("Can't find the pod5 type {s}"));
|
|
|
- };
|
|
|
-
|
|
|
- let file_metadata = fs::metadata(path)?;
|
|
|
-
|
|
|
- let sr = s.replace(&config.base_dir, "");
|
|
|
- let components: Vec<&str> = sr.split('/').filter(|c| !c.is_empty()).collect();
|
|
|
-
|
|
|
- let run_name = components
|
|
|
- .get(config.run_dir_n as usize)
|
|
|
- .context("Can't get run_name")?
|
|
|
- .to_string();
|
|
|
- let flowcell_name = components
|
|
|
- .get(config.flowcell_dir_n as usize)
|
|
|
- .context("Can't get flowcell_name")?
|
|
|
- .to_string();
|
|
|
-
|
|
|
- Ok(Self {
|
|
|
- path: path.to_path_buf(),
|
|
|
- pod5_type,
|
|
|
- run_name,
|
|
|
- flowcell_name,
|
|
|
- file_metadata,
|
|
|
- })
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-pub fn list_pod_files(dir: &str) -> Result<Vec<Pod5>> {
|
|
|
- let pattern = format!("{}/**/*.pod5", dir);
|
|
|
- let mut pod_files = Vec::new();
|
|
|
-
|
|
|
- let conf = Pod5Config {
|
|
|
- base_dir: if dir.ends_with('/') {
|
|
|
- dir.to_string()
|
|
|
- } else {
|
|
|
- format!("{dir}/")
|
|
|
- },
|
|
|
- ..Pod5Config::default()
|
|
|
- };
|
|
|
-
|
|
|
- for entry in glob(&pattern).expect("Failed to read glob pattern") {
|
|
|
- match entry {
|
|
|
- Ok(path) => {
|
|
|
- let p = path.to_str().context("Can't parse path to string {path}")?;
|
|
|
- if p.contains("/pod5_fail/") || p.contains("/pod5_skip/") {
|
|
|
- continue;
|
|
|
- }
|
|
|
- match Pod5::from_path(&path, &conf) {
|
|
|
- Ok(pod5) => pod_files.push(pod5),
|
|
|
- Err(e) => warn!("{e}"),
|
|
|
- }
|
|
|
- }
|
|
|
- Err(e) => warn!("Error: {:?}", e),
|
|
|
- }
|
|
|
- }
|
|
|
- Ok(pod_files)
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug)]
|
|
|
-pub struct Run {
|
|
|
- pub run_name: String,
|
|
|
- pub flowcells: Vec<FlowCell>,
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug, Clone)]
|
|
|
-pub struct FlowCell {
|
|
|
- pub flowcell_name: String,
|
|
|
- pub corrected_name: String,
|
|
|
- pub cases: Vec<FlowCellCase>,
|
|
|
- pub run_name: String,
|
|
|
- pub pod5_type: Pod5Type,
|
|
|
- pub pod5_info: Pod5Info,
|
|
|
- pub pod5: Vec<Pod5>,
|
|
|
-}
|
|
|
-
|
|
|
-// impl FlowCell {
|
|
|
-// pub fn cases_pod5_dir(&self) -> Vec<PathBuf> {
|
|
|
-// match self.pod5_type {
|
|
|
-// Pod5Type::Raw => {
|
|
|
-// let p = self.pod5.first().unwrap();
|
|
|
-// vec![p.path.parent().unwrap().to_path_buf()]
|
|
|
-// },
|
|
|
-// Pod5Type::Demuxed => {
|
|
|
-// self.cases.iter().map(|c| {
|
|
|
-// let str_barcode = format!("barcode{}", c.barcode);
|
|
|
-// })
|
|
|
-// },
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-
|
|
|
-#[derive(Debug, Default)]
|
|
|
-pub struct Pod5Collection {
|
|
|
- pub importation_date: DateTime<Utc>,
|
|
|
- pub runs: Vec<Run>,
|
|
|
- pub bam_dir: String,
|
|
|
- pub pod5_dir: String,
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug, Clone, Default)]
|
|
|
-pub struct FlowCellCase {
|
|
|
- pub id: String,
|
|
|
- pub time_point: String,
|
|
|
- pub barcode: String,
|
|
|
- pub pod_dir: PathBuf,
|
|
|
- // pub basecalled: Option<bool>,
|
|
|
-}
|
|
|
-
|
|
|
-impl Pod5Collection {
|
|
|
- pub fn new(pod5_dir: &str, corrected_fc_path: &str, bam_dir: &str) -> Result<Self> {
|
|
|
- let pod5 = list_pod_files(pod5_dir)?;
|
|
|
- info!("n pod5 {}", pod5.len());
|
|
|
-
|
|
|
- let mut fc: HashMap<String, Vec<Pod5>> = HashMap::new();
|
|
|
- for pod in pod5 {
|
|
|
- let k = format!("{}-{}", pod.run_name, pod.flowcell_name);
|
|
|
- fc.entry(k).or_default().push(pod);
|
|
|
- }
|
|
|
-
|
|
|
- let corrected_fc = load_flowcells_corrected_names(corrected_fc_path)?;
|
|
|
- let flow_cells: Vec<FlowCell> = fc
|
|
|
- .par_values()
|
|
|
- .map(|v| {
|
|
|
- let first = &v[0];
|
|
|
- let pod5_info = Pod5Info::from_pod5(first.path.to_str().unwrap());
|
|
|
- let flowcell_name = first.flowcell_name.clone();
|
|
|
-
|
|
|
- let sel: Vec<FCLine> = corrected_fc
|
|
|
- .iter()
|
|
|
- .filter(|e| e.flow_cell == flowcell_name)
|
|
|
- .cloned()
|
|
|
- .collect();
|
|
|
-
|
|
|
- let mut corrected_name: Vec<String> = sel
|
|
|
- .clone()
|
|
|
- .into_iter()
|
|
|
- .map(|e| e.ref_flow_cell)
|
|
|
- .filter(|e| !e.is_empty())
|
|
|
- .collect();
|
|
|
- corrected_name.dedup();
|
|
|
-
|
|
|
- if corrected_name.len() > 1 {
|
|
|
- panic!("Multiple corrected flow_cells for {v:?}");
|
|
|
- }
|
|
|
-
|
|
|
- let corrected_name = if !corrected_name.is_empty() {
|
|
|
- corrected_name.first().unwrap().to_string()
|
|
|
- } else {
|
|
|
- "".to_string()
|
|
|
- };
|
|
|
-
|
|
|
- let cases: Vec<FlowCellCase> = sel
|
|
|
- .iter()
|
|
|
- .map(|e| {
|
|
|
- let pod_dir = match first.pod5_type {
|
|
|
- Pod5Type::Raw => first.path.parent().unwrap().to_path_buf(),
|
|
|
- Pod5Type::Demuxed => {
|
|
|
- let mut bc_dir =
|
|
|
- first.path.parent().unwrap().parent().unwrap().to_path_buf();
|
|
|
- bc_dir
|
|
|
- .push(format!("barcode{}", e.barcode_number.replace("NB", "")));
|
|
|
- bc_dir
|
|
|
- }
|
|
|
- };
|
|
|
-
|
|
|
- FlowCellCase {
|
|
|
- id: e.id.clone(),
|
|
|
- time_point: e.time_point.clone(),
|
|
|
- barcode: e.barcode_number.clone(),
|
|
|
- pod_dir,
|
|
|
- }
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- FlowCell {
|
|
|
- flowcell_name,
|
|
|
- corrected_name,
|
|
|
- cases,
|
|
|
- run_name: first.run_name.clone(),
|
|
|
- pod5_type: first.pod5_type.clone(),
|
|
|
- pod5_info,
|
|
|
- pod5: v.to_vec(),
|
|
|
- }
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- let mut runs = HashMap::new();
|
|
|
- for fc in flow_cells {
|
|
|
- runs.entry(fc.run_name.clone())
|
|
|
- .or_insert_with(Vec::new)
|
|
|
- .push(fc);
|
|
|
- }
|
|
|
-
|
|
|
- let runs: Vec<Run> = runs
|
|
|
- .into_values()
|
|
|
- .map(|v| Run {
|
|
|
- run_name: v[0].run_name.clone(),
|
|
|
- flowcells: v.to_vec(),
|
|
|
- })
|
|
|
- .collect();
|
|
|
-
|
|
|
- Ok(Self {
|
|
|
- importation_date: Utc::now(),
|
|
|
- runs,
|
|
|
- bam_dir: bam_dir.to_string(),
|
|
|
- pod5_dir: pod5_dir.to_string(),
|
|
|
- })
|
|
|
- }
|
|
|
-
|
|
|
- pub fn print_info(&self) {
|
|
|
- self.runs.iter().for_each(|run| {
|
|
|
- run.flowcells.iter().for_each(|fc| {
|
|
|
- let total_size: u64 = fc.pod5.iter().map(|p| p.file_metadata.size()).sum();
|
|
|
- let n_files = fc.pod5.len();
|
|
|
- let dates: Vec<DateTime<Utc>> = fc
|
|
|
- .pod5
|
|
|
- .iter()
|
|
|
- .map(|p| p.file_metadata.modified().unwrap().into())
|
|
|
- .collect();
|
|
|
- let from = dates.iter().min().unwrap();
|
|
|
- let to = dates.iter().max().unwrap();
|
|
|
- let s = [
|
|
|
- run.run_name.clone(),
|
|
|
- from.to_string(),
|
|
|
- to.to_string(),
|
|
|
- n_files.to_string(),
|
|
|
- total_size.to_string(),
|
|
|
- fc.flowcell_name.to_string(),
|
|
|
- fc.pod5_type.to_string(),
|
|
|
- fc.pod5_info.acquisition_id.clone(),
|
|
|
- format!("{:?}", fc.cases),
|
|
|
- ]
|
|
|
- .join("\t");
|
|
|
- println!("{s}");
|
|
|
- });
|
|
|
- });
|
|
|
- }
|
|
|
-
|
|
|
- // pub fn check_local(&self) -> anyhow::Result<()> {
|
|
|
- // let mut res = Vec::new();
|
|
|
- // for run in self.runs.iter() {
|
|
|
- // for fc in run.flowcells.iter() {
|
|
|
- // for c in fc.cases.iter() {
|
|
|
- // let bases_called = if let Some(b) = c.basecalled {
|
|
|
- // if b {
|
|
|
- // "✅".to_string()
|
|
|
- // } else {
|
|
|
- // "❌".to_string()
|
|
|
- // }
|
|
|
- // } else {
|
|
|
- // "❌".to_string()
|
|
|
- // };
|
|
|
- //
|
|
|
- // let s = [
|
|
|
- // c.id.to_string(),
|
|
|
- // c.time_point.to_string(),
|
|
|
- // c.barcode.to_string(),
|
|
|
- // run.run_name.clone(),
|
|
|
- // fc.flowcell_name.to_string(),
|
|
|
- // fc.pod5_type.to_string(),
|
|
|
- // fc.pod5_info.acquisition_id.clone(),
|
|
|
- // bases_called,
|
|
|
- // ]
|
|
|
- // .join("\t");
|
|
|
- // res.push(s);
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // res.sort();
|
|
|
- // println!("{}", res.join("\n"));
|
|
|
- // Ok(())
|
|
|
- // }
|
|
|
-
|
|
|
- // pub fn fc_done(&self) {
|
|
|
- // for run in self.runs.iter() {
|
|
|
- // for fc in run.flowcells.iter() {
|
|
|
- // let n_called = fc
|
|
|
- // .cases
|
|
|
- // .iter()
|
|
|
- // .filter(|c| if let Some(b) = c.basecalled { b } else { false })
|
|
|
- // .count();
|
|
|
- // if n_called != 0 && n_called == fc.cases.len() {
|
|
|
- // let s = [
|
|
|
- // format!("{}/{}", run.run_name, fc.flowcell_name),
|
|
|
- // fc.pod5_info.acquisition_id.to_string(),
|
|
|
- // format!("{:#?}", fc.cases),
|
|
|
- // ]
|
|
|
- // .join("\t");
|
|
|
- // println!("{s}");
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
-
|
|
|
- // pub fn todo(&self) {
|
|
|
- // let run_dir = &self.pod5_dir;
|
|
|
- // for run in self.runs.iter() {
|
|
|
- // for fc in run.flowcells.iter() {
|
|
|
- // let to_call: Vec<_> = fc
|
|
|
- // .cases
|
|
|
- // .iter()
|
|
|
- // .filter(|c| if let Some(b) = c.basecalled { !b } else { true })
|
|
|
- // .collect();
|
|
|
- //
|
|
|
- // if !to_call.is_empty() {
|
|
|
- // if fc.pod5_type == Pod5Type::Raw && to_call.len() != fc.cases.len() {
|
|
|
- // println!("No solution for: {}/{}", run.run_name, fc.flowcell_name);
|
|
|
- // } else {
|
|
|
- // match fc.pod5_type {
|
|
|
- // Pod5Type::Raw => {
|
|
|
- // let cases: Vec<String> = to_call
|
|
|
- // .iter()
|
|
|
- // .map(|c| {
|
|
|
- // let bc = c.barcode.replace("NB", "");
|
|
|
- // let tp = c.time_point.to_lowercase();
|
|
|
- // [bc, c.id.to_string(), tp].join(" ")
|
|
|
- // })
|
|
|
- // .collect();
|
|
|
- // println!(
|
|
|
- // "from_mux.sh {}/{}/{} {}",
|
|
|
- // run_dir,
|
|
|
- // run.run_name,
|
|
|
- // fc.flowcell_name,
|
|
|
- // cases.join(" ")
|
|
|
- // );
|
|
|
- // }
|
|
|
- // Pod5Type::Demuxed => to_call.iter().for_each(|c| {
|
|
|
- // let bc = c.barcode.replace("NB", "");
|
|
|
- // let tp = c.time_point.to_lowercase();
|
|
|
- // let bam = format!(
|
|
|
- // "{}/{}/{}/{}_{}_hs1.bam",
|
|
|
- // self.bam_dir, c.id, c.time_point, c.id, c.time_point
|
|
|
- // );
|
|
|
- // if PathBuf::from(bam).exists() {
|
|
|
- // let pod_dir: Vec<String> = fc
|
|
|
- // .pod5
|
|
|
- // .iter()
|
|
|
- // .filter(|p| {
|
|
|
- // p.path.contains(&format!("barcode{}", bc.clone()))
|
|
|
- // })
|
|
|
- // .take(1)
|
|
|
- // .map(|p| p.path.to_string())
|
|
|
- // .collect();
|
|
|
- //
|
|
|
- // let pod_dir = pod_dir.first().unwrap();
|
|
|
- // let mut pod_dir = PathBuf::from(pod_dir);
|
|
|
- // pod_dir.pop();
|
|
|
- //
|
|
|
- // // TODO sheduler
|
|
|
- // println!(
|
|
|
- // "complete_bam.sh {} {} {}",
|
|
|
- // c.id,
|
|
|
- // tp,
|
|
|
- // pod_dir.to_string_lossy()
|
|
|
- // )
|
|
|
- // } else {
|
|
|
- // let pod_dir: Vec<String> = fc
|
|
|
- // .pod5
|
|
|
- // .iter()
|
|
|
- // .filter(|p| {
|
|
|
- // p.path.contains(&format!("barcode{}", bc.clone()))
|
|
|
- // })
|
|
|
- // .take(1)
|
|
|
- // .map(|p| p.path.to_string())
|
|
|
- // .collect();
|
|
|
- //
|
|
|
- // let pod_dir = pod_dir.first().unwrap();
|
|
|
- // let mut pod_dir = PathBuf::from(pod_dir);
|
|
|
- // pod_dir.pop();
|
|
|
- //
|
|
|
- // println!(
|
|
|
- // "dorado.sh {} {} {}",
|
|
|
- // c.id,
|
|
|
- // tp,
|
|
|
- // pod_dir.to_string_lossy()
|
|
|
- // )
|
|
|
- // }
|
|
|
- // }),
|
|
|
- // };
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
-
|
|
|
- pub fn ids(&self) -> Vec<String> {
|
|
|
- let mut ids: Vec<String> = self
|
|
|
- .runs
|
|
|
- .iter()
|
|
|
- .flat_map(|r| {
|
|
|
- r.flowcells
|
|
|
- .iter()
|
|
|
- .flat_map(|f| {
|
|
|
- f.cases
|
|
|
- .iter()
|
|
|
- .map(|c| c.id.clone())
|
|
|
- .collect::<Vec<String>>()
|
|
|
- })
|
|
|
- .collect::<Vec<String>>()
|
|
|
- })
|
|
|
- .collect();
|
|
|
- ids.sort();
|
|
|
- ids.dedup();
|
|
|
- ids
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug, Deserialize, Clone)]
|
|
|
-pub struct FCLine {
|
|
|
- pub id: String,
|
|
|
- pub time_point: String,
|
|
|
- pub barcode_number: String,
|
|
|
- pub flow_cell: String,
|
|
|
- pub run: String,
|
|
|
- pub path: String,
|
|
|
- pub ref_flow_cell: String,
|
|
|
-}
|
|
|
-
|
|
|
-pub fn load_flowcells_corrected_names(file_path: &str) -> anyhow::Result<Vec<FCLine>> {
|
|
|
- let file = File::open(file_path)?;
|
|
|
-
|
|
|
- let mut rdr = ReaderBuilder::new()
|
|
|
- .delimiter(b'\t')
|
|
|
- .has_headers(true)
|
|
|
- .from_reader(file);
|
|
|
-
|
|
|
- let mut records = Vec::new();
|
|
|
- for result in rdr.deserialize() {
|
|
|
- let mut record: FCLine = result?;
|
|
|
-
|
|
|
- // formating
|
|
|
- record.time_point = record.time_point.to_lowercase();
|
|
|
- record.id = record.id.to_uppercase();
|
|
|
-
|
|
|
- records.push(record);
|
|
|
- }
|
|
|
-
|
|
|
- Ok(records)
|
|
|
-}
|
|
|
-
|
|
|
-#[derive(Debug, Serialize, Deserialize)]
|
|
|
-struct MinKnowSampleSheet {
|
|
|
- pub protocol_run_id: String,
|
|
|
- pub position_id: String,
|
|
|
- pub flow_cell_id: String,
|
|
|
- pub sample_id: String,
|
|
|
- pub experiment_id: String,
|
|
|
- pub flow_cell_product_code: String,
|
|
|
- pub kit: String,
|
|
|
-}
|
|
|
-
|
|
|
-impl TryFrom<&str> for MinKnowSampleSheet {
|
|
|
- type Error = anyhow::Error;
|
|
|
-
|
|
|
- fn try_from(value: &str) -> anyhow::Result<Self> {
|
|
|
- let cells: Vec<&str> = value.split(",").collect();
|
|
|
- if cells.len() != 7 {
|
|
|
- return Err(anyhow::anyhow!(
|
|
|
- "Number of cells not equal to definition. {value}"
|
|
|
- ));
|
|
|
- }
|
|
|
-
|
|
|
- Ok(Self {
|
|
|
- protocol_run_id: cells[0].to_string(),
|
|
|
- position_id: cells[1].to_string(),
|
|
|
- flow_cell_id: cells[2].to_string(),
|
|
|
- sample_id: cells[3].to_string(),
|
|
|
- experiment_id: cells[4].to_string(),
|
|
|
- flow_cell_product_code: cells[5].to_string(),
|
|
|
- kit: cells[6].to_string(),
|
|
|
- })
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-impl MinKnowSampleSheet {
|
|
|
- pub fn from_path(path: &str) -> anyhow::Result<Self> {
|
|
|
- let file = File::open(path).map_err(|e| format!("Can't open file: {path}\n{e}"))?;
|
|
|
- let reader = io::BufReader::new(file);
|
|
|
- for (i, line) in reader.lines().enumerate() {
|
|
|
- let line = line.map_err(|e| format!("Error parsing line: {line:?}\n\t{e}"))?;
|
|
|
- if i == 0 && line != "protocol_run_id,position_id,flow_cell_id,sample_id,experiment_id,flow_cell_product_code,kit" {
|
|
|
- return Err(anyhow::anyhow!("File header doesnt correspond to MinKnwo sample sheet: {line}"));
|
|
|
- } else if i == 1 {
|
|
|
- return Ok(line.as_str().try_into()?);
|
|
|
- } else {
|
|
|
- return Err(anyhow::anyhow!("Wrong MinKnow sample sheet format."));
|
|
|
- }
|
|
|
- }
|
|
|
- return Err(anyhow::anyhow!("Wrong MinKnow sample sheet format."));
|
|
|
-
|
|
|
- }
|
|
|
-}
|