pod5.rs 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681
  1. use anyhow::{anyhow, Context, Result};
  2. use chrono::{DateTime, Utc};
  3. use csv::ReaderBuilder;
  4. use glob::glob;
  5. use hashbrown::HashMap;
  6. use log::{info, warn};
  7. use rayon::prelude::*;
  8. use serde::{Deserialize, Serialize};
  9. use std::{
  10. fmt::Display,
  11. fs::{self, File, Metadata},
  12. os::unix::fs::MetadataExt,
  13. path::{Path, PathBuf},
  14. };
  15. use crate::io::pod5_infos::Pod5Info;
  16. /// Represents a collection of Pod5 sequencing runs and associated metadata.
  17. ///
  18. /// A `Pod5Collection` groups multiple sequencing runs (`Run`), each consisting of
  19. /// one or more flow cells. It is initialized by scanning a directory of `.pod5` files,
  20. /// optionally mapping flow cell names to corrected identifiers, and assigning BAM and
  21. /// `.pod5` directories.
  22. ///
  23. /// # Fields
  24. /// - `importation_date`: Timestamp of when this collection was created.
  25. /// - `runs`: List of runs with associated flow cells and metadata.
  26. /// - `bam_dir`: Directory containing BAM files.
  27. /// - `pod5_dir`: Directory containing `.pod5` files.
  28. #[derive(Debug, Default)]
  29. pub struct Pod5Collection {
  30. pub importation_date: DateTime<Utc>,
  31. pub runs: Vec<Run>,
  32. pub bam_dir: String,
  33. pub pod5_dir: String,
  34. }
  35. impl Pod5Collection {
  36. /// Constructs a new `Pod5Collection` by scanning the given `.pod5` directory,
  37. /// applying corrected flowcell naming, and grouping data by run.
  38. ///
  39. /// # Arguments
  40. /// - `pod5_dir`: Path to directory containing `.pod5` files.
  41. /// - `corrected_fc_path`: Path to file with corrected flowcell mappings.
  42. /// - `bam_dir`: Path to directory containing BAM files.
  43. ///
  44. /// # Returns
  45. /// - `Ok(Pod5Collection)` if the data is consistent and valid.
  46. /// - `Err(anyhow::Error)` if listing, parsing, or validation fails.
  47. pub fn new(pod5_dir: &str, corrected_fc_path: &str, bam_dir: &str) -> Result<Self> {
  48. // Load pod5 files
  49. let pod5_files = list_pod_files(pod5_dir)?;
  50. info!("n pod5 {}", pod5_files.len());
  51. // Group pod5 files by run-flowcell key
  52. let mut grouped: HashMap<String, Vec<Pod5>> = HashMap::new();
  53. for pod in pod5_files {
  54. let key = format!("{}••{}", pod.run_name, pod.flowcell_name);
  55. grouped.entry(key).or_default().push(pod);
  56. }
  57. // Load corrected flowcell mapping
  58. let corrected_fc = load_flowcells_corrected_names(corrected_fc_path)?;
  59. // Construct FlowCells in parallel from Pod5 groups
  60. let flowcells: Vec<FlowCell> = grouped
  61. .into_values()
  62. .par_bridge()
  63. .map(|group| FlowCell::new(group, &corrected_fc))
  64. .collect::<Result<Vec<_>>>()?;
  65. // Group FlowCells by run_name (sequential step)
  66. let mut runs_map: HashMap<String, Vec<FlowCell>> = HashMap::new();
  67. for fc in flowcells {
  68. runs_map.entry(fc.run_name.clone()).or_default().push(fc);
  69. }
  70. // Convert each run group into a Run
  71. let runs: Vec<Run> = runs_map
  72. .into_values()
  73. .map(|fcs| Run {
  74. run_name: fcs[0].run_name.clone(),
  75. flowcells: fcs,
  76. })
  77. .collect();
  78. Ok(Self {
  79. importation_date: Utc::now(),
  80. runs,
  81. bam_dir: bam_dir.to_string(),
  82. pod5_dir: pod5_dir.to_string(),
  83. })
  84. }
  85. pub fn print_info(&self) {
  86. self.runs.iter().for_each(|run| {
  87. run.flowcells.iter().for_each(|fc| {
  88. let total_size: u64 = fc.pod5.iter().map(|p| p.file_metadata.size()).sum();
  89. let n_files = fc.pod5.len();
  90. let dates: Vec<DateTime<Utc>> = fc
  91. .pod5
  92. .iter()
  93. .map(|p| p.file_metadata.modified().unwrap().into())
  94. .collect();
  95. let from = dates.iter().min().unwrap();
  96. let to = dates.iter().max().unwrap();
  97. let s = [
  98. run.run_name.clone(),
  99. from.to_string(),
  100. to.to_string(),
  101. n_files.to_string(),
  102. total_size.to_string(),
  103. fc.flowcell_name.to_string(),
  104. fc.pod5_type.to_string(),
  105. fc.pod5_info.acquisition_id.clone(),
  106. format!("{:?}", fc.cases),
  107. ]
  108. .join("\t");
  109. println!("{s}");
  110. });
  111. });
  112. }
  113. /// Returns a sorted and deduplicated list of all unique `FlowCellCase` IDs in the collection.
  114. pub fn ids(&self) -> Vec<String> {
  115. let mut ids: Vec<String> = self
  116. .runs
  117. .iter()
  118. .flat_map(|r| r.flowcells.iter())
  119. .flat_map(|f| f.cases.iter().map(|c| c.id.clone()))
  120. .collect();
  121. ids.sort_unstable(); // faster than sort()
  122. ids.dedup();
  123. ids
  124. }
  125. }
  126. /// Represents a sequencing run, which may contain multiple flowcells.
  127. ///
  128. /// A `Run` groups flowcells that were processed together during a sequencing event
  129. /// (e.g., a MinION or PromethION run). It serves as a logical grouping for downstream analysis.
  130. ///
  131. /// # Fields
  132. /// - `run_name`: Unique identifier for the sequencing run (e.g., "20240301_RUN42").
  133. /// - `flowcells`: List of `FlowCell` objects associated with this run.
  134. #[derive(Debug)]
  135. pub struct Run {
  136. /// Name of the sequencing run.
  137. pub run_name: String,
  138. /// Flowcells that belong to this run.
  139. pub flowcells: Vec<FlowCell>,
  140. }
  141. /// Represents a flowcell and its associated metadata, cases, and `.pod5` files.
  142. ///
  143. /// A `FlowCell` encapsulates all relevant information needed to track,
  144. /// identify, and process a physical flowcell, including its corrected name,
  145. /// acquisition metadata, and associated `.pod5` files.
  146. ///
  147. /// # Fields
  148. /// - `flowcell_name`: Original name of the flowcell as found in `.pod5` files.
  149. /// - `corrected_name`: Normalized or corrected version of the flowcell name (if available).
  150. /// - `cases`: Associated cases (`FlowCellCase`) for this flowcell, usually representing samples or barcodes.
  151. /// - `run_name`: Name of the sequencing run this flowcell belongs to.
  152. /// - `pod5_type`: Whether the `.pod5` files are raw or demultiplexed (`Pod5Type`).
  153. /// - `pod5_info`: Metadata extracted from one representative `.pod5` file (`Pod5Info`).
  154. /// - `pod5`: All `.pod5` file entries associated with this flowcell.
  155. #[derive(Debug, Clone)]
  156. pub struct FlowCell {
  157. /// Original flowcell name (e.g., "FCX123").
  158. pub flowcell_name: String,
  159. /// Corrected flowcell name, if normalization was applied.
  160. pub corrected_name: String,
  161. /// Sample/barcode-level associations for this flowcell.
  162. pub cases: Vec<FlowCellCase>,
  163. /// The sequencing run this flowcell belongs to.
  164. pub run_name: String,
  165. /// Type of pod5 data: raw or demuxed.
  166. pub pod5_type: Pod5Type,
  167. /// Metadata extracted from a `.pod5` file, including acquisition ID.
  168. pub pod5_info: Pod5Info,
  169. /// The list of `.pod5` files linked to this flowcell.
  170. pub pod5: Vec<Pod5>,
  171. }
  172. impl FlowCell {
  173. /// Constructs a new `FlowCell` from a non-empty vector of `Pod5` entries
  174. /// and a list of corrected flowcell mappings.
  175. ///
  176. /// Ensures that all entries in the vector share the same `run_name`, `flowcell_name`, and `pod5_type`.
  177. ///
  178. /// # Arguments
  179. /// - `pods`: A non-empty vector of `Pod5` entries (moved, not cloned).
  180. /// - `corrected_fc`: Reference to a list of `FCLine` entries for resolving corrected names.
  181. ///
  182. /// # Errors
  183. /// Returns an error if:
  184. /// - `pods` is empty
  185. /// - `.pod5` path is invalid UTF-8
  186. /// - inconsistent metadata across pod5 entries
  187. /// - multiple corrected names are found
  188. /// - parent directory resolution fails
  189. pub fn new(pods: Vec<Pod5>, corrected_fc: &[FCLine]) -> anyhow::Result<Self> {
  190. let first = pods.first().context("Empty pod5 list for FlowCell")?;
  191. let flowcell_name = &first.flowcell_name;
  192. let run_name = &first.run_name;
  193. let pod5_type = &first.pod5_type;
  194. // Consistency check
  195. let inconsistent = pods.iter().any(|p| {
  196. p.flowcell_name != *flowcell_name
  197. || p.run_name != *run_name
  198. || p.pod5_type != *pod5_type
  199. });
  200. if inconsistent {
  201. return Err(anyhow!(
  202. "Inconsistent pod5 metadata: all entries must share the same run_name, flowcell_name, and pod5_type"
  203. ));
  204. }
  205. // Extract and validate .pod5 path
  206. let path_str = first.path.to_str().context("Invalid UTF-8 in pod5 path")?;
  207. let pod5_info = Pod5Info::from_pod5(path_str);
  208. // Select corrected entries for this flowcell
  209. let matched_fc_lines: Vec<_> = corrected_fc
  210. .iter()
  211. .filter(|e| e.flow_cell == *flowcell_name)
  212. .cloned()
  213. .collect();
  214. // Resolve unique corrected name
  215. let corrected_name = {
  216. let mut names: Vec<_> = matched_fc_lines
  217. .iter()
  218. .map(|e| e.ref_flow_cell.clone())
  219. .filter(|s| !s.is_empty())
  220. .collect();
  221. names.dedup();
  222. match names.len() {
  223. 0 => String::new(),
  224. 1 => names[0].clone(),
  225. _ => {
  226. return Err(anyhow!(
  227. "Multiple corrected names for flow cell '{}': {:?}",
  228. flowcell_name,
  229. names
  230. ));
  231. }
  232. }
  233. };
  234. // Cache parent directories
  235. let raw_parent = first.path.parent().context("Missing parent for RAW pod5")?;
  236. let demuxed_grandparent = raw_parent
  237. .parent()
  238. .context("Invalid directory structure for DEMUXED pod5")?;
  239. // Build case list
  240. let cases = matched_fc_lines
  241. .iter()
  242. .map(|e| {
  243. let pod_dir = match pod5_type {
  244. Pod5Type::Raw => raw_parent.to_path_buf(),
  245. Pod5Type::Demuxed => {
  246. let mut bc_dir = demuxed_grandparent.to_path_buf();
  247. bc_dir.push(format!("barcode{}", e.barcode_number.replace("NB", "")));
  248. bc_dir
  249. }
  250. };
  251. Ok(FlowCellCase {
  252. id: e.id.clone(),
  253. time_point: e.sample_type.clone(),
  254. barcode: e.barcode_number.clone(),
  255. pod_dir,
  256. })
  257. })
  258. .collect::<Result<_>>()?;
  259. Ok(Self {
  260. flowcell_name: flowcell_name.clone(),
  261. corrected_name,
  262. cases,
  263. run_name: run_name.clone(),
  264. pod5_type: pod5_type.clone(),
  265. pod5_info,
  266. pod5: pods, // Already moved
  267. })
  268. }
  269. }
  270. /// Represents the type of `.pod5` file: either raw or demultiplexed.
  271. #[derive(Debug, Clone, PartialEq)]
  272. pub enum Pod5Type {
  273. /// Raw `.pod5` files directly from acquisition.
  274. Raw,
  275. /// Demultiplexed `.pod5` files, post-processed by barcoding.
  276. Demuxed,
  277. }
  278. impl Display for Pod5Type {
  279. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  280. let s = match self {
  281. Pod5Type::Raw => "raw",
  282. Pod5Type::Demuxed => "demuxed",
  283. };
  284. f.write_str(s)
  285. }
  286. }
  287. /// Configuration for interpreting file paths when parsing `.pod5` files.
  288. #[derive(Debug, Clone)]
  289. pub struct Pod5Config {
  290. /// Base directory (prefix to strip from full paths).
  291. pub base_dir: String,
  292. /// Substring used to detect "raw" pod5 files.
  293. pub type_raw: String,
  294. /// Substring used to detect "demuxed" pod5 files.
  295. pub type_demuxed: String,
  296. /// Index (in path components) where `run_name` is expected.
  297. pub run_dir_n: u8,
  298. /// Index (in path components) where `flowcell_name` is expected.
  299. pub flowcell_dir_n: u8,
  300. }
  301. impl Default for Pod5Config {
  302. fn default() -> Self {
  303. Self {
  304. base_dir: "/data/run_data".to_string(),
  305. type_raw: "/pod5/".to_string(),
  306. type_demuxed: "/pod5_pass/".to_string(),
  307. run_dir_n: 0,
  308. flowcell_dir_n: 1,
  309. }
  310. }
  311. }
  312. /// Represents a `.pod5` file and its associated metadata and location info.
  313. ///
  314. /// Used as the base object for flowcell and run aggregation.
  315. #[derive(Debug, Clone)]
  316. pub struct Pod5 {
  317. /// Full path to the `.pod5` file.
  318. pub path: PathBuf,
  319. /// Whether the file is raw or demultiplexed.
  320. pub pod5_type: Pod5Type,
  321. /// Name of the sequencing run this file belongs to.
  322. pub run_name: String,
  323. /// Name of the flowcell associated with this file.
  324. pub flowcell_name: String,
  325. /// Filesystem metadata (e.g., size, modified time).
  326. pub file_metadata: Metadata,
  327. }
  328. impl Pod5 {
  329. /// Constructs a `Pod5` instance from a file path, using a `Pod5Config` to infer type and extract metadata.
  330. ///
  331. /// # Arguments
  332. /// - `path`: Path to the `.pod5` file.
  333. /// - `config`: Configuration used to interpret the path structure.
  334. ///
  335. /// # Returns
  336. /// - `Ok(Pod5)` if type and components can be extracted.
  337. /// - `Err` if path is malformed, missing components, or type is unrecognized.
  338. pub fn from_path(path: impl AsRef<Path>, config: &Pod5Config) -> Result<Self> {
  339. let path = path.as_ref();
  340. let path_str = path
  341. .to_str()
  342. .context(format!("Can't convert path to UTF-8 string: {:?}", path))?;
  343. // Determine Pod5 type by pattern matching
  344. let pod5_type = if path_str.contains(&config.type_raw) {
  345. Pod5Type::Raw
  346. } else if path_str.contains(&config.type_demuxed) {
  347. Pod5Type::Demuxed
  348. } else {
  349. return Err(anyhow!(
  350. "Unable to determine pod5 type from path: {}",
  351. path_str
  352. ));
  353. };
  354. // Extract metadata from filesystem
  355. let file_metadata =
  356. fs::metadata(path).with_context(|| format!("Failed to get metadata for {:?}", path))?;
  357. // Strip base_dir and split into components
  358. let relative_path = path_str.strip_prefix(&config.base_dir).unwrap_or(path_str); // fallback to full path if base_dir is not a prefix
  359. let components: Vec<&str> = relative_path.split('/').filter(|c| !c.is_empty()).collect();
  360. // Extract run_name and flowcell_name from path components
  361. let run_name = components
  362. .get(config.run_dir_n as usize)
  363. .context("Missing run_name in path")?
  364. .to_string();
  365. let flowcell_name = components
  366. .get(config.flowcell_dir_n as usize)
  367. .context("Missing flowcell_name in path")?
  368. .to_string();
  369. Ok(Self {
  370. path: path.to_path_buf(),
  371. pod5_type,
  372. run_name,
  373. flowcell_name,
  374. file_metadata,
  375. })
  376. }
  377. }
  378. /// Recursively scans a directory for `.pod5` files and parses them into `Pod5` objects.
  379. ///
  380. /// This function uses glob-based search to find all `.pod5` files under the given directory
  381. /// (including subdirectories), then filters out unwanted paths (e.g., `pod5_fail/`, `pod5_skip/`)
  382. /// and attempts to parse each remaining file using `Pod5::from_path`.
  383. ///
  384. /// Any file that fails to parse is skipped with a warning.
  385. ///
  386. /// # Arguments
  387. /// - `dir`: Path to the root directory to search (absolute or relative).
  388. ///
  389. /// # Returns
  390. /// - `Ok(Vec<Pod5>)` on success, with all successfully parsed `.pod5` files.
  391. /// - `Err(anyhow::Error)` if path parsing fails (e.g., invalid UTF-8).
  392. ///
  393. /// # Errors
  394. /// - Fails early if the glob pattern itself is invalid.
  395. /// - Skips over files that fail to parse, but logs warnings.
  396. ///
  397. /// # Notes
  398. /// - Directories containing `/pod5_fail/` or `/pod5_skip/` are excluded.
  399. /// - The glob pattern used is `{dir}/**/*.pod5`.
  400. ///
  401. /// # Example
  402. /// ```
  403. /// let pod_files = list_pod_files("/data/pods")?;
  404. /// for pod in pod_files {
  405. /// println!("{}", pod.path.display());
  406. /// }
  407. /// ```
  408. pub fn list_pod_files(dir: &str) -> Result<Vec<Pod5>> {
  409. let pattern = format!("{}/**/*.pod5", dir);
  410. let mut pod_files = Vec::new();
  411. let conf = Pod5Config {
  412. base_dir: if dir.ends_with('/') {
  413. dir.to_string()
  414. } else {
  415. format!("{dir}/")
  416. },
  417. ..Pod5Config::default()
  418. };
  419. for entry in glob(&pattern).expect("Failed to read glob pattern") {
  420. match entry {
  421. Ok(path) => {
  422. let p = path.to_str().context("Can't parse path to string {path}")?;
  423. if p.contains("/pod5_fail/") || p.contains("/pod5_skip/") {
  424. continue;
  425. }
  426. match Pod5::from_path(&path, &conf) {
  427. Ok(pod5) => pod_files.push(pod5),
  428. Err(e) => warn!("{e}"),
  429. }
  430. }
  431. Err(e) => warn!("Error: {:?}", e),
  432. }
  433. }
  434. Ok(pod_files)
  435. }
  436. // impl FlowCell {
  437. // pub fn cases_pod5_dir(&self) -> Vec<PathBuf> {
  438. // match self.pod5_type {
  439. // Pod5Type::Raw => {
  440. // let p = self.pod5.first().unwrap();
  441. // vec![p.path.parent().unwrap().to_path_buf()]
  442. // },
  443. // Pod5Type::Demuxed => {
  444. // self.cases.iter().map(|c| {
  445. // let str_barcode = format!("barcode{}", c.barcode);
  446. // })
  447. // },
  448. // }
  449. // }
  450. // }
  451. #[derive(Debug, Clone, Default)]
  452. pub struct FlowCellCase {
  453. pub id: String,
  454. pub time_point: String,
  455. pub barcode: String,
  456. pub pod_dir: PathBuf,
  457. // pub basecalled: Option<bool>,
  458. }
  459. // #[derive(Debug, Serialize, Deserialize, Clone)]
  460. // pub struct IdsInput {
  461. // pub data: Vec<IdInput>,
  462. // }
  463. //
  464. // #[derive(Debug, Serialize, Deserialize, Clone)]
  465. // pub struct IdInput {
  466. // pub id: String,
  467. // pub time_point: String,
  468. // pub barcode: String,
  469. // pub flow_cell: String,
  470. // pub run: String,
  471. // }
  472. //
  473. // // Implement PartialEq and Eq for IdInput
  474. // impl PartialEq for IdInput {
  475. // fn eq(&self, other: &Self) -> bool {
  476. // self.id == other.id
  477. // && self.time_point == other.time_point
  478. // && self.barcode == other.barcode
  479. // && self.flow_cell == other.flow_cell
  480. // && self.run == other.run
  481. // }
  482. // }
  483. //
  484. // impl Eq for IdInput {}
  485. //
  486. // // Implement Hash for IdInput
  487. // impl Hash for IdInput {
  488. // fn hash<H: Hasher>(&self, state: &mut H) {
  489. // self.id.hash(state);
  490. // self.time_point.hash(state);
  491. // self.barcode.hash(state);
  492. // self.flow_cell.hash(state);
  493. // self.run.hash(state);
  494. // }
  495. // }
  496. //
  497. // impl IdsInput {
  498. // pub fn load_json(path: &str) -> anyhow::Result<Self> {
  499. // let f = File::open(path)?;
  500. // let s: Self = serde_json::from_reader(f)?;
  501. // Ok(s)
  502. // }
  503. //
  504. // pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
  505. // let f = File::create(path)?;
  506. // serde_json::to_writer(f, self)?;
  507. // Ok(())
  508. // }
  509. //
  510. // pub fn dedup(&mut self) {
  511. // let mut unique = HashSet::new();
  512. // self.data.retain(|item| unique.insert(item.clone()));
  513. // }
  514. //
  515. // pub fn load_from_tsv(path: &str) -> anyhow::Result<Self> {
  516. // let inputs = load_flowcells_corrected_names(path)?;
  517. // let data = inputs
  518. // .iter()
  519. // .map(|line| IdInput {
  520. // id: line.id.to_string(),
  521. // time_point: line.sample_type.to_string(),
  522. // barcode: line.barcode_number.to_string(),
  523. // flow_cell: line.flow_cell.to_string(),
  524. // run: line.run.to_string(),
  525. // })
  526. // .collect();
  527. //
  528. // let mut res = Self { data };
  529. // res.dedup();
  530. // Ok(res)
  531. // }
  532. //
  533. // pub fn add_input(&mut self, values: IdInput) {
  534. // self.data.push(values);
  535. // self.dedup();
  536. // }
  537. // }
  538. #[derive(Debug, Serialize, Deserialize, Clone)]
  539. pub struct Pod5Run {
  540. pub protocol_run_id: String,
  541. pub position_id: String,
  542. pub flow_cell_id: String,
  543. pub id: String,
  544. pub time_point: String,
  545. pub barcode_number: String,
  546. pub flow_cell: String,
  547. pub run: String,
  548. pub last_pod_dir: (DateTime<Utc>, String),
  549. pub archives: Vec<(String, DateTime<Utc>, String)>,
  550. }
  551. /// Loads corrected flowcell metadata from a tab-delimited file.
  552. ///
  553. /// This function parses a TSV file where each row is deserialized into an `FCLine`.
  554. /// It also normalizes some fields (e.g., lowercases `sample_type`, uppercases `id`)
  555. /// for consistency in downstream processing.
  556. ///
  557. /// # Arguments
  558. /// - `file_path`: Path to the TSV file containing flowcell correction data.
  559. ///
  560. /// # Returns
  561. /// A vector of `FCLine` records, one per line in the file.
  562. ///
  563. /// # Errors
  564. /// Returns an error if the file cannot be opened or if any line fails to deserialize.
  565. ///
  566. /// # Expected Format (TSV with header)
  567. /// ```text
  568. /// id sample_type barcode_number flow_cell run_path ref_flow_cell
  569. /// P001X03 tumoral NB01 FC123 RUN123 /path/to/data FC123_CORR
  570. /// ```
  571. ///
  572. /// # Example
  573. /// ```
  574. /// let fc_lines = load_flowcells_corrected_names("flowcells.tsv")?;
  575. /// assert!(!fc_lines.is_empty());
  576. /// ```
  577. pub fn load_flowcells_corrected_names(file_path: &str) -> anyhow::Result<Vec<FCLine>> {
  578. let file = File::open(file_path)?;
  579. let mut rdr = ReaderBuilder::new()
  580. .delimiter(b'\t')
  581. .has_headers(true)
  582. .from_reader(file);
  583. let mut records = Vec::new();
  584. for result in rdr.deserialize() {
  585. let mut record: FCLine = result?;
  586. // formating
  587. record.sample_type = record.sample_type.to_lowercase();
  588. record.id = record.id.to_uppercase();
  589. records.push(record);
  590. }
  591. Ok(records)
  592. }
  593. /// Represents a single record describing a barcode-flowcell pairing,
  594. /// including original and corrected metadata.
  595. ///
  596. /// This struct is typically deserialized from a TSV file and used to map
  597. /// `.pod5` files to metadata like corrected flowcell names and experimental time points.
  598. #[derive(Debug, Serialize, Deserialize, Clone)]
  599. pub struct FCLine {
  600. /// Unique identifier for the sample or barcode group (e.g., "P001X03").
  601. pub id: String,
  602. /// Sample type associated with this record (e.g., "normal", "tumoral").
  603. pub sample_type: String,
  604. /// The barcode number (e.g., "NB01", "NB02").
  605. pub barcode_number: String,
  606. /// Original flowcell name as found in the raw `.pod5` metadata.
  607. pub flow_cell: String,
  608. /// Sequencing run name this flowcell belongs to (e.g., "20240101_FAB123").
  609. pub run: String,
  610. /// Original path to data (can be absolute or relative).
  611. pub path: String,
  612. /// Corrected flowcell name used to resolve naming inconsistencies.
  613. pub ref_flow_cell: String,
  614. }