minknow.rs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. use std::str::FromStr;
  2. use anyhow::Context;
  3. use serde::{Deserialize, Serialize};
  4. /// Represents a single entry from a MinKNOW sample sheet CSV file.
  5. ///
  6. /// This structure captures the metadata associated with a sample flowcell
  7. /// as defined by the Oxford Nanopore MinKNOW software.
  8. ///
  9. /// Expected format (CSV):
  10. /// ```text
  11. /// protocol_run_id,position_id,flow_cell_id,sample_id,experiment_id,flow_cell_product_code,kit
  12. /// 0ef3f65c-aa2b-4936-b49b-e55d361e9d85,1F,PBC97196,Projet_143,Projet_143,FLO-PRO114M,
  13. /// ```
  14. #[derive(Debug, Serialize, Deserialize, Clone)]
  15. pub struct MinKnowSampleSheet {
  16. /// Unique identifier for the protocol run.
  17. pub protocol_run_id: String,
  18. /// Identifier for the flowcell position (e.g., port or device slot).
  19. pub position_id: String,
  20. /// Flowcell barcode or identifier (e.g., FAB123).
  21. pub flow_cell_id: String,
  22. /// Sample ID associated with this run.
  23. pub sample_id: String,
  24. /// Experiment ID assigned in the sample sheet.
  25. pub experiment_id: String,
  26. /// Product code for the flowcell (e.g., FLO-MIN106).
  27. pub flow_cell_product_code: String,
  28. /// Kit identifier used for sample preparation (e.g., SQK-LSK109).
  29. pub kit: String,
  30. }
  31. impl TryFrom<&str> for MinKnowSampleSheet {
  32. type Error = anyhow::Error;
  33. /// Attempts to parse a single comma-separated line (CSV row) into a `MinKnowSampleSheet`.
  34. ///
  35. /// # Arguments
  36. /// - `value`: A CSV-formatted string representing a single data row (excluding the header).
  37. ///
  38. /// # Returns
  39. /// - `Ok(MinKnowSampleSheet)` if parsing succeeds.
  40. /// - `Err` if the row does not contain exactly 7 fields.
  41. ///
  42. /// # Example
  43. /// ```
  44. /// let row = "1234-ABCD,ABC123,FAB001,SAMPLE001,EXP001,FCP001,KIT001";
  45. /// let sheet = MinKnowSampleSheet::try_from(row)?;
  46. /// assert_eq!(sheet.sample_id, "SAMPLE001");
  47. /// ```
  48. fn try_from(value: &str) -> anyhow::Result<Self> {
  49. let cells: Vec<&str> = value.split(',').collect();
  50. if cells.len() != 7 {
  51. return Err(anyhow::anyhow!(
  52. "Number of fields is not equal to 7: {value}"
  53. ));
  54. }
  55. Ok(Self {
  56. protocol_run_id: cells[0].to_string(),
  57. position_id: cells[1].to_string(),
  58. flow_cell_id: cells[2].to_string(),
  59. sample_id: cells[3].to_string(),
  60. experiment_id: cells[4].to_string(),
  61. flow_cell_product_code: cells[5].to_string(),
  62. kit: cells[6].to_string(),
  63. })
  64. }
  65. }
  66. impl MinKnowSampleSheet {
  67. /// Loads a `MinKnowSampleSheet` from a file path containing a 2-line CSV:
  68. /// a header and a single data row.
  69. ///
  70. /// # Arguments
  71. /// - `path`: Path to the MinKNOW sample sheet file.
  72. ///
  73. /// # Returns
  74. /// - `Ok(MinKnowSampleSheet)` if the file is well-formed.
  75. /// - `Err` if the file is missing, malformed, or has an invalid header.
  76. ///
  77. /// # Expected Format
  78. /// The file must contain:
  79. /// - A single header line:
  80. /// `"protocol_run_id,position_id,flow_cell_id,sample_id,experiment_id,flow_cell_product_code,kit"`
  81. /// - One data line corresponding to a sample.
  82. ///
  83. /// # Errors
  84. /// - If the file is missing, empty, or contains malformed data.
  85. ///
  86. /// # Example
  87. /// ```
  88. /// let sheet = MinKnowSampleSheet::from_path("samplesheet.csv")?;
  89. /// println!("Sample ID: {}", sheet.sample_id);
  90. /// ```
  91. pub fn from_path(path: &str) -> anyhow::Result<Self> {
  92. use std::fs::File;
  93. use std::io::{self, BufRead};
  94. let file =
  95. File::open(path).map_err(|e| anyhow::anyhow!("Can't open file: {path}\n\t{e}"))?;
  96. let reader = io::BufReader::new(file);
  97. let mut lines = reader.lines();
  98. // Validate header line
  99. if let Some(header_line) = lines.next() {
  100. let header_line =
  101. header_line.map_err(|e| anyhow::anyhow!("Error reading header line: {e}"))?;
  102. if header_line
  103. != "protocol_run_id,position_id,flow_cell_id,sample_id,experiment_id,flow_cell_product_code,kit"
  104. {
  105. return Err(anyhow::anyhow!(
  106. "File header doesn't match MinKnow sample sheet format: {header_line}"
  107. ));
  108. }
  109. } else {
  110. return Err(anyhow::anyhow!("File is empty or missing a header."));
  111. }
  112. // Parse the data row
  113. if let Some(data_line) = lines.next() {
  114. let data_line =
  115. data_line.map_err(|e| anyhow::anyhow!("Error reading data line: {e}"))?;
  116. return data_line.as_str().try_into();
  117. }
  118. Err(anyhow::anyhow!(
  119. "File doesn't contain the expected second line (data row)."
  120. ))
  121. }
  122. }
  123. /// Loads Nanopore channel state entries from a CSV file.
  124. ///
  125. /// This function parses a CSV file that contains time-series data for
  126. /// individual sequencing channels, including their status (`adapter`, `strand`, etc.),
  127. /// time since experiment start, and duration in samples.
  128. ///
  129. /// The CSV file is expected to have the following headers:
  130. /// - `Channel`
  131. /// - `State`
  132. /// - `Experiment Time (minutes)`
  133. /// - `State Time (samples)`
  134. ///
  135. /// The `State` column is deserialized into a `NanoporeChannelStatus` enum.
  136. ///
  137. /// # Arguments
  138. ///
  139. /// * `path` - Path to the CSV file containing the channel state data.
  140. ///
  141. /// # Returns
  142. ///
  143. /// A `Result` containing:
  144. /// - `Ok(Vec<ChannelStateEntry>)` on success
  145. /// - `Err(Box<dyn Error>)` if the file can't be opened or parsed
  146. ///
  147. /// # Errors
  148. ///
  149. /// This function will return an error if:
  150. /// - The file cannot be opened
  151. /// - The CSV is malformed or missing expected headers
  152. /// - A status value cannot be parsed into `NanoporeChannelStatus`
  153. ///
  154. /// # Example
  155. ///
  156. /// ```rust
  157. /// let entries = load_channel_states("nanopore_data.csv")?;
  158. /// for entry in entries {
  159. /// println!("{:?}", entry);
  160. /// }
  161. /// ```
  162. ///
  163. /// # Dependencies
  164. ///
  165. /// Requires the [`csv`](https://docs.rs/csv), [`serde`](https://docs.rs/serde), and [`serde_derive`](https://docs.rs/serde_derive) crates.
  166. ///
  167. /// # See Also
  168. ///
  169. /// - [`ChannelStateEntry`]
  170. /// - [`NanoporeChannelStatus`]
  171. pub fn parse_pore_activity_from_reader<R: std::io::Read>(
  172. r: &mut R,
  173. ) -> anyhow::Result<Vec<PoreStateEntry>> {
  174. let reader = std::io::BufReader::new(r);
  175. let mut rdr = csv::ReaderBuilder::new()
  176. .delimiter(b',')
  177. .has_headers(true)
  178. .from_reader(reader);
  179. let mut records = Vec::new();
  180. for result in rdr.deserialize() {
  181. let record: PoreStateEntry = result?;
  182. records.push(record);
  183. }
  184. Ok(records)
  185. }
  186. /// One record from a Nanopore pore activity CSV file.
  187. ///
  188. /// This structure represents the state of a single sequencing channel at a given
  189. /// timepoint in the experiment. Each entry includes the channel number, the current
  190. /// pore state (e.g., `adapter`, `strand`, etc.), the experiment time in minutes, and
  191. /// the duration the channel has been in that state (in raw sample counts).
  192. #[derive(Debug, Serialize, Deserialize, Clone)]
  193. pub struct PoreStateEntry {
  194. /// Current status of the pore in this channel.
  195. ///
  196. /// Values include: `adapter`, `strand`, `unavailable`, etc.
  197. #[serde(rename = "Channel State", deserialize_with = "deserialize_status")]
  198. pub state: NanoporeChannelStatus,
  199. /// Time since the start of the experiment, in minutes.
  200. #[serde(rename = "Experiment Time (minutes)")]
  201. pub experiment_time_minutes: f32,
  202. /// Duration the channel has been in this state, in raw sample units.
  203. #[serde(rename = "State Time (samples)")]
  204. pub state_time_samples: u64,
  205. }
  206. /// Represents the current status of a Nanopore sequencing channel.
  207. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize)]
  208. pub enum NanoporeChannelStatus {
  209. /// An adapter is detected in the pore, but sequencing hasn't started yet.
  210. Adapter,
  211. /// Channel has been disabled by the system or user.
  212. Disabled,
  213. /// Channel is reserved or blocked, typically during transitions like muxing.
  214. Locked,
  215. /// More than one nanopore detected in the channel, which is undesirable.
  216. Multiple,
  217. /// No nanopore detected in the channel.
  218. NoPore,
  219. /// Channel is flagged for manual reset due to an error or inactivity.
  220. PendingManualReset,
  221. /// Channel is about to undergo a multiplexer change.
  222. PendingMuxChange,
  223. /// A functional nanopore is detected and ready to start sequencing.
  224. Pore,
  225. /// Signal is saturated, often due to blockage or abnormal activity.
  226. Saturated,
  227. /// Channel is actively sequencing a strand of DNA or RNA.
  228. Strand,
  229. /// Channel is not accessible or not reporting status.
  230. Unavailable,
  231. /// Channel is currently undergoing unblocking to restore functionality.
  232. Unblocking,
  233. /// Channel is in an undefined or unclassified state.
  234. Unclassified,
  235. /// Channel is unclassified following a reset attempt.
  236. UnclassifiedFollowingReset,
  237. /// Unknown status with a negative signal or pattern.
  238. UnknownNegative,
  239. /// Unknown status with a positive signal or pattern.
  240. UnknownPositive,
  241. /// No signal detected; the channel appears inactive or disconnected.
  242. Zero,
  243. }
  244. impl FromStr for NanoporeChannelStatus {
  245. type Err = String;
  246. fn from_str(s: &str) -> Result<Self, Self::Err> {
  247. use NanoporeChannelStatus::*;
  248. match s.trim().to_lowercase().as_str() {
  249. "adapter" => Ok(Adapter),
  250. "disabled" => Ok(Disabled),
  251. "locked" => Ok(Locked),
  252. "multiple" => Ok(Multiple),
  253. "no_pore" => Ok(NoPore),
  254. "pending_manual_reset" => Ok(PendingManualReset),
  255. "pending_mux_change" => Ok(PendingMuxChange),
  256. "pore" => Ok(Pore),
  257. "saturated" => Ok(Saturated),
  258. "strand" => Ok(Strand),
  259. "unavailable" => Ok(Unavailable),
  260. "unblocking" => Ok(Unblocking),
  261. "unclassified" => Ok(Unclassified),
  262. "unclassified_following_reset" => Ok(UnclassifiedFollowingReset),
  263. "unknown_negative" => Ok(UnknownNegative),
  264. "unknown_positive" => Ok(UnknownPositive),
  265. "zero" => Ok(Zero),
  266. _ => Err(format!("Unknown channel status: {}", s)),
  267. }
  268. }
  269. }
  270. // Used by serde to parse the status field
  271. fn deserialize_status<'de, D>(deserializer: D) -> Result<NanoporeChannelStatus, D::Error>
  272. where
  273. D: serde::Deserializer<'de>,
  274. {
  275. let s = String::deserialize(deserializer)?;
  276. NanoporeChannelStatus::from_str(&s).map_err(serde::de::Error::custom)
  277. }
  278. /// Represents a single timepoint of flowcell throughput metrics from a Nanopore sequencing run.
  279. ///
  280. /// Each record summarizes various metrics such as the number of reads,
  281. /// basecalled reads, raw samples, and throughput estimates at a given minute
  282. /// of the experiment.
  283. #[derive(Debug, Serialize, Deserialize, Clone)]
  284. pub struct ThroughputEntry {
  285. /// Time since the start of the experiment, in minutes.
  286. #[serde(rename = "Experiment Time (minutes)")]
  287. pub experiment_time_minutes: u32,
  288. /// Total number of reads detected.
  289. #[serde(rename = "Reads")]
  290. pub reads: u64,
  291. /// Number of reads that passed basecalling filters.
  292. #[serde(rename = "Basecalled Reads Passed")]
  293. pub basecalled_reads_passed: u64,
  294. /// Number of reads that failed basecalling filters.
  295. #[serde(rename = "Basecalled Reads Failed")]
  296. pub basecalled_reads_failed: u64,
  297. /// Number of reads skipped during basecalling.
  298. #[serde(rename = "Basecalled Reads Skipped")]
  299. pub basecalled_reads_skipped: u64,
  300. /// Number of raw signal samples selected for processing.
  301. #[serde(rename = "Selected Raw Samples")]
  302. pub selected_raw_samples: u64,
  303. /// Number of events (e.g., current transitions) selected.
  304. #[serde(rename = "Selected Events")]
  305. pub selected_events: u64,
  306. /// Estimated number of base pairs sequenced.
  307. #[serde(rename = "Estimated Bases")]
  308. pub estimated_bases: u64,
  309. /// Actual number of basecalled base pairs.
  310. #[serde(rename = "Basecalled Bases")]
  311. pub basecalled_bases: u64,
  312. /// Number of raw signal samples used for basecalling.
  313. #[serde(rename = "Basecalled Samples")]
  314. pub basecalled_samples: u64,
  315. }
  316. /// Loads flowcell throughput statistics from a CSV file.
  317. ///
  318. /// This function reads per-minute summary statistics from a CSV file
  319. /// generated by a Nanopore sequencing run. These metrics include the number
  320. /// of reads, basecalled results, and estimated throughput.
  321. ///
  322. /// The CSV file must have the following headers:
  323. /// - `Experiment Time (minutes)`
  324. /// - `Reads`
  325. /// - `Basecalled Reads Passed`
  326. /// - `Basecalled Reads Failed`
  327. /// - `Basecalled Reads Skipped`
  328. /// - `Selected Raw Samples`
  329. /// - `Selected Events`
  330. /// - `Estimated Bases`
  331. /// - `Basecalled Bases`
  332. /// - `Basecalled Samples`
  333. ///
  334. /// # Arguments
  335. ///
  336. /// * `reader` - Any reader that implements `Read`, such as a file, buffer, or
  337. /// decompressed tar entry, containing CSV-formatted throughput metrics.
  338. ///
  339. /// # Returns
  340. ///
  341. /// A `Result` containing:
  342. /// - `Ok(Vec<ThroughputEntry>)` if parsing succeeds
  343. /// - `Err(anyhow::Error)` if an I/O or CSV deserialization error occurs
  344. ///
  345. pub fn parse_throughput_from_reader<R: std::io::Read>(
  346. r: &mut R,
  347. ) -> anyhow::Result<Vec<ThroughputEntry>> {
  348. let reader = std::io::BufReader::new(r);
  349. let mut csv_reader = csv::ReaderBuilder::new()
  350. .delimiter(b',')
  351. .has_headers(true)
  352. .from_reader(reader);
  353. let mut records = Vec::new();
  354. for result in csv_reader.deserialize() {
  355. let record: ThroughputEntry = result.context("CSV deserialization failed")?;
  356. records.push(record);
  357. }
  358. Ok(records)
  359. }