소스 검색

Pod5Info update

Thomas 3 달 전
부모
커밋
a8cfb4395c
3개의 변경된 파일110개의 추가작업 그리고 130개의 파일을 삭제
  1. 21 13
      src/collection/pod5.rs
  2. 2 7
      src/commands/dorado.rs
  3. 87 110
      src/io/pod5_infos.rs

+ 21 - 13
src/collection/pod5.rs

@@ -1,7 +1,5 @@
 use std::{
-    collections::HashSet,
-    fmt, fs,
-    path::{Path, PathBuf},
+    collections::HashSet, fmt, fs, path::{Path, PathBuf}
 };
 
 use chrono::{DateTime, Utc};
@@ -45,12 +43,18 @@ impl Pod5 {
     /// corresponding fields in `Pod5`.
     pub fn from_path<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
         let path_ref = path.as_ref();
-        let path_str = path_ref.to_str().ok_or_else(|| {
-            anyhow::anyhow!("Path contains invalid UTF-8: {}", path_ref.display())
-        })?;
-
-        let info = Pod5Info::from_pod5(path_str);
-        let file_size = std::fs::metadata(path_ref)?.len();
+        
+        // Convert path to string, returning an error if it contains invalid UTF-8
+        let path_str = path_ref
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("Path contains invalid UTF-8: {}", path_ref.display()))?;
+        
+        // Pod5Info::from_pod5 now returns Result
+        let info = Pod5Info::from_pod5(path_str)?;
+        
+        let file_size = std::fs::metadata(path_ref)
+            .map_err(|e| anyhow::anyhow!("Failed to read metadata for '{}': {}", path_ref.display(), e))?
+            .len();
 
         Ok(Self {
             name: path_ref
@@ -100,8 +104,9 @@ impl Pod5sRun {
     /// Each file is parsed using `Pod5::from_path`.
     pub fn load_from_dir<P: AsRef<Path>>(dir: P) -> anyhow::Result<Self> {
         let pod_paths = list_files_with_ext(dir.as_ref(), "pod5")?;
+        
         if pod_paths.is_empty() {
-            anyhow::bail!("No .pod5 files found in directory");
+            anyhow::bail!("No .pod5 files found in directory: {}", dir.as_ref().display());
         }
 
         let mut pod5s = Vec::with_capacity(pod_paths.len());
@@ -109,14 +114,16 @@ impl Pod5sRun {
         let mut sequencing_kit: Option<String> = None;
         let mut run_id: Option<String> = None;
 
-        for p in pod_paths {
-            let pod = Pod5::from_path(&p)?;
+        for p in pod_paths.iter() {
+            let pod = Pod5::from_path(&p)
+                .map_err(|e| anyhow::anyhow!("Failed to parse POD5 file '{}': {:#}", p.display(), e))?;
+            
             // run_id uniqueness check
             match &run_id {
                 None => run_id = Some(pod.protocol_run_id.clone()),
                 Some(exp) if &pod.protocol_run_id != exp => {
                     anyhow::bail!(
-                        "Mixed sequencing kits: expected '{}', found '{}' (file: {})",
+                        "Mixed run IDs: expected '{}', found '{}' (file: {})",
                         exp,
                         pod.protocol_run_id,
                         pod.path.display()
@@ -376,6 +383,7 @@ mod tests {
         let dir = "/mnt/beegfs02/scratch/t_steimle/prom_runs/A/20251117_0915_P2I-00461-A_PBI55810_22582b29/pod5_recovered";
         let saved_runs = "~/data/seq_runs_cases.json";
 
+
         let flow_cell = Pod5sRun::load_from_dir(dir)?;
         println!("{:#?}", flow_cell.pod5s.first());
         let stats = flow_cell.stats();

+ 2 - 7
src/commands/dorado.rs

@@ -6,10 +6,7 @@ use std::{
 use anyhow::Context;
 
 use crate::{
-    commands::{Command, SlurmParams},
-    config::Config,
-    io::pod5_infos::Pod5Info,
-    slurm_helpers::max_gpu_per_node,
+    collection::pod5::Pod5, commands::{Command, SlurmParams}, config::Config, io::pod5_infos::Pod5Info, slurm_helpers::max_gpu_per_node
 };
 
 /// Run Dorado basecalling on a directory of POD5 files.
@@ -81,9 +78,7 @@ impl Command for DoradoBasecall {
             .find(|p| p.extension().and_then(|e| e.to_str()) == Some("pod5"))
             .context("No .pod5 file found")?;
 
-        self.sequencing_kit = Pod5Info::from_pod5(&pod_path.to_string_lossy())
-            .sequencing_kit
-            .to_uppercase();
+        self.sequencing_kit = Pod5::from_path(&pod_path)?.sequencing_kit.to_uppercase();
 
         Ok(())
     }

+ 87 - 110
src/io/pod5_infos.rs

@@ -38,26 +38,40 @@ pub struct Pod5Info {
 }
 
 impl Pod5Info {
-    pub fn from_pod5(file_path: &str) -> Self {
-        let mut file = File::open(file_path).unwrap();
-        let _end = file.seek(SeekFrom::End(0)).unwrap();
-        file.seek(SeekFrom::Current(-32)).unwrap(); // Signature + Section marker + 8 bytes for footer length
-        let mut buffer = [0; 8]; // Buffer for 8 bytes
-
-        file.read_exact(&mut buffer).unwrap(); // Read 8 bytes
+    /// Read Pod5 metadata from a file, returning Result instead of panicking
+    pub fn from_pod5(file_path: &str) -> anyhow::Result<Self> {
+        let mut file = File::open(file_path)
+            .map_err(|e| anyhow::anyhow!("Failed to open POD5 file '{}': {}", file_path, e))?;
+        
+        let end = file.seek(SeekFrom::End(0))
+            .map_err(|e| anyhow::anyhow!("Failed to seek to end of POD5 file '{}': {}", file_path, e))?;
+        
+        if end < 32 {
+            anyhow::bail!("POD5 file '{}' is too small ({} bytes), expected at least 32 bytes", file_path, end);
+        }
+        
+        file.seek(SeekFrom::Current(-32))
+            .map_err(|e| anyhow::anyhow!("Failed to seek in POD5 file '{}': {}", file_path, e))?;
+        
+        let mut buffer = [0; 8];
+        file.read_exact(&mut buffer)
+            .map_err(|e| anyhow::anyhow!("Failed to read footer length from POD5 file '{}': {}", file_path, e))?;
 
-        // Convert bytes to little-endian i64
         let value = i64::from_le_bytes(buffer);
+        
+        if value <= 0 || value as u64 > end {
+            anyhow::bail!("Invalid footer length in POD5 file '{}': {} (file size: {})", file_path, value, end);
+        }
 
-        // Seek to the footer position
-        file.seek(SeekFrom::Current(-(8 + value))).unwrap();
+        file.seek(SeekFrom::Current(-(8 + value)))
+            .map_err(|e| anyhow::anyhow!("Failed to seek to footer in POD5 file '{}': {}", file_path, e))?;
 
-        // Read the footer data
         let mut buf = vec![0; value as usize];
-        file.read_exact(&mut buf).unwrap();
+        file.read_exact(&mut buf)
+            .map_err(|e| anyhow::anyhow!("Failed to read footer data from POD5 file '{}': {}", file_path, e))?;
 
-        // Deserialize the FlatBuffer
-        let footer = root_as_footer(&buf).unwrap();
+        let footer = root_as_footer(&buf)
+            .map_err(|e| anyhow::anyhow!("Failed to parse footer in POD5 file '{}': {:?}", file_path, e))?;
 
         let mut acquisition_id = String::new();
         let mut acquisition_start_time = Utc::now();
@@ -81,144 +95,107 @@ impl Pod5Info {
         if let Some(contents) = footer.contents() {
             for content in contents.iter() {
                 if let ContentType::RunInfoTable = content.content_type() {
-                    // println!("{content:#?}");
                     let batch = read_arrow_table(
                         file_path,
                         content.offset() as u64,
                         content.length() as u64,
                     )
-                    .unwrap();
+                    .map_err(|e| anyhow::anyhow!("Failed to read run info table from POD5 file '{}': {}", file_path, e))?;
+
+                    if batch.is_empty() {
+                        continue;
+                    }
+
                     let schema = batch[0].schema();
                     for column in 0..batch[0].num_columns() {
                         let array: ArrayRef = batch[0].column(column).clone();
-
-                        // Print column name and values
                         let column_name = schema.field(column).name().to_string();
-                        // println!("Column: {}", column_name);
 
-                        // Match the type of the array to extract values
                         match array.data_type() {
                             arrow::datatypes::DataType::Int16 => {
-                                let int_array =
-                                    array.as_any().downcast_ref::<Int16Array>().unwrap();
-                                for i in 0..int_array.len() {
-                                    // println!("{}: i16,", column_name);
-                                    match column_name.as_str() {
-                                        "adc_max" => adc_max = int_array.value(i),
-                                        "adc_min" => adc_min = int_array.value(i),
-                                        _ => (),
+                                if let Some(int_array) = array.as_any().downcast_ref::<Int16Array>() {
+                                    for i in 0..int_array.len() {
+                                        match column_name.as_str() {
+                                            "adc_max" => adc_max = int_array.value(i),
+                                            "adc_min" => adc_min = int_array.value(i),
+                                            _ => (),
+                                        }
                                     }
-
-                                    // println!("{}", int_array.value(i));
                                 }
                             }
                             arrow::datatypes::DataType::UInt16 => {
-                                let int_array =
-                                    array.as_any().downcast_ref::<UInt16Array>().unwrap();
-                                for i in 0..int_array.len() {
-                                    // println!("{}: u16,", column_name);
-                                    if let "sample_rate" = column_name.as_str() {
-                                        sample_rate = int_array.value(i)
+                                if let Some(int_array) = array.as_any().downcast_ref::<UInt16Array>() {
+                                    for i in 0..int_array.len() {
+                                        if let "sample_rate" = column_name.as_str() {
+                                            sample_rate = int_array.value(i)
+                                        }
                                     }
                                 }
                             }
-
-                            // arrow::datatypes::DataType::Int32 => {
-                            //     let int_array =
-                            //         array.as_any().downcast_ref::<Int32Array>().unwrap();
-                            //     for i in 0..int_array.len() {
-                            //         println!("{}: i32,", column_name);
-                            //
-                            //         // println!("{}", int_array.value(i));
-                            //     }
-                            // }
-                            // arrow::datatypes::DataType::UInt32 => {
-                            //     let int_array =
-                            //         array.as_any().downcast_ref::<UInt32Array>().unwrap();
-                            //     for i in 0..int_array.len() {
-                            //         println!("{}: u32,", column_name);
-                            //
-                            //         // println!("{}", int_array.value(i));
-                            //     }
-                            // }
-                            // arrow::datatypes::DataType::Float64 => {
-                            //     let float_array =
-                            //         array.as_any().downcast_ref::<Float64Array>().unwrap();
-                            //     for i in 0..float_array.len() {
-                            //         println!("{}: f64,", column_name);
-                            //
-                            //         // println!("{}", float_array.value(i));
-                            //     }
-                            // }
                             arrow::datatypes::DataType::Utf8 => {
-                                let string_array =
-                                    array.as_any().downcast_ref::<StringArray>().unwrap();
-                                let string_array: Vec<String> = string_array
-                                    .iter()
-                                    .flat_map(|v| match v {
-                                        Some(v) => vec![v.to_string()],
-                                        None => vec![],
-                                    })
-                                    .collect();
+                                if let Some(string_array) = array.as_any().downcast_ref::<StringArray>() {
+                                    let string_array: Vec<String> = string_array
+                                        .iter()
+                                        .flat_map(|v| match v {
+                                            Some(v) => vec![v.to_string()],
+                                            None => vec![],
+                                        })
+                                        .collect();
 
-                                let value = string_array.join(" ");
+                                    let value = string_array.join(" ");
 
-                                match column_name.as_str() {
-                                    "acquisition_id" => acquisition_id = value,
-                                    "experiment_name" => experiment_name = value,
-                                    "flow_cell_id" => flow_cell_id = value,
-                                    "flow_cell_product_code" => flow_cell_product_code = value,
-                                    "protocol_name" => protocol_name = value,
-                                    "protocol_run_id" => protocol_run_id = value,
-                                    "sample_id" => sample_id = value,
-                                    "sequencing_kit" => sequencing_kit = value,
-                                    "sequencer_position" => sequencer_position = value,
-                                    "sequencer_position_type" => sequencer_position_type = value,
-                                    "software" => software = value,
-                                    "system_name" => system_name = value,
-                                    "system_type" => system_type = value,
-                                    _ => (),
+                                    match column_name.as_str() {
+                                        "acquisition_id" => acquisition_id = value,
+                                        "experiment_name" => experiment_name = value,
+                                        "flow_cell_id" => flow_cell_id = value,
+                                        "flow_cell_product_code" => flow_cell_product_code = value,
+                                        "protocol_name" => protocol_name = value,
+                                        "protocol_run_id" => protocol_run_id = value,
+                                        "sample_id" => sample_id = value,
+                                        "sequencing_kit" => sequencing_kit = value,
+                                        "sequencer_position" => sequencer_position = value,
+                                        "sequencer_position_type" => sequencer_position_type = value,
+                                        "software" => software = value,
+                                        "system_name" => system_name = value,
+                                        "system_type" => system_type = value,
+                                        _ => (),
+                                    }
                                 }
-                                // println!("{}: String,", column_name);
-                                // println!("{}", string_array.join(" "));
                             }
                             arrow::datatypes::DataType::Timestamp(
                                 arrow::datatypes::TimeUnit::Millisecond,
                                 Some(timezone),
                             ) => {
                                 if &timezone.to_string() == "UTC" {
-                                    let timestamp_array = array
+                                    if let Some(timestamp_array) = array
                                         .as_any()
                                         .downcast_ref::<TimestampMillisecondArray>()
-                                        .unwrap();
-                                    for i in 0..timestamp_array.len() {
-                                        let timestamp = timestamp_array.value(i);
-                                        let datetime: DateTime<Utc> =
-                                            Utc.timestamp_millis_opt(timestamp).unwrap();
-                                        // println!("{}: DateTime<Utc>,", column_name);
-
-                                        match column_name.as_str() {
-                                            "acquisition_start_time" => {
-                                                acquisition_start_time = datetime
+                                    {
+                                        for i in 0..timestamp_array.len() {
+                                            let timestamp = timestamp_array.value(i);
+                                            if let Some(datetime) = Utc.timestamp_millis_opt(timestamp).single() {
+                                                match column_name.as_str() {
+                                                    "acquisition_start_time" => {
+                                                        acquisition_start_time = datetime
+                                                    }
+                                                    "protocol_start_time" => protocol_start_time = datetime,
+                                                    _ => (),
+                                                }
                                             }
-                                            "protocol_start_time" => protocol_start_time = datetime,
-                                            _ => (),
                                         }
-
-                                        // println!("{}", datetime.to_rfc3339());
                                     }
                                 }
                             }
-
                             _ => {
-                                // println!("Unsupported data type: {:?}", array.data_type());
+                                // Unsupported data type, skip
                             }
                         }
                     }
                 }
             }
         }
-        Pod5Info {
+
+        Ok(Pod5Info {
             acquisition_id,
             acquisition_start_time,
             adc_max,
@@ -237,7 +214,7 @@ impl Pod5Info {
             software,
             system_name,
             system_type,
-        }
+        })
     }
 }