|
|
@@ -38,26 +38,40 @@ pub struct Pod5Info {
|
|
|
}
|
|
|
|
|
|
impl Pod5Info {
|
|
|
- pub fn from_pod5(file_path: &str) -> Self {
|
|
|
- let mut file = File::open(file_path).unwrap();
|
|
|
- let _end = file.seek(SeekFrom::End(0)).unwrap();
|
|
|
- file.seek(SeekFrom::Current(-32)).unwrap(); // Signature + Section marker + 8 bytes for footer length
|
|
|
- let mut buffer = [0; 8]; // Buffer for 8 bytes
|
|
|
-
|
|
|
- file.read_exact(&mut buffer).unwrap(); // Read 8 bytes
|
|
|
+ /// Read Pod5 metadata from a file, returning Result instead of panicking
|
|
|
+ pub fn from_pod5(file_path: &str) -> anyhow::Result<Self> {
|
|
|
+ let mut file = File::open(file_path)
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to open POD5 file '{}': {}", file_path, e))?;
|
|
|
+
|
|
|
+ let end = file.seek(SeekFrom::End(0))
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to seek to end of POD5 file '{}': {}", file_path, e))?;
|
|
|
+
|
|
|
+ if end < 32 {
|
|
|
+ anyhow::bail!("POD5 file '{}' is too small ({} bytes), expected at least 32 bytes", file_path, end);
|
|
|
+ }
|
|
|
+
|
|
|
+ file.seek(SeekFrom::Current(-32))
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to seek in POD5 file '{}': {}", file_path, e))?;
|
|
|
+
|
|
|
+ let mut buffer = [0; 8];
|
|
|
+ file.read_exact(&mut buffer)
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to read footer length from POD5 file '{}': {}", file_path, e))?;
|
|
|
|
|
|
- // Convert bytes to little-endian i64
|
|
|
let value = i64::from_le_bytes(buffer);
|
|
|
+
|
|
|
+ if value <= 0 || value as u64 > end {
|
|
|
+ anyhow::bail!("Invalid footer length in POD5 file '{}': {} (file size: {})", file_path, value, end);
|
|
|
+ }
|
|
|
|
|
|
- // Seek to the footer position
|
|
|
- file.seek(SeekFrom::Current(-(8 + value))).unwrap();
|
|
|
+ file.seek(SeekFrom::Current(-(8 + value)))
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to seek to footer in POD5 file '{}': {}", file_path, e))?;
|
|
|
|
|
|
- // Read the footer data
|
|
|
let mut buf = vec![0; value as usize];
|
|
|
- file.read_exact(&mut buf).unwrap();
|
|
|
+ file.read_exact(&mut buf)
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to read footer data from POD5 file '{}': {}", file_path, e))?;
|
|
|
|
|
|
- // Deserialize the FlatBuffer
|
|
|
- let footer = root_as_footer(&buf).unwrap();
|
|
|
+ let footer = root_as_footer(&buf)
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to parse footer in POD5 file '{}': {:?}", file_path, e))?;
|
|
|
|
|
|
let mut acquisition_id = String::new();
|
|
|
let mut acquisition_start_time = Utc::now();
|
|
|
@@ -81,144 +95,107 @@ impl Pod5Info {
|
|
|
if let Some(contents) = footer.contents() {
|
|
|
for content in contents.iter() {
|
|
|
if let ContentType::RunInfoTable = content.content_type() {
|
|
|
- // println!("{content:#?}");
|
|
|
let batch = read_arrow_table(
|
|
|
file_path,
|
|
|
content.offset() as u64,
|
|
|
content.length() as u64,
|
|
|
)
|
|
|
- .unwrap();
|
|
|
+ .map_err(|e| anyhow::anyhow!("Failed to read run info table from POD5 file '{}': {}", file_path, e))?;
|
|
|
+
|
|
|
+ if batch.is_empty() {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
let schema = batch[0].schema();
|
|
|
for column in 0..batch[0].num_columns() {
|
|
|
let array: ArrayRef = batch[0].column(column).clone();
|
|
|
-
|
|
|
- // Print column name and values
|
|
|
let column_name = schema.field(column).name().to_string();
|
|
|
- // println!("Column: {}", column_name);
|
|
|
|
|
|
- // Match the type of the array to extract values
|
|
|
match array.data_type() {
|
|
|
arrow::datatypes::DataType::Int16 => {
|
|
|
- let int_array =
|
|
|
- array.as_any().downcast_ref::<Int16Array>().unwrap();
|
|
|
- for i in 0..int_array.len() {
|
|
|
- // println!("{}: i16,", column_name);
|
|
|
- match column_name.as_str() {
|
|
|
- "adc_max" => adc_max = int_array.value(i),
|
|
|
- "adc_min" => adc_min = int_array.value(i),
|
|
|
- _ => (),
|
|
|
+ if let Some(int_array) = array.as_any().downcast_ref::<Int16Array>() {
|
|
|
+ for i in 0..int_array.len() {
|
|
|
+ match column_name.as_str() {
|
|
|
+ "adc_max" => adc_max = int_array.value(i),
|
|
|
+ "adc_min" => adc_min = int_array.value(i),
|
|
|
+ _ => (),
|
|
|
+ }
|
|
|
}
|
|
|
-
|
|
|
- // println!("{}", int_array.value(i));
|
|
|
}
|
|
|
}
|
|
|
arrow::datatypes::DataType::UInt16 => {
|
|
|
- let int_array =
|
|
|
- array.as_any().downcast_ref::<UInt16Array>().unwrap();
|
|
|
- for i in 0..int_array.len() {
|
|
|
- // println!("{}: u16,", column_name);
|
|
|
- if let "sample_rate" = column_name.as_str() {
|
|
|
- sample_rate = int_array.value(i)
|
|
|
+ if let Some(int_array) = array.as_any().downcast_ref::<UInt16Array>() {
|
|
|
+ for i in 0..int_array.len() {
|
|
|
+ if let "sample_rate" = column_name.as_str() {
|
|
|
+ sample_rate = int_array.value(i)
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- // arrow::datatypes::DataType::Int32 => {
|
|
|
- // let int_array =
|
|
|
- // array.as_any().downcast_ref::<Int32Array>().unwrap();
|
|
|
- // for i in 0..int_array.len() {
|
|
|
- // println!("{}: i32,", column_name);
|
|
|
- //
|
|
|
- // // println!("{}", int_array.value(i));
|
|
|
- // }
|
|
|
- // }
|
|
|
- // arrow::datatypes::DataType::UInt32 => {
|
|
|
- // let int_array =
|
|
|
- // array.as_any().downcast_ref::<UInt32Array>().unwrap();
|
|
|
- // for i in 0..int_array.len() {
|
|
|
- // println!("{}: u32,", column_name);
|
|
|
- //
|
|
|
- // // println!("{}", int_array.value(i));
|
|
|
- // }
|
|
|
- // }
|
|
|
- // arrow::datatypes::DataType::Float64 => {
|
|
|
- // let float_array =
|
|
|
- // array.as_any().downcast_ref::<Float64Array>().unwrap();
|
|
|
- // for i in 0..float_array.len() {
|
|
|
- // println!("{}: f64,", column_name);
|
|
|
- //
|
|
|
- // // println!("{}", float_array.value(i));
|
|
|
- // }
|
|
|
- // }
|
|
|
arrow::datatypes::DataType::Utf8 => {
|
|
|
- let string_array =
|
|
|
- array.as_any().downcast_ref::<StringArray>().unwrap();
|
|
|
- let string_array: Vec<String> = string_array
|
|
|
- .iter()
|
|
|
- .flat_map(|v| match v {
|
|
|
- Some(v) => vec![v.to_string()],
|
|
|
- None => vec![],
|
|
|
- })
|
|
|
- .collect();
|
|
|
+ if let Some(string_array) = array.as_any().downcast_ref::<StringArray>() {
|
|
|
+ let string_array: Vec<String> = string_array
|
|
|
+ .iter()
|
|
|
+ .flat_map(|v| match v {
|
|
|
+ Some(v) => vec![v.to_string()],
|
|
|
+ None => vec![],
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
|
|
|
- let value = string_array.join(" ");
|
|
|
+ let value = string_array.join(" ");
|
|
|
|
|
|
- match column_name.as_str() {
|
|
|
- "acquisition_id" => acquisition_id = value,
|
|
|
- "experiment_name" => experiment_name = value,
|
|
|
- "flow_cell_id" => flow_cell_id = value,
|
|
|
- "flow_cell_product_code" => flow_cell_product_code = value,
|
|
|
- "protocol_name" => protocol_name = value,
|
|
|
- "protocol_run_id" => protocol_run_id = value,
|
|
|
- "sample_id" => sample_id = value,
|
|
|
- "sequencing_kit" => sequencing_kit = value,
|
|
|
- "sequencer_position" => sequencer_position = value,
|
|
|
- "sequencer_position_type" => sequencer_position_type = value,
|
|
|
- "software" => software = value,
|
|
|
- "system_name" => system_name = value,
|
|
|
- "system_type" => system_type = value,
|
|
|
- _ => (),
|
|
|
+ match column_name.as_str() {
|
|
|
+ "acquisition_id" => acquisition_id = value,
|
|
|
+ "experiment_name" => experiment_name = value,
|
|
|
+ "flow_cell_id" => flow_cell_id = value,
|
|
|
+ "flow_cell_product_code" => flow_cell_product_code = value,
|
|
|
+ "protocol_name" => protocol_name = value,
|
|
|
+ "protocol_run_id" => protocol_run_id = value,
|
|
|
+ "sample_id" => sample_id = value,
|
|
|
+ "sequencing_kit" => sequencing_kit = value,
|
|
|
+ "sequencer_position" => sequencer_position = value,
|
|
|
+ "sequencer_position_type" => sequencer_position_type = value,
|
|
|
+ "software" => software = value,
|
|
|
+ "system_name" => system_name = value,
|
|
|
+ "system_type" => system_type = value,
|
|
|
+ _ => (),
|
|
|
+ }
|
|
|
}
|
|
|
- // println!("{}: String,", column_name);
|
|
|
- // println!("{}", string_array.join(" "));
|
|
|
}
|
|
|
arrow::datatypes::DataType::Timestamp(
|
|
|
arrow::datatypes::TimeUnit::Millisecond,
|
|
|
Some(timezone),
|
|
|
) => {
|
|
|
if &timezone.to_string() == "UTC" {
|
|
|
- let timestamp_array = array
|
|
|
+ if let Some(timestamp_array) = array
|
|
|
.as_any()
|
|
|
.downcast_ref::<TimestampMillisecondArray>()
|
|
|
- .unwrap();
|
|
|
- for i in 0..timestamp_array.len() {
|
|
|
- let timestamp = timestamp_array.value(i);
|
|
|
- let datetime: DateTime<Utc> =
|
|
|
- Utc.timestamp_millis_opt(timestamp).unwrap();
|
|
|
- // println!("{}: DateTime<Utc>,", column_name);
|
|
|
-
|
|
|
- match column_name.as_str() {
|
|
|
- "acquisition_start_time" => {
|
|
|
- acquisition_start_time = datetime
|
|
|
+ {
|
|
|
+ for i in 0..timestamp_array.len() {
|
|
|
+ let timestamp = timestamp_array.value(i);
|
|
|
+ if let Some(datetime) = Utc.timestamp_millis_opt(timestamp).single() {
|
|
|
+ match column_name.as_str() {
|
|
|
+ "acquisition_start_time" => {
|
|
|
+ acquisition_start_time = datetime
|
|
|
+ }
|
|
|
+ "protocol_start_time" => protocol_start_time = datetime,
|
|
|
+ _ => (),
|
|
|
+ }
|
|
|
}
|
|
|
- "protocol_start_time" => protocol_start_time = datetime,
|
|
|
- _ => (),
|
|
|
}
|
|
|
-
|
|
|
- // println!("{}", datetime.to_rfc3339());
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
_ => {
|
|
|
- // println!("Unsupported data type: {:?}", array.data_type());
|
|
|
+ // Unsupported data type, skip
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- Pod5Info {
|
|
|
+
|
|
|
+ Ok(Pod5Info {
|
|
|
acquisition_id,
|
|
|
acquisition_start_time,
|
|
|
adc_max,
|
|
|
@@ -237,7 +214,7 @@ impl Pod5Info {
|
|
|
software,
|
|
|
system_name,
|
|
|
system_type,
|
|
|
- }
|
|
|
+ })
|
|
|
}
|
|
|
}
|
|
|
|