|
|
@@ -3,9 +3,7 @@ use std::{
|
|
|
io::{Read, Seek, SeekFrom},
|
|
|
};
|
|
|
|
|
|
-use arrow::array::{
|
|
|
- ArrayRef, Int16Array, StringArray, TimestampMillisecondArray, UInt16Array,
|
|
|
-};
|
|
|
+use arrow::array::{ArrayRef, Int16Array, StringArray, TimestampMillisecondArray, UInt16Array};
|
|
|
use arrow::{array::RecordBatch, ipc::reader::FileReader};
|
|
|
use chrono::TimeZone;
|
|
|
use chrono::{DateTime, Utc};
|
|
|
@@ -74,171 +72,147 @@ impl Pod5Info {
|
|
|
let mut system_name = String::new();
|
|
|
let mut system_type = String::new();
|
|
|
|
|
|
- // Update variables with actual values
|
|
|
- // acquisition_id = "acquisition_123".to_string();
|
|
|
- // acquisition_start_time = Utc::now();
|
|
|
- // adc_max = 32767;
|
|
|
- // adc_min = -32768;
|
|
|
- // experiment_name = "Experiment XYZ".to_string();
|
|
|
- // flow_cell_id = "FC123456".to_string();
|
|
|
- // flow_cell_product_code = "FCPROD123".to_string();
|
|
|
- // protocol_name = "Protocol ABC".to_string();
|
|
|
- // protocol_run_id = "protocol_run_456".to_string();
|
|
|
- // protocol_start_time = Utc::now();
|
|
|
- // sample_id = "sample_789".to_string();
|
|
|
- // sample_rate = 44100;
|
|
|
- // sequencing_kit = "SEQKIT123".to_string();
|
|
|
- // sequencer_position = "Position A1".to_string();
|
|
|
- // sequencer_position_type = "Type B".to_string();
|
|
|
- // software = "Software v1.0".to_string();
|
|
|
- // system_name = "System Name".to_string();
|
|
|
- // system_type = "System Type".to_string();
|
|
|
- //
|
|
|
if let Some(contents) = footer.contents() {
|
|
|
for content in contents.iter() {
|
|
|
- match content.content_type() {
|
|
|
- ContentType::RunInfoTable => {
|
|
|
- // println!("{content:#?}");
|
|
|
- let batch = read_arrow_table(
|
|
|
- file_path,
|
|
|
- content.offset() as u64,
|
|
|
- content.length() as u64,
|
|
|
- )
|
|
|
- .unwrap();
|
|
|
- let schema = batch[0].schema();
|
|
|
- for column in 0..batch[0].num_columns() {
|
|
|
- let array: ArrayRef = batch[0].column(column).clone();
|
|
|
-
|
|
|
- // Print column name and values
|
|
|
- let column_name = schema.field(column).name().to_string();
|
|
|
- // println!("Column: {}", column_name);
|
|
|
+ if let ContentType::RunInfoTable = content.content_type() {
|
|
|
+ // println!("{content:#?}");
|
|
|
+ let batch = read_arrow_table(
|
|
|
+ file_path,
|
|
|
+ content.offset() as u64,
|
|
|
+ content.length() as u64,
|
|
|
+ )
|
|
|
+ .unwrap();
|
|
|
+ let schema = batch[0].schema();
|
|
|
+ for column in 0..batch[0].num_columns() {
|
|
|
+ let array: ArrayRef = batch[0].column(column).clone();
|
|
|
|
|
|
- // Match the type of the array to extract values
|
|
|
- match array.data_type() {
|
|
|
- arrow::datatypes::DataType::Int16 => {
|
|
|
- let int_array =
|
|
|
- array.as_any().downcast_ref::<Int16Array>().unwrap();
|
|
|
- for i in 0..int_array.len() {
|
|
|
- // println!("{}: i16,", column_name);
|
|
|
- match column_name.as_str() {
|
|
|
- "adc_max" => adc_max = int_array.value(i),
|
|
|
- "adc_min" => adc_min = int_array.value(i),
|
|
|
- _ => (),
|
|
|
- }
|
|
|
+ // Print column name and values
|
|
|
+ let column_name = schema.field(column).name().to_string();
|
|
|
+ // println!("Column: {}", column_name);
|
|
|
|
|
|
- // println!("{}", int_array.value(i));
|
|
|
+ // Match the type of the array to extract values
|
|
|
+ match array.data_type() {
|
|
|
+ arrow::datatypes::DataType::Int16 => {
|
|
|
+ let int_array =
|
|
|
+ array.as_any().downcast_ref::<Int16Array>().unwrap();
|
|
|
+ for i in 0..int_array.len() {
|
|
|
+ // println!("{}: i16,", column_name);
|
|
|
+ match column_name.as_str() {
|
|
|
+ "adc_max" => adc_max = int_array.value(i),
|
|
|
+ "adc_min" => adc_min = int_array.value(i),
|
|
|
+ _ => (),
|
|
|
}
|
|
|
+
|
|
|
+ // println!("{}", int_array.value(i));
|
|
|
}
|
|
|
- arrow::datatypes::DataType::UInt16 => {
|
|
|
- let int_array =
|
|
|
- array.as_any().downcast_ref::<UInt16Array>().unwrap();
|
|
|
- for i in 0..int_array.len() {
|
|
|
- // println!("{}: u16,", column_name);
|
|
|
- match column_name.as_str() {
|
|
|
- "sample_rate" => sample_rate = int_array.value(i),
|
|
|
- _ => (),
|
|
|
- }
|
|
|
+ }
|
|
|
+ arrow::datatypes::DataType::UInt16 => {
|
|
|
+ let int_array =
|
|
|
+ array.as_any().downcast_ref::<UInt16Array>().unwrap();
|
|
|
+ for i in 0..int_array.len() {
|
|
|
+ // println!("{}: u16,", column_name);
|
|
|
+ if let "sample_rate" = column_name.as_str() {
|
|
|
+ sample_rate = int_array.value(i)
|
|
|
}
|
|
|
}
|
|
|
+ }
|
|
|
|
|
|
- // arrow::datatypes::DataType::Int32 => {
|
|
|
- // let int_array =
|
|
|
- // array.as_any().downcast_ref::<Int32Array>().unwrap();
|
|
|
- // for i in 0..int_array.len() {
|
|
|
- // println!("{}: i32,", column_name);
|
|
|
- //
|
|
|
- // // println!("{}", int_array.value(i));
|
|
|
- // }
|
|
|
- // }
|
|
|
- // arrow::datatypes::DataType::UInt32 => {
|
|
|
- // let int_array =
|
|
|
- // array.as_any().downcast_ref::<UInt32Array>().unwrap();
|
|
|
- // for i in 0..int_array.len() {
|
|
|
- // println!("{}: u32,", column_name);
|
|
|
- //
|
|
|
- // // println!("{}", int_array.value(i));
|
|
|
- // }
|
|
|
- // }
|
|
|
- // arrow::datatypes::DataType::Float64 => {
|
|
|
- // let float_array =
|
|
|
- // array.as_any().downcast_ref::<Float64Array>().unwrap();
|
|
|
- // for i in 0..float_array.len() {
|
|
|
- // println!("{}: f64,", column_name);
|
|
|
- //
|
|
|
- // // println!("{}", float_array.value(i));
|
|
|
- // }
|
|
|
- // }
|
|
|
- arrow::datatypes::DataType::Utf8 => {
|
|
|
- let string_array =
|
|
|
- array.as_any().downcast_ref::<StringArray>().unwrap();
|
|
|
- let string_array: Vec<String> = string_array
|
|
|
- .iter()
|
|
|
- .flat_map(|v| match v {
|
|
|
- Some(v) => vec![v.to_string()],
|
|
|
- None => vec![],
|
|
|
- })
|
|
|
- .collect();
|
|
|
+ // arrow::datatypes::DataType::Int32 => {
|
|
|
+ // let int_array =
|
|
|
+ // array.as_any().downcast_ref::<Int32Array>().unwrap();
|
|
|
+ // for i in 0..int_array.len() {
|
|
|
+ // println!("{}: i32,", column_name);
|
|
|
+ //
|
|
|
+ // // println!("{}", int_array.value(i));
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // arrow::datatypes::DataType::UInt32 => {
|
|
|
+ // let int_array =
|
|
|
+ // array.as_any().downcast_ref::<UInt32Array>().unwrap();
|
|
|
+ // for i in 0..int_array.len() {
|
|
|
+ // println!("{}: u32,", column_name);
|
|
|
+ //
|
|
|
+ // // println!("{}", int_array.value(i));
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // arrow::datatypes::DataType::Float64 => {
|
|
|
+ // let float_array =
|
|
|
+ // array.as_any().downcast_ref::<Float64Array>().unwrap();
|
|
|
+ // for i in 0..float_array.len() {
|
|
|
+ // println!("{}: f64,", column_name);
|
|
|
+ //
|
|
|
+ // // println!("{}", float_array.value(i));
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ arrow::datatypes::DataType::Utf8 => {
|
|
|
+ let string_array =
|
|
|
+ array.as_any().downcast_ref::<StringArray>().unwrap();
|
|
|
+ let string_array: Vec<String> = string_array
|
|
|
+ .iter()
|
|
|
+ .flat_map(|v| match v {
|
|
|
+ Some(v) => vec![v.to_string()],
|
|
|
+ None => vec![],
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
|
|
|
- let value = string_array.join(" ");
|
|
|
+ let value = string_array.join(" ");
|
|
|
|
|
|
- match column_name.as_str() {
|
|
|
- "acquisition_id" => acquisition_id = value,
|
|
|
- "experiment_name" => experiment_name = value,
|
|
|
- "flow_cell_id" => flow_cell_id = value,
|
|
|
- "flow_cell_product_code" => flow_cell_product_code = value,
|
|
|
- "protocol_name" => protocol_name = value,
|
|
|
- "protocol_run_id" => protocol_run_id = value,
|
|
|
- "sample_id" => sample_id = value,
|
|
|
- "sequencing_kit" => sequencing_kit = value,
|
|
|
- "sequencer_position" => sequencer_position = value,
|
|
|
- "sequencer_position_type" => {
|
|
|
- sequencer_position_type = value
|
|
|
- }
|
|
|
- "software" => software = value,
|
|
|
- "system_name" => system_name = value,
|
|
|
- "system_type" => system_type = value,
|
|
|
- _ => (),
|
|
|
+ match column_name.as_str() {
|
|
|
+ "acquisition_id" => acquisition_id = value,
|
|
|
+ "experiment_name" => experiment_name = value,
|
|
|
+ "flow_cell_id" => flow_cell_id = value,
|
|
|
+ "flow_cell_product_code" => flow_cell_product_code = value,
|
|
|
+ "protocol_name" => protocol_name = value,
|
|
|
+ "protocol_run_id" => protocol_run_id = value,
|
|
|
+ "sample_id" => sample_id = value,
|
|
|
+ "sequencing_kit" => sequencing_kit = value,
|
|
|
+ "sequencer_position" => sequencer_position = value,
|
|
|
+ "sequencer_position_type" => {
|
|
|
+ sequencer_position_type = value
|
|
|
}
|
|
|
- // println!("{}: String,", column_name);
|
|
|
- // println!("{}", string_array.join(" "));
|
|
|
+ "software" => software = value,
|
|
|
+ "system_name" => system_name = value,
|
|
|
+ "system_type" => system_type = value,
|
|
|
+ _ => (),
|
|
|
}
|
|
|
- arrow::datatypes::DataType::Timestamp(
|
|
|
- arrow::datatypes::TimeUnit::Millisecond,
|
|
|
- Some(timezone),
|
|
|
- ) => {
|
|
|
- if &timezone.to_string() == "UTC" {
|
|
|
- let timestamp_array = array
|
|
|
- .as_any()
|
|
|
- .downcast_ref::<TimestampMillisecondArray>()
|
|
|
- .unwrap();
|
|
|
- for i in 0..timestamp_array.len() {
|
|
|
- let timestamp = timestamp_array.value(i);
|
|
|
- let datetime: DateTime<Utc> =
|
|
|
- Utc.timestamp_millis_opt(timestamp).unwrap();
|
|
|
- // println!("{}: DateTime<Utc>,", column_name);
|
|
|
+ // println!("{}: String,", column_name);
|
|
|
+ // println!("{}", string_array.join(" "));
|
|
|
+ }
|
|
|
+ arrow::datatypes::DataType::Timestamp(
|
|
|
+ arrow::datatypes::TimeUnit::Millisecond,
|
|
|
+ Some(timezone),
|
|
|
+ ) => {
|
|
|
+ if &timezone.to_string() == "UTC" {
|
|
|
+ let timestamp_array = array
|
|
|
+ .as_any()
|
|
|
+ .downcast_ref::<TimestampMillisecondArray>()
|
|
|
+ .unwrap();
|
|
|
+ for i in 0..timestamp_array.len() {
|
|
|
+ let timestamp = timestamp_array.value(i);
|
|
|
+ let datetime: DateTime<Utc> =
|
|
|
+ Utc.timestamp_millis_opt(timestamp).unwrap();
|
|
|
+ // println!("{}: DateTime<Utc>,", column_name);
|
|
|
|
|
|
- match column_name.as_str() {
|
|
|
- "acquisition_start_time" => {
|
|
|
- acquisition_start_time = datetime
|
|
|
- }
|
|
|
- "protocol_start_time" => {
|
|
|
- protocol_start_time = datetime
|
|
|
- }
|
|
|
- _ => (),
|
|
|
+ match column_name.as_str() {
|
|
|
+ "acquisition_start_time" => {
|
|
|
+ acquisition_start_time = datetime
|
|
|
}
|
|
|
-
|
|
|
- // println!("{}", datetime.to_rfc3339());
|
|
|
+ "protocol_start_time" => {
|
|
|
+ protocol_start_time = datetime
|
|
|
+ }
|
|
|
+ _ => (),
|
|
|
}
|
|
|
+
|
|
|
+ // println!("{}", datetime.to_rfc3339());
|
|
|
}
|
|
|
}
|
|
|
+ }
|
|
|
|
|
|
- _ => {
|
|
|
- // println!("Unsupported data type: {:?}", array.data_type());
|
|
|
- }
|
|
|
+ _ => {
|
|
|
+ // println!("Unsupported data type: {:?}", array.data_type());
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- _ => (),
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -298,7 +272,7 @@ mod tests {
|
|
|
|
|
|
#[test]
|
|
|
fn it_works() {
|
|
|
- let file_path = "/data/run_data/20240620-CL/SAU-MRD-NB12_AUB-DIAG-NB13_PAR-DIAG-NB14/20240620_1532_1D_PAW61519_8071e9f4/pod5/PAW61519_8071e9f4_709b3710_0.pod5";
|
|
|
+ let file_path = "/data/run_data/20240326-CL/RICCO-MRD-NB03_MOREAU-DIAG-NB04/20240326_1355_1C_PAU37401_75b4c39d/pod5_pass/barcode03/PAU37401_pass_barcode03_75b4c39d_6af082bf_0.pod5";
|
|
|
let r = Pod5Info::from_pod5(file_path);
|
|
|
println!("{r:#?}");
|
|
|
}
|