Browse Source

Savana CNSegment

Thomas 9 months ago
parent
commit
ccb72720b4
2 changed files with 110 additions and 2 deletions
  1. 101 1
      src/callers/savana.rs
  2. 9 1
      src/lib.rs

+ 101 - 1
src/callers/savana.rs

@@ -18,7 +18,13 @@ use anyhow::Context;
 use itertools::Itertools;
 use log::info;
 use rayon::prelude::*;
-use std::{fs, io::BufRead, path::Path, str::FromStr};
+use serde::{Deserialize, Serialize};
+use std::{
+    fs::{self, File},
+    io::{self, BufRead},
+    path::Path,
+    str::FromStr,
+};
 
 #[derive(Debug)]
 pub struct Savana {
@@ -423,3 +429,97 @@ impl SavanaReadCounts {
             .collect()
     }
 }
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct SavanaCN {
+    pub segments: Vec<CNSegment>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CNSegment {
+    pub chromosome: String,
+    pub start: u64,
+    pub end: u64,
+    pub segment_id: String,
+    pub bin_count: u32,
+    pub sum_of_bin_lengths: u64,
+    pub weight: f64,
+    pub copy_number: f64,
+    pub minor_allele_copy_number: Option<f64>,
+    pub mean_baf: Option<f64>,
+    pub no_het_snps: u32,
+}
+
+impl SavanaCN {
+    pub fn parse_file<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
+        let file = File::open(&path).context("Failed to open the file")?;
+        let reader = io::BufReader::new(file);
+
+        let mut segments = Vec::new();
+
+        for (index, line) in reader.lines().enumerate() {
+            let line = line.context("Failed to read line from file")?;
+
+            // Skip header line
+            if index == 0 {
+                continue;
+            }
+
+            let parts: Vec<&str> = line.split('\t').collect();
+            if parts.len() != 11 {
+                return Err(anyhow::anyhow!(
+                    "Invalid number of columns at line {} (expected 11, got {})",
+                    index + 1,
+                    parts.len()
+                ));
+            }
+
+            // Helper closure for parsing with context
+            let parse_field = |field: &str, name: &str, line_num: usize| -> anyhow::Result<f64> {
+                field
+                    .parse()
+                    .context(format!("Failed to parse {name} at line {line_num}"))
+            };
+
+            let parse_field_u32 = |field: &str, name: &str, line_num: usize| -> anyhow::Result<u32> {
+                field
+                    .parse()
+                    .context(format!("Failed to parse {name} at line {line_num}"))
+            };
+
+            let line_num = index + 1;
+
+            let segment = CNSegment {
+                chromosome: parts[0].to_string(),
+                start: parse_field_u32(parts[1], "start", line_num)? as u64,
+                end: parse_field_u32(parts[2], "end", line_num)? as u64,
+                segment_id: parts[3].to_string(),
+                bin_count: parse_field_u32(parts[4], "bin_count", line_num)?,
+                sum_of_bin_lengths: parse_field_u32(parts[5], "sum_of_bin_lengths", line_num)?
+                    as u64,
+                weight: parse_field(parts[6], "weight", line_num)?,
+                copy_number: parse_field(parts[7], "copy_number", line_num)?,
+                minor_allele_copy_number: if parts[8].is_empty() {
+                    None
+                } else {
+                    Some(parse_field(parts[8], "minor_allele_copy_number", line_num)?)
+                },
+                mean_baf: if parts[9].is_empty() {
+                    None
+                } else {
+                    Some(parse_field(parts[9], "mean_baf", line_num)?)
+                },
+                no_het_snps: if parts[10].is_empty() {
+                    0
+                } else {
+                    parse_field_u32(parts[10], "no_het_snps", line_num)?
+                },
+            };
+
+            segments.push(segment);
+        }
+
+        Ok(Self { segments })
+    }
+
+}

+ 9 - 1
src/lib.rs

@@ -43,7 +43,7 @@ mod tests {
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, pipes::somatic::const_stats, variant::{variant::AlterationCategory, variant_collection::VariantsStats}};
+    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::CNSegment, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, pipes::somatic::const_stats, variant::{variant::AlterationCategory, variant_collection::VariantsStats}};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -793,4 +793,12 @@ Ok(())
         });
         Ok(())
     }
+
+    #[test]
+    fn parse_savana_seg() {
+        init();
+        let r = CNSegment::parse_file("/data/longreads_basic_pipe/ADJAGBA/diag/savana/ADJAGBA_diag_hs1_HP_segmented_absolute_copy_number.tsv").unwrap();
+        println!("{} lines", r.len());
+        println!("{:#?}", r.first().unwrap());
+    }
 }