Browse Source

straglr somatic change

Thomas 20 hours ago
parent
commit
081a1b411b
7 changed files with 763 additions and 310 deletions
  1. 12 4
      pandora-config.example.toml
  2. 2 2
      src/callers/clairs.rs
  3. 7 4
      src/callers/mod.rs
  4. 87 39
      src/callers/nanomonsv.rs
  5. 446 95
      src/callers/straglr.rs
  6. 6 1
      src/config.rs
  7. 203 165
      src/io/straglr.rs

+ 12 - 4
pandora-config.example.toml

@@ -293,8 +293,12 @@ straglr_bin = "/home/t_steimle/.conda/envs/straglr_env/bin/straglr.py"
 # RepeatMasker Simple_repeat
 straglr_loci_bed = "/home/t_steimle/ref/hs1/simple_repeat_ucsc_hs1.bed"
 
-# Size of reported difference between normal and tumoral
-straglr_min_diff = 4
+# Minimum allele size difference in bp to report as changed between normal and tumoral
+straglr_min_size_diff = 4
+
+# Minimum read support required for an allele to be considered for 
+# change between normal and tumoral
+straglr_min_support_diff = 2
 
 # Minimum read support for STR genotyping.
 straglr_min_support = 2
@@ -370,7 +374,7 @@ modkit_summary_file = "{result_dir}/{id}/{time}/{id}_{time}_5mC_5hmC_summary.txt
 #######################################
 
 # Path to nanomonsv binary.
-nanomonsv_bin = "/home/prom/.local/bin/nanomonsv"
+nanomonsv_bin = "/home/t_steimle/.conda/envs/nanomonsv_env/bin/nanomonsv"
 
 # Paired nanomonsv output directory template.
 # {result_dir}, {id}, {time}
@@ -380,7 +384,7 @@ nanomonsv_output_dir = "{result_dir}/{id}/{time}/nanomonsv"
 nanomonsv_force = false
 
 # Threads for nanomonsv.
-nanomonsv_threads = 150
+nanomonsv_threads = 40
 
 # Paired nanomonsv PASSED VCF template.
 # {output_dir}, {id}
@@ -394,6 +398,10 @@ nanomonsv_solo_output_dir = "{result_dir}/{id}/{time}/nanomonsv-solo"
 # {output_dir}, {id}, {time}
 nanomonsv_solo_passed_vcf = "{output_dir}/{id}_{time}_nanomonsv-solo_PASSED.vcf.gz"
 
+# Path to simple repeat BED file for nanomonsv.
+# https://github.com/friend1ws/nanomonsv
+# Warning TBI index should exists
+nanomonsv_simple_repeat_bed = "/home/t_steimle/ref/hs1/human_chm13v2.0_simpleRepeat.bed.gz"
 
 #######################################
 # PromethION metadata

+ 2 - 2
src/callers/clairs.rs

@@ -536,8 +536,8 @@ impl ClairS {
         let passed_vcf = self.somatic_passed_vcf_path();
 
         // Filter PASS
-        let mut keep_pass =
-            BcftoolsKeepPass::from_config(&self.config, tmp_path.clone(), passed_vcf.clone());
+        let keep_pass =
+            BcftoolsKeepPass::from_config(&self.config, tmp_path, passed_vcf.clone());
 
         Ok(keep_pass)
     }

+ 7 - 4
src/callers/mod.rs

@@ -125,7 +125,7 @@
 use crate::{
     callers::{
         clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV,
-        savana::Savana, severus::Severus,
+        savana::Savana, severus::Severus, straglr::Straglr,
     },
     config::Config,
     pipes::{Initialize, InitializeSolo},
@@ -204,15 +204,15 @@ pub mod straglr;
 /// # Ok::<(), anyhow::Error>(())
 /// ```
 pub fn run_somatic_callers(id: &str, config: &Config) -> anyhow::Result<()> {
+    // ClairS - somatic SNV/indels with haplotype awareness
+    ClairS::initialize(id, config)?.run()?;
+
     // DeepVariant - germline variants for normal sample
     DeepVariant::initialize(id, &config.normal_name, config)?.run()?;
 
     // DeepVariant - germline variants for tumor sample
     DeepVariant::initialize(id, &config.tumoral_name, config)?.run()?;
 
-    // ClairS - somatic SNV/indels with haplotype awareness
-    ClairS::initialize(id, config)?.run()?;
-
     // Severus - structural variants and VNTRs
     Severus::initialize(id, config)?.run()?;
 
@@ -225,6 +225,9 @@ pub fn run_somatic_callers(id: &str, config: &Config) -> anyhow::Result<()> {
     // DeepSomatic - somatic SNV/indels
     DeepSomatic::initialize(id, config)?.run()?;
 
+    // Straglr - Short Tandem Repeat (STR) genotyper
+    Straglr::initialize(id, config)?.run()?;
+
     Ok(())
 }
 

+ 87 - 39
src/callers/nanomonsv.rs

@@ -69,6 +69,7 @@
 //! - [NanomonSV GitHub](https://github.com/friend1ws/nanomonsv)
 //! - [NanomonSV Paper](https://doi.org/10.1186/s13059-020-02175-y)
 use rayon::prelude::*;
+use regex::Regex;
 use std::{
     fs::{self},
     path::{Path, PathBuf},
@@ -82,7 +83,8 @@ use crate::{
     annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
     collection::vcf::Vcf,
     commands::{
-        CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner, bcftools::{BcftoolsConcat, BcftoolsKeepPass}
+        bcftools::{BcftoolsConcat, BcftoolsKeepPass},
+        CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner,
     },
     config::Config,
     helpers::{is_file_older, remove_dir_if_exists},
@@ -91,8 +93,8 @@ use crate::{
     run,
     runners::Run,
     variant::{
-        vcf_variant::{Label, Variants},
         variant_collection::VariantCollection,
+        vcf_variant::{Label, Variants},
     },
 };
 
@@ -121,11 +123,17 @@ pub struct NanomonSV {
     job_args: Vec<String>,
     /// Number of threads for parallel execution
     threads: u8,
+    memory: Option<u16>,
 }
 
 impl JobCommand for NanomonSV {
     fn cmd(&self) -> String {
-        format!("{} {}", self.config.nanomonsv_bin, self.job_args.join(" "))
+        format!(
+            "source {conda_sh} && conda activate nanomonsv_env && {bin} {args}",
+            conda_sh = self.config.conda_sh,
+            bin = self.config.nanomonsv_bin,
+            args = self.job_args.join(" ")
+        )
     }
 }
 
@@ -133,10 +141,11 @@ impl LocalRunner for NanomonSV {}
 
 impl SlurmRunner for NanomonSV {
     fn slurm_args(&self) -> Vec<String> {
+        let mem = self.memory.unwrap_or(40);
         SlurmParams {
             job_name: Some(format!("nanomonsv_{}", self.id)),
             cpus_per_task: Some(self.threads as u32),
-            mem: Some("60G".into()),
+            mem: Some(format!("{mem}G")),
             partition: Some("shortq".into()),
             gres: None,
         }
@@ -145,10 +154,11 @@ impl SlurmRunner for NanomonSV {
 }
 impl SbatchRunner for NanomonSV {
     fn slurm_params(&self) -> SlurmParams {
+        let mem = self.memory.unwrap_or(40);
         SlurmParams {
             job_name: Some(format!("nanomonsv_{}", self.id)),
             cpus_per_task: Some(self.threads as u32),
-            mem: Some("60G".into()),
+            mem: Some(format!("{mem}G")),
             partition: Some("shortq".into()),
             gres: None,
         }
@@ -184,6 +194,7 @@ impl Initialize for NanomonSV {
             config: config.clone(),
             job_args: Vec::new(),
             threads: config.nanomonsv_threads,
+            memory: None,
         };
 
         if nanomonsv.config.nanomonsv_force {
@@ -251,38 +262,38 @@ impl Run for NanomonSV {
         }
 
         let diag_bam = self.config.tumoral_bam(&self.id);
-        let mrd_bam = self.config.normal_bam(&self.id);
+        let normal_bam = self.config.normal_bam(&self.id);
 
-        let diag_out_dir = self.config.nanomonsv_output_dir(&self.id, "diag");
-        let mrd_out_dir = self.config.nanomonsv_output_dir(&self.id, "mrd");
+        let diag_out_dir = self.config.nanomonsv_output_dir(&self.id, &self.config.tumoral_name);
+        let normal_out_dir = self.config.nanomonsv_output_dir(&self.id, &self.config.normal_name);
         let vcf_passed = self.config.nanomonsv_passed_vcf(&self.id);
 
         fs::create_dir_all(&diag_out_dir)?;
-        fs::create_dir_all(&mrd_out_dir)?;
+        fs::create_dir_all(&normal_out_dir)?;
 
         somatic_parse(
             &self.id,
             &diag_bam,
-            &mrd_bam,
+            &normal_bam,
             &diag_out_dir,
-            &mrd_out_dir,
+            &normal_out_dir,
             &self.config,
             &self.log_dir,
         )?;
 
-        let diag_out_prefix = format!("{}/{}_diag", diag_out_dir, self.id);
-        let mrd_out_prefix = format!("{}/{}_mrd", mrd_out_dir, self.id);
+        let diag_out_prefix = format!("{}/{}_{}", diag_out_dir, self.id, self.config.tumoral_name);
+        let normal_out_prefix = format!("{}/{}_{}", normal_out_dir, self.id, self.config.normal_name);
 
         let diag_result_vcf = format!("{diag_out_prefix}.nanomonsv.result.vcf");
-        let mrd_result_vcf = format!("{mrd_out_prefix}.nanomonsv.result.vcf");
+        let normal_result_vcf = format!("{normal_out_prefix}.nanomonsv.result.vcf");
 
-        if !Path::new(&mrd_result_vcf).exists() {
-            info!("Nanomonsv get from normal bam: {mrd_bam}.");
-            let report = nanomonsv_get(&mrd_bam, &mrd_out_prefix, None, None, &self.config)
+        if !Path::new(&normal_result_vcf).exists() {
+            info!("Nanomonsv get from normal bam: {normal_bam}.");
+            let report = nanomonsv_get(&normal_bam, &normal_out_prefix, None, None, &self.config)
                 .context(format!(
-                    "Error while running NanomonSV get for {mrd_result_vcf}"
+                    "Error while running NanomonSV get for {normal_result_vcf}"
                 ))?;
-            report.save_to_file(format!("{}/nanomonsv_get_mrd_", self.log_dir))?;
+            report.save_to_file(format!("{}/nanomonsv_get_{}_", self.log_dir, self.config.normal_name))?;
         } else {
             debug!(
                 "NanomonSV `get` results already exists for {} normal, skipping execution.",
@@ -295,14 +306,14 @@ impl Run for NanomonSV {
             let report = nanomonsv_get(
                 &diag_bam,
                 &diag_out_prefix,
-                Some(&mrd_bam),
-                Some(&mrd_out_prefix),
+                Some(&normal_bam),
+                Some(&normal_out_prefix),
                 &self.config,
             )
             .context(format!(
                 "Error while running NanomonSV get for {diag_result_vcf}"
             ))?;
-            report.save_to_file(format!("{}/nanomonsv_get_diag_", self.log_dir))?;
+            report.save_to_file(format!("{}/nanomonsv_get_{}_", self.log_dir, self.config.tumoral_name))?;
         } else {
             debug!(
                 "NanomonSV `get` results already exists for {} tumoral, skipping execution.",
@@ -338,10 +349,6 @@ impl CallerCat for NanomonSV {
 }
 
 impl Version for NanomonSV {
-    /// Retrieves the NanomonSV version by running `nanomonsv --version`.
-    ///
-    /// # Errors
-    /// Returns an error if command execution fails or "Version " not found in output.
     fn version(config: &Config) -> anyhow::Result<String> {
         let mut runner = NanomonSV {
             id: "version".to_string(),
@@ -349,21 +356,34 @@ impl Version for NanomonSV {
             config: config.clone(),
             job_args: vec!["--version".into()],
             threads: 1,
+            memory: Some(2),
         };
+
         let out = run!(&runner.config, &mut runner)
-            .context("Error while running `NanomonSV --version`")?;
-        let log = format!("{}{}", out.stdout, out.stderr);
-        let start = log
-            .find("stdout: nanomonsv ")
-            .context("Failed to find 'stdout: nanomonsv ' in the log")?;
-        let start_index = start + "stdout: nanomonsv ".len();
-        let end = log[start_index..]
-            .find('\n')
-            .context("Failed to find newline after 'stdout: nanomonsv '")?;
-        Ok(log[start_index..start_index + end]
-            .to_string()
-            .trim()
-            .to_string())
+            .context("Error while running `nanomonsv --version`")?;
+
+        // Prefer stdout; fallback to combined logs if needed.
+        let candidates = [
+            out.stdout.clone(),
+            out.stderr.clone(),
+            format!("{}{}", out.stdout, out.stderr),
+        ];
+
+        // Matches:
+        // "nanomonsv 0.8.0"
+        // (and tolerates leading junk)
+        let re = Regex::new(r"(?m)\bnanomonsv\s+v?([0-9]+(?:\.[0-9]+)*(?:[-+][0-9A-Za-z.\-]+)?)\b")
+            .context("Failed to compile nanomonsv version regex")?;
+
+        for text in candidates {
+            if let Some(caps) = re.captures(&text) {
+                return Ok(caps[1].to_string());
+            }
+        }
+
+        Err(anyhow::anyhow!(
+            "Failed to find nanomonsv version in command output"
+        ))
     }
 }
 
@@ -621,6 +641,7 @@ pub fn nanomonsv_parse(
         config: config.clone(),
         job_args: args,
         threads: config.nanomonsv_threads,
+        memory: Some(20),
     };
     run!(config, &mut runner)
 }
@@ -734,6 +755,8 @@ pub fn nanomonsv_get(
     args.extend(vec![
         "--process".into(),
         threads,
+        "--simple_repeat_bed ".into(),
+        config.nanomonsv_simple_repeat_bed.to_string(),
         out_prefix.to_string(),
         bam.to_string(),
         config.reference.clone(),
@@ -744,6 +767,7 @@ pub fn nanomonsv_get(
         config: config.clone(),
         job_args: args,
         threads: config.nanomonsv_threads,
+        memory: Some(40),
     };
     run!(config, &mut runner)
 }
@@ -883,3 +907,27 @@ pub fn replace_filename_suffix(path: &Path, from: &str, to: &str) -> PathBuf {
 
     path.with_file_name(new_file_name)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::helpers::test_init;
+
+    #[test]
+    fn nanomonsv_version() -> anyhow::Result<()> {
+        test_init();
+        let vl = NanomonSV::version(&Config::default())?;
+        info!("Nanomonsv version: {vl}");
+        Ok(())
+    }
+
+    #[test]
+    fn nanomonsv_run() -> anyhow::Result<()> {
+        test_init();
+        let config = Config::default();
+
+        let mut caller = NanomonSV::initialize("DUMCO", &config)?;
+        caller.run()?;
+        Ok(())
+    }
+}

+ 446 - 95
src/callers/straglr.rs

@@ -135,6 +135,8 @@ use crate::{
 use anyhow::Context;
 use log::{debug, info};
 use std::{
+    collections::HashMap,
+    fmt,
     fs::{self, File},
     io::{BufRead, BufReader, Write},
     path::Path,
@@ -329,7 +331,12 @@ impl Run for Straglr {
             }
         }
 
-        let differences = self.find_somatic_changes(self.config.straglr_min_diff)?;
+        let differences = self.find_somatic_changes(
+            self.config.straglr_min_size_diff.into(),
+            self.config.straglr_min_support_diff,
+        )?;
+        let stats = compute_stats(&differences);
+        info!("{}", stats.summary());
         self.save_somatic_changes(&differences, &self.config.straglr_tumor_normal_diff_tsv(id))?;
         Ok(())
     }
@@ -337,12 +344,6 @@ impl Run for Straglr {
 
 impl Straglr {
     /// Loads and parses the normal sample Straglr TSV results.
-    ///
-    /// # Returns
-    /// Vector of STR loci from the normal sample
-    ///
-    /// # Errors
-    /// Returns an error if the TSV file cannot be read or parsed.
     pub fn load_normal_results(&self) -> anyhow::Result<Vec<StraglrRow>> {
         let tsv_path = self.config.straglr_normal_tsv(&self.id);
         read_straglr_tsv(&tsv_path).context(format!(
@@ -352,12 +353,6 @@ impl Straglr {
     }
 
     /// Loads and parses the tumor sample Straglr TSV results.
-    ///
-    /// # Returns
-    /// Vector of STR loci from the tumor sample
-    ///
-    /// # Errors
-    /// Returns an error if the TSV file cannot be read or parsed.
     pub fn load_tumor_results(&self) -> anyhow::Result<Vec<StraglrRow>> {
         let tsv_path = self.config.straglr_tumor_tsv(&self.id);
         read_straglr_tsv(&tsv_path).context(format!(
@@ -366,69 +361,114 @@ impl Straglr {
         ))
     }
 
-    /// Loads both normal and tumor results as a tuple.
+    /// Finds somatic STR changes between tumor and normal samples.
     ///
-    /// # Returns
-    /// `(normal_results, tumor_results)` tuple
-    ///
-    /// # Errors
-    /// Returns an error if either TSV file cannot be read or parsed.
-    pub fn load_results(&self) -> anyhow::Result<(Vec<StraglrRow>, Vec<StraglrRow>)> {
-        Ok((self.load_normal_results()?, self.load_tumor_results()?))
-    }
-
-    /// Finds STR loci that differ between tumor and normal samples.
-    ///
-    /// Compares copy numbers at matching loci to identify somatic STR changes.
+    /// Reports loci that are either:
+    /// 1. Present only in tumor (de novo) with sufficient support
+    /// 2. Present in both with allele size difference exceeding threshold
     ///
     /// # Arguments
-    /// * `min_difference` - Minimum copy number difference to report (default: 2)
+    /// * `min_size_diff` - Minimum allele size difference in bp to report as changed
+    /// * `min_support` - Minimum read support required for an allele to be considered
     ///
     /// # Returns
-    /// Vector of tuples: `(locus_id, normal_row, tumor_row, copy_number_diff)`
-    ///
-    /// # Errors
-    /// Returns an error if results cannot be loaded.
+    /// Vector of `SomaticStrChange` containing matched loci with differences
     pub fn find_somatic_changes(
         &self,
-        min_difference: u32,
-    ) -> anyhow::Result<Vec<(String, StraglrRow, StraglrRow, i64)>> {
-        let (normal, tumor) = self.load_results()?;
+        min_size_diff: f64,
+        min_support: u32,
+    ) -> anyhow::Result<Vec<SomaticStrChange>> {
+        let normal = self.load_normal_results()?;
+        let tumor = self.load_tumor_results()?;
+
+        // Index normal by locus key, filtering by min_support
+        let normal_map: HashMap<(String, u64, u64, String), StraglrRow> = normal
+            .into_iter()
+            .filter(|r| r.support >= min_support)
+            .map(|r| ((r.chrom.clone(), r.start, r.end, r.repeat_unit.clone()), r))
+            .collect();
 
-        // Index tumor by location for O(1) lookup
-        let tumor_map: std::collections::HashMap<(String, u64, u64), StraglrRow> = tumor
+        // Index tumor by locus key, filtering by min_support
+        let tumor_map: HashMap<(String, u64, u64, String), StraglrRow> = tumor
             .into_iter()
-            .map(|r| ((r.chrom.clone(), r.start, r.end), r))
+            .filter(|r| r.support >= min_support)
+            .map(|r| ((r.chrom.clone(), r.start, r.end, r.repeat_unit.clone()), r))
             .collect();
 
         let mut changes = Vec::new();
 
-        for normal_row in normal {
-            let key = (normal_row.chrom.clone(), normal_row.start, normal_row.end);
-
-            if let Some(tumor_row) = tumor_map.get(&key) {
-                if let (Some(normal_cn), Some(tumor_cn)) =
-                    (normal_row.max_copy_number(), tumor_row.max_copy_number())
-                {
-                    let diff = tumor_cn as i64 - normal_cn as i64;
-                    if diff.abs() >= min_difference as i64 {
-                        let location = normal_row.location_string();
-                        changes.push((location, normal_row, tumor_row.clone(), diff));
+        // Check tumor loci
+        for (key, tumor_row) in &tumor_map {
+            match normal_map.get(key) {
+                // Paired: compare alleles
+                Some(normal_row) => {
+                    let comparisons = compare_alleles(normal_row, tumor_row, min_support);
+
+                    // Find the largest absolute size difference and its direction
+                    let max_diff_comparison = comparisons
+                        .iter()
+                        .filter_map(|c| c.size_diff_bp.map(|d| (d, c)))
+                        .max_by(|(a, _), (b, _)| a.abs().partial_cmp(&b.abs()).unwrap());
+
+                    let has_novel = comparisons.iter().any(|c| c.normal.is_none());
+
+                    let (passes_threshold, status) = match max_diff_comparison {
+                        Some((diff, _)) if diff.abs() >= min_size_diff => {
+                            let status = if diff > 0.0 {
+                                ChangeStatus::Expansion
+                            } else {
+                                ChangeStatus::Contraction
+                            };
+                            (true, status)
+                        }
+                        _ if has_novel => (true, ChangeStatus::Expansion), // Novel allele = expansion
+                        _ => (false, ChangeStatus::Expansion), // Won't be used
+                    };
+
+                    if passes_threshold {
+                        changes.push(SomaticStrChange {
+                            chrom: tumor_row.chrom.clone(),
+                            start: tumor_row.start,
+                            end: tumor_row.end,
+                            repeat_unit: tumor_row.repeat_unit.clone(),
+                            normal: Some(normal_row.clone()),
+                            tumor: Some(tumor_row.clone()),
+                            comparisons,
+                            status,
+                        });
                     }
                 }
+                // Tumor-only: de novo (already filtered by min_support)
+                None => {
+                    changes.push(SomaticStrChange {
+                        chrom: tumor_row.chrom.clone(),
+                        start: tumor_row.start,
+                        end: tumor_row.end,
+                        repeat_unit: tumor_row.repeat_unit.clone(),
+                        normal: None,
+                        tumor: Some(tumor_row.clone()),
+                        comparisons: vec![],
+                        status: ChangeStatus::TumorOnly,
+                    });
+                }
             }
         }
 
+        // Sort by genomic position
+        changes.sort_by(|a, b| {
+            a.chrom
+                .cmp(&b.chrom)
+                .then(a.start.cmp(&b.start))
+                .then(a.end.cmp(&b.end))
+        });
+
         Ok(changes)
     }
 
     /// Saves somatic STR changes to a TSV file.
-    ///
-    /// # Format
-    /// Tab-separated: chrom, start, end, repeat_unit, normal_cn, tumor_cn, diff, normal_support, tumor_support
     pub fn save_somatic_changes(
         &self,
-        differences: &[(String, StraglrRow, StraglrRow, i64)],
+        changes: &[SomaticStrChange],
         output_path: &str,
     ) -> anyhow::Result<()> {
         use std::io::Write;
@@ -436,37 +476,51 @@ impl Straglr {
         let mut file = File::create(output_path)
             .context(format!("Failed to create output file: {}", output_path))?;
 
-        // Header
         writeln!(
             file,
-            "#chrom\tstart\tend\trepeat_unit\tnormal_genotype\ttumor_genotype\tnormal_cn\ttumor_cn\tdiff\tnormal_support\ttumor_support"
+            "#chrom\tstart\tend\trepeat_unit\tnormal_genotype\ttumor_genotype\tmax_size_diff\tnormal_support\ttumor_support\tstatus"
         )?;
 
-        for (_, normal, tumor, diff) in differences {
+        for change in changes {
+            let normal_geno = change
+                .normal
+                .as_ref()
+                .map(|r| r.genotype.as_str())
+                .unwrap_or(".");
+            let tumor_geno = change
+                .tumor
+                .as_ref()
+                .map(|r| r.genotype.as_str())
+                .unwrap_or(".");
+            let normal_support = change
+                .normal
+                .as_ref()
+                .map(|r| r.support.to_string())
+                .unwrap_or_else(|| ".".to_string());
+            let tumor_support = change
+                .tumor
+                .as_ref()
+                .map(|r| r.support.to_string())
+                .unwrap_or_else(|| ".".to_string());
+
+            let max_size = change
+                .max_size_diff()
+                .map(|d| format!("{:.1}", d))
+                .unwrap_or_else(|| ".".to_string());
+
             writeln!(
                 file,
-                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
-                normal.chrom,
-                normal.start,
-                normal.end,
-                normal.repeat_unit,
-                normal.genotype,
-                tumor.genotype,
-                normal
-                    .copy_numbers
-                    .iter()
-                    .map(|n| n.to_string())
-                    .collect::<Vec<_>>()
-                    .join(","),
-                tumor
-                    .copy_numbers
-                    .iter()
-                    .map(|n| n.to_string())
-                    .collect::<Vec<_>>()
-                    .join(","),
-                diff,
-                normal.support,
-                tumor.support,
+                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
+                change.chrom,
+                change.start,
+                change.end,
+                change.repeat_unit,
+                normal_geno,
+                tumor_geno,
+                max_size,
+                normal_support,
+                tumor_support,
+                change.status,
             )?;
         }
 
@@ -475,6 +529,171 @@ impl Straglr {
     }
 }
 
+/// Status of somatic STR change.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ChangeStatus {
+    /// Tumor allele larger than normal
+    Expansion,
+    /// Tumor allele smaller than normal
+    Contraction,
+    /// Present only in tumor
+    TumorOnly,
+}
+
+impl fmt::Display for ChangeStatus {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ChangeStatus::Expansion => write!(f, "EXPANSION"),
+            ChangeStatus::Contraction => write!(f, "CONTRACTION"),
+            ChangeStatus::TumorOnly => write!(f, "TUMOR_ONLY"),
+        }
+    }
+}
+
+/// Per-allele comparison between normal and tumor.
+#[derive(Debug, Clone)]
+pub struct AlleleComparison {
+    /// Normal allele size in bp (None if tumor-only allele)
+    pub normal: Option<f64>,
+    /// Normal allele read support
+    pub normal_support: Option<u32>,
+    /// Tumor allele size in bp (None if lost in tumor)
+    pub tumor: Option<f64>,
+    /// Tumor allele read support
+    pub tumor_support: Option<u32>,
+    /// Size difference in bp (tumor - normal)
+    pub size_diff_bp: Option<f64>,
+}
+
+/// A somatic STR change between tumor and normal.
+#[derive(Debug, Clone)]
+pub struct SomaticStrChange {
+    pub chrom: String,
+    pub start: u64,
+    pub end: u64,
+    pub repeat_unit: String,
+    pub normal: Option<StraglrRow>,
+    pub tumor: Option<StraglrRow>,
+    pub comparisons: Vec<AlleleComparison>,
+    pub status: ChangeStatus,
+}
+
+impl SomaticStrChange {
+    /// Returns location string.
+    pub fn location_string(&self) -> String {
+        format!("{}:{}-{}", self.chrom, self.start, self.end)
+    }
+
+    /// Returns max absolute size difference across alleles.
+    pub fn max_size_diff(&self) -> Option<f64> {
+        self.comparisons
+            .iter()
+            .filter_map(|c| c.size_diff_bp.map(|d| d.abs()))
+            .max_by(|a, b| a.partial_cmp(b).unwrap())
+    }
+
+    /// Returns true if there are novel tumor alleles.
+    pub fn has_novel_allele(&self) -> bool {
+        self.comparisons.iter().any(|c| c.normal.is_none() && c.tumor.is_some())
+    }
+}
+
+/// Compares alleles between normal and tumor rows using greedy size matching.
+///
+/// Only considers alleles with at least `min_support` reads.
+/// Parses genotype strings in "size(count);size(count)" format and matches
+/// tumor alleles to closest normal alleles within tolerance.
+fn compare_alleles(
+    normal: &StraglrRow,
+    tumor: &StraglrRow,
+    min_support: u32,
+) -> Vec<AlleleComparison> {
+    let normal_alleles = parse_genotype_alleles(&normal.genotype)
+        .into_iter()
+        .filter(|(_, support)| *support >= min_support)
+        .collect::<Vec<_>>();
+    let tumor_alleles = parse_genotype_alleles(&tumor.genotype)
+        .into_iter()
+        .filter(|(_, support)| *support >= min_support)
+        .collect::<Vec<_>>();
+
+    let mut comparisons = Vec::new();
+    let mut matched_normal = vec![false; normal_alleles.len()];
+
+    // Tolerance for matching: alleles within 10bp are considered same
+    let match_tolerance = 10.0;
+
+    // Match each tumor allele to closest normal
+    for (tumor_size, tumor_support) in &tumor_alleles {
+        let mut best: Option<(usize, f64)> = None;
+
+        for (i, (normal_size, _)) in normal_alleles.iter().enumerate() {
+            if matched_normal[i] {
+                continue;
+            }
+            let diff = (*tumor_size - *normal_size).abs();
+            if diff <= match_tolerance
+                && (best.is_none() || diff < best.unwrap().1) {
+                    best = Some((i, diff));
+                }
+        }
+
+        if let Some((idx, _)) = best {
+            matched_normal[idx] = true;
+            let (normal_size, normal_support) = normal_alleles[idx];
+            comparisons.push(AlleleComparison {
+                normal: Some(normal_size),
+                normal_support: Some(normal_support),
+                tumor: Some(*tumor_size),
+                tumor_support: Some(*tumor_support),
+                size_diff_bp: Some(*tumor_size - normal_size),
+            });
+        } else {
+            // Novel tumor allele
+            comparisons.push(AlleleComparison {
+                normal: None,
+                normal_support: None,
+                tumor: Some(*tumor_size),
+                tumor_support: Some(*tumor_support),
+                size_diff_bp: None,
+            });
+        }
+    }
+
+    // Add unmatched normal alleles (lost in tumor)
+    for (i, (normal_size, normal_support)) in normal_alleles.iter().enumerate() {
+        if !matched_normal[i] {
+            comparisons.push(AlleleComparison {
+                normal: Some(*normal_size),
+                normal_support: Some(*normal_support),
+                tumor: None,
+                tumor_support: None,
+                size_diff_bp: None,
+            });
+        }
+    }
+
+    comparisons
+}
+
+/// Parses genotype string "size(count);size(count)" into (size, support) pairs.
+fn parse_genotype_alleles(genotype: &str) -> Vec<(f64, u32)> {
+    genotype
+        .split(';')
+        .filter_map(|part| {
+            let part = part.trim();
+            if part.is_empty() || part == "." {
+                return None;
+            }
+            let open = part.find('(')?;
+            let close = part.find(')')?;
+            let size: f64 = part[..open].parse().ok()?;
+            let support: u32 = part[open + 1..close].parse().ok()?;
+            Some((size, support))
+        })
+        .collect()
+}
+
 #[derive(Debug, Clone)]
 struct StraglrJob {
     conda_sh: String,
@@ -728,7 +947,7 @@ impl StraglrSolo {
     ///
     /// # Errors
     /// Returns an error if results cannot be loaded.
-    pub fn load_expanded_repeats(&self, min_copy_number: u32) -> anyhow::Result<Vec<StraglrRow>> {
+    pub fn load_expanded_repeats(&self, min_copy_number: f64) -> anyhow::Result<Vec<StraglrRow>> {
         let results = self.load_results()?;
         Ok(results
             .into_iter()
@@ -899,19 +1118,19 @@ pub fn run_straglr_chunked(
     }
 
     // Run all chunks in parallel
-    // info!("Executing {} Straglr jobs in parallel", actual_n_parts);
-    // let outputs = run_many!(config, jobs)?;
-    //
-    // // Save logs
-    // let log_dir = format!("{}/{}/log/straglr_chunked", config.result_dir, id);
-    // fs::create_dir_all(&log_dir).context("Failed to create log directory")?;
-    //
-    // for (i, output) in outputs.iter().enumerate() {
-    //     let log_file = format!("{}/straglr_part{}_", log_dir, i + 1);
-    //     output
-    //         .save_to_file(&log_file)
-    //         .context(format!("Failed to save logs for part {}", i + 1))?;
-    // }
+    info!("Executing {} Straglr jobs in parallel", actual_n_parts);
+    let outputs = run_many!(config, jobs)?;
+
+    // Save logs
+    let log_dir = format!("{}/{}/log/straglr_chunked", config.result_dir, id);
+    fs::create_dir_all(&log_dir).context("Failed to create log directory")?;
+
+    for (i, output) in outputs.iter().enumerate() {
+        let log_file = format!("{}/straglr_part{}_", log_dir, i + 1);
+        output
+            .save_to_file(&log_file)
+            .context(format!("Failed to save logs for part {}", i + 1))?;
+    }
 
     // Merge TSV files
     info!("Merging {} TSV files", actual_n_parts);
@@ -988,6 +1207,141 @@ fn merge_tsv_files(input_files: &[String], output_file: &str) -> anyhow::Result<
     Ok(())
 }
 
+/// Summary statistics for somatic STR changes.
+#[derive(Debug, Clone, Default)]
+pub struct SomaticStrStats {
+    /// Total number of somatic changes
+    pub total: usize,
+    /// Number of expansions
+    pub expansions: usize,
+    /// Number of contractions
+    pub contractions: usize,
+    /// Number of tumor-only loci
+    pub tumor_only: usize,
+    /// Mean size difference (bp) for paired loci
+    pub mean_size_diff: f64,
+    /// Median size difference (bp) for paired loci
+    pub median_size_diff: f64,
+    /// Max size difference (bp)
+    pub max_size_diff: f64,
+    /// Min size difference (bp)
+    pub min_size_diff: f64,
+    /// Size differences by repeat unit motif
+    pub by_motif: HashMap<String, MotifStats>,
+    /// Count by chromosome
+    pub by_chrom: HashMap<String, usize>,
+}
+
+/// Per-motif statistics.
+#[derive(Debug, Clone, Default)]
+pub struct MotifStats {
+    pub count: usize,
+    pub expansions: usize,
+    pub contractions: usize,
+    pub mean_size_diff: f64,
+}
+
+/// Computes summary statistics for somatic STR changes.
+pub fn compute_stats(changes: &[SomaticStrChange]) -> SomaticStrStats {
+    if changes.is_empty() {
+        return SomaticStrStats::default();
+    }
+
+    let mut stats = SomaticStrStats {
+        total: changes.len(),
+        ..Default::default()
+    };
+
+    let mut size_diffs: Vec<f64> = Vec::new();
+    let mut motif_diffs: HashMap<String, Vec<f64>> = HashMap::new();
+
+    for change in changes {
+        // Count by status
+        match change.status {
+            ChangeStatus::Expansion => stats.expansions += 1,
+            ChangeStatus::Contraction => stats.contractions += 1,
+            ChangeStatus::TumorOnly => stats.tumor_only += 1,
+        }
+
+        // Count by chromosome
+        *stats.by_chrom.entry(change.chrom.clone()).or_insert(0) += 1;
+
+        // Collect size differences for paired loci
+        if let Some(diff) = change.max_size_diff() {
+            size_diffs.push(diff);
+            motif_diffs
+                .entry(change.repeat_unit.clone())
+                .or_default()
+                .push(diff);
+        }
+
+        // Track motif counts
+        let motif_stat = stats.by_motif.entry(change.repeat_unit.clone()).or_default();
+        motif_stat.count += 1;
+        match change.status {
+            ChangeStatus::Expansion => motif_stat.expansions += 1,
+            ChangeStatus::Contraction => motif_stat.contractions += 1,
+            ChangeStatus::TumorOnly => {}
+        }
+    }
+
+    // Compute global size diff stats
+    if !size_diffs.is_empty() {
+        size_diffs.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        
+        stats.mean_size_diff = size_diffs.iter().sum::<f64>() / size_diffs.len() as f64;
+        stats.min_size_diff = size_diffs[0];
+        stats.max_size_diff = size_diffs[size_diffs.len() - 1];
+        
+        let mid = size_diffs.len() / 2;
+        stats.median_size_diff = if size_diffs.len().is_multiple_of(2) {
+            (size_diffs[mid - 1] + size_diffs[mid]) / 2.0
+        } else {
+            size_diffs[mid]
+        };
+    }
+
+    // Compute per-motif mean size diff
+    for (motif, diffs) in motif_diffs {
+        if let Some(motif_stat) = stats.by_motif.get_mut(&motif) {
+            if !diffs.is_empty() {
+                motif_stat.mean_size_diff = diffs.iter().sum::<f64>() / diffs.len() as f64;
+            }
+        }
+    }
+
+    stats
+}
+
+impl SomaticStrStats {
+    /// Returns a formatted summary string.
+    pub fn summary(&self) -> String {
+        format!(
+            "Total: {} | Expansions: {} | Contractions: {} | Tumor-only: {}\n\
+             Size diff (bp): mean={:.1}, median={:.1}, range=[{:.1}, {:.1}]\n\
+             Motifs: {}",
+            self.total,
+            self.expansions,
+            self.contractions,
+            self.tumor_only,
+            self.mean_size_diff,
+            self.median_size_diff,
+            self.min_size_diff,
+            self.max_size_diff,
+            self.by_motif.len()
+        )
+    }
+
+    /// Returns expansion/contraction ratio.
+    pub fn expansion_ratio(&self) -> f64 {
+        if self.contractions == 0 {
+            f64::INFINITY
+        } else {
+            self.expansions as f64 / self.contractions as f64
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1007,10 +1361,7 @@ mod tests {
         let config = Config::default();
 
         let mut caller = Straglr::initialize("DUMCO", &config)?;
-        // caller.run()?;
-
-        let differences = caller.find_somatic_changes(caller.config.straglr_min_diff)?;
-        caller.save_somatic_changes(&differences, &caller.config.straglr_tumor_normal_diff_tsv("DUMCO"))?;
+        caller.run()?;
 
         Ok(())
     }

+ 6 - 1
src/config.rs

@@ -319,6 +319,8 @@ pub struct Config {
     /// Template for solo nanomonsv passed VCF (`{output_dir}`, `{id}`, `{time}`).
     pub nanomonsv_solo_passed_vcf: String,
 
+    pub nanomonsv_simple_repeat_bed: String,
+
     // === Straglr configuration ===
     /// Path to Straglr executable.
     pub straglr_bin: String,
@@ -327,7 +329,10 @@ pub struct Config {
     pub straglr_loci_bed: String,
 
     /// Size of reported difference between normal and tumoral
-    pub straglr_min_diff: u32,
+    pub straglr_min_size_diff: u32,
+
+    /// Minimum CN of reported difference between normal and tumoral
+    pub straglr_min_support_diff: u32,
 
     /// Minimum read support for STR genotyping.
     pub straglr_min_support: u32,

+ 203 - 165
src/io/straglr.rs

@@ -1,10 +1,12 @@
 //! Straglr TSV output parser
+//! Straglr TSV output parser
 //!
-//! Parses TSV files produced by Straglr STR genotyper.
+//! Parses TSV files produced by Straglr STR genotyper with `--genotype_in_size`.
 
 use anyhow::Context;
 use log::warn;
 use std::{
+    collections::HashMap,
     io::{BufRead, BufReader},
     str::FromStr,
 };
@@ -13,8 +15,8 @@ use super::readers::get_reader;
 
 /// Represents a single STR locus genotyped by Straglr.
 ///
-/// Each row corresponds to one STR locus with its genotype information,
-/// read support, and confidence metrics.
+/// Aggregated from per-read rows sharing the same (chrom, start, end, repeat_unit).
+/// The genotype field contains allele sizes in format "size(count)" or "size1(n1);size2(n2)".
 #[derive(Debug, Clone, PartialEq)]
 pub struct StraglrRow {
     /// Chromosome name (e.g., "chr4", "chrX")
@@ -25,86 +27,52 @@ pub struct StraglrRow {
     pub end: u64,
     /// Repeat unit motif (e.g., "CAG", "GGCCCC")
     pub repeat_unit: String,
-    /// Genotype as string (e.g., "12/13", "45/45", ".")
+    /// Genotype string: "size(count)" or "size1(n1);size2(n2)" for het
     pub genotype: String,
-    /// Copy number for each allele (e.g., [12, 13])
-    pub copy_numbers: Vec<u32>,
-    /// Allele lengths in base pairs
-    pub allele_lengths: Option<Vec<u32>>,
-    /// Read support count
+    /// Total read support (sum of counts from genotype)
     pub support: u32,
-    /// Confidence score
-    pub score: Option<f64>,
 }
 
-impl FromStr for StraglrRow {
+/// Raw per-read row from Straglr output (internal).
+#[derive(Debug)]
+struct RawReadRow {
+    chrom: String,
+    start: u64,
+    end: u64,
+    repeat_unit: String,
+    genotype: String,
+}
+
+impl FromStr for RawReadRow {
     type Err = anyhow::Error;
 
     /// Parses a single TSV line from Straglr output.
     ///
-    /// Expected format (tab-separated):
+    /// Expected format (11 tab-separated fields):
     /// ```text
-    /// #chrom  start   end repeat_unit genotype    copy_number support [optional: allele_length, score, ...]
+    /// #chrom  start  end  repeat_unit  genotype  read  copy_number  size  read_start  strand  allele
+    /// chr1    100075838  100075872  TTTAATT  35.9(54)  uuid  5.3  37  2186  +  35.9
     /// ```
-    ///
-    /// # Errors
-    /// Returns an error if required fields are missing or cannot be parsed.
     fn from_str(s: &str) -> anyhow::Result<Self> {
         let fields: Vec<&str> = s.split('\t').collect();
 
-        if fields.len() < 7 {
+        if fields.len() < 11 {
             anyhow::bail!(
-                "Invalid Straglr TSV line: expected at least 7 fields, got {}",
+                "Invalid Straglr TSV line: expected 11 fields, got {}",
                 fields.len()
             );
         }
 
-        let chrom = fields[0].to_string();
-        let start: u64 = fields[1]
-            .parse()
-            .context(format!("Failed to parse start position: {}", fields[1]))?;
-        let end: u64 = fields[2]
-            .parse()
-            .context(format!("Failed to parse end position: {}", fields[2]))?;
-        let repeat_unit = fields[3].to_string();
-        let genotype = fields[4].to_string();
-
-        // Parse copy numbers (format: "12,13" or "45")
-        let copy_numbers: Vec<u32> = fields[5]
-            .split(',')
-            .filter_map(|s| s.parse().ok())
-            .collect();
-
-        if copy_numbers.is_empty() && fields[5] != "." {
-            warn!("Failed to parse any copy numbers from: {}", fields[5]);
-        }
-
-        let support: u32 = fields[6]
-            .parse()
-            .context(format!("Failed to parse support: {}", fields[6]))?;
-
-        // Optional fields
-        let allele_lengths = fields.get(7).and_then(|s| {
-            let lengths: Vec<u32> = s.split(',').filter_map(|v| v.parse().ok()).collect();
-            if lengths.is_empty() {
-                None
-            } else {
-                Some(lengths)
-            }
-        });
-
-        let score = fields.get(8).and_then(|s| s.parse().ok());
-
         Ok(Self {
-            chrom,
-            start,
-            end,
-            repeat_unit,
-            genotype,
-            copy_numbers,
-            allele_lengths,
-            support,
-            score,
+            chrom: fields[0].to_string(),
+            start: fields[1]
+                .parse()
+                .context(format!("Failed to parse start: {}", fields[1]))?,
+            end: fields[2]
+                .parse()
+                .context(format!("Failed to parse end: {}", fields[2]))?,
+            repeat_unit: fields[3].to_string(),
+            genotype: fields[4].to_string(),
         })
     }
 }
@@ -115,26 +83,47 @@ impl StraglrRow {
         self.end.saturating_sub(self.start)
     }
 
-    /// Returns true if this locus has an expanded repeat (based on copy number threshold).
-    ///
-    /// # Arguments
-    /// * `threshold` - Minimum copy number to consider as expanded
-    pub fn is_expanded(&self, threshold: u32) -> bool {
-        self.copy_numbers.iter().any(|&cn| cn >= threshold)
+    /// Returns the motif length.
+    pub fn motif_length(&self) -> usize {
+        self.repeat_unit.len()
+    }
+
+    /// Returns allele sizes parsed from genotype string.
+    pub fn allele_sizes(&self) -> Vec<f64> {
+        parse_genotype_sizes(&self.genotype)
+    }
+
+    /// Returns copy numbers (size / motif_length) for each allele.
+    pub fn copy_numbers(&self) -> Vec<f64> {
+        let motif_len = self.motif_length() as f64;
+        self.allele_sizes()
+            .into_iter()
+            .map(|s| s / motif_len)
+            .collect()
+    }
+
+    /// Returns the maximum allele size in bp.
+    pub fn max_allele_size(&self) -> Option<f64> {
+        self.allele_sizes()
+            .into_iter()
+            .max_by(|a, b| a.partial_cmp(b).unwrap())
     }
 
     /// Returns the maximum copy number across all alleles.
-    pub fn max_copy_number(&self) -> Option<u32> {
-        self.copy_numbers.iter().max().copied()
+    pub fn max_copy_number(&self) -> Option<f64> {
+        self.copy_numbers()
+            .into_iter()
+            .max_by(|a, b| a.partial_cmp(b).unwrap())
+    }
+
+    /// Returns true if this locus has an expanded repeat.
+    pub fn is_expanded(&self, threshold_cn: f64) -> bool {
+        self.copy_numbers().iter().any(|&cn| cn >= threshold_cn)
     }
 
-    /// Returns true if the locus is homozygous (all alleles have same copy number).
+    /// Returns true if the locus is homozygous (single allele).
     pub fn is_homozygous(&self) -> bool {
-        if self.copy_numbers.len() <= 1 {
-            return true;
-        }
-        let first = self.copy_numbers[0];
-        self.copy_numbers.iter().all(|&cn| cn == first)
+        self.allele_sizes().len() <= 1
     }
 
     /// Returns the location as a string (e.g., "chr4:3074876-3074933").
@@ -143,89 +132,127 @@ impl StraglrRow {
     }
 }
 
+/// Parses genotype string "size(count)" or "size1(n1);size2(n2)" into sizes.
+fn parse_genotype_sizes(genotype: &str) -> Vec<f64> {
+    genotype
+        .split(';')
+        .filter_map(|part| {
+            let part = part.trim();
+            if part.is_empty() || part == "." {
+                return None;
+            }
+            if let Some(paren_idx) = part.find('(') {
+                part[..paren_idx].parse().ok()
+            } else {
+                part.parse().ok()
+            }
+        })
+        .collect()
+}
+
+/// Parses genotype string and returns total read support.
+fn parse_genotype_support(genotype: &str) -> u32 {
+    genotype
+        .split(';')
+        .filter_map(|part| {
+            let part = part.trim();
+            if part.is_empty() || part == "." {
+                return None;
+            }
+            let open = part.find('(')?;
+            let close = part.find(')')?;
+            part[open + 1..close].parse::<u32>().ok()
+        })
+        .sum()
+}
+
+/// Unique key for grouping reads by locus.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+struct LocusKey {
+    chrom: String,
+    start: u64,
+    end: u64,
+    repeat_unit: String,
+}
+
 /// Reads and parses a Straglr TSV output file.
 ///
+/// Aggregates per-read rows into unique loci.
 /// Skips header lines (starting with `#`) and empty lines.
-/// Logs warnings for malformed lines but continues parsing.
 ///
 /// # Arguments
 /// * `path` - Path to the Straglr TSV file (can be gzipped)
 ///
 /// # Returns
-/// `Ok(Vec<StraglrRow>)` with all successfully parsed STR loci
-///
-/// # Errors
-/// Returns an error if the file cannot be opened or read.
-///
-/// # Example
-/// ```ignore
-/// use pandora_lib_promethion::io::straglr::read_straglr_tsv;
-///
-/// let strs = read_straglr_tsv("/path/to/sample_straglr.tsv")?;
-/// for str_locus in strs {
-///     if str_locus.is_expanded(40) {
-///         println!("Expanded repeat at {}: {} copies",
-///                  str_locus.location_string(),
-///                  str_locus.max_copy_number().unwrap());
-///     }
-/// }
-/// # Ok::<(), anyhow::Error>(())
-/// ```
+/// `Ok(Vec<StraglrRow>)` with one entry per unique locus
 pub fn read_straglr_tsv(path: &str) -> anyhow::Result<Vec<StraglrRow>> {
     let reader = BufReader::new(get_reader(path)?);
-    let mut results = Vec::new();
+    let mut locus_map: HashMap<LocusKey, String> = HashMap::new();
 
     for (line_num, line) in reader.lines().enumerate() {
-        match line {
-            Ok(line) => {
-                // Skip header lines and empty lines
-                if line.starts_with('#') || line.trim().is_empty() {
-                    continue;
-                }
-
-                match line.parse::<StraglrRow>() {
-                    Ok(row) => results.push(row),
-                    Err(e) => warn!(
-                        "Failed to parse line {}: {} (error: {})",
-                        line_num + 1,
-                        line,
-                        e
-                    ),
-                }
+        let line = match line {
+            Ok(l) => l,
+            Err(e) => {
+                warn!("Failed to read line {}: {}", line_num + 1, e);
+                continue;
+            }
+        };
+
+        if line.starts_with('#') || line.trim().is_empty() {
+            continue;
+        }
+
+        match line.parse::<RawReadRow>() {
+            Ok(row) => {
+                let key = LocusKey {
+                    chrom: row.chrom,
+                    start: row.start,
+                    end: row.end,
+                    repeat_unit: row.repeat_unit,
+                };
+                // All reads at same locus share genotype; store first occurrence
+                locus_map.entry(key).or_insert(row.genotype);
             }
-            Err(e) => warn!("Failed to read line {}: {}", line_num + 1, e),
+            Err(e) => warn!("Failed to parse line {}: {} (error: {})", line_num + 1, line, e),
         }
     }
 
+    // Convert to StraglrRow
+    let mut results: Vec<StraglrRow> = locus_map
+        .into_iter()
+        .map(|(key, genotype)| {
+            let support = parse_genotype_support(&genotype);
+            StraglrRow {
+                chrom: key.chrom,
+                start: key.start,
+                end: key.end,
+                repeat_unit: key.repeat_unit,
+                genotype,
+                support,
+            }
+        })
+        .collect();
+
+    // Sort by genomic position
+    results.sort_by(|a, b| {
+        a.chrom
+            .cmp(&b.chrom)
+            .then(a.start.cmp(&b.start))
+            .then(a.end.cmp(&b.end))
+    });
+
     Ok(results)
 }
 
 /// Filters Straglr results to keep only loci with copy numbers above a threshold.
-///
-/// # Arguments
-/// * `rows` - Vector of Straglr STR loci
-/// * `min_copy_number` - Minimum copy number threshold
-///
-/// # Returns
-/// Filtered vector containing only expanded repeats
-pub fn filter_expanded(rows: Vec<StraglrRow>, min_copy_number: u32) -> Vec<StraglrRow> {
+pub fn filter_expanded(rows: Vec<StraglrRow>, min_copy_number: f64) -> Vec<StraglrRow> {
     rows.into_iter()
         .filter(|row| row.is_expanded(min_copy_number))
         .collect()
 }
 
 /// Groups Straglr results by chromosome.
-///
-/// # Arguments
-/// * `rows` - Vector of Straglr STR loci
-///
-/// # Returns
-/// HashMap mapping chromosome names to vectors of STR loci
-pub fn group_by_chromosome(
-    rows: Vec<StraglrRow>,
-) -> std::collections::HashMap<String, Vec<StraglrRow>> {
-    use std::collections::HashMap;
-
+pub fn group_by_chromosome(rows: Vec<StraglrRow>) -> HashMap<String, Vec<StraglrRow>> {
     let mut map: HashMap<String, Vec<StraglrRow>> = HashMap::new();
     for row in rows {
         map.entry(row.chrom.clone()).or_default().push(row);
@@ -238,19 +265,29 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_parse_straglr_row() {
-        let line = "chr4\t3074876\t3074933\tCAG\t19/20\t19,20\t45\t57,60\t0.95";
-        let row: StraglrRow = line.parse().unwrap();
-
-        assert_eq!(row.chrom, "chr4");
-        assert_eq!(row.start, 3074876);
-        assert_eq!(row.end, 3074933);
-        assert_eq!(row.repeat_unit, "CAG");
-        assert_eq!(row.genotype, "19/20");
-        assert_eq!(row.copy_numbers, vec![19, 20]);
-        assert_eq!(row.support, 45);
-        assert_eq!(row.allele_lengths, Some(vec![57, 60]));
-        assert_eq!(row.score, Some(0.95));
+    fn test_parse_raw_read_row() {
+        let line = "chr1\t100075838\t100075872\tTTTAATT\t35.9(54)\t40ef6b9e-e0d6-4220-8300-eb66f4bfa5ea\t5.3\t37\t2186\t+\t35.9";
+        let row: RawReadRow = line.parse().unwrap();
+
+        assert_eq!(row.chrom, "chr1");
+        assert_eq!(row.start, 100075838);
+        assert_eq!(row.end, 100075872);
+        assert_eq!(row.repeat_unit, "TTTAATT");
+        assert_eq!(row.genotype, "35.9(54)");
+    }
+
+    #[test]
+    fn test_parse_genotype_sizes() {
+        assert_eq!(parse_genotype_sizes("35.9(54)"), vec![35.9]);
+        assert_eq!(parse_genotype_sizes("20.1(25);45.3(29)"), vec![20.1, 45.3]);
+        assert_eq!(parse_genotype_sizes("."), Vec::<f64>::new());
+    }
+
+    #[test]
+    fn test_parse_genotype_support() {
+        assert_eq!(parse_genotype_support("35.9(54)"), 54);
+        assert_eq!(parse_genotype_support("20.1(25);45.3(29)"), 54);
+        assert_eq!(parse_genotype_support("."), 0);
     }
 
     #[test]
@@ -260,36 +297,37 @@ mod tests {
             start: 100,
             end: 200,
             repeat_unit: "CAG".to_string(),
-            genotype: "50/50".to_string(),
-            copy_numbers: vec![50, 50],
-            allele_lengths: None,
-            support: 30,
-            score: None,
+            genotype: "60.0(30);90.0(25)".to_string(),
+            support: 55,
         };
 
         assert_eq!(row.locus_length(), 100);
-        assert!(row.is_expanded(40));
-        assert!(!row.is_expanded(60));
-        assert_eq!(row.max_copy_number(), Some(50));
-        assert!(row.is_homozygous());
+        assert_eq!(row.motif_length(), 3);
+        assert_eq!(row.allele_sizes(), vec![60.0, 90.0]);
+        
+        let cns = row.copy_numbers();
+        assert!((cns[0] - 20.0).abs() < 0.01);
+        assert!((cns[1] - 30.0).abs() < 0.01);
+        
+        assert!((row.max_allele_size().unwrap() - 90.0).abs() < 0.01);
+        assert!((row.max_copy_number().unwrap() - 30.0).abs() < 0.01);
+        assert!(row.is_expanded(25.0));
+        assert!(!row.is_expanded(35.0));
+        assert!(!row.is_homozygous());
         assert_eq!(row.location_string(), "chr4:100-200");
     }
 
     #[test]
-    fn test_heterozygous_detection() {
+    fn test_homozygous() {
         let row = StraglrRow {
             chrom: "chr4".to_string(),
             start: 100,
             end: 200,
             repeat_unit: "CAG".to_string(),
-            genotype: "19/45".to_string(),
-            copy_numbers: vec![19, 45],
-            allele_lengths: None,
-            support: 30,
-            score: None,
+            genotype: "60.0(55)".to_string(),
+            support: 55,
         };
 
-        assert!(!row.is_homozygous());
-        assert_eq!(row.max_copy_number(), Some(45));
+        assert!(row.is_homozygous());
     }
 }