|
|
@@ -135,6 +135,8 @@ use crate::{
|
|
|
use anyhow::Context;
|
|
|
use log::{debug, info};
|
|
|
use std::{
|
|
|
+ collections::HashMap,
|
|
|
+ fmt,
|
|
|
fs::{self, File},
|
|
|
io::{BufRead, BufReader, Write},
|
|
|
path::Path,
|
|
|
@@ -329,7 +331,12 @@ impl Run for Straglr {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- let differences = self.find_somatic_changes(self.config.straglr_min_diff)?;
|
|
|
+ let differences = self.find_somatic_changes(
|
|
|
+ self.config.straglr_min_size_diff.into(),
|
|
|
+ self.config.straglr_min_support_diff,
|
|
|
+ )?;
|
|
|
+ let stats = compute_stats(&differences);
|
|
|
+ info!("{}", stats.summary());
|
|
|
self.save_somatic_changes(&differences, &self.config.straglr_tumor_normal_diff_tsv(id))?;
|
|
|
Ok(())
|
|
|
}
|
|
|
@@ -337,12 +344,6 @@ impl Run for Straglr {
|
|
|
|
|
|
impl Straglr {
|
|
|
/// Loads and parses the normal sample Straglr TSV results.
|
|
|
- ///
|
|
|
- /// # Returns
|
|
|
- /// Vector of STR loci from the normal sample
|
|
|
- ///
|
|
|
- /// # Errors
|
|
|
- /// Returns an error if the TSV file cannot be read or parsed.
|
|
|
pub fn load_normal_results(&self) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
let tsv_path = self.config.straglr_normal_tsv(&self.id);
|
|
|
read_straglr_tsv(&tsv_path).context(format!(
|
|
|
@@ -352,12 +353,6 @@ impl Straglr {
|
|
|
}
|
|
|
|
|
|
/// Loads and parses the tumor sample Straglr TSV results.
|
|
|
- ///
|
|
|
- /// # Returns
|
|
|
- /// Vector of STR loci from the tumor sample
|
|
|
- ///
|
|
|
- /// # Errors
|
|
|
- /// Returns an error if the TSV file cannot be read or parsed.
|
|
|
pub fn load_tumor_results(&self) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
let tsv_path = self.config.straglr_tumor_tsv(&self.id);
|
|
|
read_straglr_tsv(&tsv_path).context(format!(
|
|
|
@@ -366,69 +361,114 @@ impl Straglr {
|
|
|
))
|
|
|
}
|
|
|
|
|
|
- /// Loads both normal and tumor results as a tuple.
|
|
|
+ /// Finds somatic STR changes between tumor and normal samples.
|
|
|
///
|
|
|
- /// # Returns
|
|
|
- /// `(normal_results, tumor_results)` tuple
|
|
|
- ///
|
|
|
- /// # Errors
|
|
|
- /// Returns an error if either TSV file cannot be read or parsed.
|
|
|
- pub fn load_results(&self) -> anyhow::Result<(Vec<StraglrRow>, Vec<StraglrRow>)> {
|
|
|
- Ok((self.load_normal_results()?, self.load_tumor_results()?))
|
|
|
- }
|
|
|
-
|
|
|
- /// Finds STR loci that differ between tumor and normal samples.
|
|
|
- ///
|
|
|
- /// Compares copy numbers at matching loci to identify somatic STR changes.
|
|
|
+ /// Reports loci that are either:
|
|
|
+ /// 1. Present only in tumor (de novo) with sufficient support
|
|
|
+ /// 2. Present in both with allele size difference exceeding threshold
|
|
|
///
|
|
|
/// # Arguments
|
|
|
- /// * `min_difference` - Minimum copy number difference to report (default: 2)
|
|
|
+ /// * `min_size_diff` - Minimum allele size difference in bp to report as changed
|
|
|
+ /// * `min_support` - Minimum read support required for an allele to be considered
|
|
|
///
|
|
|
/// # Returns
|
|
|
- /// Vector of tuples: `(locus_id, normal_row, tumor_row, copy_number_diff)`
|
|
|
- ///
|
|
|
- /// # Errors
|
|
|
- /// Returns an error if results cannot be loaded.
|
|
|
+ /// Vector of `SomaticStrChange` containing matched loci with differences
|
|
|
pub fn find_somatic_changes(
|
|
|
&self,
|
|
|
- min_difference: u32,
|
|
|
- ) -> anyhow::Result<Vec<(String, StraglrRow, StraglrRow, i64)>> {
|
|
|
- let (normal, tumor) = self.load_results()?;
|
|
|
+ min_size_diff: f64,
|
|
|
+ min_support: u32,
|
|
|
+ ) -> anyhow::Result<Vec<SomaticStrChange>> {
|
|
|
+ let normal = self.load_normal_results()?;
|
|
|
+ let tumor = self.load_tumor_results()?;
|
|
|
+
|
|
|
+ // Index normal by locus key, filtering by min_support
|
|
|
+ let normal_map: HashMap<(String, u64, u64, String), StraglrRow> = normal
|
|
|
+ .into_iter()
|
|
|
+ .filter(|r| r.support >= min_support)
|
|
|
+ .map(|r| ((r.chrom.clone(), r.start, r.end, r.repeat_unit.clone()), r))
|
|
|
+ .collect();
|
|
|
|
|
|
- // Index tumor by location for O(1) lookup
|
|
|
- let tumor_map: std::collections::HashMap<(String, u64, u64), StraglrRow> = tumor
|
|
|
+ // Index tumor by locus key, filtering by min_support
|
|
|
+ let tumor_map: HashMap<(String, u64, u64, String), StraglrRow> = tumor
|
|
|
.into_iter()
|
|
|
- .map(|r| ((r.chrom.clone(), r.start, r.end), r))
|
|
|
+ .filter(|r| r.support >= min_support)
|
|
|
+ .map(|r| ((r.chrom.clone(), r.start, r.end, r.repeat_unit.clone()), r))
|
|
|
.collect();
|
|
|
|
|
|
let mut changes = Vec::new();
|
|
|
|
|
|
- for normal_row in normal {
|
|
|
- let key = (normal_row.chrom.clone(), normal_row.start, normal_row.end);
|
|
|
-
|
|
|
- if let Some(tumor_row) = tumor_map.get(&key) {
|
|
|
- if let (Some(normal_cn), Some(tumor_cn)) =
|
|
|
- (normal_row.max_copy_number(), tumor_row.max_copy_number())
|
|
|
- {
|
|
|
- let diff = tumor_cn as i64 - normal_cn as i64;
|
|
|
- if diff.abs() >= min_difference as i64 {
|
|
|
- let location = normal_row.location_string();
|
|
|
- changes.push((location, normal_row, tumor_row.clone(), diff));
|
|
|
+ // Check tumor loci
|
|
|
+ for (key, tumor_row) in &tumor_map {
|
|
|
+ match normal_map.get(key) {
|
|
|
+ // Paired: compare alleles
|
|
|
+ Some(normal_row) => {
|
|
|
+ let comparisons = compare_alleles(normal_row, tumor_row, min_support);
|
|
|
+
|
|
|
+ // Find the largest absolute size difference and its direction
|
|
|
+ let max_diff_comparison = comparisons
|
|
|
+ .iter()
|
|
|
+ .filter_map(|c| c.size_diff_bp.map(|d| (d, c)))
|
|
|
+ .max_by(|(a, _), (b, _)| a.abs().partial_cmp(&b.abs()).unwrap());
|
|
|
+
|
|
|
+ let has_novel = comparisons.iter().any(|c| c.normal.is_none());
|
|
|
+
|
|
|
+ let (passes_threshold, status) = match max_diff_comparison {
|
|
|
+ Some((diff, _)) if diff.abs() >= min_size_diff => {
|
|
|
+ let status = if diff > 0.0 {
|
|
|
+ ChangeStatus::Expansion
|
|
|
+ } else {
|
|
|
+ ChangeStatus::Contraction
|
|
|
+ };
|
|
|
+ (true, status)
|
|
|
+ }
|
|
|
+ _ if has_novel => (true, ChangeStatus::Expansion), // Novel allele = expansion
|
|
|
+ _ => (false, ChangeStatus::Expansion), // Won't be used
|
|
|
+ };
|
|
|
+
|
|
|
+ if passes_threshold {
|
|
|
+ changes.push(SomaticStrChange {
|
|
|
+ chrom: tumor_row.chrom.clone(),
|
|
|
+ start: tumor_row.start,
|
|
|
+ end: tumor_row.end,
|
|
|
+ repeat_unit: tumor_row.repeat_unit.clone(),
|
|
|
+ normal: Some(normal_row.clone()),
|
|
|
+ tumor: Some(tumor_row.clone()),
|
|
|
+ comparisons,
|
|
|
+ status,
|
|
|
+ });
|
|
|
}
|
|
|
}
|
|
|
+ // Tumor-only: de novo (already filtered by min_support)
|
|
|
+ None => {
|
|
|
+ changes.push(SomaticStrChange {
|
|
|
+ chrom: tumor_row.chrom.clone(),
|
|
|
+ start: tumor_row.start,
|
|
|
+ end: tumor_row.end,
|
|
|
+ repeat_unit: tumor_row.repeat_unit.clone(),
|
|
|
+ normal: None,
|
|
|
+ tumor: Some(tumor_row.clone()),
|
|
|
+ comparisons: vec![],
|
|
|
+ status: ChangeStatus::TumorOnly,
|
|
|
+ });
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ // Sort by genomic position
|
|
|
+ changes.sort_by(|a, b| {
|
|
|
+ a.chrom
|
|
|
+ .cmp(&b.chrom)
|
|
|
+ .then(a.start.cmp(&b.start))
|
|
|
+ .then(a.end.cmp(&b.end))
|
|
|
+ });
|
|
|
+
|
|
|
Ok(changes)
|
|
|
}
|
|
|
|
|
|
/// Saves somatic STR changes to a TSV file.
|
|
|
- ///
|
|
|
- /// # Format
|
|
|
- /// Tab-separated: chrom, start, end, repeat_unit, normal_cn, tumor_cn, diff, normal_support, tumor_support
|
|
|
pub fn save_somatic_changes(
|
|
|
&self,
|
|
|
- differences: &[(String, StraglrRow, StraglrRow, i64)],
|
|
|
+ changes: &[SomaticStrChange],
|
|
|
output_path: &str,
|
|
|
) -> anyhow::Result<()> {
|
|
|
use std::io::Write;
|
|
|
@@ -436,37 +476,51 @@ impl Straglr {
|
|
|
let mut file = File::create(output_path)
|
|
|
.context(format!("Failed to create output file: {}", output_path))?;
|
|
|
|
|
|
- // Header
|
|
|
writeln!(
|
|
|
file,
|
|
|
- "#chrom\tstart\tend\trepeat_unit\tnormal_genotype\ttumor_genotype\tnormal_cn\ttumor_cn\tdiff\tnormal_support\ttumor_support"
|
|
|
+ "#chrom\tstart\tend\trepeat_unit\tnormal_genotype\ttumor_genotype\tmax_size_diff\tnormal_support\ttumor_support\tstatus"
|
|
|
)?;
|
|
|
|
|
|
- for (_, normal, tumor, diff) in differences {
|
|
|
+ for change in changes {
|
|
|
+ let normal_geno = change
|
|
|
+ .normal
|
|
|
+ .as_ref()
|
|
|
+ .map(|r| r.genotype.as_str())
|
|
|
+ .unwrap_or(".");
|
|
|
+ let tumor_geno = change
|
|
|
+ .tumor
|
|
|
+ .as_ref()
|
|
|
+ .map(|r| r.genotype.as_str())
|
|
|
+ .unwrap_or(".");
|
|
|
+ let normal_support = change
|
|
|
+ .normal
|
|
|
+ .as_ref()
|
|
|
+ .map(|r| r.support.to_string())
|
|
|
+ .unwrap_or_else(|| ".".to_string());
|
|
|
+ let tumor_support = change
|
|
|
+ .tumor
|
|
|
+ .as_ref()
|
|
|
+ .map(|r| r.support.to_string())
|
|
|
+ .unwrap_or_else(|| ".".to_string());
|
|
|
+
|
|
|
+ let max_size = change
|
|
|
+ .max_size_diff()
|
|
|
+ .map(|d| format!("{:.1}", d))
|
|
|
+ .unwrap_or_else(|| ".".to_string());
|
|
|
+
|
|
|
writeln!(
|
|
|
file,
|
|
|
- "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
|
|
|
- normal.chrom,
|
|
|
- normal.start,
|
|
|
- normal.end,
|
|
|
- normal.repeat_unit,
|
|
|
- normal.genotype,
|
|
|
- tumor.genotype,
|
|
|
- normal
|
|
|
- .copy_numbers
|
|
|
- .iter()
|
|
|
- .map(|n| n.to_string())
|
|
|
- .collect::<Vec<_>>()
|
|
|
- .join(","),
|
|
|
- tumor
|
|
|
- .copy_numbers
|
|
|
- .iter()
|
|
|
- .map(|n| n.to_string())
|
|
|
- .collect::<Vec<_>>()
|
|
|
- .join(","),
|
|
|
- diff,
|
|
|
- normal.support,
|
|
|
- tumor.support,
|
|
|
+ "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
|
|
|
+ change.chrom,
|
|
|
+ change.start,
|
|
|
+ change.end,
|
|
|
+ change.repeat_unit,
|
|
|
+ normal_geno,
|
|
|
+ tumor_geno,
|
|
|
+ max_size,
|
|
|
+ normal_support,
|
|
|
+ tumor_support,
|
|
|
+ change.status,
|
|
|
)?;
|
|
|
}
|
|
|
|
|
|
@@ -475,6 +529,171 @@ impl Straglr {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/// Status of somatic STR change.
|
|
|
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
|
+pub enum ChangeStatus {
|
|
|
+ /// Tumor allele larger than normal
|
|
|
+ Expansion,
|
|
|
+ /// Tumor allele smaller than normal
|
|
|
+ Contraction,
|
|
|
+ /// Present only in tumor
|
|
|
+ TumorOnly,
|
|
|
+}
|
|
|
+
|
|
|
+impl fmt::Display for ChangeStatus {
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ match self {
|
|
|
+ ChangeStatus::Expansion => write!(f, "EXPANSION"),
|
|
|
+ ChangeStatus::Contraction => write!(f, "CONTRACTION"),
|
|
|
+ ChangeStatus::TumorOnly => write!(f, "TUMOR_ONLY"),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Per-allele comparison between normal and tumor.
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub struct AlleleComparison {
|
|
|
+ /// Normal allele size in bp (None if tumor-only allele)
|
|
|
+ pub normal: Option<f64>,
|
|
|
+ /// Normal allele read support
|
|
|
+ pub normal_support: Option<u32>,
|
|
|
+ /// Tumor allele size in bp (None if lost in tumor)
|
|
|
+ pub tumor: Option<f64>,
|
|
|
+ /// Tumor allele read support
|
|
|
+ pub tumor_support: Option<u32>,
|
|
|
+ /// Size difference in bp (tumor - normal)
|
|
|
+ pub size_diff_bp: Option<f64>,
|
|
|
+}
|
|
|
+
|
|
|
+/// A somatic STR change between tumor and normal.
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub struct SomaticStrChange {
|
|
|
+ pub chrom: String,
|
|
|
+ pub start: u64,
|
|
|
+ pub end: u64,
|
|
|
+ pub repeat_unit: String,
|
|
|
+ pub normal: Option<StraglrRow>,
|
|
|
+ pub tumor: Option<StraglrRow>,
|
|
|
+ pub comparisons: Vec<AlleleComparison>,
|
|
|
+ pub status: ChangeStatus,
|
|
|
+}
|
|
|
+
|
|
|
+impl SomaticStrChange {
|
|
|
+ /// Returns location string.
|
|
|
+ pub fn location_string(&self) -> String {
|
|
|
+ format!("{}:{}-{}", self.chrom, self.start, self.end)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns max absolute size difference across alleles.
|
|
|
+ pub fn max_size_diff(&self) -> Option<f64> {
|
|
|
+ self.comparisons
|
|
|
+ .iter()
|
|
|
+ .filter_map(|c| c.size_diff_bp.map(|d| d.abs()))
|
|
|
+ .max_by(|a, b| a.partial_cmp(b).unwrap())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns true if there are novel tumor alleles.
|
|
|
+ pub fn has_novel_allele(&self) -> bool {
|
|
|
+ self.comparisons.iter().any(|c| c.normal.is_none() && c.tumor.is_some())
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Compares alleles between normal and tumor rows using greedy size matching.
|
|
|
+///
|
|
|
+/// Only considers alleles with at least `min_support` reads.
|
|
|
+/// Parses genotype strings in "size(count);size(count)" format and matches
|
|
|
+/// tumor alleles to closest normal alleles within tolerance.
|
|
|
+fn compare_alleles(
|
|
|
+ normal: &StraglrRow,
|
|
|
+ tumor: &StraglrRow,
|
|
|
+ min_support: u32,
|
|
|
+) -> Vec<AlleleComparison> {
|
|
|
+ let normal_alleles = parse_genotype_alleles(&normal.genotype)
|
|
|
+ .into_iter()
|
|
|
+ .filter(|(_, support)| *support >= min_support)
|
|
|
+ .collect::<Vec<_>>();
|
|
|
+ let tumor_alleles = parse_genotype_alleles(&tumor.genotype)
|
|
|
+ .into_iter()
|
|
|
+ .filter(|(_, support)| *support >= min_support)
|
|
|
+ .collect::<Vec<_>>();
|
|
|
+
|
|
|
+ let mut comparisons = Vec::new();
|
|
|
+ let mut matched_normal = vec![false; normal_alleles.len()];
|
|
|
+
|
|
|
+ // Tolerance for matching: alleles within 10bp are considered same
|
|
|
+ let match_tolerance = 10.0;
|
|
|
+
|
|
|
+ // Match each tumor allele to closest normal
|
|
|
+ for (tumor_size, tumor_support) in &tumor_alleles {
|
|
|
+ let mut best: Option<(usize, f64)> = None;
|
|
|
+
|
|
|
+ for (i, (normal_size, _)) in normal_alleles.iter().enumerate() {
|
|
|
+ if matched_normal[i] {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ let diff = (*tumor_size - *normal_size).abs();
|
|
|
+ if diff <= match_tolerance
|
|
|
+ && (best.is_none() || diff < best.unwrap().1) {
|
|
|
+ best = Some((i, diff));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if let Some((idx, _)) = best {
|
|
|
+ matched_normal[idx] = true;
|
|
|
+ let (normal_size, normal_support) = normal_alleles[idx];
|
|
|
+ comparisons.push(AlleleComparison {
|
|
|
+ normal: Some(normal_size),
|
|
|
+ normal_support: Some(normal_support),
|
|
|
+ tumor: Some(*tumor_size),
|
|
|
+ tumor_support: Some(*tumor_support),
|
|
|
+ size_diff_bp: Some(*tumor_size - normal_size),
|
|
|
+ });
|
|
|
+ } else {
|
|
|
+ // Novel tumor allele
|
|
|
+ comparisons.push(AlleleComparison {
|
|
|
+ normal: None,
|
|
|
+ normal_support: None,
|
|
|
+ tumor: Some(*tumor_size),
|
|
|
+ tumor_support: Some(*tumor_support),
|
|
|
+ size_diff_bp: None,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Add unmatched normal alleles (lost in tumor)
|
|
|
+ for (i, (normal_size, normal_support)) in normal_alleles.iter().enumerate() {
|
|
|
+ if !matched_normal[i] {
|
|
|
+ comparisons.push(AlleleComparison {
|
|
|
+ normal: Some(*normal_size),
|
|
|
+ normal_support: Some(*normal_support),
|
|
|
+ tumor: None,
|
|
|
+ tumor_support: None,
|
|
|
+ size_diff_bp: None,
|
|
|
+ });
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ comparisons
|
|
|
+}
|
|
|
+
|
|
|
+/// Parses genotype string "size(count);size(count)" into (size, support) pairs.
|
|
|
+fn parse_genotype_alleles(genotype: &str) -> Vec<(f64, u32)> {
|
|
|
+ genotype
|
|
|
+ .split(';')
|
|
|
+ .filter_map(|part| {
|
|
|
+ let part = part.trim();
|
|
|
+ if part.is_empty() || part == "." {
|
|
|
+ return None;
|
|
|
+ }
|
|
|
+ let open = part.find('(')?;
|
|
|
+ let close = part.find(')')?;
|
|
|
+ let size: f64 = part[..open].parse().ok()?;
|
|
|
+ let support: u32 = part[open + 1..close].parse().ok()?;
|
|
|
+ Some((size, support))
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+}
|
|
|
+
|
|
|
#[derive(Debug, Clone)]
|
|
|
struct StraglrJob {
|
|
|
conda_sh: String,
|
|
|
@@ -728,7 +947,7 @@ impl StraglrSolo {
|
|
|
///
|
|
|
/// # Errors
|
|
|
/// Returns an error if results cannot be loaded.
|
|
|
- pub fn load_expanded_repeats(&self, min_copy_number: u32) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
+ pub fn load_expanded_repeats(&self, min_copy_number: f64) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
let results = self.load_results()?;
|
|
|
Ok(results
|
|
|
.into_iter()
|
|
|
@@ -899,19 +1118,19 @@ pub fn run_straglr_chunked(
|
|
|
}
|
|
|
|
|
|
// Run all chunks in parallel
|
|
|
- // info!("Executing {} Straglr jobs in parallel", actual_n_parts);
|
|
|
- // let outputs = run_many!(config, jobs)?;
|
|
|
- //
|
|
|
- // // Save logs
|
|
|
- // let log_dir = format!("{}/{}/log/straglr_chunked", config.result_dir, id);
|
|
|
- // fs::create_dir_all(&log_dir).context("Failed to create log directory")?;
|
|
|
- //
|
|
|
- // for (i, output) in outputs.iter().enumerate() {
|
|
|
- // let log_file = format!("{}/straglr_part{}_", log_dir, i + 1);
|
|
|
- // output
|
|
|
- // .save_to_file(&log_file)
|
|
|
- // .context(format!("Failed to save logs for part {}", i + 1))?;
|
|
|
- // }
|
|
|
+ info!("Executing {} Straglr jobs in parallel", actual_n_parts);
|
|
|
+ let outputs = run_many!(config, jobs)?;
|
|
|
+
|
|
|
+ // Save logs
|
|
|
+ let log_dir = format!("{}/{}/log/straglr_chunked", config.result_dir, id);
|
|
|
+ fs::create_dir_all(&log_dir).context("Failed to create log directory")?;
|
|
|
+
|
|
|
+ for (i, output) in outputs.iter().enumerate() {
|
|
|
+ let log_file = format!("{}/straglr_part{}_", log_dir, i + 1);
|
|
|
+ output
|
|
|
+ .save_to_file(&log_file)
|
|
|
+ .context(format!("Failed to save logs for part {}", i + 1))?;
|
|
|
+ }
|
|
|
|
|
|
// Merge TSV files
|
|
|
info!("Merging {} TSV files", actual_n_parts);
|
|
|
@@ -988,6 +1207,141 @@ fn merge_tsv_files(input_files: &[String], output_file: &str) -> anyhow::Result<
|
|
|
Ok(())
|
|
|
}
|
|
|
|
|
|
+/// Summary statistics for somatic STR changes.
|
|
|
+#[derive(Debug, Clone, Default)]
|
|
|
+pub struct SomaticStrStats {
|
|
|
+ /// Total number of somatic changes
|
|
|
+ pub total: usize,
|
|
|
+ /// Number of expansions
|
|
|
+ pub expansions: usize,
|
|
|
+ /// Number of contractions
|
|
|
+ pub contractions: usize,
|
|
|
+ /// Number of tumor-only loci
|
|
|
+ pub tumor_only: usize,
|
|
|
+ /// Mean size difference (bp) for paired loci
|
|
|
+ pub mean_size_diff: f64,
|
|
|
+ /// Median size difference (bp) for paired loci
|
|
|
+ pub median_size_diff: f64,
|
|
|
+ /// Max size difference (bp)
|
|
|
+ pub max_size_diff: f64,
|
|
|
+ /// Min size difference (bp)
|
|
|
+ pub min_size_diff: f64,
|
|
|
+ /// Size differences by repeat unit motif
|
|
|
+ pub by_motif: HashMap<String, MotifStats>,
|
|
|
+ /// Count by chromosome
|
|
|
+ pub by_chrom: HashMap<String, usize>,
|
|
|
+}
|
|
|
+
|
|
|
+/// Per-motif statistics.
|
|
|
+#[derive(Debug, Clone, Default)]
|
|
|
+pub struct MotifStats {
|
|
|
+ pub count: usize,
|
|
|
+ pub expansions: usize,
|
|
|
+ pub contractions: usize,
|
|
|
+ pub mean_size_diff: f64,
|
|
|
+}
|
|
|
+
|
|
|
+/// Computes summary statistics for somatic STR changes.
|
|
|
+pub fn compute_stats(changes: &[SomaticStrChange]) -> SomaticStrStats {
|
|
|
+ if changes.is_empty() {
|
|
|
+ return SomaticStrStats::default();
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut stats = SomaticStrStats {
|
|
|
+ total: changes.len(),
|
|
|
+ ..Default::default()
|
|
|
+ };
|
|
|
+
|
|
|
+ let mut size_diffs: Vec<f64> = Vec::new();
|
|
|
+ let mut motif_diffs: HashMap<String, Vec<f64>> = HashMap::new();
|
|
|
+
|
|
|
+ for change in changes {
|
|
|
+ // Count by status
|
|
|
+ match change.status {
|
|
|
+ ChangeStatus::Expansion => stats.expansions += 1,
|
|
|
+ ChangeStatus::Contraction => stats.contractions += 1,
|
|
|
+ ChangeStatus::TumorOnly => stats.tumor_only += 1,
|
|
|
+ }
|
|
|
+
|
|
|
+ // Count by chromosome
|
|
|
+ *stats.by_chrom.entry(change.chrom.clone()).or_insert(0) += 1;
|
|
|
+
|
|
|
+ // Collect size differences for paired loci
|
|
|
+ if let Some(diff) = change.max_size_diff() {
|
|
|
+ size_diffs.push(diff);
|
|
|
+ motif_diffs
|
|
|
+ .entry(change.repeat_unit.clone())
|
|
|
+ .or_default()
|
|
|
+ .push(diff);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Track motif counts
|
|
|
+ let motif_stat = stats.by_motif.entry(change.repeat_unit.clone()).or_default();
|
|
|
+ motif_stat.count += 1;
|
|
|
+ match change.status {
|
|
|
+ ChangeStatus::Expansion => motif_stat.expansions += 1,
|
|
|
+ ChangeStatus::Contraction => motif_stat.contractions += 1,
|
|
|
+ ChangeStatus::TumorOnly => {}
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Compute global size diff stats
|
|
|
+ if !size_diffs.is_empty() {
|
|
|
+ size_diffs.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
|
+
|
|
|
+ stats.mean_size_diff = size_diffs.iter().sum::<f64>() / size_diffs.len() as f64;
|
|
|
+ stats.min_size_diff = size_diffs[0];
|
|
|
+ stats.max_size_diff = size_diffs[size_diffs.len() - 1];
|
|
|
+
|
|
|
+ let mid = size_diffs.len() / 2;
|
|
|
+ stats.median_size_diff = if size_diffs.len().is_multiple_of(2) {
|
|
|
+ (size_diffs[mid - 1] + size_diffs[mid]) / 2.0
|
|
|
+ } else {
|
|
|
+ size_diffs[mid]
|
|
|
+ };
|
|
|
+ }
|
|
|
+
|
|
|
+ // Compute per-motif mean size diff
|
|
|
+ for (motif, diffs) in motif_diffs {
|
|
|
+ if let Some(motif_stat) = stats.by_motif.get_mut(&motif) {
|
|
|
+ if !diffs.is_empty() {
|
|
|
+ motif_stat.mean_size_diff = diffs.iter().sum::<f64>() / diffs.len() as f64;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ stats
|
|
|
+}
|
|
|
+
|
|
|
+impl SomaticStrStats {
|
|
|
+ /// Returns a formatted summary string.
|
|
|
+ pub fn summary(&self) -> String {
|
|
|
+ format!(
|
|
|
+ "Total: {} | Expansions: {} | Contractions: {} | Tumor-only: {}\n\
|
|
|
+ Size diff (bp): mean={:.1}, median={:.1}, range=[{:.1}, {:.1}]\n\
|
|
|
+ Motifs: {}",
|
|
|
+ self.total,
|
|
|
+ self.expansions,
|
|
|
+ self.contractions,
|
|
|
+ self.tumor_only,
|
|
|
+ self.mean_size_diff,
|
|
|
+ self.median_size_diff,
|
|
|
+ self.min_size_diff,
|
|
|
+ self.max_size_diff,
|
|
|
+ self.by_motif.len()
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns expansion/contraction ratio.
|
|
|
+ pub fn expansion_ratio(&self) -> f64 {
|
|
|
+ if self.contractions == 0 {
|
|
|
+ f64::INFINITY
|
|
|
+ } else {
|
|
|
+ self.expansions as f64 / self.contractions as f64
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
use super::*;
|
|
|
@@ -1007,10 +1361,7 @@ mod tests {
|
|
|
let config = Config::default();
|
|
|
|
|
|
let mut caller = Straglr::initialize("DUMCO", &config)?;
|
|
|
- // caller.run()?;
|
|
|
-
|
|
|
- let differences = caller.find_somatic_changes(caller.config.straglr_min_diff)?;
|
|
|
- caller.save_somatic_changes(&differences, &caller.config.straglr_tumor_normal_diff_tsv("DUMCO"))?;
|
|
|
+ caller.run()?;
|
|
|
|
|
|
Ok(())
|
|
|
}
|