use std::collections::HashMap; use crate::io::bam::{FbInv, SegmentOrder}; /// Distribution statistics for a numeric field. #[derive(Debug, Clone, Default)] pub struct DistributionStats { pub count: usize, pub min: i64, pub max: i64, pub sum: i64, pub mean: f64, pub median: f64, pub std_dev: f64, } impl DistributionStats { /// Compute distribution stats from a slice of values. pub fn from_values(values: &mut [i64]) -> Self { if values.is_empty() { return Self::default(); } let count = values.len(); let sum: i64 = values.iter().sum(); let mean = sum as f64 / count as f64; // Sort for min/max/median values.sort_unstable(); let min = values[0]; let max = values[count - 1]; let median = if count.is_multiple_of(2) { (values[count / 2 - 1] + values[count / 2]) as f64 / 2.0 } else { values[count / 2] as f64 }; // Standard deviation let variance: f64 = values .iter() .map(|&v| (v as f64 - mean).powi(2)) .sum::() / count as f64; let std_dev = variance.sqrt(); Self { count, min, max, sum, mean, median, std_dev, } } } /// Strand pattern for fold-back inversions. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum StrandPattern { PlusMinus, // A on +, B on - MinusPlus, // A on -, B on + } impl StrandPattern { pub fn from_strands(a_strand: char, b_strand: char) -> Self { match (a_strand, b_strand) { ('+', '-') => StrandPattern::PlusMinus, ('-', '+') => StrandPattern::MinusPlus, _ => unreachable!("FbInv requires opposite strands"), } } } /// Per-contig statistics. #[derive(Debug, Clone, Default)] pub struct ContigStats { pub count: usize, pub mean_distance: f64, // mean distance between consecutive events (sorted by position) pub positions: Vec, // stored temporarily for distance calculation } /// Comprehensive statistics computed from Vec. #[derive(Debug, Clone)] pub struct FbInvStats { // Counts pub total_count: usize, pub per_contig_count: HashMap, pub by_segment_order: HashMap, // Size distributions pub aln_a_len_stats: DistributionStats, pub aln_b_len_stats: DistributionStats, pub b_c_stats: DistributionStats, pub a_d_stats: DistributionStats, // Overlap metrics pub overlap_stats: OverlapStats, // Strand patterns pub strand_pattern_counts: HashMap, // Per-contig with mean distance pub per_contig_stats: HashMap, } /// Overlap metrics between alignment A and B. #[derive(Debug, Clone, Default)] pub struct OverlapStats { pub overlap_bp_stats: DistributionStats, // overlap in base pairs pub overlap_fraction_a: DistributionStats, // overlap / aln_a_len (stored as i64 * 1000 for precision) pub overlap_fraction_b: DistributionStats, // overlap / aln_b_len pub jaccard_stats: DistributionStats, // overlap / union (stored as i64 * 1000) } impl FbInvStats { /// Compute all statistics from a Vec. /// Assumes events are sorted by (contig, position) for distance calculations. pub fn from_events(events: &[FbInv]) -> Self { if events.is_empty() { return Self::empty(); } let total_count = events.len(); // Counts let mut per_contig_count: HashMap = HashMap::new(); let mut by_segment_order: HashMap = HashMap::new(); let mut strand_pattern_counts: HashMap = HashMap::new(); // Collect values for distributions let mut aln_a_lens: Vec = Vec::with_capacity(total_count); let mut aln_b_lens: Vec = Vec::with_capacity(total_count); let mut b_c_vals: Vec = Vec::with_capacity(total_count); let mut a_d_vals: Vec = Vec::with_capacity(total_count); // Overlap values let mut overlap_bps: Vec = Vec::with_capacity(total_count); let mut overlap_frac_a: Vec = Vec::with_capacity(total_count); let mut overlap_frac_b: Vec = Vec::with_capacity(total_count); let mut jaccard_vals: Vec = Vec::with_capacity(total_count); // Per-contig positions for distance calculation let mut per_contig_positions: HashMap> = HashMap::new(); for ev in events { // Counts *per_contig_count.entry(ev.ref_name.clone()).or_insert(0) += 1; *by_segment_order.entry(ev.first).or_insert(0) += 1; let pattern = StrandPattern::from_strands(ev.aln_a_strand, ev.aln_b_strand); *strand_pattern_counts.entry(pattern).or_insert(0) += 1; // Size distributions aln_a_lens.push(ev.aln_a_len); aln_b_lens.push(ev.aln_b_len); b_c_vals.push(ev.b_c); a_d_vals.push(ev.a_d); // Overlap calculation let overlap_bp = compute_overlap_bp(ev); overlap_bps.push(overlap_bp); if ev.aln_a_len > 0 { overlap_frac_a.push((overlap_bp * 1000) / ev.aln_a_len); } if ev.aln_b_len > 0 { overlap_frac_b.push((overlap_bp * 1000) / ev.aln_b_len); } let union = compute_union_bp(ev); if union > 0 { jaccard_vals.push((overlap_bp * 1000) / union); } // Store position for distance calculation (use midpoint of A alignment) let midpoint = (ev.aln_a_start + ev.aln_a_end) / 2; per_contig_positions .entry(ev.ref_name.clone()) .or_default() .push(midpoint); } // Compute per-contig stats with mean distance let mut per_contig_stats: HashMap = HashMap::new(); for (contig, mut positions) in per_contig_positions { let count = positions.len(); positions.sort_unstable(); // ensure sorted let mean_distance = if count > 1 { let total_distance: i64 = positions.windows(2).map(|w| (w[1] - w[0]).abs()).sum(); total_distance as f64 / (count - 1) as f64 } else { 0.0 }; per_contig_stats.insert( contig.clone(), ContigStats { count, mean_distance, positions, }, ); } Self { total_count, per_contig_count, by_segment_order, aln_a_len_stats: DistributionStats::from_values(&mut aln_a_lens), aln_b_len_stats: DistributionStats::from_values(&mut aln_b_lens), b_c_stats: DistributionStats::from_values(&mut b_c_vals), a_d_stats: DistributionStats::from_values(&mut a_d_vals), overlap_stats: OverlapStats { overlap_bp_stats: DistributionStats::from_values(&mut overlap_bps), overlap_fraction_a: DistributionStats::from_values(&mut overlap_frac_a), overlap_fraction_b: DistributionStats::from_values(&mut overlap_frac_b), jaccard_stats: DistributionStats::from_values(&mut jaccard_vals), }, strand_pattern_counts, per_contig_stats, } } fn empty() -> Self { Self { total_count: 0, per_contig_count: HashMap::new(), by_segment_order: HashMap::new(), aln_a_len_stats: DistributionStats::default(), aln_b_len_stats: DistributionStats::default(), b_c_stats: DistributionStats::default(), a_d_stats: DistributionStats::default(), overlap_stats: OverlapStats::default(), strand_pattern_counts: HashMap::new(), per_contig_stats: HashMap::new(), } } /// Summary report as a formatted string. pub fn summary(&self) -> String { let mut s = String::new(); s.push_str("=== FbInv Statistics ===\n\n"); s.push_str(&format!("Total events: {}\n\n", self.total_count)); // Per-contig counts s.push_str("Per-contig counts:\n"); let mut contigs: Vec<_> = self.per_contig_count.iter().collect(); contigs.sort_by(|a, b| b.1.cmp(a.1)); // sort by count descending for (contig, count) in contigs { if let Some(cstats) = self.per_contig_stats.get(contig) { s.push_str(&format!( " {}: {} events, mean distance: {:.1} bp\n", contig, count, cstats.mean_distance )); } else { s.push_str(&format!(" {}: {}\n", contig, count)); } } s.push('\n'); // Segment order s.push_str("By segment order:\n"); for (order, count) in &self.by_segment_order { s.push_str(&format!(" {:?}: {}\n", order, count)); } s.push('\n'); // Strand patterns s.push_str("Strand patterns:\n"); for (pattern, count) in &self.strand_pattern_counts { let pct = (*count as f64 / self.total_count as f64) * 100.0; s.push_str(&format!(" {:?}: {} ({:.1}%)\n", pattern, count, pct)); } s.push('\n'); // Size distributions s.push_str("Size distributions:\n"); s.push_str(&format_dist(" aln_a_len", &self.aln_a_len_stats)); s.push_str(&format_dist(" aln_b_len", &self.aln_b_len_stats)); s.push_str(&format_dist(" b_c", &self.b_c_stats)); s.push_str(&format_dist(" a_d", &self.a_d_stats)); s.push('\n'); // Overlap metrics s.push_str("Overlap metrics:\n"); s.push_str(&format_dist( " overlap_bp", &self.overlap_stats.overlap_bp_stats, )); s.push_str(&format!( " overlap_frac_a: mean={:.1}%, median={:.1}%\n", self.overlap_stats.overlap_fraction_a.mean / 10.0, self.overlap_stats.overlap_fraction_a.median / 10.0 )); s.push_str(&format!( " overlap_frac_b: mean={:.1}%, median={:.1}%\n", self.overlap_stats.overlap_fraction_b.mean / 10.0, self.overlap_stats.overlap_fraction_b.median / 10.0 )); s.push_str(&format!( " jaccard: mean={:.1}%, median={:.1}%\n", self.overlap_stats.jaccard_stats.mean / 10.0, self.overlap_stats.jaccard_stats.median / 10.0 )); s } } /// Compute overlap in base pairs between alignment A and B on the reference. fn compute_overlap_bp(ev: &FbInv) -> i64 { let max_start = ev.aln_a_start.max(ev.aln_b_start); let min_end = ev.aln_a_end.min(ev.aln_b_end); (min_end - max_start).max(0) } /// Compute union in base pairs between alignment A and B on the reference. fn compute_union_bp(ev: &FbInv) -> i64 { let min_start = ev.aln_a_start.min(ev.aln_b_start); let max_end = ev.aln_a_end.max(ev.aln_b_end); max_end - min_start } fn format_dist(name: &str, d: &DistributionStats) -> String { format!( "{}: min={}, max={}, mean={:.1}, median={:.1}, std={:.1}\n", name, d.min, d.max, d.mean, d.median, d.std_dev ) }