| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333 |
- use std::collections::HashMap;
- use crate::io::bam::{FbInv, SegmentOrder};
- /// Distribution statistics for a numeric field.
- #[derive(Debug, Clone, Default)]
- pub struct DistributionStats {
- pub count: usize,
- pub min: i64,
- pub max: i64,
- pub sum: i64,
- pub mean: f64,
- pub median: f64,
- pub std_dev: f64,
- }
- impl DistributionStats {
- /// Compute distribution stats from a slice of values.
- pub fn from_values(values: &mut [i64]) -> Self {
- if values.is_empty() {
- return Self::default();
- }
- let count = values.len();
- let sum: i64 = values.iter().sum();
- let mean = sum as f64 / count as f64;
- // Sort for min/max/median
- values.sort_unstable();
- let min = values[0];
- let max = values[count - 1];
- let median = if count.is_multiple_of(2) {
- (values[count / 2 - 1] + values[count / 2]) as f64 / 2.0
- } else {
- values[count / 2] as f64
- };
- // Standard deviation
- let variance: f64 = values
- .iter()
- .map(|&v| (v as f64 - mean).powi(2))
- .sum::<f64>()
- / count as f64;
- let std_dev = variance.sqrt();
- Self {
- count,
- min,
- max,
- sum,
- mean,
- median,
- std_dev,
- }
- }
- }
- /// Strand pattern for fold-back inversions.
- #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
- pub enum StrandPattern {
- PlusMinus, // A on +, B on -
- MinusPlus, // A on -, B on +
- }
- impl StrandPattern {
- pub fn from_strands(a_strand: char, b_strand: char) -> Self {
- match (a_strand, b_strand) {
- ('+', '-') => StrandPattern::PlusMinus,
- ('-', '+') => StrandPattern::MinusPlus,
- _ => unreachable!("FbInv requires opposite strands"),
- }
- }
- }
- /// Per-contig statistics.
- #[derive(Debug, Clone, Default)]
- pub struct ContigStats {
- pub count: usize,
- pub mean_distance: f64, // mean distance between consecutive events (sorted by position)
- pub positions: Vec<i64>, // stored temporarily for distance calculation
- }
- /// Comprehensive statistics computed from Vec<FbInv>.
- #[derive(Debug, Clone)]
- pub struct FbInvStats {
- // Counts
- pub total_count: usize,
- pub per_contig_count: HashMap<String, usize>,
- pub by_segment_order: HashMap<SegmentOrder, usize>,
- // Size distributions
- pub aln_a_len_stats: DistributionStats,
- pub aln_b_len_stats: DistributionStats,
- pub b_c_stats: DistributionStats,
- pub a_d_stats: DistributionStats,
- // Overlap metrics
- pub overlap_stats: OverlapStats,
- // Strand patterns
- pub strand_pattern_counts: HashMap<StrandPattern, usize>,
- // Per-contig with mean distance
- pub per_contig_stats: HashMap<String, ContigStats>,
- }
- /// Overlap metrics between alignment A and B.
- #[derive(Debug, Clone, Default)]
- pub struct OverlapStats {
- pub overlap_bp_stats: DistributionStats, // overlap in base pairs
- pub overlap_fraction_a: DistributionStats, // overlap / aln_a_len (stored as i64 * 1000 for precision)
- pub overlap_fraction_b: DistributionStats, // overlap / aln_b_len
- pub jaccard_stats: DistributionStats, // overlap / union (stored as i64 * 1000)
- }
- impl FbInvStats {
- /// Compute all statistics from a Vec<FbInv>.
- /// Assumes events are sorted by (contig, position) for distance calculations.
- pub fn from_events(events: &[FbInv]) -> Self {
- if events.is_empty() {
- return Self::empty();
- }
- let total_count = events.len();
- // Counts
- let mut per_contig_count: HashMap<String, usize> = HashMap::new();
- let mut by_segment_order: HashMap<SegmentOrder, usize> = HashMap::new();
- let mut strand_pattern_counts: HashMap<StrandPattern, usize> = HashMap::new();
- // Collect values for distributions
- let mut aln_a_lens: Vec<i64> = Vec::with_capacity(total_count);
- let mut aln_b_lens: Vec<i64> = Vec::with_capacity(total_count);
- let mut b_c_vals: Vec<i64> = Vec::with_capacity(total_count);
- let mut a_d_vals: Vec<i64> = Vec::with_capacity(total_count);
- // Overlap values
- let mut overlap_bps: Vec<i64> = Vec::with_capacity(total_count);
- let mut overlap_frac_a: Vec<i64> = Vec::with_capacity(total_count);
- let mut overlap_frac_b: Vec<i64> = Vec::with_capacity(total_count);
- let mut jaccard_vals: Vec<i64> = Vec::with_capacity(total_count);
- // Per-contig positions for distance calculation
- let mut per_contig_positions: HashMap<String, Vec<i64>> = HashMap::new();
- for ev in events {
- // Counts
- *per_contig_count.entry(ev.ref_name.clone()).or_insert(0) += 1;
- *by_segment_order.entry(ev.first).or_insert(0) += 1;
- let pattern = StrandPattern::from_strands(ev.aln_a_strand, ev.aln_b_strand);
- *strand_pattern_counts.entry(pattern).or_insert(0) += 1;
- // Size distributions
- aln_a_lens.push(ev.aln_a_len);
- aln_b_lens.push(ev.aln_b_len);
- b_c_vals.push(ev.b_c);
- a_d_vals.push(ev.a_d);
- // Overlap calculation
- let overlap_bp = compute_overlap_bp(ev);
- overlap_bps.push(overlap_bp);
- if ev.aln_a_len > 0 {
- overlap_frac_a.push((overlap_bp * 1000) / ev.aln_a_len);
- }
- if ev.aln_b_len > 0 {
- overlap_frac_b.push((overlap_bp * 1000) / ev.aln_b_len);
- }
- let union = compute_union_bp(ev);
- if union > 0 {
- jaccard_vals.push((overlap_bp * 1000) / union);
- }
- // Store position for distance calculation (use midpoint of A alignment)
- let midpoint = (ev.aln_a_start + ev.aln_a_end) / 2;
- per_contig_positions
- .entry(ev.ref_name.clone())
- .or_default()
- .push(midpoint);
- }
- // Compute per-contig stats with mean distance
- let mut per_contig_stats: HashMap<String, ContigStats> = HashMap::new();
- for (contig, mut positions) in per_contig_positions {
- let count = positions.len();
- positions.sort_unstable(); // ensure sorted
- let mean_distance = if count > 1 {
- let total_distance: i64 = positions.windows(2).map(|w| (w[1] - w[0]).abs()).sum();
- total_distance as f64 / (count - 1) as f64
- } else {
- 0.0
- };
- per_contig_stats.insert(
- contig.clone(),
- ContigStats {
- count,
- mean_distance,
- positions,
- },
- );
- }
- Self {
- total_count,
- per_contig_count,
- by_segment_order,
- aln_a_len_stats: DistributionStats::from_values(&mut aln_a_lens),
- aln_b_len_stats: DistributionStats::from_values(&mut aln_b_lens),
- b_c_stats: DistributionStats::from_values(&mut b_c_vals),
- a_d_stats: DistributionStats::from_values(&mut a_d_vals),
- overlap_stats: OverlapStats {
- overlap_bp_stats: DistributionStats::from_values(&mut overlap_bps),
- overlap_fraction_a: DistributionStats::from_values(&mut overlap_frac_a),
- overlap_fraction_b: DistributionStats::from_values(&mut overlap_frac_b),
- jaccard_stats: DistributionStats::from_values(&mut jaccard_vals),
- },
- strand_pattern_counts,
- per_contig_stats,
- }
- }
- fn empty() -> Self {
- Self {
- total_count: 0,
- per_contig_count: HashMap::new(),
- by_segment_order: HashMap::new(),
- aln_a_len_stats: DistributionStats::default(),
- aln_b_len_stats: DistributionStats::default(),
- b_c_stats: DistributionStats::default(),
- a_d_stats: DistributionStats::default(),
- overlap_stats: OverlapStats::default(),
- strand_pattern_counts: HashMap::new(),
- per_contig_stats: HashMap::new(),
- }
- }
- /// Summary report as a formatted string.
- pub fn summary(&self) -> String {
- let mut s = String::new();
- s.push_str("=== FbInv Statistics ===\n\n");
- s.push_str(&format!("Total events: {}\n\n", self.total_count));
- // Per-contig counts
- s.push_str("Per-contig counts:\n");
- let mut contigs: Vec<_> = self.per_contig_count.iter().collect();
- contigs.sort_by(|a, b| b.1.cmp(a.1)); // sort by count descending
- for (contig, count) in contigs {
- if let Some(cstats) = self.per_contig_stats.get(contig) {
- s.push_str(&format!(
- " {}: {} events, mean distance: {:.1} bp\n",
- contig, count, cstats.mean_distance
- ));
- } else {
- s.push_str(&format!(" {}: {}\n", contig, count));
- }
- }
- s.push('\n');
- // Segment order
- s.push_str("By segment order:\n");
- for (order, count) in &self.by_segment_order {
- s.push_str(&format!(" {:?}: {}\n", order, count));
- }
- s.push('\n');
- // Strand patterns
- s.push_str("Strand patterns:\n");
- for (pattern, count) in &self.strand_pattern_counts {
- let pct = (*count as f64 / self.total_count as f64) * 100.0;
- s.push_str(&format!(" {:?}: {} ({:.1}%)\n", pattern, count, pct));
- }
- s.push('\n');
- // Size distributions
- s.push_str("Size distributions:\n");
- s.push_str(&format_dist(" aln_a_len", &self.aln_a_len_stats));
- s.push_str(&format_dist(" aln_b_len", &self.aln_b_len_stats));
- s.push_str(&format_dist(" b_c", &self.b_c_stats));
- s.push_str(&format_dist(" a_d", &self.a_d_stats));
- s.push('\n');
- // Overlap metrics
- s.push_str("Overlap metrics:\n");
- s.push_str(&format_dist(
- " overlap_bp",
- &self.overlap_stats.overlap_bp_stats,
- ));
- s.push_str(&format!(
- " overlap_frac_a: mean={:.1}%, median={:.1}%\n",
- self.overlap_stats.overlap_fraction_a.mean / 10.0,
- self.overlap_stats.overlap_fraction_a.median / 10.0
- ));
- s.push_str(&format!(
- " overlap_frac_b: mean={:.1}%, median={:.1}%\n",
- self.overlap_stats.overlap_fraction_b.mean / 10.0,
- self.overlap_stats.overlap_fraction_b.median / 10.0
- ));
- s.push_str(&format!(
- " jaccard: mean={:.1}%, median={:.1}%\n",
- self.overlap_stats.jaccard_stats.mean / 10.0,
- self.overlap_stats.jaccard_stats.median / 10.0
- ));
- s
- }
- }
- /// Compute overlap in base pairs between alignment A and B on the reference.
- fn compute_overlap_bp(ev: &FbInv) -> i64 {
- let max_start = ev.aln_a_start.max(ev.aln_b_start);
- let min_end = ev.aln_a_end.min(ev.aln_b_end);
- (min_end - max_start).max(0)
- }
- /// Compute union in base pairs between alignment A and B on the reference.
- fn compute_union_bp(ev: &FbInv) -> i64 {
- let min_start = ev.aln_a_start.min(ev.aln_b_start);
- let max_end = ev.aln_a_end.max(ev.aln_b_end);
- max_end - min_start
- }
- fn format_dist(name: &str, d: &DistributionStats) -> String {
- format!(
- "{}: min={}, max={}, mean={:.1}, median={:.1}, std={:.1}\n",
- name, d.min, d.max, d.mean, d.median, d.std_dev
- )
- }
|