fb_inv_stats.rs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. use std::collections::HashMap;
  2. use crate::io::bam::{FbInv, SegmentOrder};
  3. /// Distribution statistics for a numeric field.
  4. #[derive(Debug, Clone, Default)]
  5. pub struct DistributionStats {
  6. pub count: usize,
  7. pub min: i64,
  8. pub max: i64,
  9. pub sum: i64,
  10. pub mean: f64,
  11. pub median: f64,
  12. pub std_dev: f64,
  13. }
  14. impl DistributionStats {
  15. /// Compute distribution stats from a slice of values.
  16. pub fn from_values(values: &mut [i64]) -> Self {
  17. if values.is_empty() {
  18. return Self::default();
  19. }
  20. let count = values.len();
  21. let sum: i64 = values.iter().sum();
  22. let mean = sum as f64 / count as f64;
  23. // Sort for min/max/median
  24. values.sort_unstable();
  25. let min = values[0];
  26. let max = values[count - 1];
  27. let median = if count.is_multiple_of(2) {
  28. (values[count / 2 - 1] + values[count / 2]) as f64 / 2.0
  29. } else {
  30. values[count / 2] as f64
  31. };
  32. // Standard deviation
  33. let variance: f64 = values
  34. .iter()
  35. .map(|&v| (v as f64 - mean).powi(2))
  36. .sum::<f64>()
  37. / count as f64;
  38. let std_dev = variance.sqrt();
  39. Self {
  40. count,
  41. min,
  42. max,
  43. sum,
  44. mean,
  45. median,
  46. std_dev,
  47. }
  48. }
  49. }
  50. /// Strand pattern for fold-back inversions.
  51. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
  52. pub enum StrandPattern {
  53. PlusMinus, // A on +, B on -
  54. MinusPlus, // A on -, B on +
  55. }
  56. impl StrandPattern {
  57. pub fn from_strands(a_strand: char, b_strand: char) -> Self {
  58. match (a_strand, b_strand) {
  59. ('+', '-') => StrandPattern::PlusMinus,
  60. ('-', '+') => StrandPattern::MinusPlus,
  61. _ => unreachable!("FbInv requires opposite strands"),
  62. }
  63. }
  64. }
  65. /// Per-contig statistics.
  66. #[derive(Debug, Clone, Default)]
  67. pub struct ContigStats {
  68. pub count: usize,
  69. pub mean_distance: f64, // mean distance between consecutive events (sorted by position)
  70. pub positions: Vec<i64>, // stored temporarily for distance calculation
  71. }
  72. /// Comprehensive statistics computed from Vec<FbInv>.
  73. #[derive(Debug, Clone)]
  74. pub struct FbInvStats {
  75. // Counts
  76. pub total_count: usize,
  77. pub per_contig_count: HashMap<String, usize>,
  78. pub by_segment_order: HashMap<SegmentOrder, usize>,
  79. // Size distributions
  80. pub aln_a_len_stats: DistributionStats,
  81. pub aln_b_len_stats: DistributionStats,
  82. pub b_c_stats: DistributionStats,
  83. pub a_d_stats: DistributionStats,
  84. // Overlap metrics
  85. pub overlap_stats: OverlapStats,
  86. // Strand patterns
  87. pub strand_pattern_counts: HashMap<StrandPattern, usize>,
  88. // Per-contig with mean distance
  89. pub per_contig_stats: HashMap<String, ContigStats>,
  90. }
  91. /// Overlap metrics between alignment A and B.
  92. #[derive(Debug, Clone, Default)]
  93. pub struct OverlapStats {
  94. pub overlap_bp_stats: DistributionStats, // overlap in base pairs
  95. pub overlap_fraction_a: DistributionStats, // overlap / aln_a_len (stored as i64 * 1000 for precision)
  96. pub overlap_fraction_b: DistributionStats, // overlap / aln_b_len
  97. pub jaccard_stats: DistributionStats, // overlap / union (stored as i64 * 1000)
  98. }
  99. impl FbInvStats {
  100. /// Compute all statistics from a Vec<FbInv>.
  101. /// Assumes events are sorted by (contig, position) for distance calculations.
  102. pub fn from_events(events: &[FbInv]) -> Self {
  103. if events.is_empty() {
  104. return Self::empty();
  105. }
  106. let total_count = events.len();
  107. // Counts
  108. let mut per_contig_count: HashMap<String, usize> = HashMap::new();
  109. let mut by_segment_order: HashMap<SegmentOrder, usize> = HashMap::new();
  110. let mut strand_pattern_counts: HashMap<StrandPattern, usize> = HashMap::new();
  111. // Collect values for distributions
  112. let mut aln_a_lens: Vec<i64> = Vec::with_capacity(total_count);
  113. let mut aln_b_lens: Vec<i64> = Vec::with_capacity(total_count);
  114. let mut b_c_vals: Vec<i64> = Vec::with_capacity(total_count);
  115. let mut a_d_vals: Vec<i64> = Vec::with_capacity(total_count);
  116. // Overlap values
  117. let mut overlap_bps: Vec<i64> = Vec::with_capacity(total_count);
  118. let mut overlap_frac_a: Vec<i64> = Vec::with_capacity(total_count);
  119. let mut overlap_frac_b: Vec<i64> = Vec::with_capacity(total_count);
  120. let mut jaccard_vals: Vec<i64> = Vec::with_capacity(total_count);
  121. // Per-contig positions for distance calculation
  122. let mut per_contig_positions: HashMap<String, Vec<i64>> = HashMap::new();
  123. for ev in events {
  124. // Counts
  125. *per_contig_count.entry(ev.ref_name.clone()).or_insert(0) += 1;
  126. *by_segment_order.entry(ev.first).or_insert(0) += 1;
  127. let pattern = StrandPattern::from_strands(ev.aln_a_strand, ev.aln_b_strand);
  128. *strand_pattern_counts.entry(pattern).or_insert(0) += 1;
  129. // Size distributions
  130. aln_a_lens.push(ev.aln_a_len);
  131. aln_b_lens.push(ev.aln_b_len);
  132. b_c_vals.push(ev.b_c);
  133. a_d_vals.push(ev.a_d);
  134. // Overlap calculation
  135. let overlap_bp = compute_overlap_bp(ev);
  136. overlap_bps.push(overlap_bp);
  137. if ev.aln_a_len > 0 {
  138. overlap_frac_a.push((overlap_bp * 1000) / ev.aln_a_len);
  139. }
  140. if ev.aln_b_len > 0 {
  141. overlap_frac_b.push((overlap_bp * 1000) / ev.aln_b_len);
  142. }
  143. let union = compute_union_bp(ev);
  144. if union > 0 {
  145. jaccard_vals.push((overlap_bp * 1000) / union);
  146. }
  147. // Store position for distance calculation (use midpoint of A alignment)
  148. let midpoint = (ev.aln_a_start + ev.aln_a_end) / 2;
  149. per_contig_positions
  150. .entry(ev.ref_name.clone())
  151. .or_default()
  152. .push(midpoint);
  153. }
  154. // Compute per-contig stats with mean distance
  155. let mut per_contig_stats: HashMap<String, ContigStats> = HashMap::new();
  156. for (contig, mut positions) in per_contig_positions {
  157. let count = positions.len();
  158. positions.sort_unstable(); // ensure sorted
  159. let mean_distance = if count > 1 {
  160. let total_distance: i64 = positions.windows(2).map(|w| (w[1] - w[0]).abs()).sum();
  161. total_distance as f64 / (count - 1) as f64
  162. } else {
  163. 0.0
  164. };
  165. per_contig_stats.insert(
  166. contig.clone(),
  167. ContigStats {
  168. count,
  169. mean_distance,
  170. positions,
  171. },
  172. );
  173. }
  174. Self {
  175. total_count,
  176. per_contig_count,
  177. by_segment_order,
  178. aln_a_len_stats: DistributionStats::from_values(&mut aln_a_lens),
  179. aln_b_len_stats: DistributionStats::from_values(&mut aln_b_lens),
  180. b_c_stats: DistributionStats::from_values(&mut b_c_vals),
  181. a_d_stats: DistributionStats::from_values(&mut a_d_vals),
  182. overlap_stats: OverlapStats {
  183. overlap_bp_stats: DistributionStats::from_values(&mut overlap_bps),
  184. overlap_fraction_a: DistributionStats::from_values(&mut overlap_frac_a),
  185. overlap_fraction_b: DistributionStats::from_values(&mut overlap_frac_b),
  186. jaccard_stats: DistributionStats::from_values(&mut jaccard_vals),
  187. },
  188. strand_pattern_counts,
  189. per_contig_stats,
  190. }
  191. }
  192. fn empty() -> Self {
  193. Self {
  194. total_count: 0,
  195. per_contig_count: HashMap::new(),
  196. by_segment_order: HashMap::new(),
  197. aln_a_len_stats: DistributionStats::default(),
  198. aln_b_len_stats: DistributionStats::default(),
  199. b_c_stats: DistributionStats::default(),
  200. a_d_stats: DistributionStats::default(),
  201. overlap_stats: OverlapStats::default(),
  202. strand_pattern_counts: HashMap::new(),
  203. per_contig_stats: HashMap::new(),
  204. }
  205. }
  206. /// Summary report as a formatted string.
  207. pub fn summary(&self) -> String {
  208. let mut s = String::new();
  209. s.push_str("=== FbInv Statistics ===\n\n");
  210. s.push_str(&format!("Total events: {}\n\n", self.total_count));
  211. // Per-contig counts
  212. s.push_str("Per-contig counts:\n");
  213. let mut contigs: Vec<_> = self.per_contig_count.iter().collect();
  214. contigs.sort_by(|a, b| b.1.cmp(a.1)); // sort by count descending
  215. for (contig, count) in contigs {
  216. if let Some(cstats) = self.per_contig_stats.get(contig) {
  217. s.push_str(&format!(
  218. " {}: {} events, mean distance: {:.1} bp\n",
  219. contig, count, cstats.mean_distance
  220. ));
  221. } else {
  222. s.push_str(&format!(" {}: {}\n", contig, count));
  223. }
  224. }
  225. s.push('\n');
  226. // Segment order
  227. s.push_str("By segment order:\n");
  228. for (order, count) in &self.by_segment_order {
  229. s.push_str(&format!(" {:?}: {}\n", order, count));
  230. }
  231. s.push('\n');
  232. // Strand patterns
  233. s.push_str("Strand patterns:\n");
  234. for (pattern, count) in &self.strand_pattern_counts {
  235. let pct = (*count as f64 / self.total_count as f64) * 100.0;
  236. s.push_str(&format!(" {:?}: {} ({:.1}%)\n", pattern, count, pct));
  237. }
  238. s.push('\n');
  239. // Size distributions
  240. s.push_str("Size distributions:\n");
  241. s.push_str(&format_dist(" aln_a_len", &self.aln_a_len_stats));
  242. s.push_str(&format_dist(" aln_b_len", &self.aln_b_len_stats));
  243. s.push_str(&format_dist(" b_c", &self.b_c_stats));
  244. s.push_str(&format_dist(" a_d", &self.a_d_stats));
  245. s.push('\n');
  246. // Overlap metrics
  247. s.push_str("Overlap metrics:\n");
  248. s.push_str(&format_dist(
  249. " overlap_bp",
  250. &self.overlap_stats.overlap_bp_stats,
  251. ));
  252. s.push_str(&format!(
  253. " overlap_frac_a: mean={:.1}%, median={:.1}%\n",
  254. self.overlap_stats.overlap_fraction_a.mean / 10.0,
  255. self.overlap_stats.overlap_fraction_a.median / 10.0
  256. ));
  257. s.push_str(&format!(
  258. " overlap_frac_b: mean={:.1}%, median={:.1}%\n",
  259. self.overlap_stats.overlap_fraction_b.mean / 10.0,
  260. self.overlap_stats.overlap_fraction_b.median / 10.0
  261. ));
  262. s.push_str(&format!(
  263. " jaccard: mean={:.1}%, median={:.1}%\n",
  264. self.overlap_stats.jaccard_stats.mean / 10.0,
  265. self.overlap_stats.jaccard_stats.median / 10.0
  266. ));
  267. s
  268. }
  269. }
  270. /// Compute overlap in base pairs between alignment A and B on the reference.
  271. fn compute_overlap_bp(ev: &FbInv) -> i64 {
  272. let max_start = ev.aln_a_start.max(ev.aln_b_start);
  273. let min_end = ev.aln_a_end.min(ev.aln_b_end);
  274. (min_end - max_start).max(0)
  275. }
  276. /// Compute union in base pairs between alignment A and B on the reference.
  277. fn compute_union_bp(ev: &FbInv) -> i64 {
  278. let min_start = ev.aln_a_start.min(ev.aln_b_start);
  279. let max_end = ev.aln_a_end.max(ev.aln_b_end);
  280. max_end - min_start
  281. }
  282. fn format_dist(name: &str, d: &DistributionStats) -> String {
  283. format!(
  284. "{}: min={}, max={}, mean={:.1}, median={:.1}, std={:.1}\n",
  285. name, d.min, d.max, d.mean, d.median, d.std_dev
  286. )
  287. }