utils.rs 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. use std::time::Duration;
  2. use anyhow::{Context, Ok, Result};
  3. use hashbrown::HashMap;
  4. use indicatif::{ProgressBar, ProgressStyle};
  5. use statrs::distribution::{ChiSquared, ContinuousCDF};
  6. pub fn chi_square_test_impl(observed: &[f64], expected: &[f64]) -> anyhow::Result<f64> {
  7. if observed.len() != expected.len() {
  8. return Err(anyhow::anyhow!("Input vectors must have the same length"));
  9. }
  10. // Calculate the chi-squared statistic
  11. let chi_squared_statistic: f64 = observed
  12. .iter()
  13. .zip(expected.iter())
  14. .map(|(obs, exp)| ((obs - exp).abs() - 0.5).powi(2) / exp)
  15. .sum();
  16. // Degrees of freedom is the number of categories minus 1
  17. // let degrees_of_freedom = (observed.len() - 1) as f64;
  18. let degrees_of_freedom = 1.0;
  19. // Calculate p-value using chi-squared distribution
  20. let chi_squared_distribution = ChiSquared::new(degrees_of_freedom).unwrap();
  21. let p_value = 1.0 - chi_squared_distribution.cdf(chi_squared_statistic);
  22. // You can use the p-value to make decisions based on your significance level
  23. // For example, with a significance level of 0.05, if p_value < 0.05, reject the null hypothesis
  24. Ok(p_value)
  25. }
  26. /// 2-sample test for equality of proportions with continuity correction
  27. /// remerciements to chatGPT
  28. pub fn chi_square_test_for_proportions(
  29. success_a: f64,
  30. total_a: f64,
  31. success_b: f64,
  32. total_b: f64,
  33. ) -> anyhow::Result<f64> {
  34. let observed_counts = vec![
  35. success_a,
  36. total_a - success_a,
  37. success_b,
  38. total_b - success_b,
  39. ];
  40. let expected_counts = vec![
  41. total_a * (success_a + success_b) / (total_a + total_b),
  42. total_a * (total_a - success_a + total_b - success_b) / (total_a + total_b),
  43. total_b * (success_a + success_b) / (total_a + total_b),
  44. total_b * (total_a - success_a + total_b - success_b) / (total_a + total_b),
  45. ];
  46. chi_square_test_impl(&observed_counts, &expected_counts)
  47. }
  48. pub fn get_hts_nt_pileup(
  49. bam: &mut rust_htslib::bam::IndexedReader,
  50. chr: &str,
  51. start: i32,
  52. with_next_ins: bool,
  53. ) -> Result<Vec<u8>> {
  54. use rust_htslib::{bam, bam::Read};
  55. let stop = start + 1;
  56. let mut bases = Vec::new();
  57. bam.fetch((chr, start, stop))?;
  58. let mut bam_pileup = Vec::new();
  59. for p in bam.pileup() {
  60. let pileup = p.context(format!(
  61. "Can't pilup bam at position {}:{}-{}",
  62. chr, start, stop
  63. ))?;
  64. let position = pileup.pos() as i32;
  65. if position == start {
  66. for alignment in pileup.alignments() {
  67. match alignment.indel() {
  68. bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
  69. bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
  70. _ => {
  71. let record = alignment.record();
  72. if record.seq_len() > 0 {
  73. if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
  74. bases.push(b);
  75. }
  76. } else {
  77. if alignment.is_del() {
  78. bases.push(b'D');
  79. }
  80. }
  81. }
  82. }
  83. }
  84. }
  85. }
  86. Ok(bases)
  87. }
  88. pub fn hts_base_at(
  89. record: &rust_htslib::bam::record::Record,
  90. at_pos: u32,
  91. with_next_ins: bool,
  92. ) -> Result<Option<u8>> {
  93. use rust_htslib::bam::record::Cigar;
  94. let cigar = record.cigar();
  95. let seq = record.seq();
  96. let pos = cigar.pos() as u32;
  97. let mut read_i = 0u32;
  98. let at_pos = at_pos - 1;
  99. let mut ref_pos = pos;
  100. if ref_pos > at_pos {
  101. return Ok(None);
  102. }
  103. for (id, op) in cigar.iter().enumerate() {
  104. let (add_read, add_ref) = match *op {
  105. Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
  106. Cigar::Ins(len) => (len, 0),
  107. Cigar::Del(len) => (0, len),
  108. Cigar::RefSkip(len) => (0, len),
  109. Cigar::SoftClip(len) => (len, 0),
  110. Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
  111. };
  112. // If at the end of the op len and next op is Ins return I
  113. if with_next_ins && ref_pos + add_read == at_pos + 1 {
  114. if let Some(Cigar::Ins(_)) = cigar.get(id + 1) {
  115. return Ok(Some(b'I'));
  116. }
  117. }
  118. if ref_pos + add_ref > at_pos {
  119. // Handle deletions directly
  120. if let Cigar::Del(_) = *op {
  121. return Ok(Some(b'D'));
  122. } else if let Cigar::RefSkip(_) = op {
  123. return Ok(None);
  124. } else {
  125. let diff = at_pos - ref_pos;
  126. let p = read_i + diff;
  127. return Ok(Some(seq[p as usize]));
  128. }
  129. }
  130. read_i += add_read;
  131. ref_pos += add_ref;
  132. }
  133. Ok(None)
  134. }
  135. // thanks to chatGPT (the best)
  136. pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
  137. let m = dna_sequence.len() as f64;
  138. // Count occurrences of each base
  139. let mut bases = HashMap::<char, usize>::new();
  140. for base in dna_sequence.chars() {
  141. *bases.entry(base).or_insert(0) += 1;
  142. }
  143. // Calculate Shannon entropy
  144. let mut shannon_entropy_value = 0.0;
  145. for &n_i in bases.values() {
  146. let p_i = n_i as f64 / m;
  147. shannon_entropy_value -= p_i * p_i.log2();
  148. }
  149. shannon_entropy_value
  150. }
  151. pub fn print_stat_cat(s: &HashMap<String, u32>, denum: u32) {
  152. let denum = denum as f32;
  153. let mut v: Vec<(&String, &u32)> = s.iter().map(|e| e).collect();
  154. v.sort_by(|(_, a), (_, b)| b.cmp(a));
  155. let mut table = prettytable::table!(["category", "n", "%"]);
  156. v.iter().for_each(|(k, v)| {
  157. let p = (**v as f32) * 100 as f32 / denum;
  158. let p = format!("{:.2}", p);
  159. table.add_row([*k, &v.to_string(), &p].into());
  160. });
  161. table.printstd();
  162. }
  163. pub fn new_pg(len: u64) -> ProgressBar {
  164. let sty = ProgressStyle::with_template(
  165. " {spinner} {msg:>7.cyan} [{elapsed_precise}] [{bar:40}] {human_pos:>7}/{human_len:7}",
  166. )
  167. .unwrap()
  168. .progress_chars("=>-");
  169. let pg = ProgressBar::new(len);
  170. pg.set_style(sty);
  171. pg.enable_steady_tick(Duration::from_millis(200));
  172. pg
  173. }
  174. pub fn new_pg_speed(len: u64) -> ProgressBar {
  175. let sty = ProgressStyle::with_template(
  176. " {msg:>7.cyan} [{elapsed_precise}] [{bar:40}] {human_pos:>7}/{human_len:7} {per_sec}",
  177. )
  178. .unwrap()
  179. .progress_chars("=>-");
  180. let pg = ProgressBar::new(len);
  181. pg.set_style(sty);
  182. pg.enable_steady_tick(Duration::from_millis(200));
  183. pg
  184. }
  185. pub fn new_pg_bytes(len: u64) -> ProgressBar {
  186. let sty = ProgressStyle::with_template(
  187. " {msg:>7.cyan} [{elapsed_precise}] [{bar:40}] {decimal_bytes:>7}/{decimal_total_bytes:7} {decimal_bytes_per_sec}",
  188. )
  189. .unwrap()
  190. .progress_chars("=>-");
  191. let pg = ProgressBar::new(len);
  192. pg.set_style(sty);
  193. pg.enable_steady_tick(Duration::from_millis(200));
  194. pg
  195. }