Thomas
/
pandora_lib_variants


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
							use std::time::Duration;

use anyhow::{Context, Ok, Result};
use hashbrown::HashMap;
use indicatif::{ProgressBar, ProgressStyle};
use statrs::distribution::{ChiSquared, ContinuousCDF};

pub fn chi_square_test_impl(observed: &[f64], expected: &[f64]) -> anyhow::Result<f64> {
    if observed.len() != expected.len() {
        return Err(anyhow::anyhow!("Input vectors must have the same length"));
    }

    // Calculate the chi-squared statistic
    let chi_squared_statistic: f64 = observed
        .iter()
        .zip(expected.iter())
        .map(|(obs, exp)| ((obs - exp).abs() - 0.5).powi(2) / exp)
        .sum();

    // Degrees of freedom is the number of categories minus 1
    // let degrees_of_freedom = (observed.len() - 1) as f64;
    let degrees_of_freedom = 1.0;

    // Calculate p-value using chi-squared distribution
    let chi_squared_distribution = ChiSquared::new(degrees_of_freedom).unwrap();
    let p_value = 1.0 - chi_squared_distribution.cdf(chi_squared_statistic);

    // You can use the p-value to make decisions based on your significance level
    // For example, with a significance level of 0.05, if p_value < 0.05, reject the null hypothesis
    Ok(p_value)
}

/// 2-sample test for equality of proportions with continuity correction
/// remerciements to chatGPT
pub fn chi_square_test_for_proportions(
    success_a: f64,
    total_a: f64,
    success_b: f64,
    total_b: f64,
) -> anyhow::Result<f64> {
    let observed_counts = vec![
        success_a,
        total_a - success_a,
        success_b,
        total_b - success_b,
    ];
    let expected_counts = vec![
        total_a * (success_a + success_b) / (total_a + total_b),
        total_a * (total_a - success_a + total_b - success_b) / (total_a + total_b),
        total_b * (success_a + success_b) / (total_a + total_b),
        total_b * (total_a - success_a + total_b - success_b) / (total_a + total_b),
    ];

    chi_square_test_impl(&observed_counts, &expected_counts)
}

pub fn get_hts_nt_pileup(
    bam: &mut rust_htslib::bam::IndexedReader,
    chr: &str,
    start: i32,
    with_next_ins: bool,
) -> Result<Vec<u8>> {
    use rust_htslib::{bam, bam::Read};
    let stop = start + 1;
    let mut bases = Vec::new();
    bam.fetch((chr, start, stop))?;
    let mut bam_pileup = Vec::new();
    for p in bam.pileup() {
        let pileup = p.context(format!(
            "Can't pilup bam at position {}:{}-{}",
            chr, start, stop
        ))?;
        let position = pileup.pos() as i32;
        if position == start {
            for alignment in pileup.alignments() {
                match alignment.indel() {
                    bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
                    bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
                    _ => {
                        let record = alignment.record();
                        if record.seq_len() > 0 {
                            if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
                                bases.push(b);
                            }
                        } else {
                            if alignment.is_del() {
                                bases.push(b'D');
                            }
                        }
                    }
                }
            }
        }
    }
    Ok(bases)
}

pub fn hts_base_at(
    record: &rust_htslib::bam::record::Record,
    at_pos: u32,
    with_next_ins: bool,
) -> Result<Option<u8>> {
    use rust_htslib::bam::record::Cigar;

    let cigar = record.cigar();
    let seq = record.seq();
    let pos = cigar.pos() as u32;

    let mut read_i = 0u32;
    let at_pos = at_pos - 1;
    let mut ref_pos = pos;
    if ref_pos > at_pos {
        return Ok(None);
    }

    for (id, op) in cigar.iter().enumerate() {
        let (add_read, add_ref) = match *op {
            Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
            Cigar::Ins(len) => (len, 0),
            Cigar::Del(len) => (0, len),
            Cigar::RefSkip(len) => (0, len),
            Cigar::SoftClip(len) => (len, 0),
            Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
        };
        // If at the end of the op len and next op is Ins return I
        if with_next_ins && ref_pos + add_read == at_pos + 1 {
            if let Some(Cigar::Ins(_)) = cigar.get(id + 1) {
                return Ok(Some(b'I'));
            }
        }

        if ref_pos + add_ref > at_pos {
            // Handle deletions directly
            if let Cigar::Del(_) = *op {
                return Ok(Some(b'D'));
            } else if let Cigar::RefSkip(_) = op {
                return Ok(None);
            } else {
                let diff = at_pos - ref_pos;
                let p = read_i + diff;
                return Ok(Some(seq[p as usize]));
            }
        }

        read_i += add_read;
        ref_pos += add_ref;
    }
    Ok(None)
}

// thanks to chatGPT (the best)
pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
    let m = dna_sequence.len() as f64;

    // Count occurrences of each base
    let mut bases = HashMap::<char, usize>::new();
    for base in dna_sequence.chars() {
        *bases.entry(base).or_insert(0) += 1;
    }

    // Calculate Shannon entropy
    let mut shannon_entropy_value = 0.0;
    for &n_i in bases.values() {
        let p_i = n_i as f64 / m;
        shannon_entropy_value -= p_i * p_i.log2();
    }

    shannon_entropy_value
}

pub fn print_stat_cat(s: &HashMap<String, u32>, denum: u32) {
    let denum = denum as f32;
    let mut v: Vec<(&String, &u32)> = s.iter().map(|e| e).collect();
    v.sort_by(|(_, a), (_, b)| b.cmp(a));

    let mut table = prettytable::table!(["category", "n", "%"]);

    v.iter().for_each(|(k, v)| {
        let p = (**v as f32) * 100 as f32 / denum;
        let p = format!("{:.2}", p);
        table.add_row([*k, &v.to_string(), &p].into());
    });

    table.printstd();
}

pub fn new_pg(len: u64) -> ProgressBar {
    let sty = ProgressStyle::with_template(
        " {spinner}  {msg:>7.cyan} [{elapsed_precise}] [{bar:40}] {human_pos:>7}/{human_len:7}",
    )
    .unwrap()
    .progress_chars("=>-");
    let pg = ProgressBar::new(len);
    pg.set_style(sty);
    pg.enable_steady_tick(Duration::from_millis(200));
    pg
}

pub fn new_pg_speed(len: u64) -> ProgressBar {
    let sty = ProgressStyle::with_template(
        "  {msg:>7.cyan} [{elapsed_precise}] [{bar:40}] {human_pos:>7}/{human_len:7} {per_sec}",
    )
    .unwrap()
    .progress_chars("=>-");
    let pg = ProgressBar::new(len);
    pg.set_style(sty);
    pg.enable_steady_tick(Duration::from_millis(200));
    pg
}

pub fn new_pg_bytes(len: u64) -> ProgressBar {
    let sty = ProgressStyle::with_template(
        "  {msg:>7.cyan} [{elapsed_precise}] [{bar:40}] {decimal_bytes:>7}/{decimal_total_bytes:7} {decimal_bytes_per_sec}",
    )
    .unwrap()
    .progress_chars("=>-");
    let pg = ProgressBar::new(len);
    pg.set_style(sty);
    pg.enable_steady_tick(Duration::from_millis(200));
    pg
}