Thomas
/
pandora_lib_promethion


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120
							use core::fmt;
use std::{
    collections::{BTreeMap, HashMap},
    fs::{self, File},
    path::PathBuf,
};

use anyhow::{anyhow, Context};
use chrono::{DateTime, Utc};
use dashmap::DashMap;
use glob::glob;
use log::{debug, info, warn};
use rand::{rng, Rng};
use rayon::prelude::*;
use rust_htslib::bam::{ext::BamRecordExtensions, record::Cigar, Read};
use serde::{Deserialize, Serialize};

use crate::config::Config;

use super::flowcells::FlowCell;

#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct WGSBam {
    pub id: String,
    pub time_point: String,
    pub reference_genome: String,
    // pub bam_type: BamType,
    pub path: PathBuf,
    pub modified: DateTime<Utc>,
    pub bam_stats: WGSBamStats,
    // pub cramino: Option<CraminoRes>,
    pub composition: Vec<(String, String, f64)>, // acquisition id, fn
}

// #[derive(Debug, PartialEq, Clone, Deserialize, Serialize)]
// pub enum BamType {
//     WGS,
//     Panel(String),
//     ChIP(String),
// }
//
impl WGSBam {
    pub fn new(path: PathBuf) -> anyhow::Result<Self> {
        let stem = path
            .clone()
            .file_stem()
            .context(format!("Can't parse stem from {}", path.display()))?
            .to_string_lossy()
            .to_string();
        let stem: Vec<&str> = stem.split('_').collect();

        if stem.len() != 3 {
            return Err(anyhow!("Error in bam name: {}", path.display()));
        }

        let id = stem[0].to_string();
        let time_point = stem[1].to_string();

        let reference_genome = stem
            .last()
            .context("Can't get last from stem {stem}")?
            .to_string();

        // let bam_type = if stem.len() == 4 {
        //     match stem[2] {
        //         "oncoT" => BamType::Panel("oncoT".to_string()),
        //         "H3K27ac" => BamType::ChIP("H3K27ac".to_string()),
        //         "H3K4me3" => BamType::ChIP("H3K4me3".to_string()),
        //         _ => return Err(anyhow!("Error in bam name: {}", path.display())),
        //     }
        // } else {
        //     BamType::WGS
        // };
        let file_metadata = fs::metadata(&path)?;
        let modified = file_metadata.modified()?;

        let tp_dir = path
            .parent()
            .context("Can't parse parent from: {bam_path}")?;

        let json_path = format!(
            "{}/{id}_{time_point}_{reference_genome}_info.json",
            tp_dir.to_string_lossy()
        );
        let json_file = PathBuf::from(&json_path);

        if json_file.exists() && json_file.metadata()?.modified()? > modified {
            match Self::load_json(&json_path) {
                Ok(s) => return Ok(s),
                Err(e) => {
                    warn!("Error while loading {}.\n{e}", json_path);
                }
            }
        }

        // let cramino_path = format!(
        //     "{}/{id}_{time_point}_{reference_genome}_cramino.txt",
        //     tp_dir.to_string_lossy()
        // );

        // let cramino = if bam_type == BamType::WGS {
        //     if !PathBuf::from_str(&cramino_path)?.exists() {
        //         // return Err(anyhow!("Cramino file missing {cramino_path}"));
        //         Cramino::default().with_bam(path.to_str().unwrap())?;
        //     }
        //     let mut cramino = Cramino::default().with_result_path(&cramino_path);
        //     cramino
        //         .parse_results()
        //         .context(format!("Error while parsing cramino for {cramino_path}"))?;
        //
        //     if let Some(cramino) = cramino.results {
        //         Some(cramino)
        //     } else {
        //         return Err(anyhow!("Cramino results parsing failed"));
        //     }
        // } else {
        //     None
        // };

        let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(format!(
            "Error while reading BAM composition for {}",
            path.display()
        ))?;

        let s = Self {
            path,
            bam_stats: WGSBamStats::new(&id, &time_point, &Config::default())?,
            // cramino,
            id: id.to_string(),
            time_point: time_point.to_string(),
            // bam_type,
            reference_genome,
            composition,
            modified: modified.into(),
        };
        s.save_json(&json_path)?;
        Ok(s)
    }

    pub fn load_json(path: &str) -> anyhow::Result<Self> {
        let f = File::open(path)?;
        let s: Self = serde_json::from_reader(f)?;
        Ok(s)
    }

    pub fn save_json(&self, path: &str) -> anyhow::Result<()> {
        let f = File::create(path)?;
        serde_json::to_writer(f, self)?;
        Ok(())
    }
}

#[derive(Debug, Default, Clone)]
pub struct BamCollection {
    pub bams: Vec<WGSBam>,
}

impl BamCollection {
    pub fn new(result_dir: &str) -> Self {
        load_bam_collection(result_dir)
    }

    pub fn by_acquisition_id(&self) -> HashMap<String, Vec<&WGSBam>> {
        let mut acq: HashMap<String, Vec<&WGSBam>> = HashMap::new();
        for bam in self.bams.iter() {
            for (acq_id, _, _) in bam.composition.iter() {
                if let Some(entry) = acq.get_mut(acq_id) {
                    entry.push(bam);
                } else {
                    acq.insert(acq_id.to_string(), vec![bam]);
                }
            }
        }
        acq
    }

    pub fn get(&self, id: &str, time_point: &str) -> Vec<&WGSBam> {
        self.bams
            .iter()
            .filter(|b| b.id == id && b.time_point == time_point)
            .collect()
    }

    pub fn by_id_completed(&self, min_diag_cov: f32, min_mrd_cov: f32) -> Vec<WGSBam> {
        self.bams
            .iter()
            // .filter(|b| matches!(b.bam_type, BamType::WGS))
            .filter(|b| match b.time_point.as_str() {
                "diag" => b.bam_stats.global_coverage >= min_diag_cov as f64,
                "mrd" => b.bam_stats.global_coverage >= min_mrd_cov as f64,
                _ => false,
            })
            // .filter(|b| match &b.cramino {
            //     Some(cramino) => match b.time_point.as_str() {
            //         "diag" => cramino.mean_length >= min_diag_cov as f64,
            //         "mrd" => cramino.mean_length >= min_mrd_cov as f64,
            //         _ => false,
            //     },
            //     _ => false,
            // })
            .cloned()
            .collect()
    }

    pub fn ids(&self) -> Vec<String> {
        let mut ids: Vec<String> = self.bams.iter().map(|b| b.id.clone()).collect();
        ids.sort();
        ids.dedup();
        ids
    }
}

pub fn load_bam_collection(result_dir: &str) -> BamCollection {
    let pattern = format!("{}/*/*/*.bam", result_dir);
    let bams = glob(&pattern)
        .expect("Failed to read glob pattern.")
        // .par_bridge()
        .filter_map(|entry| {
            match entry {
                Ok(path) => match WGSBam::new(path) {
                    Ok(bam) => return Some(bam),
                    Err(e) => warn!("{e}"),
                },
                Err(e) => warn!("Error: {:?}", e),
            }
            None
        })
        .collect();

    BamCollection { bams }
}

/// Computes the composition of `(RG, fn)` tag prefixes in a BAM file sample.
///
/// This function parses a BAM file and, for a subset of records (`sample_size`),
/// extracts the `RG` (read group) and `fn` (typically a file or flowcell ID) string tags from each record.
/// It groups records by the prefix (before `_`, if present) of each tag,
/// counts occurrences for each pair, and reports their proportion in the sample.
///
/// # Arguments
///
/// * `file_path` - Path to the BAM file.
/// * `sample_size` - Maximum number of records to sample and process.
///
/// # Returns
///
/// Returns a vector of tuples, each containing:
///   - `String`: the <runid> (up to the first underscore of the `RG` tag, or full tag if no underscore)
///   - `String`: the <flowcell-id> (up to the first underscore of the `fn` tag, or full tag if no underscore)
///   - `f64`: percentage of records with this `(runid, flowcell-id)` prefix pair, relative to total successfully processed records
///
/// # Errors
///
/// Returns an error if the BAM file cannot be opened or read.
/// Any record in the sampled set does not have both required tags, or the tags are not strings.
///
/// # Examples
///
/// ```no_run
/// # use anyhow::Result;
/// let stats = bam_composition("reads.bam", 10_000)?;
/// for (rg, flowcell, pct) in stats {
///     println!("{} / {}: {:.2}%", rg, flowcell, pct);
/// }
/// # Ok::<(), anyhow::Error>(())
/// ```
///
/// # Notes
///
/// - Only the first `sample_size` records in the BAM are processed.
/// - Percentages are normalized by the number of records that had both required tags.
/// - This function uses the `rust-htslib` crate for BAM parsing.
///
/// # Tag requirements
///
/// - Both `RG` and `fn` tags must be present and of string type.
/// - Tag values are split on the first underscore; only the prefix is used for grouping.
///
/// # See also
///
/// - [SAM/BAM tag conventions](https://samtools.github.io/hts-specs/SAMv1.pdf)
/// - [Dorado SAM specifications](https://github.com/nanoporetech/dorado/blob/release-v1.0/documentation/SAM.md)
///
pub fn bam_composition(
    file_path: &str,
    sample_size: usize,
) -> anyhow::Result<Vec<(String, String, f64)>> {
    use std::collections::HashMap;
    use rust_htslib::bam::{Reader, record::Aux};

    let mut bam = Reader::from_path(file_path)?;
    let mut rgs: HashMap<(String, String), u64> = HashMap::new();

    let mut processed = 0u64;
    for (i, rec) in bam.records().filter_map(Result::ok).take(sample_size).enumerate() {
        let rg = match rec.aux(b"RG") {
            Ok(Aux::String(s)) => s,
            Ok(_) => return Err(anyhow::anyhow!("Record {}: RG tag is not a string", i + 1)),
            Err(_) => return Err(anyhow::anyhow!("Record {}: RG tag missing", i + 1)),
        };
        let fn_tag = match rec.aux(b"fn") {
            Ok(Aux::String(b)) => b,
            Ok(_) => return Err(anyhow::anyhow!("Record {}: fn tag is not a string", i + 1)),
            Err(_) => return Err(anyhow::anyhow!("Record {}: fn tag missing", i + 1)),
        };
        let rg_prefix = rg.split_once('_').map(|(a, _)| a).unwrap_or(rg);
        let fn_prefix = fn_tag.split_once('_').map(|(b, _)| b).unwrap_or(fn_tag);
        *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string())).or_default() += 1;
        processed += 1;
    }

    let results: Vec<_> = rgs
        .into_iter()
        .map(|((rg, fn_tag), n)| (
            rg,
            fn_tag,
            n as f64 * 100.0 / processed as f64
        ))
        .collect();

    Ok(results)
}

pub fn bam_has_fc(bam: &WGSBam, flow_cell: &FlowCell) -> anyhow::Result<bool> {
    let mut bam = rust_htslib::bam::Reader::from_path(&bam.path)?;

    let mut rgs: HashMap<String, u64> = HashMap::new();
    for result in bam.records().filter_map(Result::ok).take(2_000) {
        if let rust_htslib::bam::record::Aux::String(s) = result.aux(b"fn")? {
            *rgs.entry(s.to_string()).or_default() += 1;
        }
    }

    for k in rgs.keys() {
        if k.contains(&flow_cell.sample_sheet.flow_cell_id) {
            return Ok(true);
        }
    }

    Ok(false)
}

pub struct BamReadsSampler {
    pub positions: Vec<(String, u64)>,
    pub reader: rust_htslib::bam::IndexedReader,
    current_index: usize,
}

impl BamReadsSampler {
    pub fn new(path: &str, n: usize) -> anyhow::Result<Self> {
        debug!("Reading BAM file: {path}");
        let reader = rust_htslib::bam::IndexedReader::from_path(path)?;
        let header = reader.header();

        let contig_len: Vec<(String, u64)> = header
            .target_names()
            .into_iter()
            .map(|target_name| {
                let tid = header.tid(target_name).unwrap();
                (
                    String::from_utf8(target_name.to_vec()).unwrap(),
                    header.target_len(tid).unwrap(),
                )
            })
            .collect();

        let positions = sample_random_positions(&contig_len, n);

        Ok(Self {
            positions,
            reader,
            current_index: 0,
        })
    }
}

impl Iterator for BamReadsSampler {
    type Item = Result<rust_htslib::bam::Record, rust_htslib::errors::Error>;

    fn next(&mut self) -> Option<Self::Item> {
        loop {
            if self.current_index < self.positions.len() {
                let (contig, pos) = &self.positions[self.current_index];

                match self.reader.fetch((contig, *pos, pos + 1)) {
                    Ok(_) => (),
                    Err(e) => warn!("Error while reading BAM {e}"),
                }

                let result = self.reader.records().next();

                self.current_index += 1;
                if result.is_some() {
                    return result;
                }
            } else {
                return None;
            }
        }
    }
}

pub fn n_mapped_reads(bam_path: &str) -> anyhow::Result<u64> {
    let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path)?;
    let mapped = bam
        .index_stats()?
        .into_iter()
        .filter(|(tid, _, _, _)| *tid < 24)
        .map(|(_, _, m, _)| m)
        .sum();
    Ok(mapped)
}

// 0-based inclusif
pub fn nt_pileup(
    bam: &mut rust_htslib::bam::IndexedReader,
    chr: &str,
    position: u32,
    with_next_ins: bool,
) -> anyhow::Result<Vec<u8>> {
    use rust_htslib::{bam, bam::Read};
    let mut bases = Vec::new();
    bam.fetch((chr, position, position + 1))?;
    let mut bam_pileup = Vec::new();
    for p in bam.pileup() {
        let pileup = p.context(format!(
            "Can't pileup bam at position {}:{} (0-based)",
            chr, position
        ))?;
        let cur_position = pileup.pos();
        if cur_position == position {
            for alignment in pileup.alignments() {
                match alignment.indel() {
                    bam::pileup::Indel::Ins(_len) => bam_pileup.push(b'I'),
                    bam::pileup::Indel::Del(_len) => bam_pileup.push(b'D'),
                    _ => {
                        let record = alignment.record();
                        if record.seq_len() > 0 {
                            if let Some(b) = base_at(&record, position, with_next_ins)? {
                                bases.push(b);
                            }
                        } else if alignment.is_del() {
                            bases.push(b'D');
                        }
                    }
                }
            }
        }
    }
    Ok(bases)
}

pub fn base_at(
    record: &rust_htslib::bam::record::Record,
    at_pos: u32,
    with_next_ins: bool,
) -> anyhow::Result<Option<u8>> {
    let cigar = record.cigar();
    let seq = record.seq();
    let pos = cigar.pos() as u32;

    let mut read_i = 0u32;
    // let at_pos = at_pos - 1;
    let mut ref_pos = pos;
    if ref_pos > at_pos {
        return Ok(None);
    }

    for (id, op) in cigar.iter().enumerate() {
        let (add_read, add_ref) = match *op {
            Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
            Cigar::Ins(len) => (len, 0),
            Cigar::Del(len) => (0, len),
            Cigar::RefSkip(len) => (0, len),
            Cigar::SoftClip(len) => (len, 0),
            Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
        };
        // If at the end of the op len and next op is Ins return I
        if with_next_ins && ref_pos + add_read == at_pos + 1 {
            if let Some(Cigar::Ins(_)) = cigar.get(id + 1) {
                return Ok(Some(b'I'));
            }
        }

        if ref_pos + add_ref > at_pos {
            // Handle deletions directly
            if let Cigar::Del(_) = *op {
                return Ok(Some(b'D'));
            } else if let Cigar::RefSkip(_) = op {
                return Ok(None);
            } else {
                let diff = at_pos - ref_pos;
                let p = read_i + diff;
                return Ok(Some(seq[p as usize]));
            }
        }

        read_i += add_read;
        ref_pos += add_ref;
    }
    Ok(None)
}

pub fn counts_at(
    bam: &mut rust_htslib::bam::IndexedReader,
    chr: &str,
    position: u32,
) -> anyhow::Result<HashMap<String, i32>> {
    let p = nt_pileup(bam, chr, position, false)?
        .iter()
        .map(|e| String::from_utf8(vec![*e]).unwrap())
        .collect::<Vec<_>>();
    let mut counts = HashMap::new();
    for item in p.iter() {
        *counts.entry(item.to_string()).or_insert(0) += 1;
    }
    Ok(counts)
}

pub fn ins_pileup(
    bam: &mut rust_htslib::bam::IndexedReader,
    chr: &str,
    position: u32,
) -> anyhow::Result<Vec<String>> {
    let mut bases = Vec::new();
    bam.fetch((chr, position, position + 10))?;
    for p in bam.pileup() {
        let pileup = p.context(format!(
            "Can't pileup bam at position {}:{} (0-based)",
            chr, position
        ))?;
        let cur_position = pileup.pos();
        // Ins in the next position
        if cur_position == position + 1 {
            // debug!("{cur_position}");
            for alignment in pileup.alignments() {
                let record = alignment.record();
                if record.seq_len() > 0 {
                    if let Some(b) = ins_at(&record, position)? {
                        bases.push(b);
                    }
                }
            }
        }
    }
    Ok(bases)
}

pub fn ins_at(
    record: &rust_htslib::bam::record::Record,
    at_pos: u32,
) -> anyhow::Result<Option<String>> {
    use rust_htslib::bam::record::Cigar;

    let cigar = record.cigar();
    let seq = record.seq();
    let pos = cigar.pos() as u32;

    let mut read_i = 0u32;
    let mut ref_pos = pos;
    if ref_pos > at_pos {
        return Ok(None);
    }
    // debug!(
    //     "read: {}",
    //     String::from_utf8(record.qname().to_vec()).unwrap()
    // );

    for op in cigar.iter() {
        let (add_read, add_ref) = match *op {
            Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
            Cigar::Ins(len) => (len, 0),
            Cigar::Del(len) => (0, len),
            Cigar::RefSkip(len) => (0, len),
            Cigar::SoftClip(len) => (len, 0),
            Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
        };

        if ref_pos + add_read > at_pos && ref_pos + add_read < at_pos + 10 {
            if let Cigar::Ins(v) = *op {
                // debug!(
                //     "ins size {v} @ {} (corrected {})",
                //     ref_pos + add_read,
                //     (ref_pos + add_read) - v - 1
                // );

                if (ref_pos + add_read) - v - 1 == at_pos {
                    let inserted_seq =
                        seq.as_bytes()[read_i as usize..(read_i + v) as usize].to_vec();
                    return Ok(Some(String::from_utf8(inserted_seq)?));
                }
            }
        }

        read_i += add_read;
        ref_pos += add_ref;
    }
    Ok(None)
}

pub fn counts_ins_at(
    bam: &mut rust_htslib::bam::IndexedReader,
    chr: &str,
    position: u32,
) -> anyhow::Result<HashMap<String, i32>> {
    let p = ins_pileup(bam, chr, position)?;
    let mut counts = HashMap::new();
    for item in p.iter() {
        *counts.entry(item.to_string()).or_insert(0) += 1;
    }
    Ok(counts)
}

pub fn sample_random_positions(chromosomes: &Vec<(String, u64)>, n: usize) -> Vec<(String, u64)> {
    let mut rng = rng();
    let total_length: u64 = chromosomes.iter().map(|(_, length)| length).sum();

    (0..n)
        .map(|_| {
            let random_position = rng.random_range(0..total_length);
            let mut cumulative_length = 0;

            for (chr, length) in chromosomes {
                cumulative_length += length;
                if random_position < cumulative_length {
                    let position_within_chr = random_position - (cumulative_length - length);
                    return (chr.clone(), position_within_chr);
                }
            }

            unreachable!("Should always find a chromosome")
        })
        .collect()
}

/// High-level mapping statistics for a WGS BAM file.
///
/// This struct aggregates essential quality control and mapping metrics, suitable for reporting or downstream QC.
/// Unless otherwise specified, units are in base pairs (bp) or counts of reads.
///
/// # Fields
/// - `all_records`: Total number of records in the BAM, before any filtering.
/// - `n_reads`: Number of primary, mapped, QC-passed, non-duplicate reads counted for statistics.
/// - `mapped_fraction`: Fraction of mapped (counted) reads relative to all records.
/// - `n_unmapped`: Number of unmapped reads (`BAM_FUNMAP`).
/// - `n_duplicates`: Number of reads marked as duplicate (`BAM_FDUP`).
/// - `mapped_yield`: Total sum of mapped read lengths (bp).
/// - `mean_read_length`: Mean length of mapped reads (bp).
/// - `median_read_length`: Median mapped read length (bp).
/// - `coverage_stddev`: Standard deviation of per-read global coverage contribution.
/// - `global_coverage`: Mean mapped yield divided by total genome size.
/// - `karyotype`: Per-contig summary: (TID, contig length, contig name, mapped yield, coverage stddev).
/// - `n50`: N50 statistic for mapped read lengths (bp).
/// - `by_lengths`: Histogram of mapped read lengths as (length, count), sorted by length.
///
/// # Filtering Rule
/// Only records that are:
/// - primary alignments (not secondary/supplementary),
/// - mapped (`!BAM_FUNMAP`),
/// - pass vendor quality checks (`!BAM_FQCFAIL`),
/// - not marked as duplicate (`!BAM_FDUP`),
/// - and with mapping quality ≥ `bam_min_mapq`,
/// are counted as mapped reads.
///
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct WGSBamStats {
    /// Total number of BAM records (including unmapped, filtered, duplicates).
    pub all_records: u64,
    /// Number of mapped, primary, QC-passed, non-duplicate reads (final filtered count).
    pub n_reads: u64,
    /// Fraction of counted mapped reads over all BAM records.
    pub mapped_fraction: f64,
    /// Number of unmapped records (flag 0x4).
    pub n_unmapped: u64,
    /// Number of duplicate records (flag 0x400).
    pub n_duplicates: u64,
    /// Number of duplicate records (flag 0x400).
    pub n_lowq: u64,
    /// Total sum of mapped read lengths (bp).
    pub mapped_yield: u64,
    /// Mean mapped read length (bp).
    pub mean_read_length: f64,
    /// Median mapped read length (bp).
    pub median_read_length: u64,
    /// Standard deviation of per-read global coverage contributions.
    pub coverage_stddev: f64,
    /// Mean global coverage: `mapped_yield / genome_size`.
    pub global_coverage: f64,
    /// (tid, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev, coverage_ratio)
    pub karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)>,
    /// N50 value of mapped read lengths (bp).
    pub n50: u64,
    /// Histogram of mapped read lengths: (length, count), sorted by length.
    pub by_lengths: Vec<(u64, u32)>,
}

impl WGSBamStats {
    /// Computes `WGSBamStats` by scanning a BAM file and summarizing essential statistics.
    ///
    /// # Parameters
    /// - `case_id`: Sample/case identifier used to locate the BAM file via config.
    /// - `time`: Timestamp string for progress/logging.
    /// - `config`: Configuration struct. Must provide:
    ///     - `solo_bam(case_id, time)`: path to BAM file.
    ///     - `bam_min_mapq`: minimum mapping quality to include read.
    ///     - `bam_n_threads`: number of BAM decompression threads.
    ///
    /// # Filtering Rule
    /// Only primary, mapped, non-duplicate, QC-passed reads with sufficient mapping quality are included (see struct doc).
    ///
    /// # Returns
    /// On success, returns a fully populated `WGSBamStats` with all core and extended fields.
    /// Returns an error if the BAM cannot be parsed or genome sizes cannot be retrieved.
    pub fn new(case_id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
        let bam_path = config.solo_bam(case_id, time);
        let bam_min_mapq = config.bam_min_mapq;
        let bam_n_threads = config.bam_n_threads;

        let mut bam = rust_htslib::bam::Reader::from_path(&bam_path)?;
        let h = bam.header().clone();
        let header = rust_htslib::bam::Header::from_template(&h);
        bam.set_threads(bam_n_threads as usize)?;

        info!("Parsing BAM file: {bam_path}");

        let mut all_records = 0u64;
        let mut n_reads = 0u64;
        let mut n_unmapped = 0u64;
        let mut n_duplicates = 0u64;
        let mut n_lowq = 0u64;
        let mut mapped_lengths = Vec::new();
        let mut lengths_by_tid: HashMap<i32, Vec<u64>> = HashMap::new();

        // Main scan loop: apply flag and MAPQ filters.
        for rec in bam.rc_records() {
            all_records += 1;
            let r = rec.with_context(|| "failed to parse BAM record")?;

            let flags = r.flags();
            // Count and exclude unmapped.
            if flags & (rust_htslib::htslib::BAM_FUNMAP as u16) != 0 {
                n_unmapped += 1;
                continue;
            }
            // Count and exclude duplicates.
            if flags & (rust_htslib::htslib::BAM_FDUP as u16) != 0 {
                n_duplicates += 1;
                continue;
            }
            // Filter by mapping quality.
            if r.mapq() < bam_min_mapq {
                n_lowq += 1;
                continue;
            }
            // Exclude secondary and QC-failed.
            let bad_flags = rust_htslib::htslib::BAM_FSECONDARY | rust_htslib::htslib::BAM_FQCFAIL;
            if (flags & (bad_flags as u16)) != 0 {
                continue;
            }

            n_reads += 1;
            let start = r.pos();
            let end = r.reference_end();
            let len = if end > start { (end - start) as u64 } else { 0 };
            mapped_lengths.push(len);
            lengths_by_tid.entry(r.tid()).or_default().push(len);

            if n_reads % 500_000 == 0 {
                info!("{case_id} {time}: processed {n_reads} mapped reads");
            }
        }

        let mapped_fraction = if all_records > 0 {
            n_reads as f64 / all_records as f64
        } else {
            0.0
        };

        // Compute mean, median, N50 (single sort).
        let mut lens = mapped_lengths.clone();
        lens.sort_unstable();

        let mean_read_length = if n_reads > 0 {
            mapped_lengths.iter().sum::<u64>() as f64 / n_reads as f64
        } else {
            0.0
        };
        let median_read_length = if lens.is_empty() {
            0
        } else {
            lens[lens.len() / 2]
        };
        let mapped_yield: u64 = mapped_lengths.iter().sum();
        // N50: minimal length L such that sum(lengths >= L) >= 50% yield.
        let mut cum = 0u64;
        let half = mapped_yield / 2;
        let n50 = lens
            .iter()
            .rev()
            .find_map(|&l| {
                cum += l;
                if cum >= half {
                    Some(l)
                } else {
                    None
                }
            })
            .unwrap_or(0);

        // Histogram (sorted by length).
        let mut histo = BTreeMap::new();
        for &len in &mapped_lengths {
            *histo.entry(len).or_insert(0) += 1;
        }
        let by_lengths: Vec<_> = histo.into_iter().collect();

        // Genome/coverage.
        let genome = get_genome_sizes(&header)?;
        let genome_size: u64 = genome.values().sum();
        let global_coverage = if genome_size > 0 {
            mapped_yield as f64 / genome_size as f64
        } else {
            0.0
        };

        // Global coverage stdev: stdev of per-read contributions to global coverage.
        let mut total_sq_cov = 0.0;
        for &len in &mapped_lengths {
            let cov = if genome_size > 0 {
                len as f64 / genome_size as f64
            } else {
                0.0
            };
            total_sq_cov += cov * cov;
        }
        let mean_cov = global_coverage;
        let variance = if n_reads > 0 {
            let var = total_sq_cov / n_reads as f64 - mean_cov.powi(2);
            if var < 0.0 {
                0.0
            } else {
                var
            }
        } else {
            0.0
        };
        let coverage_stddev = variance.sqrt();

        // Per-contig: (TID, contig_length, contig_name, mapped_yield, mean_coverage, coverage_stddev , coverage_ratio).
        let mut karyotype: Vec<(i32, u64, String, u64, f64, f64, f64)> = lengths_by_tid
            .iter()
            .map(|(tid, lengths)| {
                let contig = String::from_utf8(h.tid2name(*tid as u32).to_vec()).unwrap();
                let contig_len = genome.get(&contig).copied().unwrap_or(0);
                let mapped_sum: u64 = lengths.iter().sum();
                // Per-contig mean coverage
                let mean_cov = if contig_len > 0 {
                    mapped_sum as f64 / contig_len as f64
                } else {
                    0.0
                };
                let coverage_ratio = if global_coverage > 0.0 {
                    mean_cov / global_coverage
                } else {
                    0.0
                };

                // Per-contig coverage stdev (over all reads mapped to this contig)
                let var = if contig_len > 0 && !lengths.is_empty() {
                    lengths
                        .iter()
                        .map(|&l| {
                            let cov = l as f64 / contig_len as f64;
                            (cov - mean_cov).powi(2)
                        })
                        .sum::<f64>()
                        / lengths.len() as f64
                } else {
                    0.0
                };
                let stddev = var.sqrt();
                (
                    *tid,
                    contig_len,
                    contig,
                    mapped_sum,
                    mean_cov,
                    stddev,
                    coverage_ratio,
                )
            })
            .collect();

        karyotype.sort_unstable_by_key(|&(tid, _, _, _, _, _, _)| tid);

        Ok(Self {
            all_records,
            n_reads,
            mapped_fraction,
            n_unmapped,
            n_duplicates,
            mapped_yield,
            mean_read_length,
            median_read_length,
            coverage_stddev,
            global_coverage,
            karyotype,
            n50,
            by_lengths,
            n_lowq,
        })
    }
}

impl fmt::Display for WGSBamStats {
    /// Formats the BAM statistics as a readable summary table.
    ///
    /// Shows all global stats and a per-contig breakdown with coverage stdev.
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        writeln!(f, "BAM statistics summary:")?;
        writeln!(f, "  All BAM records:         {}", self.all_records)?;
        writeln!(f, "  Unmapped reads:          {}", self.n_unmapped)?;
        writeln!(f, "  Duplicate reads:         {}", self.n_duplicates)?;
        writeln!(f, "  Low Q reads:         {}", self.n_lowq)?;
        writeln!(f, "  Counted (mapped) reads:  {}", self.n_reads)?;
        writeln!(f, "  Mapped fraction:         {:.4}", self.mapped_fraction)?;
        writeln!(
            f,
            "  Mapped yield [Gb]:       {:.2}",
            self.mapped_yield as f64 / 1e9
        )?;
        writeln!(f, "  Mean read length [bp]:   {:.2}", self.mean_read_length)?;
        writeln!(f, "  Median read length [bp]: {}", self.median_read_length)?;
        writeln!(f, "  N50 [bp]:                {}", self.n50)?;
        writeln!(f, "  Global mean coverage:    {:.2}", self.global_coverage)?;
        writeln!(f, "  Global coverage stdev:   {:.2}", self.coverage_stddev)?;
        writeln!(f, "  Per-contig stats:")?;
        writeln!(
            f,
            "    {:<8} {:<10} {:<12} {:<14} {:<12} {:<10} {:<10}",
            "TID", "Length", "Name", "MappedYield", "CovMean", "CovStdev", "CovRatio"
        )?;
        for (tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio) in
            &self.karyotype
        {
            writeln!(
                f,
                "    {:<8} {:<10} {:<12} {:<14} {:<12.4} {:.4} {:.2}",
                tid, contig_len, contig, mapped_sum, mean_cov, stddev, coverage_ratio
            )?;
        }

        Ok(())
    }
}

pub fn get_genome_sizes(header: &rust_htslib::bam::Header) -> anyhow::Result<HashMap<String, u64>> {
    let mut genome = HashMap::new();
    for (key, records) in header.to_hashmap() {
        for record in records {
            if key == "SQ" {
                genome.insert(
                    record["SN"].to_string(),
                    record["LN"]
                        .parse::<u64>()
                        .expect("Failed parsing length of chromosomes"),
                );
            }
        }
    }
    Ok(genome)
}

/// Result of querying a read at a reference position.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum PileBase {
    A,
    C,
    G,
    T,
    N,
    Ins,                 // insertion immediately after the queried base
    Del((Vec<u8>, u32)), // deletion covering the queried base
    Skip,                // reference skip (N)
}

/// Decode one HTSlib 4bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `Base`
#[inline]
fn decode(b: u8) -> PileBase {
    match b & 0b1111 {
        0 => PileBase::A,
        1 => PileBase::C,
        2 => PileBase::G,
        3 => PileBase::T,
        _ => PileBase::N,
    }
}

pub fn base_at_new(
    record: &rust_htslib::bam::Record,
    at_pos: i64,
    with_next_ins: bool,
) -> Option<PileBase> {
    if record.pos() > at_pos {
        return None;
    }

    let mut ref_pos = record.pos();
    let mut read_pos = 0_i64;
    let cigar = record.cigar();
    let seq = record.seq().as_bytes(); // already expanded to ASCII (A,C,G,T,N)

    for (i, op) in cigar.iter().enumerate() {
        let l = op.len() as i64;
        let (consume_read, consume_ref) = match op.char() {
            'M' | '=' | 'X' => (l, l),
            'I' | 'S' => (l, 0),
            'D' | 'N' => (0, l),
            'H' | 'P' => (0, 0),
            _ => unreachable!(),
        };

        /* look ahead for an insertion immediately AFTER the queried base */
        if with_next_ins
            && ref_pos + consume_ref == at_pos + 1
            && matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
        {
            return Some(PileBase::Ins);
        }

        // Instead of PileBase::Ins, consider
        // extracting the inserted sequence and returning its length or actual bases:
        // if let Some(Cigar::Ins(len)) = cigar.get(i + 1) {
        //     let ins_seq = &seq[read_pos as usize + (consume_ref as usize)
        //         ..read_pos as usize + (consume_ref as usize) + (*len as usize)];
        //     return Some(PileBase::Ins(ins_seq.to_vec()));
        // }

        /* Does the queried reference coordinate fall inside this CIGAR op? */
        if ref_pos + consume_ref > at_pos {
            return Some(match op {
                Cigar::Del(e) => PileBase::Del((record.qname().to_vec(), *e)),
                Cigar::RefSkip(_) => PileBase::Skip,
                _ => {
                    let offset = (at_pos - ref_pos) as usize;
                    if read_pos as usize + offset >= seq.len() {
                        return None; // out of range, malformed BAM
                    }
                    decode(seq[read_pos as usize + offset])
                }
            });
        }

        read_pos += consume_read;
        ref_pos += consume_ref;
    }

    None // beyond alignment
}

/// Return the pile‑up of **all reads** at `chr:pos` (0‑based, inclusive).
///
/// * `bam`           – an open [`rust_htslib::bam::IndexedReader`].  
/// * `chr` / `pos`   – reference contig name and coordinate.  
/// * `with_next_ins` – if `true`, report [`Base::Ins`] when an insertion starts
///                     **right after** the queried base.
///
/// The function bounds the internal pile‑up depth to 10 000 reads to protect
/// against malformed BAM files that could otherwise allocate unbounded memory.
///
/// # Errors
///
/// Propagates any I/O or parsing error from *rust‑htslib*, wrapped in
/// [`anyhow::Error`].
pub fn nt_pileup_new(
    bam: &mut rust_htslib::bam::IndexedReader,
    chr: &str,
    pos: u32,
    with_next_ins: bool,
) -> anyhow::Result<Vec<PileBase>> {
    // Storage for the per‑read observations.
    let mut pile = Vec::new();

    // Restrict BAM iterator to the one‑base span of interest.
    bam.fetch((chr, pos, pos + 1))?;

    // Hard cap on depth (adjust if your use‑case needs deeper coverage).
    bam.pileup().set_max_depth(10_000);

    // Stream the pileup; HTSlib walks the reads for us.
    for pileup in bam.pileup() {
        // Attach context to any low‑level error.
        let pileup = pileup.context(format!("Failed to pileup BAM at {chr}:{pos}"))?;

        // Skip other positions quickly.
        if pileup.pos() != pos {
            continue;
        }

        for aln in pileup.alignments() {
            // Handle the three indel states reported by HTSlib.
            match aln.indel() {
                rust_htslib::bam::pileup::Indel::Ins(_) => {
                    pile.push(PileBase::Ins); // insertion *starts at* this base
                }
                rust_htslib::bam::pileup::Indel::Del(e) => {
                    let qname = aln.record().qname().to_vec();
                    pile.push(PileBase::Del((qname, e))); // deletion length > 1, but still covers here
                }
                rust_htslib::bam::pileup::Indel::None => {
                    // Regular match/mismatch: delegate to `base_at`.
                    if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins) {
                        pile.push(base);
                    }
                }
            }
        }
    }
    Ok(pile)
}