Thomas
/
pandora_lib_promethion


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
							use std::collections::HashMap;

use anyhow::Context;
use csv::ByteRecord;
use log::error;
use rust_htslib::bam::{HeaderView, IndexedReader, Read, Record, ext::BamRecordExtensions, record::Aux};

use crate::io::{bam::{fb_inv_from_record, primary_record, primary_records}, tsv::{parse_csv_u32_into, parse_u32}};

/// A genomic bin containing reads from a specific region.
///
/// This struct represents a segment of a BAM file, storing reads that overlap
/// a specific genomic region. It provides methods to analyze read distributions
/// and extract primary alignments for supplementary alignments.
#[derive(Debug)]
pub struct Bin {
    /// The contig/chromosome name
    pub contig: String,
    /// The 0-based inclusive start position
    pub start: u32,
    /// The 0-based inclusive end position
    pub end: u32,
    /// Map of read query names to their BAM records
    pub reads_store: HashMap<Vec<u8>, Record>,
    /// Indexed BAM reader for fetching additional records
    // pub bam_reader: IndexedReader,
    /// Average read length within this bin
    // pub reads_mean_len: u32,
    /// Number of reads filtered due to low mapping quality
    pub low_qualities: Vec<u32>,
    /// Vec of depths
    pub depths: Vec<u32>,
    pub fb_invs: Vec<u32>,
}

impl Bin {
    /// Creates a new genomic bin from a BAM file.
    ///
    /// # Arguments
    ///
    /// * `bam_path` - Path to the BAM file
    /// * `contig` - Chromosome/contig name
    /// * `start` - 0-based inclusive start position
    /// * `length` - Length of the region to extract
    ///
    /// # Returns
    ///
    /// A Result containing the new Bin if successful, or an error if the BAM file
    /// couldn't be read or the region couldn't be fetched.
    ///
    /// # Examples
    ///
    /// ```
    /// let bin = Bin::new("sample.bam", "chr1", 1000000, 1000)?;
    /// println!("Loaded {} reads", bin.n_reads());
    /// ```
    pub fn new(
        bam_reader: &mut IndexedReader,
        contig: &str,
        start: u32,
        length: u32,
        min_mapq: u8,
    ) -> anyhow::Result<Self> {
        anyhow::ensure!(length > 0, "bin length must be > 0");
        let end = start + length - 1;

        bam_reader
            .fetch((contig, start as i64, end as i64))
            .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?;
        let header = bam_reader.header().clone();

        let mut reads_store: HashMap<Vec<u8>, Record> = HashMap::new();
        let mut depths = vec![0u32; length as usize]; // Optional pseudo-depth
        let mut low_qualities = vec![0u32; length as usize]; // Optional pseudo-depth
        let mut fb_invs = vec![0u32; length as usize];

        for record_result in bam_reader.rc_records() {
            let rc_record = match record_result {
                Ok(rc) => rc,
                Err(e) => {
                    error!(
                        "Failed to parse record in {}:{}-{}: {}",
                        contig, start, end, e
                    );
                    continue;
                }
            };

            let record = rc_record.as_ref();

            if let Some((read_start, read_end)) = read_range(record) {
                if read_end < start || read_start > end {
                    continue; // Not overlapping this bin
                }

                let has_fbinv =
                    fb_inv_from_record(record, &header, min_mapq, Some(150), Some(200)).is_some();

                // Clone the underlying Record
                reads_store.insert(record.qname().to_vec(), record.clone());

                // Optional: depth accumulation
                let local_start = start.max(read_start);
                let local_end = end.min(read_end);
                for pos in local_start..=local_end {
                    let i = (pos - start) as usize;
                    if i < depths.len() {
                        depths[i] += 1;
                        if record.mapq() < min_mapq {
                            low_qualities[i] += 1;
                        }
                        if has_fbinv {
                            fb_invs[i] += 1;
                        }
                    }
                }
            }
        }

        Ok(Bin {
            contig: contig.to_string(),
            start,
            end,
            reads_store,
            low_qualities,
            depths,
            fb_invs,
        })
    }
    /// Returns the total number of reads in this bin.
    pub fn n_reads(&self) -> usize {
        self.reads_store.len()
    }

    /// Returns the number of reads with SA (supplementary alignment) tags.
    pub fn n_sa(&self) -> usize {
        self.reads_store
            .values()
            .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
            .count()
    }

    /// Retrieves primary alignments for all reads with SA tags in this bin.
    ///
    /// # Returns
    ///
    /// A vector of primary alignment records for reads with supplementary alignments.
    pub fn sa_primary(&mut self, bam_reader: &mut IndexedReader) -> Vec<Record> {
        self.reads_store
            .values()
            .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
            .map(|record| primary_record(bam_reader, record.clone()))
            .collect()
    }

    /// Finds the position with the highest concentration of read starts or ends.
    ///
    /// # Returns
    ///
    /// A tuple containing (position, count) of the position with the most read starts or ends.
    pub fn max_start_or_end(&self) -> (u32, usize) {
        let mut position_counts: HashMap<u32, usize> = HashMap::new();

        for record in self.reads_store.values() {
            let reference_start = record.reference_start() as u32;
            let reference_end = record.reference_end() as u32;

            // Count start positions within the bin
            if reference_start >= self.start && reference_start <= self.end {
                *position_counts.entry(reference_start).or_default() += 1;
            }

            // Count end positions within the bin
            if reference_end >= self.start && reference_end <= self.end {
                *position_counts.entry(reference_end).or_default() += 1;
            }
        }

        position_counts
            .into_iter()
            .max_by_key(|(_, count)| *count)
            .unwrap_or((0, 0))
    }

    /// Computes the number of reads with SA tags, and finds the highest
    /// counts of read starts and ends at the same position.
    ///
    /// # Returns
    ///
    /// A tuple containing:
    /// - Number of reads with SA tags
    /// - Highest count of reads starting at any single position
    /// - Highest count of reads ending at any single position
    pub fn count_reads_sa_start_end(&self) -> (usize, usize, usize) {
        let mut n_sa = 0;
        let mut start_counts: HashMap<u32, usize> = HashMap::new();
        let mut end_counts: HashMap<u32, usize> = HashMap::new();

        for record in self.reads_store.values() {
            // Count reads with SA tags
            if matches!(record.aux(b"SA"), Ok(Aux::String(_))) {
                n_sa += 1;
            }

            let reference_start = record.reference_start() as u32;
            let reference_end = record.reference_end() as u32;

            // Count start positions within the bin
            if reference_start >= self.start && reference_start <= self.end {
                *start_counts.entry(reference_start).or_default() += 1;
            }

            // Count end positions within the bin
            if reference_end >= self.start && reference_end <= self.end {
                *end_counts.entry(reference_end).or_default() += 1;
            }
        }

        let max_start_count = start_counts.values().max().copied().unwrap_or(0);
        let max_end_count = end_counts.values().max().copied().unwrap_or(0);

        (n_sa, max_start_count, max_end_count)
    }

    /// Finds the position with the highest concentration of read starts.
    ///
    /// # Returns
    ///
    /// A tuple containing (position, count) of the position with the most read starts.
    pub fn max_start(&self) -> (u32, usize) {
        let mut start_counts: HashMap<u32, usize> = HashMap::new();

        for record in self.reads_store.values() {
            let reference_start = record.reference_start() as u32;

            if reference_start >= self.start && reference_start <= self.end {
                *start_counts.entry(reference_start).or_default() += 1;
            }
        }

        start_counts
            .into_iter()
            .max_by_key(|(_, count)| *count)
            .unwrap_or((0, 0))
    }

    /// Finds the position with the highest concentration of read ends.
    ///
    /// # Returns
    ///
    /// A tuple containing (position, count) of the position with the most read ends.
    pub fn max_end(&self) -> (u32, usize) {
        let mut end_counts: HashMap<u32, usize> = HashMap::new();

        for record in self.reads_store.values() {
            let reference_end = record.reference_end() as u32;

            if reference_end >= self.start && reference_end <= self.end {
                *end_counts.entry(reference_end).or_default() += 1;
            }
        }

        end_counts
            .into_iter()
            .max_by_key(|(_, count)| *count)
            .unwrap_or((0, 0))
    }

    /// Gets primary alignments for reads that start or end at a specific position.
    ///
    /// # Arguments
    ///
    /// * `pos` - The position to check for read starts or ends
    ///
    /// # Returns
    ///
    /// A vector of primary alignment records.
    pub fn se_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
        let records: Vec<Record> = self
            .reads_store
            .values()
            .filter(|record| {
                record.reference_start() as u32 == pos || record.reference_end() as u32 == pos
            })
            .cloned()
            .collect();

        primary_records(bam_reader, records)
    }

    /// Gets primary alignments for reads that start at a specific position.
    ///
    /// # Arguments
    ///
    /// * `pos` - The position to check for read starts
    ///
    /// # Returns
    ///
    /// A vector of primary alignment records.
    pub fn s_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
        self.reads_store
            .values()
            .filter(|record| record.reference_start() as u32 == pos)
            .map(|record| primary_record(bam_reader, record.clone()))
            .collect()
    }

    /// Gets primary alignments for reads that end at a specific position.
    ///
    /// # Arguments
    ///
    /// * `pos` - The position to check for read ends
    ///
    /// # Returns
    ///
    /// A vector of primary alignment records.
    pub fn e_primary(&mut self, pos: u32, bam_reader: &mut IndexedReader) -> Vec<Record> {
        self.reads_store
            .values()
            .filter(|record| record.reference_end() as u32 == pos)
            .map(|record| primary_record(bam_reader, record.clone()))
            .collect()
    }

    /// Calculates the mean coverage by considering the actual
    /// span of each read in the bin.
    ///
    /// # Returns
    ///
    /// The mean bin coverage as a floating-point value.
    pub fn mean_coverage(&self) -> f64 {
        // If bin has no length or no reads, return 0
        if self.end <= self.start || self.reads_store.is_empty() {
            return 0.0;
        }

        // Calculate the total length of the bin
        let bin_length = (self.end - self.start + 1) as f64;

        // Calculate the total bases covered by all reads within the bin
        let mut total_bases_covered = 0.0;
        for record in self.reads_store.values() {
            let read_start = record.reference_start() as u32;
            let read_end = record.reference_end() as u32;

            // Skip reads that don't overlap with our bin
            if read_end < self.start || read_start > self.end {
                continue;
            }

            // Calculate the overlapping region
            let overlap_start = read_start.max(self.start);
            let overlap_end = read_end.min(self.end);

            // Add the number of bases this read covers within our bin
            total_bases_covered += (overlap_end - overlap_start + 1) as f64;
        }

        // Divide by the bin length to get average coverage
        total_bases_covered / bin_length
    }

    pub fn mean_coverage_from_depths(&self) -> f64 {
        if self.depths.is_empty() {
            return 0.0;
        }
        self.depths.iter().sum::<u32>() as f64 / self.depths.len() as f64
    }
}

/// Helper: get start and end position of a read
fn read_range(record: &Record) -> Option<(u32, u32)> {
    let start = record.pos();
    let end = record.cigar().end_pos();
    if start >= 0 && end > start {
        Some((start as u32, end as u32 - 1))
    } else {
        None
    }
}

/// Reused per-reader parse buffers
#[derive(Default)]
pub struct BinRowBuf {
    pub depths: Vec<u32>,
    pub lowq: Vec<u32>,
}

/// Example: parse one TSV record into buffers.
/// Returns (start, depths_slice, lowq_slice)
pub fn parse_bin_record_into<'a>(
    rec: &'a ByteRecord,
    buf: &'a mut BinRowBuf,
    contig_expected: &str,
) -> anyhow::Result<(u32, &'a [u32], &'a [u32])> {
    let i_contig = 0usize;
    let i_start  = 1usize;
    let i_end    = 2usize;

    // adjust if your file has more/less scalar columns, but for your pasted line:
    let i_depths = 9usize;   // big CSV list
    let i_lowq   = 10usize;  // big CSV list

    let contig = std::str::from_utf8(rec.get(i_contig).context("missing contig")?)
        .context("non-utf8 contig")?;
    anyhow::ensure!(contig == contig_expected, "unexpected contig");

    let start = parse_u32(rec.get(i_start).context("missing start")?)?;
    let end = parse_u32(rec.get(i_end).context("missing end")?)?;
    anyhow::ensure!(end >= start, "invalid bin coordinates: end < start ({start} > {end})");

    parse_csv_u32_into(&mut buf.depths, rec.get(i_depths).context("missing depths")?)
        .context("parse depths")?;
    parse_csv_u32_into(&mut buf.lowq, rec.get(i_lowq).context("missing lowq")?)
        .context("parse lowq")?;

    // critical sanity check: end-start+1 should match vector length for per-base bins
    anyhow::ensure!(
        (end - start + 1) as usize == buf.depths.len(),
        "bin width mismatch: {}..{} has width {}, depths has len {}",
        start, end, end - start + 1, buf.depths.len()
    );
    anyhow::ensure!(buf.depths.len() == buf.lowq.len(), "depth/lowq len mismatch");

    Ok((start, &buf.depths, &buf.lowq))
}

#[derive(Debug)]
pub struct BinStats {
    pub contig: String,
    pub start: u32,
    pub end: u32,

    pub n_reads: u32,
    pub n_sa: u32,
    pub max_start_count: u32,
    pub max_end_count: u32,

    pub depths: Vec<u32>,
    pub low_qualities: Vec<u32>,
    // keep fb_invs too if you want
    pub fb_invs: Vec<u32>,
}

impl BinStats {
    #[inline]
    pub fn mean_coverage_from_depths(&self) -> f64 {
        if self.depths.is_empty() { return 0.0; }
        self.depths.iter().sum::<u32>() as f64 / self.depths.len() as f64
    }
}

impl BinStats {
    pub fn from_bam_region(
        bam_reader: &mut IndexedReader,
        header: &HeaderView,
        contig: &str,
        start: u32,
        length: u32,
        min_mapq: u8,
    ) -> anyhow::Result<Self> {
        anyhow::ensure!(length > 0, "bin length must be > 0");
        let end = start + length - 1;

        bam_reader
            .fetch((contig, start as i64, end as i64))
            .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?;

        let mut depths = vec![0u32; length as usize];
        let mut low_qualities = vec![0u32; length as usize];
        let mut fb_invs = vec![0u32; length as usize];

        let mut n_reads: u32 = 0;
        let mut n_sa: u32 = 0;

        let mut start_counts: HashMap<u32, u32> = HashMap::new();
        let mut end_counts: HashMap<u32, u32> = HashMap::new();

        for record_result in bam_reader.rc_records() {
            let rc_record = match record_result {
                Ok(rc) => rc,
                Err(e) => {
                    error!(
                        "Failed to parse record in {}:{}-{}: {}",
                        contig, start, end, e
                    );
                    continue;
                }
            };
            let record: &Record = rc_record.as_ref();

            // read genomic span (inclusive)
            let rs_i64 = record.pos();
            let re_i64 = record.cigar().end_pos(); // exclusive
            if rs_i64 < 0 || re_i64 <= rs_i64 {
                continue;
            }
            let read_start = rs_i64 as u32;
            let read_end = (re_i64 as u32).saturating_sub(1);

            if read_end < start || read_start > end {
                continue;
            }

            // counts
            n_reads += 1;

            if matches!(record.aux(b"SA"), Ok(Aux::String(_))) {
                n_sa += 1;
            }

            let aln_start = record.reference_start() as u32;
            let aln_end = record.reference_end() as u32;
            if (start..=end).contains(&aln_start) {
                *start_counts.entry(aln_start).or_insert(0) += 1;
            }
            if (start..=end).contains(&aln_end) {
                *end_counts.entry(aln_end).or_insert(0) += 1;
            }

            // per-record flags (hoisted)
            let is_lowq = record.mapq() < min_mapq;
            let has_fbinv =
                fb_inv_from_record(record, header, min_mapq, Some(150), Some(200)).is_some();

            // per-base accumulation (same semantics as your original)
            let local_start = start.max(read_start);
            let local_end = end.min(read_end);

            for pos in local_start..=local_end {
                let i = (pos - start) as usize;
                depths[i] += 1;
                if is_lowq {
                    low_qualities[i] += 1;
                }
                if has_fbinv {
                    fb_invs[i] += 1;
                }
            }
        }

        let max_start_count = start_counts.values().copied().max().unwrap_or(0);
        let max_end_count = end_counts.values().copied().max().unwrap_or(0);

        Ok(Self {
            contig: contig.to_string(),
            start,
            end,
            n_reads,
            n_sa,
            max_start_count,
            max_end_count,
            depths,
            low_qualities,
            fb_invs,
        })
    }
}