|
|
@@ -117,10 +117,9 @@ impl WGSBam {
|
|
|
// None
|
|
|
// };
|
|
|
|
|
|
- let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(format!(
|
|
|
- "Error while reading BAM composition for {}",
|
|
|
- path.display()
|
|
|
- ))?;
|
|
|
+ let composition = bam_composition(path.to_string_lossy().as_ref(), 20_000).context(
|
|
|
+ format!("Error while reading BAM composition for {}", path.display()),
|
|
|
+ )?;
|
|
|
|
|
|
let s = Self {
|
|
|
path,
|
|
|
@@ -285,14 +284,19 @@ pub fn bam_composition(
|
|
|
file_path: &str,
|
|
|
sample_size: usize,
|
|
|
) -> anyhow::Result<Vec<(String, String, f64)>> {
|
|
|
+ use rust_htslib::bam::{record::Aux, Reader};
|
|
|
use std::collections::HashMap;
|
|
|
- use rust_htslib::bam::{Reader, record::Aux};
|
|
|
|
|
|
let mut bam = Reader::from_path(file_path)?;
|
|
|
let mut rgs: HashMap<(String, String), u64> = HashMap::new();
|
|
|
|
|
|
let mut processed = 0u64;
|
|
|
- for (i, rec) in bam.records().filter_map(Result::ok).take(sample_size).enumerate() {
|
|
|
+ for (i, rec) in bam
|
|
|
+ .records()
|
|
|
+ .filter_map(Result::ok)
|
|
|
+ .take(sample_size)
|
|
|
+ .enumerate()
|
|
|
+ {
|
|
|
let rg = match rec.aux(b"RG") {
|
|
|
Ok(Aux::String(s)) => s,
|
|
|
Ok(_) => return Err(anyhow::anyhow!("Record {}: RG tag is not a string", i + 1)),
|
|
|
@@ -305,17 +309,14 @@ pub fn bam_composition(
|
|
|
};
|
|
|
let rg_prefix = rg.split_once('_').map(|(a, _)| a).unwrap_or(rg);
|
|
|
let fn_prefix = fn_tag.split_once('_').map(|(b, _)| b).unwrap_or(fn_tag);
|
|
|
- *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string())).or_default() += 1;
|
|
|
+ *rgs.entry((rg_prefix.to_string(), fn_prefix.to_string()))
|
|
|
+ .or_default() += 1;
|
|
|
processed += 1;
|
|
|
}
|
|
|
|
|
|
let results: Vec<_> = rgs
|
|
|
.into_iter()
|
|
|
- .map(|((rg, fn_tag), n)| (
|
|
|
- rg,
|
|
|
- fn_tag,
|
|
|
- n as f64 * 100.0 / processed as f64
|
|
|
- ))
|
|
|
+ .map(|((rg, fn_tag), n)| (rg, fn_tag, n as f64 * 100.0 / processed as f64))
|
|
|
.collect();
|
|
|
|
|
|
Ok(results)
|
|
|
@@ -979,15 +980,18 @@ pub enum PileBase {
|
|
|
G,
|
|
|
T,
|
|
|
N,
|
|
|
- Ins, // insertion immediately after the queried base
|
|
|
- Del((Vec<u8>, u32)), // deletion covering the queried base
|
|
|
- Skip, // reference skip (N)
|
|
|
+ /// insertion immediately after the queried base
|
|
|
+ Ins,
|
|
|
+ /// deletion covering the queried base: (qname, del_len)
|
|
|
+ Del((Vec<u8>, u32)),
|
|
|
+ /// reference skip (N)
|
|
|
+ Skip,
|
|
|
}
|
|
|
|
|
|
-/// Decode one HTSlib 4bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `Base`
|
|
|
+/// Decode one HTSlib 4-bit nucleotide (0=A,1=C,2=G,3=T,4=N, …) to `PileBase`
|
|
|
#[inline]
|
|
|
-fn decode(b: u8) -> PileBase {
|
|
|
- match b & 0b1111 {
|
|
|
+fn decode(nibble: u8) -> PileBase {
|
|
|
+ match nibble & 0x0f {
|
|
|
0 => PileBase::A,
|
|
|
1 => PileBase::C,
|
|
|
2 => PileBase::G,
|
|
|
@@ -996,31 +1000,24 @@ fn decode(b: u8) -> PileBase {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-pub fn base_at_new(
|
|
|
- record: &rust_htslib::bam::Record,
|
|
|
- at_pos: i64,
|
|
|
- with_next_ins: bool,
|
|
|
-) -> Option<PileBase> {
|
|
|
+pub fn base_at_new(record: &rust_htslib::bam::Record, at_pos: i64, with_next_ins: bool) -> Option<PileBase> {
|
|
|
if record.pos() > at_pos {
|
|
|
return None;
|
|
|
}
|
|
|
|
|
|
let mut ref_pos = record.pos();
|
|
|
- let mut read_pos = 0_i64;
|
|
|
+ let mut read_pos: i64 = 0;
|
|
|
let cigar = record.cigar();
|
|
|
- let seq = record.seq().as_bytes(); // already expanded to ASCII (A,C,G,T,N)
|
|
|
|
|
|
for (i, op) in cigar.iter().enumerate() {
|
|
|
- let l = op.len() as i64;
|
|
|
- let (consume_read, consume_ref) = match op.char() {
|
|
|
- 'M' | '=' | 'X' => (l, l),
|
|
|
- 'I' | 'S' => (l, 0),
|
|
|
- 'D' | 'N' => (0, l),
|
|
|
- 'H' | 'P' => (0, 0),
|
|
|
- _ => unreachable!(),
|
|
|
+ let (consume_read, consume_ref) = match *op {
|
|
|
+ Cigar::Match(l) | Cigar::Equal(l) | Cigar::Diff(l) => (l as i64, l as i64),
|
|
|
+ Cigar::Ins(l) | Cigar::SoftClip(l) => (l as i64, 0),
|
|
|
+ Cigar::Del(l) | Cigar::RefSkip(l) => (0, l as i64),
|
|
|
+ Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
|
|
|
};
|
|
|
|
|
|
- /* look ahead for an insertion immediately AFTER the queried base */
|
|
|
+ // Look-ahead: insertion immediately AFTER the queried base?
|
|
|
if with_next_ins
|
|
|
&& ref_pos + consume_ref == at_pos + 1
|
|
|
&& matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
|
|
|
@@ -1028,25 +1025,21 @@ pub fn base_at_new(
|
|
|
return Some(PileBase::Ins);
|
|
|
}
|
|
|
|
|
|
- // Instead of PileBase::Ins, consider
|
|
|
- // extracting the inserted sequence and returning its length or actual bases:
|
|
|
- // if let Some(Cigar::Ins(len)) = cigar.get(i + 1) {
|
|
|
- // let ins_seq = &seq[read_pos as usize + (consume_ref as usize)
|
|
|
- // ..read_pos as usize + (consume_ref as usize) + (*len as usize)];
|
|
|
- // return Some(PileBase::Ins(ins_seq.to_vec()));
|
|
|
- // }
|
|
|
-
|
|
|
- /* Does the queried reference coordinate fall inside this CIGAR op? */
|
|
|
+ // Does the queried reference coordinate fall inside this CIGAR op?
|
|
|
if ref_pos + consume_ref > at_pos {
|
|
|
- return Some(match op {
|
|
|
- Cigar::Del(e) => PileBase::Del((record.qname().to_vec(), *e)),
|
|
|
+ return Some(match *op {
|
|
|
+ Cigar::Del(l) => PileBase::Del((record.qname().to_vec(), l)),
|
|
|
Cigar::RefSkip(_) => PileBase::Skip,
|
|
|
+ // Match/mismatch or soft-clipped covering (only match/mismatch should consume ref)
|
|
|
_ => {
|
|
|
let offset = (at_pos - ref_pos) as usize;
|
|
|
- if read_pos as usize + offset >= seq.len() {
|
|
|
- return None; // out of range, malformed BAM
|
|
|
+ // Guard against malformed records
|
|
|
+ let seq_len = record.seq_len();
|
|
|
+ if (read_pos as usize) + offset >= seq_len {
|
|
|
+ return None;
|
|
|
}
|
|
|
- decode(seq[read_pos as usize + offset])
|
|
|
+ let b4 = record.seq().encoded_base((read_pos as usize) + offset);
|
|
|
+ decode(b4)
|
|
|
}
|
|
|
});
|
|
|
}
|
|
|
@@ -1058,57 +1051,47 @@ pub fn base_at_new(
|
|
|
None // beyond alignment
|
|
|
}
|
|
|
|
|
|
-/// Return the pile‑up of **all reads** at `chr:pos` (0‑based, inclusive).
|
|
|
+/// Return the pile-up of **all reads** at `chr:pos` (0-based, inclusive).
|
|
|
///
|
|
|
-/// * `bam` – an open [`rust_htslib::bam::IndexedReader`].
|
|
|
-/// * `chr` / `pos` – reference contig name and coordinate.
|
|
|
-/// * `with_next_ins` – if `true`, report [`Base::Ins`] when an insertion starts
|
|
|
+/// * `bam` – an open `rust_htslib::bam::IndexedReader`.
|
|
|
+/// * `chr` / `pos` – reference contig name and coordinate.
|
|
|
+/// * `with_next_ins` – if `true`, report `PileBase::Ins` when an insertion starts
|
|
|
/// **right after** the queried base.
|
|
|
///
|
|
|
-/// The function bounds the internal pile‑up depth to 10 000 reads to protect
|
|
|
+/// The function bounds the internal pile-up depth to 10 000 reads to protect
|
|
|
/// against malformed BAM files that could otherwise allocate unbounded memory.
|
|
|
-///
|
|
|
-/// # Errors
|
|
|
-///
|
|
|
-/// Propagates any I/O or parsing error from *rust‑htslib*, wrapped in
|
|
|
-/// [`anyhow::Error`].
|
|
|
pub fn nt_pileup_new(
|
|
|
bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
chr: &str,
|
|
|
pos: u32,
|
|
|
with_next_ins: bool,
|
|
|
) -> anyhow::Result<Vec<PileBase>> {
|
|
|
- // Storage for the per‑read observations.
|
|
|
let mut pile = Vec::new();
|
|
|
|
|
|
- // Restrict BAM iterator to the one‑base span of interest.
|
|
|
+ // Restrict BAM iterator to the one-base span of interest.
|
|
|
bam.fetch((chr, pos, pos + 1))?;
|
|
|
|
|
|
- // Hard cap on depth (adjust if your use‑case needs deeper coverage).
|
|
|
- bam.pileup().set_max_depth(10_000);
|
|
|
+ // Apply the depth cap to the SAME iterator we traverse.
|
|
|
+ let mut pls = bam.pileup();
|
|
|
+ pls.set_max_depth(10_000);
|
|
|
|
|
|
- // Stream the pileup; HTSlib walks the reads for us.
|
|
|
- for pileup in bam.pileup() {
|
|
|
- // Attach context to any low‑level error.
|
|
|
+ for pileup in pls {
|
|
|
let pileup = pileup.context(format!("Failed to pileup BAM at {chr}:{pos}"))?;
|
|
|
-
|
|
|
- // Skip other positions quickly.
|
|
|
if pileup.pos() != pos {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
for aln in pileup.alignments() {
|
|
|
- // Handle the three indel states reported by HTSlib.
|
|
|
match aln.indel() {
|
|
|
rust_htslib::bam::pileup::Indel::Ins(_) => {
|
|
|
- pile.push(PileBase::Ins); // insertion *starts at* this base
|
|
|
+ // insertion *starts at* this reference position
|
|
|
+ pile.push(PileBase::Ins);
|
|
|
}
|
|
|
rust_htslib::bam::pileup::Indel::Del(e) => {
|
|
|
let qname = aln.record().qname().to_vec();
|
|
|
- pile.push(PileBase::Del((qname, e))); // deletion length > 1, but still covers here
|
|
|
+ pile.push(PileBase::Del((qname, e)));
|
|
|
}
|
|
|
rust_htslib::bam::pileup::Indel::None => {
|
|
|
- // Regular match/mismatch: delegate to `base_at`.
|
|
|
if let Some(base) = base_at_new(&aln.record(), pos as i64, with_next_ins) {
|
|
|
pile.push(base);
|
|
|
}
|