|
|
@@ -1,4 +1,5 @@
|
|
|
use std::{
|
|
|
+ collections::HashMap,
|
|
|
fs::{self, File},
|
|
|
path::PathBuf,
|
|
|
str::FromStr,
|
|
|
@@ -7,14 +8,13 @@ use std::{
|
|
|
use anyhow::{anyhow, Context};
|
|
|
use chrono::{DateTime, Utc};
|
|
|
use glob::glob;
|
|
|
-use hashbrown::HashMap;
|
|
|
-use log::warn;
|
|
|
+use log::{debug, warn};
|
|
|
use pandora_lib_bindings::{
|
|
|
progs::cramino::{Cramino, CraminoRes},
|
|
|
utils::RunBin,
|
|
|
};
|
|
|
use rayon::prelude::*;
|
|
|
-use rust_htslib::bam::Read;
|
|
|
+use rust_htslib::bam::{record::Cigar, Read};
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
@@ -108,10 +108,10 @@ impl Bam {
|
|
|
None
|
|
|
};
|
|
|
|
|
|
- let composition =
|
|
|
- bam_compo(path.to_string_lossy().as_ref(), 20000).context(
|
|
|
- format!("Error while reading BAM composition for {}", path.display()),
|
|
|
- )?;
|
|
|
+ let composition = bam_compo(path.to_string_lossy().as_ref(), 20000).context(format!(
|
|
|
+ "Error while reading BAM composition for {}",
|
|
|
+ path.display()
|
|
|
+ ))?;
|
|
|
|
|
|
let s = Self {
|
|
|
path,
|
|
|
@@ -239,19 +239,19 @@ pub fn bam_compo(file_path: &str, sample_size: usize) -> anyhow::Result<Vec<(Str
|
|
|
pub fn nt_pileup(
|
|
|
bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
chr: &str,
|
|
|
- position: i32,
|
|
|
+ position: u32,
|
|
|
with_next_ins: bool,
|
|
|
) -> anyhow::Result<Vec<u8>> {
|
|
|
use rust_htslib::{bam, bam::Read};
|
|
|
let mut bases = Vec::new();
|
|
|
- bam.fetch((chr, position, position + 1))?;
|
|
|
+ bam.fetch((chr, position, position + 1))?;
|
|
|
let mut bam_pileup = Vec::new();
|
|
|
for p in bam.pileup() {
|
|
|
let pileup = p.context(format!(
|
|
|
"Can't pileup bam at position {}:{} (0-based)",
|
|
|
chr, position
|
|
|
))?;
|
|
|
- let cur_position = pileup.pos() as i32;
|
|
|
+ let cur_position = pileup.pos();
|
|
|
if cur_position == position {
|
|
|
for alignment in pileup.alignments() {
|
|
|
match alignment.indel() {
|
|
|
@@ -260,7 +260,7 @@ pub fn nt_pileup(
|
|
|
_ => {
|
|
|
let record = alignment.record();
|
|
|
if record.seq_len() > 0 {
|
|
|
- if let Some(b) = base_at(&record, position as u32, with_next_ins)? {
|
|
|
+ if let Some(b) = base_at(&record, position, with_next_ins)? {
|
|
|
bases.push(b);
|
|
|
}
|
|
|
} else if alignment.is_del() {
|
|
|
@@ -279,8 +279,6 @@ pub fn base_at(
|
|
|
at_pos: u32,
|
|
|
with_next_ins: bool,
|
|
|
) -> anyhow::Result<Option<u8>> {
|
|
|
- use rust_htslib::bam::record::Cigar;
|
|
|
-
|
|
|
let cigar = record.cigar();
|
|
|
let seq = record.seq();
|
|
|
let pos = cigar.pos() as u32;
|
|
|
@@ -327,23 +325,112 @@ pub fn base_at(
|
|
|
Ok(None)
|
|
|
}
|
|
|
|
|
|
-// thanks to chatGPT (the best)
|
|
|
-pub fn estimate_shannon_entropy(dna_sequence: &str) -> f64 {
|
|
|
- let m = dna_sequence.len() as f64;
|
|
|
+pub fn counts_at(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ position: u32,
|
|
|
+) -> anyhow::Result<HashMap<String, i32>> {
|
|
|
+ let p = nt_pileup(bam, chr, position, false)?
|
|
|
+ .iter()
|
|
|
+ .map(|e| String::from_utf8(vec![*e]).unwrap())
|
|
|
+ .collect::<Vec<_>>();
|
|
|
+ let mut counts = HashMap::new();
|
|
|
+ for item in p.iter() {
|
|
|
+ *counts.entry(item.to_string()).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+ Ok(counts)
|
|
|
+}
|
|
|
|
|
|
- // Count occurrences of each base
|
|
|
- let mut bases = HashMap::<char, usize>::new();
|
|
|
- for base in dna_sequence.chars() {
|
|
|
- *bases.entry(base).or_insert(0) += 1;
|
|
|
+pub fn ins_pileup(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ position: u32,
|
|
|
+) -> anyhow::Result<Vec<String>> {
|
|
|
+ let mut bases = Vec::new();
|
|
|
+ bam.fetch((chr, position, position + 10))?;
|
|
|
+ for p in bam.pileup() {
|
|
|
+ let pileup = p.context(format!(
|
|
|
+ "Can't pileup bam at position {}:{} (0-based)",
|
|
|
+ chr, position
|
|
|
+ ))?;
|
|
|
+ let cur_position = pileup.pos();
|
|
|
+ // Ins in the next position
|
|
|
+ if cur_position == position + 1 {
|
|
|
+ // debug!("{cur_position}");
|
|
|
+ for alignment in pileup.alignments() {
|
|
|
+ let record = alignment.record();
|
|
|
+ if record.seq_len() > 0 {
|
|
|
+ if let Some(b) = ins_at(&record, position)? {
|
|
|
+ bases.push(b);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
+ Ok(bases)
|
|
|
+}
|
|
|
+
|
|
|
+pub fn ins_at(
|
|
|
+ record: &rust_htslib::bam::record::Record,
|
|
|
+ at_pos: u32,
|
|
|
+) -> anyhow::Result<Option<String>> {
|
|
|
+ use rust_htslib::bam::record::Cigar;
|
|
|
+
|
|
|
+ let cigar = record.cigar();
|
|
|
+ let seq = record.seq();
|
|
|
+ let pos = cigar.pos() as u32;
|
|
|
|
|
|
- // Calculate Shannon entropy
|
|
|
- let mut shannon_entropy_value = 0.0;
|
|
|
- for &n_i in bases.values() {
|
|
|
- let p_i = n_i as f64 / m;
|
|
|
- shannon_entropy_value -= p_i * p_i.log2();
|
|
|
+ let mut read_i = 0u32;
|
|
|
+ let mut ref_pos = pos;
|
|
|
+ if ref_pos > at_pos {
|
|
|
+ return Ok(None);
|
|
|
}
|
|
|
+ // debug!(
|
|
|
+ // "read: {}",
|
|
|
+ // String::from_utf8(record.qname().to_vec()).unwrap()
|
|
|
+ // );
|
|
|
+
|
|
|
+ for (id, op) in cigar.iter().enumerate() {
|
|
|
+ let (add_read, add_ref) = match *op {
|
|
|
+ Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => (len, len),
|
|
|
+ Cigar::Ins(len) => (len, 0),
|
|
|
+ Cigar::Del(len) => (0, len),
|
|
|
+ Cigar::RefSkip(len) => (0, len),
|
|
|
+ Cigar::SoftClip(len) => (len, 0),
|
|
|
+ Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
|
|
|
+ };
|
|
|
|
|
|
- shannon_entropy_value
|
|
|
+ if ref_pos + add_read > at_pos && ref_pos + add_read < at_pos + 10 {
|
|
|
+ if let Cigar::Ins(v) = *op {
|
|
|
+ // debug!(
|
|
|
+ // "ins size {v} @ {} (corrected {})",
|
|
|
+ // ref_pos + add_read,
|
|
|
+ // (ref_pos + add_read) - v - 1
|
|
|
+ // );
|
|
|
+
|
|
|
+ if (ref_pos + add_read) - v - 1 == at_pos {
|
|
|
+ let inserted_seq =
|
|
|
+ seq.as_bytes()[read_i as usize..(read_i + v) as usize].to_vec();
|
|
|
+ return Ok(Some(String::from_utf8(inserted_seq)?));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ read_i += add_read;
|
|
|
+ ref_pos += add_ref;
|
|
|
+ }
|
|
|
+ Ok(None)
|
|
|
}
|
|
|
|
|
|
+pub fn counts_ins_at(
|
|
|
+ bam: &mut rust_htslib::bam::IndexedReader,
|
|
|
+ chr: &str,
|
|
|
+ position: u32,
|
|
|
+) -> anyhow::Result<HashMap<String, i32>> {
|
|
|
+ let p = ins_pileup(bam, chr, position)?;
|
|
|
+ let mut counts = HashMap::new();
|
|
|
+ for item in p.iter() {
|
|
|
+ *counts.entry(item.to_string()).or_insert(0) += 1;
|
|
|
+ }
|
|
|
+ Ok(counts)
|
|
|
+}
|