|
|
@@ -143,8 +143,8 @@ impl WGSBamStats {
|
|
|
info!("Saved stats to: {}", json_path.display());
|
|
|
|
|
|
let qnames_path = Self::qnames_path_from_bam(bam_path);
|
|
|
- qnames.save(qnames_path)?;
|
|
|
- info!("Saved qnames to: {}", json_path.display());
|
|
|
+ qnames.save(&qnames_path)?;
|
|
|
+ info!("Saved qnames to: {}", qnames_path.display());
|
|
|
Ok(stats)
|
|
|
} else {
|
|
|
info!("Loading cached stats from: {}", json_path.display());
|
|
|
@@ -155,19 +155,7 @@ impl WGSBamStats {
|
|
|
/// Derive JSON path from BAM path
|
|
|
/// e.g., /path/to/34528_norm_hs1.bam -> /path/to/34528_norm_bam_stats.json
|
|
|
fn json_path_from_bam(bam_path: &Path) -> PathBuf {
|
|
|
- let dir = bam_path.parent().unwrap_or(Path::new("."));
|
|
|
- let stem = bam_path
|
|
|
- .file_stem()
|
|
|
- .map(|s| s.to_string_lossy())
|
|
|
- .unwrap_or_default();
|
|
|
-
|
|
|
- // "34528_norm_hs1" -> "34528_norm"
|
|
|
- let parts: Vec<&str> = stem.split('_').collect();
|
|
|
- let prefix = if parts.len() >= 2 {
|
|
|
- format!("{}_{}", parts[0], parts[1])
|
|
|
- } else {
|
|
|
- stem.to_string()
|
|
|
- };
|
|
|
+ let (dir, prefix) = bam_prefix(bam_path);
|
|
|
|
|
|
dir.join(format!("{}_bam_stats.json", prefix))
|
|
|
}
|
|
|
@@ -175,19 +163,7 @@ impl WGSBamStats {
|
|
|
/// Derive JSON path from BAM path
|
|
|
/// e.g., /path/to/34528_norm_hs1.bam -> /path/to/34528_norm_bam_stats.json
|
|
|
pub fn qnames_path_from_bam(bam_path: &Path) -> PathBuf {
|
|
|
- let dir = bam_path.parent().unwrap_or(Path::new("."));
|
|
|
- let stem = bam_path
|
|
|
- .file_stem()
|
|
|
- .map(|s| s.to_string_lossy())
|
|
|
- .unwrap_or_default();
|
|
|
-
|
|
|
- // "34528_norm_hs1" -> "34528_norm"
|
|
|
- let parts: Vec<&str> = stem.split('_').collect();
|
|
|
- let prefix = if parts.len() >= 2 {
|
|
|
- format!("{}_{}", parts[0], parts[1])
|
|
|
- } else {
|
|
|
- stem.to_string()
|
|
|
- };
|
|
|
+ let (dir, prefix) = bam_prefix(bam_path);
|
|
|
|
|
|
dir.join(format!("{}_bam_qnames.bin", prefix))
|
|
|
}
|
|
|
@@ -197,8 +173,38 @@ impl WGSBamStats {
|
|
|
/// - BAM is newer than JSON
|
|
|
pub fn open(case_id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
|
|
|
let bam_path = PathBuf::from(config.solo_bam(case_id, time));
|
|
|
+ let json_path = Self::json_path(case_id, time, config);
|
|
|
+ Self::from_bam_with_cache(&bam_path, &json_path, config)
|
|
|
+ }
|
|
|
|
|
|
- Self::from_bam(&bam_path, config)
|
|
|
+ fn from_bam_with_cache(
|
|
|
+ bam_path: &Path,
|
|
|
+ json_path: &Path,
|
|
|
+ config: &Config,
|
|
|
+ ) -> anyhow::Result<Self> {
|
|
|
+ let should_recompute = if !json_path.exists() {
|
|
|
+ info!("No cached stats found, computing...");
|
|
|
+ true
|
|
|
+ } else if Self::is_bam_newer(bam_path, json_path)? {
|
|
|
+ info!("BAM is newer than cached stats, recomputing...");
|
|
|
+ true
|
|
|
+ } else {
|
|
|
+ false
|
|
|
+ };
|
|
|
+
|
|
|
+ if should_recompute {
|
|
|
+ let (stats, qnames) = Self::from_bam_path(bam_path, config)?;
|
|
|
+ if let Some(parent) = json_path.parent() {
|
|
|
+ fs::create_dir_all(parent)?;
|
|
|
+ }
|
|
|
+ stats.save_json(json_path)?;
|
|
|
+ let qnames_path = Self::qnames_path_from_bam(bam_path); // or case/time version
|
|
|
+ qnames.save(&qnames_path)?;
|
|
|
+ Ok(stats)
|
|
|
+ } else {
|
|
|
+ info!("Loading cached stats from: {}", json_path.display());
|
|
|
+ Self::load_json(json_path)
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/// Check if BAM file is newer than JSON cache
|
|
|
@@ -496,12 +502,29 @@ impl WGSBamStats {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+fn bam_prefix(bam_path: &Path) -> (PathBuf, String) {
|
|
|
+ let dir = bam_path.parent().unwrap_or(Path::new(".")).to_path_buf();
|
|
|
+ let stem = bam_path
|
|
|
+ .file_stem()
|
|
|
+ .map(|s| s.to_string_lossy())
|
|
|
+ .unwrap_or_default();
|
|
|
+
|
|
|
+ let parts: Vec<&str> = stem.split('_').collect();
|
|
|
+ let prefix = if parts.len() >= 2 {
|
|
|
+ format!("{}_{}", parts[0], parts[1])
|
|
|
+ } else {
|
|
|
+ stem.to_string()
|
|
|
+ };
|
|
|
+ (dir, prefix)
|
|
|
+}
|
|
|
+
|
|
|
pub fn median_from_hist(hist: &BTreeMap<u64, u64>, n_reads: u64) -> u64 {
|
|
|
if n_reads == 0 {
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
- let mid = n_reads / 2;
|
|
|
+ // 1-based median index
|
|
|
+ let mid = (n_reads + 1) / 2;
|
|
|
let mut cum = 0u64;
|
|
|
|
|
|
for (&len, &count) in hist {
|
|
|
@@ -573,7 +596,8 @@ pub fn fmt_histogram(
|
|
|
} else {
|
|
|
let min_len = data.first().map(|(l, _)| *l).unwrap_or(0);
|
|
|
let max_len = data.last().map(|(l, _)| *l).unwrap_or(0);
|
|
|
- let bin_size = ((max_len - min_len) / 20).max(LEN_BIN);
|
|
|
+ let span = max_len.saturating_sub(min_len);
|
|
|
+ let bin_size = (span / 20).max(LEN_BIN);
|
|
|
|
|
|
let mut bins: Vec<(u64, u64, u64)> = Vec::new();
|
|
|
let mut current_start = min_len;
|
|
|
@@ -623,12 +647,13 @@ pub fn fmt_histogram(
|
|
|
|
|
|
/// Truncate string with ellipsis if too long
|
|
|
fn truncate_str(s: &str, max_len: usize) -> String {
|
|
|
- if s.len() <= max_len {
|
|
|
+ if s.chars().count() <= max_len {
|
|
|
s.to_string()
|
|
|
} else if max_len <= 3 {
|
|
|
- s[..max_len].to_string()
|
|
|
+ s.chars().take(max_len).collect()
|
|
|
} else {
|
|
|
- format!("{}...", &s[..max_len - 3])
|
|
|
+ let prefix: String = s.chars().take(max_len - 3).collect();
|
|
|
+ format!("{prefix}...")
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -790,14 +815,44 @@ impl fmt::Display for WGSBamStats {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+fn decode_uuid(qname: &str) -> Option<Vec<u8>> {
|
|
|
+ let clean = qname.replace('-', "");
|
|
|
+ if clean.len() != 32 {
|
|
|
+ return None;
|
|
|
+ }
|
|
|
+ hex::decode(&clean).ok()
|
|
|
+}
|
|
|
+
|
|
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
|
|
pub struct QNameSet {
|
|
|
pub qnames: FxHashSet<Vec<u8>>, // 16 raw UUID bytes each
|
|
|
}
|
|
|
|
|
|
impl QNameSet {
|
|
|
+ pub fn qnames_path_from_bam(bam_path: &Path) -> PathBuf {
|
|
|
+ let dir = bam_path.parent().unwrap_or(Path::new("."));
|
|
|
+ let stem = bam_path
|
|
|
+ .file_stem()
|
|
|
+ .map(|s| s.to_string_lossy())
|
|
|
+ .unwrap_or_default();
|
|
|
+
|
|
|
+ // "34528_norm_hs1" -> "34528_norm"
|
|
|
+ let parts: Vec<&str> = stem.split('_').collect();
|
|
|
+ let prefix = if parts.len() >= 2 {
|
|
|
+ format!("{}_{}", parts[0], parts[1])
|
|
|
+ } else {
|
|
|
+ stem.to_string()
|
|
|
+ };
|
|
|
+
|
|
|
+ dir.join(format!("{}_bam_qnames.bin", prefix))
|
|
|
+ }
|
|
|
+
|
|
|
/// Load QNAMEs from a BAM file into memory (no stats, no disk write)
|
|
|
- pub fn from_bam_in_memory(bam_path: impl AsRef<Path>, bam_min_mapq: u8, n_threads: usize) -> anyhow::Result<Self> {
|
|
|
+ pub fn from_bam_in_memory(
|
|
|
+ bam_path: impl AsRef<Path>,
|
|
|
+ bam_min_mapq: u8,
|
|
|
+ n_threads: usize,
|
|
|
+ ) -> anyhow::Result<Self> {
|
|
|
let mut bam = rust_htslib::bam::Reader::from_path(bam_path.as_ref())?;
|
|
|
rust_htslib::bam::Read::set_threads(&mut bam, n_threads)?;
|
|
|
let mut qs = Self::default();
|
|
|
@@ -805,26 +860,84 @@ impl QNameSet {
|
|
|
for rec in rust_htslib::bam::Read::rc_records(&mut bam) {
|
|
|
let r = rec?;
|
|
|
let flags = r.flags();
|
|
|
- if flags & SKIP_FLAGS != 0 { continue; }
|
|
|
- if flags & DUP_FLAG != 0 { continue; }
|
|
|
- if r.mapq() < bam_min_mapq { continue; }
|
|
|
+ if flags & SKIP_FLAGS != 0 {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if flags & DUP_FLAG != 0 {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if r.mapq() < bam_min_mapq {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
qs.add_qname_bytes(r.qname());
|
|
|
}
|
|
|
Ok(qs)
|
|
|
}
|
|
|
|
|
|
- /// Add a new QNAME (hex or dashed UUID string), store as 16 bytes
|
|
|
- pub fn add(&mut self, qname: &str) -> bool {
|
|
|
- let clean = qname.replace('-', "");
|
|
|
- if clean.len() != 32 {
|
|
|
- return false;
|
|
|
+ /// Load from cache if present and up-to-date; otherwise build from BAM and save.
|
|
|
+ pub fn load_or_create(
|
|
|
+ bam_path: impl AsRef<Path>,
|
|
|
+ bam_min_mapq: u8,
|
|
|
+ n_threads: usize,
|
|
|
+ ) -> anyhow::Result<Self> {
|
|
|
+ use std::fs;
|
|
|
+
|
|
|
+ // Normalize bam_path to a PathBuf so we can reuse it
|
|
|
+ let bam_path = bam_path.as_ref().to_path_buf();
|
|
|
+ let cache = Self::qnames_path_from_bam(&bam_path);
|
|
|
+
|
|
|
+ // Remove cache if older than BAM
|
|
|
+ if cache.exists() {
|
|
|
+ let cb = fs::metadata(&cache)?;
|
|
|
+ let ba = fs::metadata(&bam_path)?;
|
|
|
+ let tb = |m: &fs::Metadata| m.modified().ok();
|
|
|
+
|
|
|
+ if let (Some(tcb), Some(tba)) = (tb(&cb), tb(&ba)) {
|
|
|
+ if tcb < tba {
|
|
|
+ fs::remove_file(&cache)?;
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
- match hex::decode(&clean) {
|
|
|
- Ok(bytes) => self.qnames.insert(bytes),
|
|
|
- Err(_) => false,
|
|
|
+
|
|
|
+ // If cache exists → load it
|
|
|
+ if cache.exists() {
|
|
|
+ return Self::load(&cache);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Otherwise → build from BAM
|
|
|
+ let qs = Self::from_bam_in_memory(&bam_path, bam_min_mapq, n_threads)?;
|
|
|
+
|
|
|
+ // Save atomic: write to tmp then rename
|
|
|
+ if let Some(parent) = cache.parent() {
|
|
|
+ fs::create_dir_all(parent)?;
|
|
|
+ }
|
|
|
+
|
|
|
+ let tmp = cache.with_extension("tmp");
|
|
|
+ qs.clone().save(&tmp)?;
|
|
|
+ fs::rename(&tmp, &cache)?;
|
|
|
+
|
|
|
+ Ok(qs)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Merge `other` into `self`. Keep existing and insert only new 16-byte QNAMEs.
|
|
|
+ pub fn merge(&mut self, other: &Self) {
|
|
|
+ for q in &other.qnames {
|
|
|
+ self.qnames.insert(q.clone());
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ /// Merge two sets into a new one, returning the result
|
|
|
+ pub fn merged(&self, other: &Self) -> Self {
|
|
|
+ let mut out = self.clone();
|
|
|
+ out.merge(other);
|
|
|
+ out
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Add a new QNAME (hex or dashed UUID string), store as 16 bytes
|
|
|
+ pub fn add(&mut self, qname: &str) -> bool {
|
|
|
+ decode_uuid(qname).is_some_and(|bytes| self.qnames.insert(bytes))
|
|
|
+ }
|
|
|
+
|
|
|
/// Add QNAME directly from BAM record (`&[u8]`), expect ASCII UUID
|
|
|
pub fn add_qname_bytes(&mut self, qname: &[u8]) -> bool {
|
|
|
match std::str::from_utf8(qname) {
|
|
|
@@ -834,7 +947,7 @@ impl QNameSet {
|
|
|
}
|
|
|
|
|
|
/// Save raw bytes (binary format), 16 bytes per QNAME
|
|
|
- pub fn save(self, path: impl AsRef<Path>) -> anyhow::Result<()> {
|
|
|
+ pub fn save(&self, path: impl AsRef<Path>) -> anyhow::Result<()> {
|
|
|
let mut f = fs::File::create(path.as_ref()).context("write QNameSet")?;
|
|
|
for q in &self.qnames {
|
|
|
f.write_all(q)?;
|
|
|
@@ -856,12 +969,9 @@ impl QNameSet {
|
|
|
}
|
|
|
|
|
|
pub fn exists(&self, qname: &str) -> bool {
|
|
|
- let clean = qname.replace('-', "");
|
|
|
- if let Ok(bytes) = hex::decode(&clean) {
|
|
|
- self.qnames.contains(&bytes)
|
|
|
- } else {
|
|
|
- false
|
|
|
- }
|
|
|
+ decode_uuid(qname)
|
|
|
+ .map(|bytes| self.qnames.contains(&bytes))
|
|
|
+ .unwrap_or(false)
|
|
|
}
|
|
|
|
|
|
pub fn len(&self) -> usize {
|