|
@@ -9,6 +9,7 @@ use anyhow::Context;
|
|
|
use bgzip::{BGZFReader, BGZFWriter};
|
|
use bgzip::{BGZFReader, BGZFWriter};
|
|
|
use bitcode::{Decode, Encode};
|
|
use bitcode::{Decode, Encode};
|
|
|
use csv::ReaderBuilder;
|
|
use csv::ReaderBuilder;
|
|
|
|
|
+use dashmap::DashMap;
|
|
|
use log::{debug, error, info, warn};
|
|
use log::{debug, error, info, warn};
|
|
|
use pandora_lib_assembler::assembler::calculate_shannon_entropy;
|
|
use pandora_lib_assembler::assembler::calculate_shannon_entropy;
|
|
|
use rayon::prelude::*;
|
|
use rayon::prelude::*;
|
|
@@ -31,7 +32,9 @@ use crate::{
|
|
|
bam::{counts_at, counts_ins_at},
|
|
bam::{counts_at, counts_ins_at},
|
|
|
vcf::Vcf,
|
|
vcf::Vcf,
|
|
|
},
|
|
},
|
|
|
- helpers::{app_storage_dir, estimate_shannon_entropy, mean, temp_file_path, Hash128},
|
|
|
|
|
|
|
+ helpers::{
|
|
|
|
|
+ app_storage_dir, detect_repetition, estimate_shannon_entropy, mean, temp_file_path, Hash128, Repeat,
|
|
|
|
|
+ },
|
|
|
io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header, writers::get_gz_writer},
|
|
io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header, writers::get_gz_writer},
|
|
|
positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
|
|
positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
|
|
|
variant::variant::VariantId,
|
|
variant::variant::VariantId,
|
|
@@ -316,6 +319,56 @@ impl VariantCollection {
|
|
|
});
|
|
});
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ pub fn remove_strech(&mut self) -> usize {
|
|
|
|
|
+ // 1) Concurrently collect indices of RepOne/RepTwo deletions keyed by "contig:pos"
|
|
|
|
|
+ let deletions_to_rm: DashMap<String, Vec<usize>> = DashMap::new();
|
|
|
|
|
+ self.variants
|
|
|
|
|
+ .par_iter()
|
|
|
|
|
+ .enumerate()
|
|
|
|
|
+ .for_each(|(i, v)| {
|
|
|
|
|
+ if let Some(del_seq) = v.deletion_seq() {
|
|
|
|
|
+ if matches!(detect_repetition(&del_seq), Repeat::RepOne(_, _) | Repeat::RepTwo(_, _)) {
|
|
|
|
|
+ let key = format!("{}:{}", v.position.contig, v.position.position);
|
|
|
|
|
+ deletions_to_rm
|
|
|
|
|
+ .entry(key)
|
|
|
|
|
+ .or_default()
|
|
|
|
|
+ .value_mut()
|
|
|
|
|
+ .push(i);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ // 2) Build a HashSet of all indices where Vec.len() > 1
|
|
|
|
|
+ let to_remove: HashSet<usize> = deletions_to_rm
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .filter_map(|entry| {
|
|
|
|
|
+ let idxs = entry.value();
|
|
|
|
|
+ if idxs.len() > 1 {
|
|
|
|
|
+ // clone here is fine since each Vec is small
|
|
|
|
|
+ Some(idxs.clone())
|
|
|
|
|
+ } else {
|
|
|
|
|
+ None
|
|
|
|
|
+ }
|
|
|
|
|
+ })
|
|
|
|
|
+ .flatten()
|
|
|
|
|
+ .collect();
|
|
|
|
|
+
|
|
|
|
|
+ // 3) Drain & rebuild, dropping any variant whose index is in `to_remove`
|
|
|
|
|
+ self.variants = self
|
|
|
|
|
+ .variants
|
|
|
|
|
+ .drain(..)
|
|
|
|
|
+ .enumerate()
|
|
|
|
|
+ .filter_map(|(i, v)| {
|
|
|
|
|
+ if to_remove.contains(&i) {
|
|
|
|
|
+ None
|
|
|
|
|
+ } else {
|
|
|
|
|
+ Some(v)
|
|
|
|
|
+ }
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect();
|
|
|
|
|
+
|
|
|
|
|
+ to_remove.len()
|
|
|
|
|
+}
|
|
|
/// Annotates variants with information from a constitutional BAM file.
|
|
/// Annotates variants with information from a constitutional BAM file.
|
|
|
///
|
|
///
|
|
|
/// This function processes variants in parallel chunks and adds annotations
|
|
/// This function processes variants in parallel chunks and adds annotations
|
|
@@ -350,8 +403,9 @@ impl VariantCollection {
|
|
|
constit_bam_path: &str,
|
|
constit_bam_path: &str,
|
|
|
max_threads: u8,
|
|
max_threads: u8,
|
|
|
) -> anyhow::Result<()> {
|
|
) -> anyhow::Result<()> {
|
|
|
- fn folder<'a>(alt_seq: &'a str) -> impl Fn((i32, i32), (String, i32)) -> (i32, i32) + 'a {
|
|
|
|
|
|
|
+ fn folder<'a>(alt_seq: &'a str) -> impl Fn((u32, u32), (String, i32)) -> (u32, u32) + 'a {
|
|
|
move |(depth_acc, alt_acc), (seq, n): (String, i32)| {
|
|
move |(depth_acc, alt_acc), (seq, n): (String, i32)| {
|
|
|
|
|
+ let n = n as u32;
|
|
|
if seq == alt_seq {
|
|
if seq == alt_seq {
|
|
|
(depth_acc + n, alt_acc + n)
|
|
(depth_acc + n, alt_acc + n)
|
|
|
} else {
|
|
} else {
|
|
@@ -361,7 +415,7 @@ impl VariantCollection {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
fn match_repeats(
|
|
fn match_repeats(
|
|
|
- v: &Vec<(String, i32)>,
|
|
|
|
|
|
|
+ v: &[(String, i32)],
|
|
|
nt: char,
|
|
nt: char,
|
|
|
n: usize,
|
|
n: usize,
|
|
|
e: usize,
|
|
e: usize,
|
|
@@ -390,6 +444,11 @@ impl VariantCollection {
|
|
|
let mut bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path)
|
|
let mut bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path)
|
|
|
.map_err(|e| anyhow::anyhow!("Failed to open BAM file: {e}"))?;
|
|
.map_err(|e| anyhow::anyhow!("Failed to open BAM file: {e}"))?;
|
|
|
|
|
|
|
|
|
|
+ let c = crate::config::Config::default();
|
|
|
|
|
+
|
|
|
|
|
+ let mut fasta_reader = noodles_fasta::indexed_reader::Builder::default()
|
|
|
|
|
+ .build_from_path(c.reference)?;
|
|
|
|
|
+
|
|
|
for var in chunk {
|
|
for var in chunk {
|
|
|
let key = var.hash();
|
|
let key = var.hash();
|
|
|
let mut anns = annotations.store.entry(key).or_default();
|
|
let mut anns = annotations.store.entry(key).or_default();
|
|
@@ -419,40 +478,79 @@ impl VariantCollection {
|
|
|
}
|
|
}
|
|
|
AlterationCategory::DEL => {
|
|
AlterationCategory::DEL => {
|
|
|
if let Some(del_repr) = var.deletion_desc() {
|
|
if let Some(del_repr) = var.deletion_desc() {
|
|
|
- let pileup_start = counts_at(
|
|
|
|
|
|
|
+ let len = var.deletion_len().unwrap_or_default();
|
|
|
|
|
+
|
|
|
|
|
+ let pileup_start = crate::collection::bam::nt_pileup_new(
|
|
|
&mut bam,
|
|
&mut bam,
|
|
|
&var.position.contig(),
|
|
&var.position.contig(),
|
|
|
del_repr.start.saturating_sub(1),
|
|
del_repr.start.saturating_sub(1),
|
|
|
|
|
+ false,
|
|
|
)?;
|
|
)?;
|
|
|
- let (start_depth, start_alt) =
|
|
|
|
|
- pileup_start.into_iter().fold((0, 0), folder("D"));
|
|
|
|
|
|
|
|
|
|
- let pileup_end = counts_at(
|
|
|
|
|
|
|
+ let pileup_end = crate::collection::bam::nt_pileup_new(
|
|
|
&mut bam,
|
|
&mut bam,
|
|
|
&var.position.contig(),
|
|
&var.position.contig(),
|
|
|
del_repr.end.saturating_sub(1),
|
|
del_repr.end.saturating_sub(1),
|
|
|
|
|
+ false,
|
|
|
)?;
|
|
)?;
|
|
|
- let (end_depth, end_alt) =
|
|
|
|
|
- pileup_end.into_iter().fold((0, 0), folder("D"));
|
|
|
|
|
|
|
|
|
|
- // outside start (one base upstream)
|
|
|
|
|
- let pileup_out_start = counts_at(
|
|
|
|
|
- &mut bam,
|
|
|
|
|
- &var.position.contig(),
|
|
|
|
|
- del_repr.start.saturating_sub(2),
|
|
|
|
|
- )?;
|
|
|
|
|
- let (_out_start_depth, out_start_alt) =
|
|
|
|
|
- pileup_out_start.into_iter().fold((0, 0), folder("D"));
|
|
|
|
|
-
|
|
|
|
|
- // outside end (one base downstream)
|
|
|
|
|
- let pileup_out_end =
|
|
|
|
|
- counts_at(&mut bam, &var.position.contig(), del_repr.end)?;
|
|
|
|
|
- let (_out_end_depth, out_end_alt) =
|
|
|
|
|
- pileup_out_end.into_iter().fold((0, 0), folder("D"));
|
|
|
|
|
-
|
|
|
|
|
- let depth = start_depth.min(end_depth);
|
|
|
|
|
- let alt = start_alt.min(end_alt).saturating_sub(out_start_alt.max(out_end_alt));
|
|
|
|
|
- // debug!("{} {alt} / {depth}", var.variant_id());
|
|
|
|
|
|
|
+ let tol = if len > 1 {
|
|
|
|
|
+ let seq = crate::io::fasta::sequence_range(
|
|
|
|
|
+ &mut fasta_reader,
|
|
|
|
|
+ &var.position.contig(),
|
|
|
|
|
+ del_repr.start as usize - 1,
|
|
|
|
|
+ del_repr.end as usize - 1,
|
|
|
|
|
+ )?;
|
|
|
|
|
+
|
|
|
|
|
+ match detect_repetition(&seq) {
|
|
|
|
|
+ Repeat::None => 0,
|
|
|
|
|
+ Repeat::RepOne(_, _) => 3,
|
|
|
|
|
+ Repeat::RepTwo(_, _) => 3,
|
|
|
|
|
+ }
|
|
|
|
|
+ } else {
|
|
|
|
|
+ 0
|
|
|
|
|
+ };
|
|
|
|
|
+ // println!("TOL {tol}");
|
|
|
|
|
+
|
|
|
|
|
+ let end_qnames: Vec<Vec<u8>> = pileup_end
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ // .inspect(|e| {
|
|
|
|
|
+ // if let crate::collection::bam::PileBase::Del((_, l)) =
|
|
|
|
|
+ // e
|
|
|
|
|
+ // {
|
|
|
|
|
+ // println!("{l}");
|
|
|
|
|
+ // }
|
|
|
|
|
+ // })
|
|
|
|
|
+ .filter_map(|e| match e {
|
|
|
|
|
+ crate::collection::bam::PileBase::Del((qn, l))
|
|
|
|
|
+ if *l >= len.saturating_sub(tol).max(1)
|
|
|
|
|
+ && *l <= len + tol =>
|
|
|
|
|
+ {
|
|
|
|
|
+ Some(qn.to_vec())
|
|
|
|
|
+ }
|
|
|
|
|
+ _ => None,
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect();
|
|
|
|
|
+ // println!("ends {}", end_qnames.len());
|
|
|
|
|
+
|
|
|
|
|
+ let alt: u32 = pileup_start
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .map(|pb| match pb {
|
|
|
|
|
+ crate::collection::bam::PileBase::Del((qn, l))
|
|
|
|
|
+ if /* end_qnames.contains(qn) */
|
|
|
|
|
+ *l >= len.saturating_sub(tol).max(1)
|
|
|
|
|
+ && *l <= len + tol =>
|
|
|
|
|
+ {
|
|
|
|
|
+ 1
|
|
|
|
|
+ }
|
|
|
|
|
+ _ => 0,
|
|
|
|
|
+ })
|
|
|
|
|
+ .sum();
|
|
|
|
|
+
|
|
|
|
|
+ let depth = pileup_start.len().min(pileup_end.len());
|
|
|
|
|
+
|
|
|
|
|
+ debug!("{} {alt} / {depth} {len}", var.variant_id());
|
|
|
|
|
+
|
|
|
anns.push(Annotation::ConstitAlt(alt as u16));
|
|
anns.push(Annotation::ConstitAlt(alt as u16));
|
|
|
anns.push(Annotation::ConstitDepth(depth as u16));
|
|
anns.push(Annotation::ConstitDepth(depth as u16));
|
|
|
}
|
|
}
|
|
@@ -471,8 +569,8 @@ impl VariantCollection {
|
|
|
// If stretch of same nt consider eq +/- 3 nt
|
|
// If stretch of same nt consider eq +/- 3 nt
|
|
|
let pv = pileup.clone().into_iter().collect::<Vec<_>>();
|
|
let pv = pileup.clone().into_iter().collect::<Vec<_>>();
|
|
|
let res = match_repeats(&pv, repeated, n, 3);
|
|
let res = match_repeats(&pv, repeated, n, 3);
|
|
|
- let depth = pileup.values().sum();
|
|
|
|
|
- let alt = res.iter().map(|(_, n)| n).sum();
|
|
|
|
|
|
|
+ let depth = pileup.values().map(|e| *e as u32).sum();
|
|
|
|
|
+ let alt = res.iter().map(|(_, n)| *n as u32).sum();
|
|
|
(depth, alt)
|
|
(depth, alt)
|
|
|
}
|
|
}
|
|
|
_ => pileup.into_iter().fold((0, 0), folder(&alt_seq)),
|
|
_ => pileup.into_iter().fold((0, 0), folder(&alt_seq)),
|