| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 |
- use std::collections::HashSet;
- use rayon::prelude::*;
- use super::variant::{AlterationCategory, VcfVariant};
- use crate::{
- annotation::{Annotation, Annotations},
- collection::{bam::{counts_at, counts_ins_at}, vcf::Vcf},
- helpers::estimate_shannon_entropy,
- pipes::somatic::sequence_at,
- };
- #[derive(Debug, Clone)]
- pub struct VariantCollection {
- pub variants: Vec<VcfVariant>,
- pub vcf: Vcf,
- }
- impl VariantCollection {
- pub fn keys(&self) -> Vec<u128> {
- self.variants.iter().map(|v| v.hash_variant()).collect()
- }
- pub fn retain_keys(&mut self, keys_to_keep: &HashSet<u128>) {
- self.variants
- .retain(|v| keys_to_keep.contains(&v.hash_variant()));
- }
- pub fn chunk_size(&self, max_threads: u8) -> usize {
- let total_items = self.variants.len();
- let min_chunk_size = 1000;
- let max_chunks = max_threads;
- let optimal_chunk_size = total_items.div_ceil(max_chunks as usize);
- optimal_chunk_size.max(min_chunk_size)
- }
- pub fn annotate_with_sequence_entropy(
- &self,
- annotations: &Annotations,
- reference: &str,
- seq_len: usize,
- max_threads: u8,
- ) {
- self.variants
- .par_chunks(self.chunk_size(max_threads))
- .for_each(|chunk| {
- let mut fasta_reader = noodles_fasta::indexed_reader::Builder::default()
- .build_from_path(reference)
- .unwrap();
- for c in chunk {
- let key = c.hash_variant();
- let mut anns = annotations.store.entry(key).or_default();
- if !anns
- .iter()
- .any(|e| matches!(e, Annotation::ShannonEntropy(_)))
- {
- if let Ok(r) = sequence_at(
- &mut fasta_reader,
- &c.position.contig(),
- c.position.position as usize,
- seq_len,
- ) {
- let ent = estimate_shannon_entropy(r.as_str());
- anns.push(Annotation::ShannonEntropy(ent));
- }
- }
- }
- });
- }
- pub fn annotate_with_constit_bam(
- &self,
- annotations: &Annotations,
- constit_bam_path: &str,
- max_threads: u8,
- ) -> anyhow::Result<()> {
- self.variants
- .par_chunks(self.chunk_size(max_threads))
- .try_for_each(|chunk| {
- let mut bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path)
- .map_err(|e| anyhow::anyhow!("Failed to open BAM file: {e}"))?;
- for var in chunk {
- let key = var.hash_variant();
- let mut anns = annotations.store.entry(key).or_default();
- if anns
- .iter()
- .filter(|e| {
- matches!(e, Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_))
- })
- .count()
- != 2
- {
- let mut n_alt = 0;
- let mut depth = 0;
- let mut alt_seq = var.alternative.to_string();
- let c = if var.alteration_category() == AlterationCategory::Ins {
- alt_seq = alt_seq.split_off(1);
- counts_ins_at(&mut bam, &var.position.contig(), var.position.position)?
- } else {
- counts_at(&mut bam, &var.position.contig(), var.position.position)?
- };
- for (seq, n) in c {
- if seq == alt_seq {
- n_alt = n;
- }
- depth += n;
- }
- anns.push(Annotation::ConstitAlt(n_alt as u16));
- anns.push(Annotation::ConstitDepth(depth as u16));
- }
- }
- anyhow::Ok(())
- })?;
- Ok(())
- }
- }
|