use std::collections::HashSet; use rayon::prelude::*; use super::variant::{AlterationCategory, VcfVariant}; use crate::{ annotation::{Annotation, Annotations}, collection::{bam::{counts_at, counts_ins_at}, vcf::Vcf}, helpers::estimate_shannon_entropy, pipes::somatic::sequence_at, }; #[derive(Debug, Clone)] pub struct VariantCollection { pub variants: Vec, pub vcf: Vcf, } impl VariantCollection { pub fn keys(&self) -> Vec { self.variants.iter().map(|v| v.hash_variant()).collect() } pub fn retain_keys(&mut self, keys_to_keep: &HashSet) { self.variants .retain(|v| keys_to_keep.contains(&v.hash_variant())); } pub fn chunk_size(&self, max_threads: u8) -> usize { let total_items = self.variants.len(); let min_chunk_size = 1000; let max_chunks = max_threads; let optimal_chunk_size = total_items.div_ceil(max_chunks as usize); optimal_chunk_size.max(min_chunk_size) } pub fn annotate_with_sequence_entropy( &self, annotations: &Annotations, reference: &str, seq_len: usize, max_threads: u8, ) { self.variants .par_chunks(self.chunk_size(max_threads)) .for_each(|chunk| { let mut fasta_reader = noodles_fasta::indexed_reader::Builder::default() .build_from_path(reference) .unwrap(); for c in chunk { let key = c.hash_variant(); let mut anns = annotations.store.entry(key).or_default(); if !anns .iter() .any(|e| matches!(e, Annotation::ShannonEntropy(_))) { if let Ok(r) = sequence_at( &mut fasta_reader, &c.position.contig(), c.position.position as usize, seq_len, ) { let ent = estimate_shannon_entropy(r.as_str()); anns.push(Annotation::ShannonEntropy(ent)); } } } }); } pub fn annotate_with_constit_bam( &self, annotations: &Annotations, constit_bam_path: &str, max_threads: u8, ) -> anyhow::Result<()> { self.variants .par_chunks(self.chunk_size(max_threads)) .try_for_each(|chunk| { let mut bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path) .map_err(|e| anyhow::anyhow!("Failed to open BAM file: {e}"))?; for var in chunk { let key = var.hash_variant(); let mut anns = annotations.store.entry(key).or_default(); if anns .iter() .filter(|e| { matches!(e, Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_)) }) .count() != 2 { let mut n_alt = 0; let mut depth = 0; let mut alt_seq = var.alternative.to_string(); let c = if var.alteration_category() == AlterationCategory::Ins { alt_seq = alt_seq.split_off(1); counts_ins_at(&mut bam, &var.position.contig(), var.position.position)? } else { counts_at(&mut bam, &var.position.contig(), var.position.position)? }; for (seq, n) in c { if seq == alt_seq { n_alt = n; } depth += n; } anns.push(Annotation::ConstitAlt(n_alt as u16)); anns.push(Annotation::ConstitDepth(depth as u16)); } } anyhow::Ok(()) })?; Ok(()) } }