variant_collection.rs 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. use std::collections::HashSet;
  2. use rayon::prelude::*;
  3. use super::variant::{AlterationCategory, VcfVariant};
  4. use crate::{
  5. annotation::{Annotation, Annotations},
  6. collection::{bam::{counts_at, counts_ins_at}, vcf::Vcf},
  7. helpers::estimate_shannon_entropy,
  8. pipes::somatic::sequence_at,
  9. };
  10. #[derive(Debug, Clone)]
  11. pub struct VariantCollection {
  12. pub variants: Vec<VcfVariant>,
  13. pub vcf: Vcf,
  14. }
  15. impl VariantCollection {
  16. pub fn keys(&self) -> Vec<u128> {
  17. self.variants.iter().map(|v| v.hash_variant()).collect()
  18. }
  19. pub fn retain_keys(&mut self, keys_to_keep: &HashSet<u128>) {
  20. self.variants
  21. .retain(|v| keys_to_keep.contains(&v.hash_variant()));
  22. }
  23. pub fn chunk_size(&self, max_threads: u8) -> usize {
  24. let total_items = self.variants.len();
  25. let min_chunk_size = 1000;
  26. let max_chunks = max_threads;
  27. let optimal_chunk_size = total_items.div_ceil(max_chunks as usize);
  28. optimal_chunk_size.max(min_chunk_size)
  29. }
  30. pub fn annotate_with_sequence_entropy(
  31. &self,
  32. annotations: &Annotations,
  33. reference: &str,
  34. seq_len: usize,
  35. max_threads: u8,
  36. ) {
  37. self.variants
  38. .par_chunks(self.chunk_size(max_threads))
  39. .for_each(|chunk| {
  40. let mut fasta_reader = noodles_fasta::indexed_reader::Builder::default()
  41. .build_from_path(reference)
  42. .unwrap();
  43. for c in chunk {
  44. let key = c.hash_variant();
  45. let mut anns = annotations.store.entry(key).or_default();
  46. if !anns
  47. .iter()
  48. .any(|e| matches!(e, Annotation::ShannonEntropy(_)))
  49. {
  50. if let Ok(r) = sequence_at(
  51. &mut fasta_reader,
  52. &c.position.contig(),
  53. c.position.position as usize,
  54. seq_len,
  55. ) {
  56. let ent = estimate_shannon_entropy(r.as_str());
  57. anns.push(Annotation::ShannonEntropy(ent));
  58. }
  59. }
  60. }
  61. });
  62. }
  63. pub fn annotate_with_constit_bam(
  64. &self,
  65. annotations: &Annotations,
  66. constit_bam_path: &str,
  67. max_threads: u8,
  68. ) -> anyhow::Result<()> {
  69. self.variants
  70. .par_chunks(self.chunk_size(max_threads))
  71. .try_for_each(|chunk| {
  72. let mut bam = rust_htslib::bam::IndexedReader::from_path(constit_bam_path)
  73. .map_err(|e| anyhow::anyhow!("Failed to open BAM file: {e}"))?;
  74. for var in chunk {
  75. let key = var.hash_variant();
  76. let mut anns = annotations.store.entry(key).or_default();
  77. if anns
  78. .iter()
  79. .filter(|e| {
  80. matches!(e, Annotation::ConstitAlt(_) | Annotation::ConstitDepth(_))
  81. })
  82. .count()
  83. != 2
  84. {
  85. let mut n_alt = 0;
  86. let mut depth = 0;
  87. let mut alt_seq = var.alternative.to_string();
  88. let c = if var.alteration_category() == AlterationCategory::Ins {
  89. alt_seq = alt_seq.split_off(1);
  90. counts_ins_at(&mut bam, &var.position.contig(), var.position.position)?
  91. } else {
  92. counts_at(&mut bam, &var.position.contig(), var.position.position)?
  93. };
  94. for (seq, n) in c {
  95. if seq == alt_seq {
  96. n_alt = n;
  97. }
  98. depth += n;
  99. }
  100. anns.push(Annotation::ConstitAlt(n_alt as u16));
  101. anns.push(Annotation::ConstitDepth(depth as u16));
  102. }
  103. }
  104. anyhow::Ok(())
  105. })?;
  106. Ok(())
  107. }
  108. }