rna_seq.rs 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. use crate::variants::{ReferenceAlternative, Variants};
  2. use log::info;
  3. use noodles_gff as gff;
  4. use std::{collections::HashMap, fs::File, io::BufReader};
  5. #[derive(Debug, Default, Clone)]
  6. struct Gene {
  7. name: String,
  8. exons: Vec<(usize, usize)>,
  9. }
  10. pub fn find_monoallelics(
  11. const_variants: &Variants,
  12. rna_bam: &str,
  13. ) -> anyhow::Result<HashMap<String, Vec<(bool, String, String, u32)>>> {
  14. info!("{} variants to check", const_variants.len());
  15. let min_rna_depth = 6;
  16. let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1.gff3";
  17. let mut reader = File::open(gff)
  18. .map(BufReader::new)
  19. .map(gff::io::Reader::new)?;
  20. let mut genes_by_contig: HashMap<String, Vec<Gene>> = HashMap::new();
  21. let mut curr_gene = Gene::default();
  22. let mut contig = String::new();
  23. for result in reader.records() {
  24. let record = result?;
  25. contig = record.reference_sequence_name().to_string();
  26. if record.ty() == "exon" {
  27. if let Some(gene) = record.attributes().get("gene") {
  28. let gene_name = gene.to_string();
  29. if curr_gene.name != gene_name {
  30. genes_by_contig
  31. .entry(contig.clone())
  32. .or_default()
  33. .push(curr_gene);
  34. curr_gene = Gene::default();
  35. curr_gene.name = gene_name;
  36. }
  37. curr_gene
  38. .exons
  39. .push((record.start().into(), record.end().into()));
  40. }
  41. }
  42. }
  43. genes_by_contig.entry(contig).or_default().push(curr_gene);
  44. let mut curr_contig = String::default();
  45. let mut genes_iter = std::slice::Iter::default();
  46. let mut results = Vec::new();
  47. for variant in const_variants.data.iter() {
  48. if !matches!(variant.alternative, ReferenceAlternative::Nucleotide(_)) {
  49. continue;
  50. }
  51. if variant.contig != curr_contig {
  52. if let Some(contigs) = genes_by_contig.get(&variant.contig) {
  53. curr_contig = variant.contig.clone();
  54. genes_iter = contigs.iter();
  55. } else {
  56. continue;
  57. }
  58. }
  59. while let Some(gene) = genes_iter.clone().next() {
  60. let mut overlapped = false;
  61. for (exon_index, &(start, end)) in gene.exons.iter().enumerate() {
  62. if variant.position < start as u32 {
  63. break; // No more overlaps possible for this gene
  64. }
  65. if variant.position <= end as u32 {
  66. results.push((
  67. variant.clone(),
  68. gene.name.clone(),
  69. format!("exon_{}", exon_index + 1),
  70. ));
  71. overlapped = true;
  72. }
  73. }
  74. if overlapped {
  75. genes_iter.next(); // Move to next gene only if we found an overlap
  76. } else if gene
  77. .exons
  78. .last()
  79. .map_or(true, |&(_, end)| variant.position > end as u32)
  80. {
  81. genes_iter.next(); // Move to next gene if variant is past all exons
  82. } else {
  83. break; // No overlap, but variant might overlap with next gene
  84. }
  85. }
  86. }
  87. let mut bam = rust_htslib::bam::IndexedReader::from_path(rna_bam)?;
  88. let mut monoall: HashMap<String, Vec<(bool, String, String, u32)>> = HashMap::new();
  89. for (variant, gene_name, exon) in results {
  90. if let (
  91. ReferenceAlternative::Nucleotide(ref_base),
  92. ReferenceAlternative::Nucleotide(alt_base),
  93. ) = (&variant.reference, &variant.alternative)
  94. {
  95. let rna_bases = pandora_lib_pileup::qnames_at_base(
  96. &mut bam,
  97. &variant.contig,
  98. variant.position as i32,
  99. false,
  100. 50,
  101. )?;
  102. if rna_bases.len() < min_rna_depth {
  103. continue;
  104. }
  105. let ref_base = ref_base.clone().into_u8();
  106. let alt_base = alt_base.clone().into_u8();
  107. let n_rna_ref = rna_bases.iter().filter(|(_, b)| *b == ref_base).count();
  108. let n_rna_alt = rna_bases.iter().filter(|(_, b)| *b == alt_base).count();
  109. let is_monoal = n_rna_ref == 0 || n_rna_alt == 0;
  110. monoall.entry(gene_name).or_default().push((
  111. is_monoal,
  112. exon,
  113. variant.contig,
  114. variant.position,
  115. ));
  116. }
  117. }
  118. Ok(monoall)
  119. }