| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- use crate::variants::{ReferenceAlternative, Variants};
- use log::info;
- use noodles_gff as gff;
- use std::{collections::HashMap, fs::File, io::BufReader};
- #[derive(Debug, Default, Clone)]
- struct Gene {
- name: String,
- exons: Vec<(usize, usize)>,
- }
- pub fn find_monoallelics(
- const_variants: &Variants,
- rna_bam: &str,
- ) -> anyhow::Result<HashMap<String, Vec<(bool, String, String, u32)>>> {
- info!("{} variants to check", const_variants.len());
- let min_rna_depth = 6;
- let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1.gff3";
- let mut reader = File::open(gff)
- .map(BufReader::new)
- .map(gff::io::Reader::new)?;
- let mut genes_by_contig: HashMap<String, Vec<Gene>> = HashMap::new();
- let mut curr_gene = Gene::default();
- let mut contig = String::new();
- for result in reader.records() {
- let record = result?;
- contig = record.reference_sequence_name().to_string();
- if record.ty() == "exon" {
- if let Some(gene) = record.attributes().get("gene") {
- let gene_name = gene.to_string();
- if curr_gene.name != gene_name {
- genes_by_contig
- .entry(contig.clone())
- .or_default()
- .push(curr_gene);
- curr_gene = Gene::default();
- curr_gene.name = gene_name;
- }
- curr_gene
- .exons
- .push((record.start().into(), record.end().into()));
- }
- }
- }
- genes_by_contig.entry(contig).or_default().push(curr_gene);
- let mut curr_contig = String::default();
- let mut genes_iter = std::slice::Iter::default();
- let mut results = Vec::new();
- for variant in const_variants.data.iter() {
- if !matches!(variant.alternative, ReferenceAlternative::Nucleotide(_)) {
- continue;
- }
- if variant.contig != curr_contig {
- if let Some(contigs) = genes_by_contig.get(&variant.contig) {
- curr_contig = variant.contig.clone();
- genes_iter = contigs.iter();
- } else {
- continue;
- }
- }
- while let Some(gene) = genes_iter.clone().next() {
- let mut overlapped = false;
- for (exon_index, &(start, end)) in gene.exons.iter().enumerate() {
- if variant.position < start as u32 {
- break; // No more overlaps possible for this gene
- }
- if variant.position <= end as u32 {
- results.push((
- variant.clone(),
- gene.name.clone(),
- format!("exon_{}", exon_index + 1),
- ));
- overlapped = true;
- }
- }
- if overlapped {
- genes_iter.next(); // Move to next gene only if we found an overlap
- } else if gene
- .exons
- .last()
- .map_or(true, |&(_, end)| variant.position > end as u32)
- {
- genes_iter.next(); // Move to next gene if variant is past all exons
- } else {
- break; // No overlap, but variant might overlap with next gene
- }
- }
- }
- let mut bam = rust_htslib::bam::IndexedReader::from_path(rna_bam)?;
- let mut monoall: HashMap<String, Vec<(bool, String, String, u32)>> = HashMap::new();
- for (variant, gene_name, exon) in results {
- if let (
- ReferenceAlternative::Nucleotide(ref_base),
- ReferenceAlternative::Nucleotide(alt_base),
- ) = (&variant.reference, &variant.alternative)
- {
- let rna_bases = pandora_lib_pileup::qnames_at_base(
- &mut bam,
- &variant.contig,
- variant.position as i32,
- false,
- 50,
- )?;
- if rna_bases.len() < min_rna_depth {
- continue;
- }
- let ref_base = ref_base.clone().into_u8();
- let alt_base = alt_base.clone().into_u8();
- let n_rna_ref = rna_bases.iter().filter(|(_, b)| *b == ref_base).count();
- let n_rna_alt = rna_bases.iter().filter(|(_, b)| *b == alt_base).count();
- let is_monoal = n_rna_ref == 0 || n_rna_alt == 0;
- monoall.entry(gene_name).or_default().push((
- is_monoal,
- exon,
- variant.contig,
- variant.position,
- ));
- }
- }
- Ok(monoall)
- }
|