|
|
@@ -0,0 +1,134 @@
|
|
|
+use std::{
|
|
|
+ collections::{BinaryHeap, HashMap},
|
|
|
+ fs::File,
|
|
|
+ io::BufReader,
|
|
|
+};
|
|
|
+
|
|
|
+use log::info;
|
|
|
+use noodles_gff as gff;
|
|
|
+
|
|
|
+use crate::{
|
|
|
+ utils::chi_square_test_for_proportions,
|
|
|
+ variants::{ReferenceAlternative, Variants},
|
|
|
+};
|
|
|
+
|
|
|
+#[derive(Debug, Default, Clone)]
|
|
|
+struct Gene {
|
|
|
+ name: String,
|
|
|
+ exons: Vec<(usize, usize)>,
|
|
|
+}
|
|
|
+
|
|
|
+pub fn find_monoallelics(const_variants: &Variants, rna_bam: &str) -> anyhow::Result<()> {
|
|
|
+ info!("{} variants to check", const_variants.len());
|
|
|
+ let min_rna_depth = 6;
|
|
|
+ let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1.gff3";
|
|
|
+
|
|
|
+ let mut reader = File::open(gff)
|
|
|
+ .map(BufReader::new)
|
|
|
+ .map(gff::io::Reader::new)?;
|
|
|
+
|
|
|
+ let mut genes_by_contig: HashMap<String, Vec<Gene>> = HashMap::new();
|
|
|
+ let mut curr_gene = Gene::default();
|
|
|
+
|
|
|
+ for result in reader.records() {
|
|
|
+ let record = result?;
|
|
|
+
|
|
|
+ if record.ty() == "exon" {
|
|
|
+ if let Some(gene) = record.attributes().get("gene") {
|
|
|
+ let gene_name = gene.to_string();
|
|
|
+ let contig = record.reference_sequence_name().to_string();
|
|
|
+ if curr_gene.name != gene_name {
|
|
|
+ genes_by_contig.entry(contig).or_default().push(curr_gene);
|
|
|
+ curr_gene = Gene::default();
|
|
|
+ curr_gene.name = gene_name;
|
|
|
+ }
|
|
|
+
|
|
|
+ curr_gene
|
|
|
+ .exons
|
|
|
+ .push((record.start().into(), record.end().into()));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut curr_contig = String::default();
|
|
|
+ let mut genes_iter = std::slice::Iter::default();
|
|
|
+ let mut results = Vec::new();
|
|
|
+ for variant in const_variants.data.iter() {
|
|
|
+ if !matches!(variant.alternative, ReferenceAlternative::Nucleotide(_)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if variant.contig != curr_contig {
|
|
|
+ if let Some(contigs) = genes_by_contig.get(&variant.contig) {
|
|
|
+ curr_contig = variant.contig.clone();
|
|
|
+ genes_iter = contigs.iter();
|
|
|
+ } else {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ while let Some(gene) = genes_iter.clone().next() {
|
|
|
+ let mut overlapped = false;
|
|
|
+ for (exon_index, &(start, end)) in gene.exons.iter().enumerate() {
|
|
|
+ if variant.position < start as u32 {
|
|
|
+ break; // No more overlaps possible for this gene
|
|
|
+ }
|
|
|
+ if variant.position <= end as u32 {
|
|
|
+ results.push((
|
|
|
+ variant.clone(),
|
|
|
+ gene.name.clone(),
|
|
|
+ format!("exon_{}", exon_index + 1),
|
|
|
+ ));
|
|
|
+ overlapped = true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if overlapped {
|
|
|
+ genes_iter.next(); // Move to next gene only if we found an overlap
|
|
|
+ } else if gene
|
|
|
+ .exons
|
|
|
+ .last()
|
|
|
+ .map_or(true, |&(_, end)| variant.position > end as u32)
|
|
|
+ {
|
|
|
+ genes_iter.next(); // Move to next gene if variant is past all exons
|
|
|
+ } else {
|
|
|
+ break; // No overlap, but variant might overlap with next gene
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut bam = rust_htslib::bam::IndexedReader::from_path(rna_bam)?;
|
|
|
+ for (variant, gene_name, exon) in results {
|
|
|
+ // let mut variant = variant;
|
|
|
+ if let (
|
|
|
+ ReferenceAlternative::Nucleotide(ref_base),
|
|
|
+ ReferenceAlternative::Nucleotide(alt_base),
|
|
|
+ ) = (&variant.reference, &variant.alternative)
|
|
|
+ {
|
|
|
+ let rna_bases = pandora_lib_pileup::qnames_at_base(
|
|
|
+ &mut bam,
|
|
|
+ &variant.contig,
|
|
|
+ variant.position as i32,
|
|
|
+ false,
|
|
|
+ 50,
|
|
|
+ )?;
|
|
|
+ if rna_bases.len() < min_rna_depth {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ let ref_base = ref_base.clone().into_u8();
|
|
|
+ let alt_base = alt_base.clone().into_u8();
|
|
|
+
|
|
|
+ let n_rna_ref = rna_bases.iter().filter(|(_, b)| *b == ref_base).count();
|
|
|
+ let n_rna_alt = rna_bases.iter().filter(|(_, b)| *b == alt_base).count();
|
|
|
+
|
|
|
+ if n_rna_ref == 0 || n_rna_alt == 0 {
|
|
|
+ println!(
|
|
|
+ "{gene_name} exon_{exon}: {}:{}",
|
|
|
+ variant.contig, variant.position
|
|
|
+ );
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+}
|