Thomas 1 year ago
parent
commit
1175378a00
4 changed files with 253 additions and 5 deletions
  1. 57 3
      src/lib.rs
  2. 134 0
      src/rna_seq.rs
  3. 6 2
      src/sql/variants_sql.rs
  4. 56 0
      test.txt_summary.html

+ 57 - 3
src/lib.rs

@@ -2,6 +2,7 @@ pub mod annotations;
 pub mod callers;
 pub mod config;
 pub mod in_out;
+pub mod rna_seq;
 pub mod sql;
 pub mod utils;
 pub mod variants;
@@ -16,14 +17,22 @@ mod tests {
         utils::count_repetitions,
         variants::{AnnotationType, Category, Variants},
     };
-    use anyhow::{Ok, Result};
+    use anyhow::{Context, Ok, Result};
     use indicatif::MultiProgress;
     use indicatif_log_bridge::LogWrapper;
     use log::info;
     use noodles_core::{Position, Region};
+    use rna_seq::find_monoallelics;
     use sql::variants_sql::write_annotaded;
     use variants::AllStats;
 
+    // export RUST_LOG="debug"
+    fn init() {
+        let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
+            .is_test(true)
+            .try_init();
+    }
+
     use super::*;
     #[test]
     fn get_config() -> Result<()> {
@@ -34,7 +43,7 @@ mod tests {
 
     #[test]
     fn load_from_vcf() -> Result<()> {
-        let name = "VEILLEUR";
+        let name = "COLLE";
 
         let logger =
             env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
@@ -49,7 +58,17 @@ mod tests {
 
     #[test]
     fn annot() -> anyhow::Result<()> {
-        let id = "VEILLEUR";
+        init();
+        let id = "ROBIN";
+        let stats = AllStats::load_json(&format!(
+            "/data/longreads_basic_pipe/{id}/diag/report/data/{id}_variants_stats.json"
+        ))
+        .context("Error while loading stats json")?;
+
+        stats
+            .generate_graph("/data/stats")
+            .context("Error while generating graph")?;
+
         write_annotaded(
             &format!("/data/longreads_basic_pipe/{id}/diag/{id}_variants.sqlite"),
             &format!("/data/longreads_basic_pipe/{id}/diag/report/data/{id}_annot_variants.json"),
@@ -224,4 +243,39 @@ mod tests {
         let config = PhaserConfig::new(id, "/data/longreads_basic_pipe", min_records, 0.35);
         phase::phase(config, multi)
     }
+
+    #[test]
+    fn find_monoal() -> anyhow::Result<()> {
+        let id = "CAMARA";
+        let constits = format!("/data/longreads_basic_pipe/{id}/diag/{id}_constit.bytes.gz");
+        let rna_bam = format!("/data/longreads_basic_pipe/{id}/diag/RNAseq/{id}_diag_RNA_hs1.bam");
+
+        let logger =
+            env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
+                .build();
+        let multi = MultiProgress::new();
+        LogWrapper::new(multi.clone(), logger).try_init().unwrap();
+
+        let mut variants = Variants::new_from_bytes(id, &constits, multi)?;
+
+        variants.data = variants
+            .data
+            .into_iter()
+            .filter_map(|mut v| {
+                if v.vaf() < 0.75 && v.vaf() > 0.25 && v.get_depth() > 6 {
+                    Some(v)
+                } else {
+                    None
+                }
+            })
+            .collect();
+        // variants.save_bytes("/data/test_var.bytes_chr1.gz")?;
+
+        find_monoallelics(
+            &variants,
+            &rna_bam,
+        )?;
+
+        Ok(())
+    }
 }

+ 134 - 0
src/rna_seq.rs

@@ -0,0 +1,134 @@
+use std::{
+    collections::{BinaryHeap, HashMap},
+    fs::File,
+    io::BufReader,
+};
+
+use log::info;
+use noodles_gff as gff;
+
+use crate::{
+    utils::chi_square_test_for_proportions,
+    variants::{ReferenceAlternative, Variants},
+};
+
+#[derive(Debug, Default, Clone)]
+struct Gene {
+    name: String,
+    exons: Vec<(usize, usize)>,
+}
+
+pub fn find_monoallelics(const_variants: &Variants, rna_bam: &str) -> anyhow::Result<()> {
+    info!("{} variants to check", const_variants.len());
+    let min_rna_depth = 6;
+    let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1.gff3";
+
+    let mut reader = File::open(gff)
+        .map(BufReader::new)
+        .map(gff::io::Reader::new)?;
+
+    let mut genes_by_contig: HashMap<String, Vec<Gene>> = HashMap::new();
+    let mut curr_gene = Gene::default();
+
+    for result in reader.records() {
+        let record = result?;
+
+        if record.ty() == "exon" {
+            if let Some(gene) = record.attributes().get("gene") {
+                let gene_name = gene.to_string();
+                let contig = record.reference_sequence_name().to_string();
+                if curr_gene.name != gene_name {
+                    genes_by_contig.entry(contig).or_default().push(curr_gene);
+                    curr_gene = Gene::default();
+                    curr_gene.name = gene_name;
+                }
+
+                curr_gene
+                    .exons
+                    .push((record.start().into(), record.end().into()));
+            }
+        }
+    }
+
+    let mut curr_contig = String::default();
+    let mut genes_iter = std::slice::Iter::default();
+    let mut results = Vec::new();
+    for variant in const_variants.data.iter() {
+        if !matches!(variant.alternative, ReferenceAlternative::Nucleotide(_)) {
+            continue;
+        }
+
+        if variant.contig != curr_contig {
+            if let Some(contigs) = genes_by_contig.get(&variant.contig) {
+                curr_contig = variant.contig.clone();
+                genes_iter = contigs.iter();
+            } else {
+                continue;
+            }
+        }
+
+        while let Some(gene) = genes_iter.clone().next() {
+            let mut overlapped = false;
+            for (exon_index, &(start, end)) in gene.exons.iter().enumerate() {
+                if variant.position < start as u32 {
+                    break; // No more overlaps possible for this gene
+                }
+                if variant.position <= end as u32 {
+                    results.push((
+                        variant.clone(),
+                        gene.name.clone(),
+                        format!("exon_{}", exon_index + 1),
+                    ));
+                    overlapped = true;
+                }
+            }
+            if overlapped {
+                genes_iter.next(); // Move to next gene only if we found an overlap
+            } else if gene
+                .exons
+                .last()
+                .map_or(true, |&(_, end)| variant.position > end as u32)
+            {
+                genes_iter.next(); // Move to next gene if variant is past all exons
+            } else {
+                break; // No overlap, but variant might overlap with next gene
+            }
+        }
+    }
+
+    let mut bam = rust_htslib::bam::IndexedReader::from_path(rna_bam)?;
+    for (variant, gene_name, exon) in results {
+        // let mut variant = variant;
+        if let (
+            ReferenceAlternative::Nucleotide(ref_base),
+            ReferenceAlternative::Nucleotide(alt_base),
+        ) = (&variant.reference, &variant.alternative)
+        {
+            let rna_bases = pandora_lib_pileup::qnames_at_base(
+                &mut bam,
+                &variant.contig,
+                variant.position as i32,
+                false,
+                50,
+            )?;
+            if rna_bases.len() < min_rna_depth {
+                continue;
+            }
+            
+            let ref_base = ref_base.clone().into_u8();
+            let alt_base = alt_base.clone().into_u8();
+
+            let n_rna_ref = rna_bases.iter().filter(|(_, b)| *b == ref_base).count();
+            let n_rna_alt = rna_bases.iter().filter(|(_, b)| *b == alt_base).count();
+
+            if n_rna_ref == 0 || n_rna_alt == 0 {
+                println!(
+                    "{gene_name} exon_{exon}: {}:{}",
+                    variant.contig, variant.position
+                );
+            }
+        }
+    }
+
+    Ok(())
+}

+ 6 - 2
src/sql/variants_sql.rs

@@ -6,7 +6,7 @@ use std::{
 
 use anyhow::{anyhow, Context, Ok, Result};
 use indicatif::MultiProgress;
-use log::warn;
+use log::{info, warn};
 use serde::{Deserialize, Serialize};
 use serde_rusqlite::*;
 
@@ -256,6 +256,7 @@ pub fn load_variants_name(name: &str, mp: &MultiProgress) -> Result<Variants> {
 }
 
 pub fn load_annotaded(db_path: &str) -> anyhow::Result<String> {
+    info!("Loading DB {db_path}");
     let connection = rusqlite::Connection::open(db_path)?;
     let mut stmt = connection.prepare(
         "SELECT * FROM Variants WHERE interpretation != '' AND interpretation IS NOT NULL",
@@ -263,7 +264,10 @@ pub fn load_annotaded(db_path: &str) -> anyhow::Result<String> {
 
     let rows = stmt.query_and_then([], |r| match from_row::<VariantSQL>(r) {
         std::result::Result::Ok(v) => Ok(v),
-        Err(e) => Err(anyhow!(e)),
+        Err(e) => {
+            println!("{:#?}", r);
+            Err(anyhow!(e)) 
+        },
     })?;
 
     let mut results = Vec::new();

File diff suppressed because it is too large
+ 56 - 0
test.txt_summary.html


Some files were not shown because too many files changed in this diff