Browse Source

N50 and size for BAMStats

Thomas 10 months ago
parent
commit
b793b4a698
4 changed files with 43 additions and 11 deletions
  1. 4 0
      README.md
  2. 3 1
      jq_filters/jq_variants.jq
  3. 24 9
      src/collection/bam.rs
  4. 12 1
      src/lib.rs

+ 4 - 0
README.md

@@ -44,6 +44,10 @@ zcat /data/longreads_basic_pipe/ADJAGBA/diag/somatic_variants.json.gz | jq -L ./
 find /data/longreads_basic_pipe/ -name "somatic_variants.json.gz" -type f -exec sh -c 'dirname=$(basename $(dirname $(dirname "$1"))); count=$(zcat "$1" | jq -L ./jq_filters -r '\''include "jq_variants"; count_consequence("SynonymousVariant") | [.true_count, .total_count, .proportion] | @tsv'\''); echo "${dirname}\t${count}"' sh {} \;
 ```
 
+### Find recurrence by VEP consequence
+```
+find /data/longreads_basic_pipe/ -name "somatic_variants.json.gz" -type f -exec sh -c 'dirname=$(basename $(dirname $(dirname "$1"))); count=$(zcat "$1" | jq -L ./jq_filters -r '\''include "jq_variants"; consequence("StopGained") | .[] | select(.has_consequence == true) | [.chr, .position, .ref, .alt] | @tsv'\''); echo "${count}"' sh {} \; | sort -k1,1V -k2,2n | uniq -c |  awk '$1 > 1 {print $2"\t"$3"\t"$4"\t"$5"\t"$1}'
+```
 
 ### Reading log files
 ```

+ 3 - 1
jq_filters/jq_variants.jq

@@ -44,7 +44,9 @@ def consequence(csq):
 		.data[] | 
 		{
 			chr: num_to_contig(.position.contig),
-			position: .position.position + 1,
+			position: ((.position.position | tonumber) + 1),
+			ref: ref_alt_vcf(.reference),
+			alt: ref_alt_vcf(.alternative),
 			has_consequence: ([ .annotations[] | select(has("VEP")) | .VEP[] | .consequence | contains([csq])] | any) 
 		}
 	];

+ 24 - 9
src/collection/bam.rs

@@ -2,17 +2,13 @@ use std::{
     collections::HashMap,
     fs::{self, File},
     path::PathBuf,
-    str::FromStr,
 };
 
 use anyhow::{anyhow, Context};
 use chrono::{DateTime, Utc};
+use dashmap::DashMap;
 use glob::glob;
 use log::{debug, info, warn};
-use pandora_lib_bindings::{
-    progs::cramino::{Cramino, CraminoRes},
-    utils::RunBin,
-};
 use rand::{thread_rng, Rng};
 use rayon::prelude::*;
 use rust_htslib::bam::{ext::BamRecordExtensions, record::Cigar, Read};
@@ -86,7 +82,12 @@ impl WGSBam {
         let json_file = PathBuf::from(&json_path);
 
         if json_file.exists() && json_file.metadata()?.modified()? > modified {
-            return Self::load_json(&json_path);
+            match Self::load_json(&json_path) {
+                Ok(s) => return Ok(s),
+                Err(e) => {
+                    warn!("Error while loading {}.\n{e}", json_path);
+                }
+            }
         }
 
         // let cramino_path = format!(
@@ -546,6 +547,8 @@ pub struct WGSBamStats {
     pub mapped_yield: u128,
     pub global_coverage: f64,
     pub karyotype: Vec<(i32, u64, String, u128)>,
+    pub n50: u128,
+    pub by_lengths: Vec<(u128, u32)>,
 }
 
 impl WGSBamStats {
@@ -561,7 +564,7 @@ impl WGSBamStats {
         let header = rust_htslib::bam::Header::from_template(&h);
         bam.set_threads(bam_n_threads as usize)?;
 
-        let mut mapped_lens = Vec::new();
+        let mut mapped_lengths = Vec::new();
         let mut all_records = 0;
         let mut n_reads = 0;
         let mut lengths_by_tid: HashMap<i32, Vec<u128>> = HashMap::new();
@@ -582,7 +585,7 @@ impl WGSBamStats {
         {
             n_reads += 1;
             let mapped_len = (read.reference_end() - read.pos()) as u128;
-            mapped_lens.push(mapped_len);
+            mapped_lengths.push(mapped_len);
             lengths_by_tid
                 .entry(read.tid())
                 .or_default()
@@ -592,11 +595,21 @@ impl WGSBamStats {
             }
         }
 
-        let mapped_yield = mapped_lens.par_iter().sum::<u128>();
+        let mapped_yield = mapped_lengths.par_iter().sum::<u128>();
         let genome = get_genome_sizes(&header)?;
         let genome_size = genome.values().map(|v| *v as u128).sum::<u128>();
         let global_coverage = mapped_yield as f64 / genome_size as f64;
 
+        let by_lengths: DashMap<u128, u32> = DashMap::new();
+        mapped_lengths
+            .par_iter()
+            .for_each(|l| *by_lengths.entry(*l).or_default() += 1);
+
+        let mut by_lengths: Vec<(u128, u32)> =
+            by_lengths.iter().map(|e| (*e.key(), *e.value())).collect();
+        by_lengths.par_sort_by(|a, b| a.0.cmp(&b.0));
+        let n50 = by_lengths[by_lengths.len() / 2].0;
+
         let mut karyotype: Vec<(i32, u64, String, u128)> = lengths_by_tid
             .iter()
             .map(|(tid, lengths)| {
@@ -618,6 +631,8 @@ impl WGSBamStats {
             mapped_yield,
             global_coverage,
             karyotype,
+            n50,
+            by_lengths,
         })
     }
 

+ 12 - 1
src/lib.rs

@@ -29,7 +29,7 @@ mod tests {
 
     use annotation::{vep::{VepLine, VEP}, Annotations};
     use callers::{nanomonsv::nanomonsv_create_pon, savana::{Savana, SavanaCopyNumber, SavanaReadCounts}, severus::{Severus, SeverusSolo}};
-    use collection::{bam::{counts_at, counts_ins_at, nt_pileup, WGSBamStats}, pod5::{Pod5, Pod5Config}, Initialize, InitializeSolo, Version};
+    use collection::{bam::{counts_at, counts_ins_at, nt_pileup, WGSBam, WGSBamStats}, pod5::{Pod5, Pod5Config}, Initialize, InitializeSolo, Version};
     use commands::{longphase::{LongphaseConfig, LongphaseHap, LongphaseModcallSolo, LongphasePhase}, modkit::{bed_methyl, ModkitConfig}};
     use functions::assembler::{Assembler, AssemblerConfig};
     use helpers::estimate_shannon_entropy;
@@ -608,6 +608,17 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn load_bam() -> anyhow::Result<()> {
+        init();
+        let id = "CAMARA";
+        let time = "diag";
+        let bam_path = Config::default().solo_bam(id, time);
+        WGSBam::new(Path::new(&bam_path).to_path_buf())?;
+        Ok(())
+    }
+
+
     #[test]
     fn tar() {
         init();