Thomas 1 년 전
부모
커밋
183de749d0
3개의 변경된 파일384개의 추가작업 그리고 22개의 파일을 삭제
  1. 114 0
      Cargo.lock
  2. 3 0
      Cargo.toml
  3. 267 22
      src/lib.rs

+ 114 - 0
Cargo.lock

@@ -83,6 +83,31 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+
 [[package]]
 name = "curl-sys"
 version = "0.4.72+curl-8.6.0"
@@ -124,6 +149,25 @@ dependencies = [
  "libm",
 ]
 
+[[package]]
+name = "either"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2"
+
+[[package]]
+name = "env_logger"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580"
+dependencies = [
+ "humantime",
+ "is-terminal",
+ "log",
+ "regex",
+ "termcolor",
+]
+
 [[package]]
 name = "float-ord"
 version = "0.3.2"
@@ -160,6 +204,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
+[[package]]
+name = "hermit-abi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
+
 [[package]]
 name = "hts-sys"
 version = "2.1.1"
@@ -176,6 +226,12 @@ dependencies = [
  "openssl-sys",
 ]
 
+[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
 [[package]]
 name = "idna"
 version = "0.5.0"
@@ -192,6 +248,17 @@ version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9007da9cacbd3e6343da136e98b0d2df013f553d35bdec8b518f07bea768e19c"
 
+[[package]]
+name = "is-terminal"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys",
+]
+
 [[package]]
 name = "jobserver"
 version = "0.1.28"
@@ -238,6 +305,12 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bfae20f6b19ad527b550c223fddc3077a547fc70cda94b9b566575423fd303ee"
 
+[[package]]
+name = "log"
+version = "0.4.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
+
 [[package]]
 name = "lzma-sys"
 version = "0.1.20"
@@ -308,6 +381,9 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "average",
+ "env_logger",
+ "log",
+ "rayon",
  "rust-htslib",
 ]
 
@@ -341,6 +417,26 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "rayon"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "regex"
 version = "1.10.3"
@@ -448,6 +544,15 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "termcolor"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
+dependencies = [
+ "winapi-util",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.57"
@@ -521,6 +626,15 @@ version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
 
+[[package]]
+name = "winapi-util"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b"
+dependencies = [
+ "windows-sys",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.52.0"

+ 3 - 0
Cargo.toml

@@ -9,5 +9,8 @@ edition = "2021"
 rust-htslib = "0.46.0"
 anyhow = "^1.0.75"
 average = "0.14.1"
+rayon = "1.9.0"
+log = "^0.4.20"
+env_logger = "^0.10.1"
 
 

+ 267 - 22
src/lib.rs

@@ -1,4 +1,8 @@
-use anyhow::{Context, Ok, Result};
+use std::collections::HashMap;
+
+use anyhow::{anyhow, Context, Ok, Result};
+use log::info;
+use rayon::prelude::*;
 use rust_htslib::{
     bam,
     bam::{ext::BamRecordExtensions, record, Read, Record},
@@ -278,8 +282,6 @@ pub fn range_depths(
         let rstart = pileup.pos() as i32;
 
         if rstart >= start && rstart < stop {
-            // depths.push(pileup.depth());
-
             let v = depths
                 .get_mut((rstart - start) as usize)
                 .context(format!("Errrr {}", rstart - start))?;
@@ -371,7 +373,7 @@ pub fn scan_sa(
     start: i32,
     stop: i32,
     mapq: u8,
-) -> Result<(Vec<Vec<Record>>, Vec<Vec<Record>>)> {
+) -> Result<Vec<(Vec<Record>, Vec<Record>)>> {
     bam.fetch((chr, start - 1, stop))?;
     let length = stop - start;
     let mut results_start: Vec<Vec<Record>> = Vec::new();
@@ -386,7 +388,6 @@ pub fn scan_sa(
 
         if rstart >= start && rstart < stop {
             if record.mapq() >= mapq && record.aux(b"SA").is_ok() {
-                
                 let index = rstart - start;
                 let u = results_start.get_mut(index as usize).unwrap();
                 u.push(record.clone());
@@ -401,7 +402,13 @@ pub fn scan_sa(
             }
         }
     }
-    Ok((results_start, results_end))
+
+    let res: Vec<(Vec<Record>, Vec<Record>)> = results_start
+        .iter()
+        .zip(results_end.iter())
+        .map(|(s, e)| (s.to_owned(), e.to_owned()))
+        .collect();
+    Ok(res)
 }
 
 pub fn records_at_base(
@@ -485,12 +492,250 @@ pub fn qnames_at_base(
     Ok(bases)
 }
 
+fn mean(data: &[f64]) -> Option<f64> {
+    let sum = data.iter().sum::<f64>() as f64;
+    let count = data.len() as f64;
+
+    match count {
+        positive if positive > 0.0 => Some(sum / count),
+        _ => None,
+    }
+}
+
+fn std_deviation(data: &[f64]) -> Option<f64> {
+    match (mean(data), data.len() as f64) {
+        (Some(data_mean), count) if count > 0.0 => {
+            let variance = data
+                .iter()
+                .map(|value| {
+                    let diff = data_mean - (*value as f64);
+                    diff * diff
+                })
+                .sum::<f64>()
+                / count;
+
+            Some(variance.sqrt())
+        }
+        _ => None,
+    }
+}
+
+fn get_se_diag_mrd(
+    diag_bam_path: &str,
+    mrd_bam_path: &str,
+    chr: &str,
+    start: i32,
+    stop: i32,
+    mapq: u8,
+) -> Vec<Vec<Record>> {
+    let min_reads = 3;
+    let bin_size = 1_000;
+    let n_bins = stop - start;
+
+    let mut se_raw = vec![(0, 0)];
+    se_raw.resize(n_bins as usize, (0, 0));
+
+    let get_pos = |i: usize| -> i32 { start + (i as i32 * bin_size as i32) };
+    let se_raw: Vec<(i32, i32)> = se_raw
+        .par_chunks(bin_size)
+        .enumerate()
+        .flat_map(|(i, _)| {
+            let mut diag_bam = rust_htslib::bam::IndexedReader::from_path(diag_bam_path)
+                .context(anyhow!("Reading {}", diag_bam_path))
+                .unwrap();
+            let mut mrd_bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam_path)
+                .context(anyhow!("Reading {}", mrd_bam_path))
+                .unwrap();
+
+            let s = get_pos(i);
+            let e = s + bin_size as i32;
+            let d = get_start_end_qual(&mut diag_bam, chr, s, e, mapq).unwrap();
+            let m = get_start_end_qual(&mut mrd_bam, chr, s, e, mapq).unwrap();
+
+            d.iter()
+                .zip(m.iter())
+                .map(|(d, m)| (*d, *m))
+                .collect::<Vec<(i32, i32)>>()
+        })
+        .collect();
+
+    info!("raw {}", se_raw.len());
+
+    let (diag_sum, mrd_sum) = se_raw
+        .iter()
+        .fold((0i32, 0i32), |acc, (d, m)| (acc.0 + d, acc.1 + m));
+
+    let diag = se_raw.iter().map(|(d, _)| *d as f64).collect::<Vec<f64>>();
+    let diag_mean = mean(&diag).unwrap();
+    let diag_std = std_deviation(&diag).unwrap();
+
+    let mrd = se_raw.iter().map(|(_, m)| *m as f64).collect::<Vec<f64>>();
+    let mrd_mean = mean(&mrd).unwrap();
+    let mrd_std = std_deviation(&mrd).unwrap();
+
+    info!(
+        "N/nt diag {} mrd {}",
+        diag_sum as f64 / (stop - start) as f64,
+        mrd_sum as f64 / (stop - start) as f64
+    );
+    info!("Mean diag {diag_mean} mrd {mrd_mean}");
+    info!("std dev diag {diag_std} mrd {mrd_std}");
+
+    let ratio_se: Vec<f64> = diag
+        .iter()
+        .zip(mrd.iter())
+        .map(|(d, m)| (((d / diag_mean) + 1.0) / ((m / mrd_mean) + 1.0)) - 1.0)
+        .collect();
+
+    let r_stddev = std_deviation(&ratio_se).unwrap();
+    info!("ratio mean {} std dev {r_stddev}", mean(&ratio_se).unwrap());
+
+    let all: Vec<_> = se_raw
+        .into_iter()
+        .enumerate()
+        .zip(ratio_se.iter())
+        .filter(|((_i, (d, m)), _r)| *d > 3 && *m == 0)
+        .collect();
+
+    info!("{} locations to assemble", all.len());
+
+    // getting priamry reads from bam at given positions
+    let se_data: Vec<(Vec<Record>, Vec<Record>)> = all
+        .par_chunks(100)
+        .flat_map(|chunks| {
+            // Loading bam reader.
+            let mut diag_bam = rust_htslib::bam::IndexedReader::from_path(diag_bam_path)
+                .context(anyhow!("Reading {}", diag_bam_path))
+                .unwrap();
+            chunks
+                .to_vec()
+                .iter()
+                .map(|((i, (_d, _m)), _r)| {
+                    let pos = *i as i32 + start;
+                    let (mut s, mut e) =
+                        get_start_end_qual_rec(&mut diag_bam, chr, pos, pos + 1, mapq).unwrap();
+                    let s = s.pop().unwrap();
+                    let s = swap_by_primary(diag_bam_path, s);
+                    let e = e.pop().unwrap();
+                    let e = swap_by_primary(diag_bam_path, e);
+                    // info!("{chr}:{pos}\t{r}\t{d} (s:{} e:{:?})\t{m}", s.len(), e.len());
+                    // println!("e {e:?}");
+                    // println!("s {s:?}");
+
+                    (s, e)
+                })
+                .collect::<Vec<(Vec<Record>, Vec<Record>)>>()
+        })
+        .collect();
+
+    // filtering by number of reads
+    // let mut groups: HashMap<String, Vec<Record>> = HashMap::new();
+    let mut res = Vec::new();
+    for (s, e) in se_data {
+        if s.len() >= min_reads {
+            res.push(s.clone());
+            // for r in s {
+            //     let k = format!("{}-{}", r.tid(), r.pos());
+            //     if let Some(vr) = groups.get_mut(&k) {
+            //         vr.push(r);
+            //         vr.dedup();
+            //     } else {
+            //         groups.insert(k, vec![r]);
+            //     }
+            // }
+        }
+        if e.len() >= min_reads {
+            res.push(e.clone());
+            // for r in e {
+            //     let k = format!("{}-{}", r.tid(), r.pos());
+            //     if let Some(vr) = groups.get_mut(&k) {
+            //         vr.push(r);
+            //         vr.dedup();
+            //     } else {
+            //         groups.insert(k, vec![r]);
+            //     }
+            // }
+        }
+    }
+    res
+}
+
+pub fn sa_ratio(
+    diag_bam_path: &str,
+    mrd_bam_path: &str,
+    chr: &str,
+    start: i32,
+    stop: i32,
+    mapq: u8,
+    bin_size: usize,
+) -> Vec<(f32, f32)> {
+    let n_bins = stop - start;
+
+    let get_pos = |i: usize| -> i32 { start + (i as i32 * bin_size as i32) };
+    let mut se_raw = vec![(0, 0)];
+    se_raw.resize(n_bins as usize, (0, 0));
+
+    se_raw
+        .par_chunks(bin_size)
+        .enumerate()
+        .flat_map(|(i, _)| {
+            let mut diag_bam = rust_htslib::bam::IndexedReader::from_path(diag_bam_path)
+                .context(anyhow!("Reading {}", diag_bam_path))
+                .unwrap();
+            let mut mrd_bam = rust_htslib::bam::IndexedReader::from_path(mrd_bam_path)
+                .context(anyhow!("Reading {}", mrd_bam_path))
+                .unwrap();
+
+            let s = get_pos(i);
+            let e = s + bin_size as i32;
+            let d = scan_sa(&mut diag_bam, chr, s, e, mapq).unwrap();
+            let m = scan_sa(&mut mrd_bam, chr, s, e, mapq).unwrap();
+
+            d.iter()
+                .zip(m.iter())
+                .map(|(d, m)| {
+                    let start_diff: f32 = d.0.len() as f32 / m.0.len() as f32;
+                    let end_diff: f32 = d.1.len() as f32 / m.1.len() as f32;
+                    (start_diff, end_diff)
+                })
+                .collect::<Vec<(f32, f32)>>()
+        })
+        .collect()
+}
 
 #[cfg(test)]
 mod tests {
+    use env_logger::Env;
+    use log::info;
+
     // Note this useful idiom: importing names from outer (for mod tests) scope.
     use super::*;
 
+    fn init() {
+        let _ = env_logger::Builder::from_env(Env::default().default_filter_or("info"))
+            .is_test(true)
+            .try_init();
+    }
+
+    #[test]
+    fn se_diff() {
+        init();
+        let case = "MERY";
+        let chr = "chr7";
+        let start = 144_103_157;
+        let stop = 144_183_195;
+        let mapq = 50;
+        let r = get_se_diag_mrd(
+            &format!("/data/longreads_basic_pipe/{case}/diag/{case}_diag_hs1.bam"),
+            &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_mrd_hs1.bam"),
+            chr,
+            start,
+            stop,
+            mapq,
+        );
+        println!("{r:?}");
+    }
+
     #[test]
     fn test_se() {
         let bam_path = "/data/longreads_basic_pipe/CAMARA/diag/CAMARA_diag_hs1.bam";
@@ -512,22 +757,22 @@ mod tests {
     }
 
     #[test]
-    fn test_sa() {
-        let bam_path = "/data/longreads_basic_pipe/CAMARA/diag/CAMARA_diag_hs1.bam";
-        let chr = "chr9";
-        let start = 21839796;
+    fn sa_r() {
+        init();
+        let case = "MERY";
+        let chr = "chr7";
+        let start = 144_153_157;
+        let stop = 144_153_195;
         let mapq = 50;
-
-        let mut bam = rust_htslib::bam::IndexedReader::from_path(bam_path).unwrap();
-
-        let a = get_start_end_qual(&mut bam, chr, start, start + 1, mapq).unwrap().pop().unwrap();
-
-        let (_, e) = get_start_end_qual_rec(&mut bam, chr, start, start + 1, mapq).unwrap();
-        let res_records = swap_by_primary(bam_path, e.get(0).unwrap().clone());
-
-        let (_, e) = scan_sa(&mut bam, chr, start, start + 1, mapq).unwrap();
-        let res_records_sa = swap_by_primary(bam_path, e.get(0).unwrap().clone());
-        assert_eq!(a as usize, res_records.len());
-        assert_eq!(a as usize, res_records_sa.len());
+        let r = sa_ratio(
+            &format!("/data/longreads_basic_pipe/{case}/diag/{case}_diag_hs1.bam"),
+            &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_diag_hs1.bam"),
+            chr,
+            start,
+            stop,
+            mapq,
+            1_000
+        );
+        println!("{r:?}");
     }
 }