Thomas 1 gadu atpakaļ
vecāks
revīzija
dc57a2068e
3 mainītis faili ar 248 papildinājumiem un 79 dzēšanām
  1. 143 2
      Cargo.lock
  2. 2 0
      Cargo.toml
  3. 103 77
      src/lib.rs

+ 143 - 2
Cargo.lock

@@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
 [[package]]
 name = "aho-corasick"
 version = "1.1.3"
@@ -122,12 +128,28 @@ version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 
+[[package]]
+name = "bstr"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
 [[package]]
 name = "byteorder"
 version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
+[[package]]
+name = "bytes"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a12916984aab3fa6e39d655a33e09c0071eb36d6ab3aea5c2d78551f1df6d952"
+
 [[package]]
 name = "bzip2-sys"
 version = "0.1.11+1.0.8"
@@ -191,6 +213,24 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
 
+[[package]]
+name = "crc32fast"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "crossbeam-channel"
+version = "0.5.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "crossbeam-deque"
 version = "0.8.5"
@@ -286,6 +326,16 @@ dependencies = [
  "log",
 ]
 
+[[package]]
+name = "flate2"
+version = "1.0.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
 [[package]]
 name = "float-ord"
 version = "0.3.2"
@@ -310,6 +360,17 @@ dependencies = [
  "quick-error",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
 [[package]]
 name = "glob"
 version = "0.3.1"
@@ -399,9 +460,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
 [[package]]
 name = "libc"
-version = "0.2.153"
+version = "0.2.155"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
 
 [[package]]
 name = "libloading"
@@ -467,6 +528,15 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
+[[package]]
+name = "miniz_oxide"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
+dependencies = [
+ "adler",
+]
+
 [[package]]
 name = "newtype_derive"
 version = "0.1.6"
@@ -486,6 +556,40 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "noodles-bgzf"
+version = "0.32.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2fba0f4a64cc897d9396d730a0c444d148daed7de31ad5904ecc673178fc9d"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "crossbeam-channel",
+ "flate2",
+]
+
+[[package]]
+name = "noodles-core"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5a8c6b020d1205abef2b0fab4463a6c5ecc3c8f4d561ca8b0d1a42323376200"
+dependencies = [
+ "bstr",
+]
+
+[[package]]
+name = "noodles-fasta"
+version = "0.41.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7a1662ac3ace299515c982a322e378bbeb4c1bd90fb098d823ef0f3a6abcc00"
+dependencies = [
+ "bstr",
+ "bytes",
+ "memchr",
+ "noodles-bgzf",
+ "noodles-core",
+]
+
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@@ -532,8 +636,10 @@ dependencies = [
  "average",
  "env_logger",
  "log",
+ "noodles-fasta",
  "rayon",
  "rust-htslib",
+ "uuid",
 ]
 
 [[package]]
@@ -670,6 +776,26 @@ version = "0.1.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac"
 
+[[package]]
+name = "serde"
+version = "1.0.204"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.204"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.64",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -784,12 +910,27 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
+[[package]]
+name = "uuid"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
+dependencies = [
+ "getrandom",
+]
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
 
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
 [[package]]
 name = "windows-sys"
 version = "0.52.0"

+ 2 - 0
Cargo.toml

@@ -12,5 +12,7 @@ average = "0.15.1"
 rayon = "1.10.0"
 log = "^0.4.22"
 env_logger = "^0.11.3"
+uuid = { version = "1.10.0", features = ["v4"] }
+noodles-fasta = "0.41.0"
 
 

+ 103 - 77
src/lib.rs

@@ -1,12 +1,9 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, fs::File, io::Write};
 
 use anyhow::{anyhow, Context, Ok, Result};
 use log::info;
 use rayon::prelude::*;
-use rust_htslib::{
-    bam,
-    bam::{ext::BamRecordExtensions, record, Read, Record},
-};
+use rust_htslib::bam::{self, ext::BamRecordExtensions, record, Header, Read, Record, Writer};
 
 pub fn get_hts_nt_pileup(
     bam: &mut rust_htslib::bam::IndexedReader,
@@ -162,7 +159,7 @@ pub fn get_start_end_qual(
     results.resize(length as usize, 0);
 
     for read in bam.records() {
-        let record = read.context(format!("Error while parsing record"))?;
+        let record = read.context("Error while parsing record")?;
         let rstart = record.pos() as i32;
         // let rstart = record.reference_start() as i32;
         let rend = record.reference_end() as i32;
@@ -276,7 +273,7 @@ pub fn range_depths(
     let mut depths = vec![0];
     depths.resize((stop - start) as usize, 0);
     for p in bam.pileup() {
-        let pileup = p.context(format!("eRR"))?;
+        let pileup = p.context("eRR")?;
         let rstart = pileup.pos() as i32;
 
         if rstart >= start && rstart < stop {
@@ -304,7 +301,7 @@ pub fn range_depths_qual(
     let mut depths = vec![0];
     depths.resize((stop - start) as usize, 0);
     for p in bam.pileup() {
-        let pileup = p.context(format!("eRR"))?;
+        let pileup = p.context("eRR")?;
         let rstart = pileup.pos() as i32;
 
         if rstart >= start && rstart < stop {
@@ -334,7 +331,7 @@ pub fn range_len_qual(
     let mut depths = vec![0];
     depths.resize((stop - start) as usize, 0);
     for p in bam.pileup() {
-        let pileup = p.context(format!("eRR"))?;
+        let pileup = p.context("eRR")?;
         let rstart = pileup.pos() as i32;
 
         if rstart >= start && rstart < stop {
@@ -380,24 +377,20 @@ pub fn scan_sa(
     results_end.resize(length as usize, vec![]);
 
     for read in bam.records() {
-        let record = read.context(format!("Error while parsing record"))?;
+        let record = read.context("Error while parsing record".to_string())?;
         let rstart = record.pos() as i32;
         let rend = record.reference_end() as i32;
 
-        if rstart >= start && rstart < stop {
-            if record.mapq() >= mapq && record.aux(b"SA").is_ok() {
-                let index = rstart - start;
-                let u = results_start.get_mut(index as usize).unwrap();
-                u.push(record.clone());
-            }
+        if rstart >= start && rstart < stop && record.mapq() >= mapq && record.aux(b"SA").is_ok() {
+            let index = rstart - start;
+            let u = results_start.get_mut(index as usize).unwrap();
+            u.push(record.clone());
         }
 
-        if rend >= start && rend < stop && record.aux(b"SA").is_ok() {
-            if record.mapq() >= mapq {
-                let index = rend - start;
-                let u = results_end.get_mut(index as usize).unwrap();
-                u.push(record.clone());
-            }
+        if rend >= start && rend < stop && record.aux(b"SA").is_ok() && record.mapq() >= mapq {
+            let index = rend - start;
+            let u = results_end.get_mut(index as usize).unwrap();
+            u.push(record.clone());
         }
     }
 
@@ -436,10 +429,8 @@ pub fn records_at_base(
                             if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
                                 bases.push((record.clone(), b));
                             }
-                        } else {
-                            if alignment.is_del() {
-                                bases.push((record.clone(), b'D'));
-                            }
+                        } else if alignment.is_del() {
+                            bases.push((record.clone(), b'D'));
                         }
                     }
                 }
@@ -477,10 +468,8 @@ pub fn qnames_at_base(
                             if let Some(b) = hts_base_at(&record, start as u32, with_next_ins)? {
                                 bases.push((qname, b));
                             }
-                        } else {
-                            if alignment.is_del() {
-                                bases.push((qname, b'D'));
-                            }
+                        } else if alignment.is_del() {
+                            bases.push((qname, b'D'));
                         }
                     }
                 }
@@ -491,7 +480,7 @@ pub fn qnames_at_base(
 }
 
 fn mean(data: &[f64]) -> Option<f64> {
-    let sum = data.iter().sum::<f64>() as f64;
+    let sum = data.iter().sum::<f64>();
     let count = data.len() as f64;
 
     match count {
@@ -506,7 +495,7 @@ fn std_deviation(data: &[f64]) -> Option<f64> {
             let variance = data
                 .iter()
                 .map(|value| {
-                    let diff = data_mean - (*value as f64);
+                    let diff = data_mean - *value;
                     diff * diff
                 })
                 .sum::<f64>()
@@ -526,7 +515,7 @@ pub fn get_se_diag_mrd(
     stop: i32,
     mapq: u8,
 ) -> Vec<Vec<Record>> {
-    let min_reads = 3;
+    // let min_reads = 3;
     let bin_size = 1_000;
     let n_bins = stop - start;
 
@@ -595,9 +584,7 @@ pub fn get_se_diag_mrd(
         .filter(|((_i, (d, m)), _r)| *d > 3 && *m == 0)
         .collect();
 
-    info!("{} locations to assemble", all.len());
-
-    // getting priamry reads from bam at given positions
+    // getting primary reads from bam at given positions
     let se_data: Vec<(Vec<Record>, Vec<Record>)> = all
         .par_chunks(100)
         .flat_map(|chunks| {
@@ -626,36 +613,11 @@ pub fn get_se_diag_mrd(
         })
         .collect();
 
-    // filtering by number of reads
-    // let mut groups: HashMap<String, Vec<Record>> = HashMap::new();
-    let mut res = Vec::new();
-    for (s, e) in se_data {
-        if s.len() >= min_reads {
-            res.push(s.clone());
-            // for r in s {
-            //     let k = format!("{}-{}", r.tid(), r.pos());
-            //     if let Some(vr) = groups.get_mut(&k) {
-            //         vr.push(r);
-            //         vr.dedup();
-            //     } else {
-            //         groups.insert(k, vec![r]);
-            //     }
-            // }
-        }
-        if e.len() >= min_reads {
-            res.push(e.clone());
-            // for r in e {
-            //     let k = format!("{}-{}", r.tid(), r.pos());
-            //     if let Some(vr) = groups.get_mut(&k) {
-            //         vr.push(r);
-            //         vr.dedup();
-            //     } else {
-            //         groups.insert(k, vec![r]);
-            //     }
-            // }
-        }
-    }
-    res
+    se_data
+        .into_iter()
+        .flat_map(|(s, e)| vec![s.clone(), e.clone()])
+        .filter(|e| !e.is_empty())
+        .collect()
 }
 
 pub fn sa_ratio(
@@ -701,26 +663,55 @@ pub fn sa_ratio(
         .collect()
 }
 
-pub fn bam_compo(file_path: &str, sample_size: usize) -> Result<Vec<(String, f64 )>> {
+pub fn bam_compo(file_path: &str, sample_size: usize) -> Result<Vec<(String, f64)>> {
     let mut bam = bam::Reader::from_path(file_path)?;
 
     let mut rgs: HashMap<String, u64> = HashMap::new();
-    for result in bam.records().filter_map(Result::ok).take(sample_size)
-    {
-        if let std::result::Result::Ok(t) = result.aux(b"RG") {
-            if let record::Aux::String(s) = t {
-                *rgs.entry(s.to_string()).or_default() += 1;
-            }
+    for result in bam.records().filter_map(Result::ok).take(sample_size) {
+        if let record::Aux::String(s) = result.aux(b"RG")? {
+            *rgs.entry(s.to_string()).or_default() += 1;
         }
     }
 
-    Ok(rgs.into_iter().map(|(rg, n)| (rg.to_string(), n as f64 * 100.0 / sample_size as f64)).collect())
+    Ok(rgs
+        .into_iter()
+        .map(|(rg, n)| (rg.to_string(), n as f64 * 100.0 / sample_size as f64))
+        .collect())
+}
+
+pub fn write_bam(records: &Vec<Record>, header: &Header, path: &str) -> Result<()> {
+    info!("{} records to write", records.len());
+    let mut writer = Writer::from_path(path, header, bam::Format::Bam).unwrap();
+    for record in records {
+        writer.write(record)?;
+    }
+    Ok(())
+}
+
+pub fn write_fasta(path: &str, records: &Vec<Record>) -> anyhow::Result<()> {
+    let mut file = File::create(path)?;
+    let mut writer = noodles_fasta::io::Writer::new(Vec::new());
+
+    for record in records {
+        let qname = String::from_utf8(record.qname().to_vec())?;
+        let qseq = record.seq().as_bytes();
+        let qqual = String::from_utf8(record.qual().to_vec())?;
+        assert_eq!(qseq.len(), qqual.len());
+        let fastq_record = noodles_fasta::Record::new(
+            noodles_fasta::record::Definition::new(qname.clone(), None),
+            qseq.into(),
+        );
+        writer.write_record(&fastq_record)?;
+    }
+    file.write_all(writer.get_ref())?;
+    Ok(())
 }
 
 #[cfg(test)]
 mod tests {
     use env_logger::Env;
     use log::info;
+    use uuid::Uuid;
 
     // Note this useful idiom: importing names from outer (for mod tests) scope.
     use super::*;
@@ -764,7 +755,7 @@ mod tests {
 
         let res = get_start_end_qual_rec(&mut bam, chr, start, start + 1, mapq).unwrap();
         let (_, e) = res;
-        let res_records = swap_by_primary(bam_path, e.get(0).unwrap().clone());
+        let res_records = swap_by_primary(bam_path, e.first().unwrap().clone());
         println!("{res_records:?}");
         println!("{}", res_records.len());
         assert!(res_records.len() == 12);
@@ -780,7 +771,7 @@ mod tests {
         let mapq = 50;
         let r = sa_ratio(
             &format!("/data/longreads_basic_pipe/{case}/diag/{case}_diag_hs1.bam"),
-            &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_diag_hs1.bam"),
+            &format!("/data/longreads_basic_pipe/{case}/mrd/{case}_mrd_hs1.bam"),
             chr,
             start,
             stop,
@@ -792,7 +783,42 @@ mod tests {
 
     #[test]
     fn rg() {
-        let rg = bam_compo("/data/longreads_basic_pipe/KENNOUCHE/mrd/KENNOUCHE_mrd_hs1.bam", 20000);
+        let rg = bam_compo(
+            "/data/longreads_basic_pipe/KENNOUCHE/mrd/KENNOUCHE_mrd_hs1.bam",
+            20000,
+        );
         println!("{rg:#?}");
     }
+
+    #[test]
+    fn se_scan() -> Result<()> {
+        init();
+        // chr9:21,830,434_22,010,143
+        let id = "BECERRA";
+        let chr = "chr9";
+        let start = 21_820_000;
+        let stop = 23_000_000;
+
+        let result_dir = "/data/longreads_basic_pipe";
+        let diag_bam_path = format!("{result_dir}/{id}/diag/{id}_diag_hs1.bam");
+        let mrd_bam_path = format!("{result_dir}/{id}/mrd/{id}_mrd_hs1.bam");
+        let records = get_se_diag_mrd(&diag_bam_path, &mrd_bam_path, chr, start, stop, 50);
+
+        let reader = rust_htslib::bam::Reader::from_path(diag_bam_path)?;
+        let header = reader.header().clone();
+        let h = rust_htslib::bam::header::Header::from_template(&header);
+
+        let result_dir = format!("{result_dir}/{id}/diag/asm_contigs");
+        if !std::path::PathBuf::from(&result_dir).exists() {
+            std::fs::create_dir(&result_dir)?;
+        }
+        for e in records.iter() {
+            let uuid = Uuid::new_v4();
+            write_fasta(&format!("{result_dir}/{uuid}.fasta"), e)?;
+            write_bam(e, &h, &format!("{result_dir}/{uuid}.bam"))
+                .context("Error while writing bam")?;
+        }
+        info!("{} group of records", records.len());
+        Ok(())
+    }
 }