소스 검색

scan and save

Thomas 1 년 전
부모
커밋
8deac1f481
4개의 변경된 파일170개의 추가작업 그리고 94개의 파일을 삭제
  1. 27 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 41 70
      src/bam.rs
  4. 101 24
      src/lib.rs

+ 27 - 0
Cargo.lock

@@ -288,6 +288,17 @@ dependencies = [
  "quick-error",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
 [[package]]
 name = "glob"
 version = "0.3.1"
@@ -489,6 +500,7 @@ dependencies = [
  "log",
  "rayon",
  "rust-htslib",
+ "uuid",
 ]
 
 [[package]]
@@ -739,12 +751,27 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
+[[package]]
+name = "uuid"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
+dependencies = [
+ "getrandom",
+]
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
 
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
 [[package]]
 name = "windows-sys"
 version = "0.52.0"

+ 1 - 0
Cargo.toml

@@ -9,3 +9,4 @@ env_logger = "0.11.5"
 log = "0.4.22"
 rayon = "1.10.0"
 rust-htslib = "0.47.0"
+uuid = { version = "1.10.0", features = ["v4"] }

+ 41 - 70
src/bam.rs

@@ -1,8 +1,11 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, u32};
 
 use anyhow::Context;
+use log::warn;
 use rust_htslib::bam::{
-    ext::BamRecordExtensions, record::{Aux, Cigar, CigarString}, Header, IndexedReader, Read, Record,
+    ext::BamRecordExtensions,
+    record::{Aux, Cigar, CigarString},
+    Header, HeaderView, IndexedReader, Read, Record,
 };
 
 pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
@@ -34,87 +37,55 @@ pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
     record
 }
 
-pub fn get_all_positions(record: &Record, tid_2_contig: &HashMap<i32, String>, bam: &mut IndexedReader) {
+pub fn get_all_positions(
+    record: &Record,
+    header: &HeaderView,
+    bam: &mut IndexedReader,
+) -> anyhow::Result<Vec<(String, u32, u32)>> {
     let mut positions = Vec::new();
     let qname = record.qname();
-    if let Some(contig) = tid_2_contig.get(&record.tid()) {
-        positions.push((contig.to_string(), record.reference_start() as u32));
-    }
+    let contig = String::from_utf8(header.tid2name(record.tid().try_into()?).to_vec())?;
+    positions.push((
+        contig,
+        record.reference_start() as u32,
+        record.reference_end() as u32,
+    ));
 
     if let Ok(Aux::String(sa)) = record.aux(b"SA") {
-        let positions: Vec<(&str, u32, u32)> = sa
-            .split(';')
-            .filter(|s| !s.is_empty())
-            .map(|s| {
-                let parts: Vec<&str> = s.split(',').take(4).collect();
-                let chr = parts.first().unwrap();
-                let start: u32 = parts.get(1).unwrap().parse().unwrap();
-                // SAM format is 1-based!
-                let start = start - 1;
-                let strand = parts.get(2).unwrap();
-                let cigar_str = *parts.get(3).unwrap();
-                let end = calculate_end_position(start, cigar_str).unwrap();
-
-                let mut founded = false;
-                bam.fetch((chr, start, start + 1)).unwrap();
-                for record in bam.records().flatten() {
-                        let rc = record.cigar().to_string();
-                    if qname == record.qname() && cigar_str == rc {
-                        founded = true;
-                        assert_eq!(start, record.reference_start() as u32, "start {chr}:{start}-{end} {strand} {cigar_str} {}", record.cigar());
-                        if end != record.reference_end() as u32 {
-                            println!("end {chr}:{start}-{end} {strand} {cigar_str} {}", record.cigar());
-                        }
-                    }
-                }
-                if !founded {
-                    println!("NOT FouNDED {chr}:{start}-{end} {strand} {cigar_str}");
-
-                }
+        let sa_splits: Vec<&str> = sa.split(';').filter(|s| !s.is_empty()).collect();
+        for s in sa_splits {
+            let parts: Vec<&str> = s.split(',').take(4).collect();
+            let chr = parts.first().unwrap();
+            let start: u32 = parts.get(1).unwrap().parse().unwrap();
+            // SAM format is 1-based!
+            let start = start - 1;
+            // let strand = parts.get(2).unwrap();
+            // // CIGAR SA != CIGAR can't calculate the end from it.
+            // let cigar_str = *parts.get(3).unwrap();
+            // let end = calculate_end_position(start, cigar_str).unwrap();
 
-                (*chr, start, end)
-            })
-            .collect();
-
-        // for (chr, start) in positions {
-        //     bam.fetch((chr, start, start + 1)).unwrap();
-        //     for record in bam.records().flatten() {
-        //         if qname == record.qname() && !record.is_supplementary() {
-        //             return record.clone();
-        //         }
-        //     }
-        // }
-    }
-}
-
-fn calculate_end_position(start: u32, cigar_str: &str) -> Result<u32, String> {
-    let mut end = start;
-    let mut count = String::new();
-
-    for c in cigar_str.chars() {
-        if c.is_digit(10) {
-            count.push(c);
-        } else {
-            let len: u32 = count.parse().map_err(|e| format!("Parse error: {}", e))?;
-            count.clear();
-            match c {
-                'M' | 'D' | 'N' | '=' | 'X' => {
-                    end += len;
-                }
-                'I' | 'S' | 'H' | 'P' => {
-                    // These operations do not consume reference positions
+            bam.fetch((chr, start, start + 1))?;
+            let mut founded = false;
+            for record in bam.records().flatten() {
+                if qname == record.qname() {
+                    founded = true;
+                    positions.push((chr.to_string(), start, record.reference_end() as u32));
                 }
-                _ => return Err(format!("Invalid CIGAR operation: {}", c)),
+            }
+            if !founded {
+                warn!("Not founded");
             }
         }
     }
-
-    Ok(end)
+    positions.sort_by(|a, b| a.1.cmp(&b.1));
+    positions.sort_by(|a, b| a.0.cmp(&b.0));
+    positions.dedup();
+    Ok(positions)
 }
 
 pub fn create_tid_2_contig(bam_path: &str) -> anyhow::Result<HashMap<i32, String>> {
     let bam_reader = rust_htslib::bam::IndexedReader::from_path(bam_path)
-            .context(format!("Can't open {}", bam_path))?;
+        .context(format!("Can't open {}", bam_path))?;
     let h = bam_reader.header();
     let mut r = HashMap::new();
     h.target_names().iter().enumerate().for_each(|(i, name)| {

+ 101 - 24
src/lib.rs

@@ -1,5 +1,5 @@
-pub mod bin;
 pub mod bam;
+pub mod bin;
 
 // fn get_se_diag_mrd(
 //     diag_bam_path: &str,
@@ -138,9 +138,15 @@ fn std_deviation(data: &[f64]) -> Option<f64> {
 }
 #[cfg(test)]
 mod tests {
+    use anyhow::Ok;
     use log::{info, warn};
-    use rust_htslib::bam::Record;
-    use std::iter::FromIterator;
+    use rust_htslib::bam::{record, Format, Header, Read, Record, Writer};
+    use std::{
+        collections::{HashMap, HashSet},
+        fs,
+        iter::FromIterator,
+        path::Path,
+    };
 
     use crate::bam::{create_tid_2_contig, get_all_positions};
 
@@ -240,8 +246,9 @@ mod tests {
     }
 
     #[test]
-    fn outliers() {
+    fn outliers() -> anyhow::Result<()> {
         init();
+        let min_reads = 3;
         let id = "SALICETTO";
         let contig = "chr10";
         let start = 91_000_000;
@@ -253,33 +260,103 @@ mod tests {
         let result_dir = "/data/longreads_basic_pipe";
         let diag_bam_path = format!("{result_dir}/{id}/diag/{id}_diag_hs1.bam");
 
-        let outliers_records: Vec<Vec<Record>> = scan_outliers(&diag_bam_path, contig, start, end, bin_length)
-            .iter()
-            .filter(|(_, _, _, sa_outlier, _, se_outlier)| *sa_outlier || *se_outlier)
-            .map(|(start, n, sa, sa_outlier, se, se_outlier)| {
-                let mut bin = Bin::new(&diag_bam_path, contig, *start, bin_length).unwrap();
-                let mut records = Vec::new();
-                if *sa_outlier {
-                    records.extend(bin.sa_primary());
-                } else if *se_outlier {
-                    let (pos, _) = bin.max_start_or_end();
-                    records.extend(bin.se_primary(pos));
-                };
-                info!("{}:{start}-{}\t{n}\t{sa}\t{se}\t{}", contig, start + bin_length - 1, records.len());
-                records
-            }).collect();
+        let outliers_records: Vec<Vec<Record>> =
+            scan_outliers(&diag_bam_path, contig, start, end, bin_length)
+                .iter()
+                .filter(|(_, _, _, sa_outlier, _, se_outlier)| *sa_outlier || *se_outlier)
+                .map(|(start, n, sa, sa_outlier, se, se_outlier)| {
+                    let mut bin = Bin::new(&diag_bam_path, contig, *start, bin_length).unwrap();
+                    let mut records = Vec::new();
+                    if *sa_outlier {
+                        records.extend(bin.sa_primary());
+                    } else if *se_outlier {
+                        let (pos, _) = bin.max_start_or_end();
+                        records.extend(bin.se_primary(pos));
+                    };
+                    info!(
+                        "{}:{start}-{}\t{n}\t{sa}\t{se}\t{}",
+                        contig,
+                        start + bin_length - 1,
+                        records.len()
+                    );
+                    records
+                })
+                .collect();
         info!("n group of records {}", outliers_records.len());
 
-        let mut bam = rust_htslib::bam::IndexedReader::from_path(&diag_bam_path)
-            .unwrap();
-        let tid_2_contig = create_tid_2_contig(&diag_bam_path).unwrap();
+        let mut bam = rust_htslib::bam::IndexedReader::from_path(&diag_bam_path).unwrap();
+        // let u = bam.header().clone();
+        let header = bam.header().to_owned();
         let mut n_recors = 0;
+        let mut grouped_records = Vec::new();
         for outlier_record in outliers_records {
             n_recors += outlier_record.len();
+            let mut hm_positions: HashMap<String, Vec<String>> = HashMap::new();
             outlier_record.iter().for_each(|r| {
-                get_all_positions(r, &tid_2_contig, &mut bam)
-            })
+                for pos in get_all_positions(r, &header, &mut bam).unwrap() {
+                    let qname = String::from_utf8(r.qname().to_vec()).unwrap();
+                    hm_positions
+                        .entry(format!("{}-{}", pos.0, pos.1))
+                        .or_default()
+                        .push(qname.clone());
+                    hm_positions
+                        .entry(format!("{}-{}", pos.0, pos.2))
+                        .or_default()
+                        .push(qname);
+                }
+            });
+            for v in hm_positions.values_mut() {
+                v.sort(); // Ensure dedup works correctly
+                v.dedup();
+
+                if v.len() >= min_reads {
+                    let rec: Vec<_> = outlier_record
+                        .clone()
+                        .into_iter()
+                        .filter(|r| {
+                            let qname = String::from_utf8(r.qname().to_vec()).unwrap();
+                            v.contains(&qname)
+                        })
+                        .collect();
+                    grouped_records.push(rec);
+                }
+            }
         }
+        println!("{grouped_records:#?}");
         println!("n records: {n_recors}");
+
+        let header = Header::from_template(&header);
+
+        let tmp_dir = format!("/data/tmp/scan_{}", uuid::Uuid::new_v4());
+        fs::create_dir_all(&tmp_dir).unwrap();
+        let mut dedup = HashSet::new();
+
+        let grouped_records: Vec<Vec<Record>> = grouped_records
+            .iter()
+            .filter(|r| {
+                let mut s: Vec<String> = r
+                    .iter()
+                    .map(|rr| String::from_utf8(rr.qname().to_vec()).unwrap())
+                    .collect();
+                s.sort();
+                let s = s.join("//");
+                if dedup.contains(&s) {
+                    false
+                } else {
+                    dedup.insert(s);
+                    true
+                }
+            })
+            .map(|r| r.to_vec())
+            .collect();
+        for records in grouped_records {
+            // Create a new BAM writer with the new header
+            let output_bam_path = format!("{tmp_dir}/{}.bam", uuid::Uuid::new_v4());
+            let mut output_bam = Writer::from_path(output_bam_path, &header, Format::Bam).unwrap();
+            for record in records {
+                output_bam.write(&record)?;
+            }
+        }
+        Ok(())
     }
 }