1 년 전 · 8deac1f481
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -288,6 +288,17 @@ dependencies = [
 
				  "quick-error",
			
 
				 ]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "getrandom"
			
 
				+version = "0.2.15"
			
 
				+source = "registry+https://github.com/rust-lang/crates.io-index"
			
 
				+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
			
 
				+dependencies = [
			
 
				+ "cfg-if",
			
 
				+ "libc",
			
 
				+ "wasi",
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "glob"
			
 
				 version = "0.3.1"
			
@@ -489,6 +500,7 @@ dependencies = [
 
				  "log",
			
 
				  "rayon",
			
 
				  "rust-htslib",
			
 
				+ "uuid",
			
 
				 ]
			
 
				 
			
 
				 [[package]]
			
@@ -739,12 +751,27 @@ version = "0.2.2"
 
				 source = "registry+https://github.com/rust-lang/crates.io-index"
			
 
				 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
			
 
				 
			
 
				+[[package]]
			
 
				+name = "uuid"
			
 
				+version = "1.10.0"
			
 
				+source = "registry+https://github.com/rust-lang/crates.io-index"
			
 
				+checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
			
 
				+dependencies = [
			
 
				+ "getrandom",
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "vcpkg"
			
 
				 version = "0.2.15"
			
 
				 source = "registry+https://github.com/rust-lang/crates.io-index"
			
 
				 checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
			
 
				 
			
 
				+[[package]]
			
 
				+name = "wasi"
			
 
				+version = "0.11.0+wasi-snapshot-preview1"
			
 
				+source = "registry+https://github.com/rust-lang/crates.io-index"
			
 
				+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
			
 
				+
			
 
				 [[package]]
			
 
				 name = "windows-sys"
			
 
				 version = "0.52.0"
			
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,3 +9,4 @@ env_logger = "0.11.5"
 
				 log = "0.4.22"
			
 
				 rayon = "1.10.0"
			
 
				 rust-htslib = "0.47.0"
			
 
				+uuid = { version = "1.10.0", features = ["v4"] }
			
--- a/src/bam.rs
+++ b/src/bam.rs
@@ -1,8 +1,11 @@
 
				-use std::collections::HashMap;
			
 
				+use std::{collections::HashMap, u32};
			
 
				 
			
 
				 use anyhow::Context;
			
 
				+use log::warn;
			
 
				 use rust_htslib::bam::{
			
 
				-    ext::BamRecordExtensions, record::{Aux, Cigar, CigarString}, Header, IndexedReader, Read, Record,
			
 
				+    ext::BamRecordExtensions,
			
 
				+    record::{Aux, Cigar, CigarString},
			
 
				+    Header, HeaderView, IndexedReader, Read, Record,
			
 
				 };
			
 
				 
			
 
				 pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
			
@@ -34,87 +37,55 @@ pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
 
				     record
			
 
				 }
			
 
				 
			
 
				-pub fn get_all_positions(record: &Record, tid_2_contig: &HashMap<i32, String>, bam: &mut IndexedReader) {
			
 
				+pub fn get_all_positions(
			
 
				+    record: &Record,
			
 
				+    header: &HeaderView,
			
 
				+    bam: &mut IndexedReader,
			
 
				+) -> anyhow::Result<Vec<(String, u32, u32)>> {
			
 
				     let mut positions = Vec::new();
			
 
				     let qname = record.qname();
			
 
				-    if let Some(contig) = tid_2_contig.get(&record.tid()) {
			
 
				-        positions.push((contig.to_string(), record.reference_start() as u32));
			
 
				-    }
			
 
				+    let contig = String::from_utf8(header.tid2name(record.tid().try_into()?).to_vec())?;
			
 
				+    positions.push((
			
 
				+        contig,
			
 
				+        record.reference_start() as u32,
			
 
				+        record.reference_end() as u32,
			
 
				+    ));
			
 
				 
			
 
				     if let Ok(Aux::String(sa)) = record.aux(b"SA") {
			
 
				-        let positions: Vec<(&str, u32, u32)> = sa
			
 
				-            .split(';')
			
 
				-            .filter(|s| !s.is_empty())
			
 
				-            .map(|s| {
			
 
				-                let parts: Vec<&str> = s.split(',').take(4).collect();
			
 
				-                let chr = parts.first().unwrap();
			
 
				-                let start: u32 = parts.get(1).unwrap().parse().unwrap();
			
 
				-                // SAM format is 1-based!
			
 
				-                let start = start - 1;
			
 
				-                let strand = parts.get(2).unwrap();
			
 
				-                let cigar_str = *parts.get(3).unwrap();
			
 
				-                let end = calculate_end_position(start, cigar_str).unwrap();
			
 
				-
			
 
				-                let mut founded = false;
			
 
				-                bam.fetch((chr, start, start + 1)).unwrap();
			
 
				-                for record in bam.records().flatten() {
			
 
				-                        let rc = record.cigar().to_string();
			
 
				-                    if qname == record.qname() && cigar_str == rc {
			
 
				-                        founded = true;
			
 
				-                        assert_eq!(start, record.reference_start() as u32, "start {chr}:{start}-{end} {strand} {cigar_str} {}", record.cigar());
			
 
				-                        if end != record.reference_end() as u32 {
			
 
				-                            println!("end {chr}:{start}-{end} {strand} {cigar_str} {}", record.cigar());
			
 
				-                        }
			
 
				-                    }
			
 
				-                }
			
 
				-                if !founded {
			
 
				-                    println!("NOT FouNDED {chr}:{start}-{end} {strand} {cigar_str}");
			
 
				-
			
 
				-                }
			
 
				+        let sa_splits: Vec<&str> = sa.split(';').filter(|s| !s.is_empty()).collect();
			
 
				+        for s in sa_splits {
			
 
				+            let parts: Vec<&str> = s.split(',').take(4).collect();
			
 
				+            let chr = parts.first().unwrap();
			
 
				+            let start: u32 = parts.get(1).unwrap().parse().unwrap();
			
 
				+            // SAM format is 1-based!
			
 
				+            let start = start - 1;
			
 
				+            // let strand = parts.get(2).unwrap();
			
 
				+            // // CIGAR SA != CIGAR can't calculate the end from it.
			
 
				+            // let cigar_str = *parts.get(3).unwrap();
			
 
				+            // let end = calculate_end_position(start, cigar_str).unwrap();
			
 
				 
			
 
				-                (*chr, start, end)
			
 
				-            })
			
 
				-            .collect();
			
 
				-
			
 
				-        // for (chr, start) in positions {
			
 
				-        //     bam.fetch((chr, start, start + 1)).unwrap();
			
 
				-        //     for record in bam.records().flatten() {
			
 
				-        //         if qname == record.qname() && !record.is_supplementary() {
			
 
				-        //             return record.clone();
			
 
				-        //         }
			
 
				-        //     }
			
 
				-        // }
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-fn calculate_end_position(start: u32, cigar_str: &str) -> Result<u32, String> {
			
 
				-    let mut end = start;
			
 
				-    let mut count = String::new();
			
 
				-
			
 
				-    for c in cigar_str.chars() {
			
 
				-        if c.is_digit(10) {
			
 
				-            count.push(c);
			
 
				-        } else {
			
 
				-            let len: u32 = count.parse().map_err(|e| format!("Parse error: {}", e))?;
			
 
				-            count.clear();
			
 
				-            match c {
			
 
				-                'M' | 'D' | 'N' | '=' | 'X' => {
			
 
				-                    end += len;
			
 
				-                }
			
 
				-                'I' | 'S' | 'H' | 'P' => {
			
 
				-                    // These operations do not consume reference positions
			
 
				+            bam.fetch((chr, start, start + 1))?;
			
 
				+            let mut founded = false;
			
 
				+            for record in bam.records().flatten() {
			
 
				+                if qname == record.qname() {
			
 
				+                    founded = true;
			
 
				+                    positions.push((chr.to_string(), start, record.reference_end() as u32));
			
 
				                 }
			
 
				-                _ => return Err(format!("Invalid CIGAR operation: {}", c)),
			
 
				+            }
			
 
				+            if !founded {
			
 
				+                warn!("Not founded");
			
 
				             }
			
 
				         }
			
 
				     }
			
 
				-
			
 
				-    Ok(end)
			
 
				+    positions.sort_by(|a, b| a.1.cmp(&b.1));
			
 
				+    positions.sort_by(|a, b| a.0.cmp(&b.0));
			
 
				+    positions.dedup();
			
 
				+    Ok(positions)
			
 
				 }
			
 
				 
			
 
				 pub fn create_tid_2_contig(bam_path: &str) -> anyhow::Result<HashMap<i32, String>> {
			
 
				     let bam_reader = rust_htslib::bam::IndexedReader::from_path(bam_path)
			
 
				-            .context(format!("Can't open {}", bam_path))?;
			
 
				+        .context(format!("Can't open {}", bam_path))?;
			
 
				     let h = bam_reader.header();
			
 
				     let mut r = HashMap::new();
			
 
				     h.target_names().iter().enumerate().for_each(|(i, name)| {
			
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,5 @@
 
				-pub mod bin;
			
 
				 pub mod bam;
			
 
				+pub mod bin;
			
 
				 
			
 
				 // fn get_se_diag_mrd(
			
 
				 //     diag_bam_path: &str,
			
@@ -138,9 +138,15 @@ fn std_deviation(data: &[f64]) -> Option<f64> {
 
				 }
			
 
				 #[cfg(test)]
			
 
				 mod tests {
			
 
				+    use anyhow::Ok;
			
 
				     use log::{info, warn};
			
 
				-    use rust_htslib::bam::Record;
			
 
				-    use std::iter::FromIterator;
			
 
				+    use rust_htslib::bam::{record, Format, Header, Read, Record, Writer};
			
 
				+    use std::{
			
 
				+        collections::{HashMap, HashSet},
			
 
				+        fs,
			
 
				+        iter::FromIterator,
			
 
				+        path::Path,
			
 
				+    };
			
 
				 
			
 
				     use crate::bam::{create_tid_2_contig, get_all_positions};
			
 
				 
			
@@ -240,8 +246,9 @@ mod tests {
 
				     }
			
 
				 
			
 
				     #[test]
			
 
				-    fn outliers() {
			
 
				+    fn outliers() -> anyhow::Result<()> {
			
 
				         init();
			
 
				+        let min_reads = 3;
			
 
				         let id = "SALICETTO";
			
 
				         let contig = "chr10";
			
 
				         let start = 91_000_000;
			
@@ -253,33 +260,103 @@ mod tests {
 
				         let result_dir = "/data/longreads_basic_pipe";
			
 
				         let diag_bam_path = format!("{result_dir}/{id}/diag/{id}_diag_hs1.bam");
			
 
				 
			
 
				-        let outliers_records: Vec<Vec<Record>> = scan_outliers(&diag_bam_path, contig, start, end, bin_length)
			
 
				-            .iter()
			
 
				-            .filter(|(_, _, _, sa_outlier, _, se_outlier)| *sa_outlier || *se_outlier)
			
 
				-            .map(|(start, n, sa, sa_outlier, se, se_outlier)| {
			
 
				-                let mut bin = Bin::new(&diag_bam_path, contig, *start, bin_length).unwrap();
			
 
				-                let mut records = Vec::new();
			
 
				-                if *sa_outlier {
			
 
				-                    records.extend(bin.sa_primary());
			
 
				-                } else if *se_outlier {
			
 
				-                    let (pos, _) = bin.max_start_or_end();
			
 
				-                    records.extend(bin.se_primary(pos));
			
 
				-                };
			
 
				-                info!("{}:{start}-{}\t{n}\t{sa}\t{se}\t{}", contig, start + bin_length - 1, records.len());
			
 
				-                records
			
 
				-            }).collect();
			
 
				+        let outliers_records: Vec<Vec<Record>> =
			
 
				+            scan_outliers(&diag_bam_path, contig, start, end, bin_length)
			
 
				+                .iter()
			
 
				+                .filter(|(_, _, _, sa_outlier, _, se_outlier)| *sa_outlier || *se_outlier)
			
 
				+                .map(|(start, n, sa, sa_outlier, se, se_outlier)| {
			
 
				+                    let mut bin = Bin::new(&diag_bam_path, contig, *start, bin_length).unwrap();
			
 
				+                    let mut records = Vec::new();
			
 
				+                    if *sa_outlier {
			
 
				+                        records.extend(bin.sa_primary());
			
 
				+                    } else if *se_outlier {
			
 
				+                        let (pos, _) = bin.max_start_or_end();
			
 
				+                        records.extend(bin.se_primary(pos));
			
 
				+                    };
			
 
				+                    info!(
			
 
				+                        "{}:{start}-{}\t{n}\t{sa}\t{se}\t{}",
			
 
				+                        contig,
			
 
				+                        start + bin_length - 1,
			
 
				+                        records.len()
			
 
				+                    );
			
 
				+                    records
			
 
				+                })
			
 
				+                .collect();
			
 
				         info!("n group of records {}", outliers_records.len());
			
 
				 
			
 
				-        let mut bam = rust_htslib::bam::IndexedReader::from_path(&diag_bam_path)
			
 
				-            .unwrap();
			
 
				-        let tid_2_contig = create_tid_2_contig(&diag_bam_path).unwrap();
			
 
				+        let mut bam = rust_htslib::bam::IndexedReader::from_path(&diag_bam_path).unwrap();
			
 
				+        // let u = bam.header().clone();
			
 
				+        let header = bam.header().to_owned();
			
 
				         let mut n_recors = 0;
			
 
				+        let mut grouped_records = Vec::new();
			
 
				         for outlier_record in outliers_records {
			
 
				             n_recors += outlier_record.len();
			
 
				+            let mut hm_positions: HashMap<String, Vec<String>> = HashMap::new();
			
 
				             outlier_record.iter().for_each(|r| {
			
 
				-                get_all_positions(r, &tid_2_contig, &mut bam)
			
 
				-            })
			
 
				+                for pos in get_all_positions(r, &header, &mut bam).unwrap() {
			
 
				+                    let qname = String::from_utf8(r.qname().to_vec()).unwrap();
			
 
				+                    hm_positions
			
 
				+                        .entry(format!("{}-{}", pos.0, pos.1))
			
 
				+                        .or_default()
			
 
				+                        .push(qname.clone());
			
 
				+                    hm_positions
			
 
				+                        .entry(format!("{}-{}", pos.0, pos.2))
			
 
				+                        .or_default()
			
 
				+                        .push(qname);
			
 
				+                }
			
 
				+            });
			
 
				+            for v in hm_positions.values_mut() {
			
 
				+                v.sort(); // Ensure dedup works correctly
			
 
				+                v.dedup();
			
 
				+
			
 
				+                if v.len() >= min_reads {
			
 
				+                    let rec: Vec<_> = outlier_record
			
 
				+                        .clone()
			
 
				+                        .into_iter()
			
 
				+                        .filter(|r| {
			
 
				+                            let qname = String::from_utf8(r.qname().to_vec()).unwrap();
			
 
				+                            v.contains(&qname)
			
 
				+                        })
			
 
				+                        .collect();
			
 
				+                    grouped_records.push(rec);
			
 
				+                }
			
 
				+            }
			
 
				         }
			
 
				+        println!("{grouped_records:#?}");
			
 
				         println!("n records: {n_recors}");
			
 
				+
			
 
				+        let header = Header::from_template(&header);
			
 
				+
			
 
				+        let tmp_dir = format!("/data/tmp/scan_{}", uuid::Uuid::new_v4());
			
 
				+        fs::create_dir_all(&tmp_dir).unwrap();
			
 
				+        let mut dedup = HashSet::new();
			
 
				+
			
 
				+        let grouped_records: Vec<Vec<Record>> = grouped_records
			
 
				+            .iter()
			
 
				+            .filter(|r| {
			
 
				+                let mut s: Vec<String> = r
			
 
				+                    .iter()
			
 
				+                    .map(|rr| String::from_utf8(rr.qname().to_vec()).unwrap())
			
 
				+                    .collect();
			
 
				+                s.sort();
			
 
				+                let s = s.join("//");
			
 
				+                if dedup.contains(&s) {
			
 
				+                    false
			
 
				+                } else {
			
 
				+                    dedup.insert(s);
			
 
				+                    true
			
 
				+                }
			
 
				+            })
			
 
				+            .map(|r| r.to_vec())
			
 
				+            .collect();
			
 
				+        for records in grouped_records {
			
 
				+            // Create a new BAM writer with the new header
			
 
				+            let output_bam_path = format!("{tmp_dir}/{}.bam", uuid::Uuid::new_v4());
			
 
				+            let mut output_bam = Writer::from_path(output_bam_path, &header, Format::Bam).unwrap();
			
 
				+            for record in records {
			
 
				+                output_bam.write(&record)?;
			
 
				+            }
			
 
				+        }
			
 
				+        Ok(())
			
 
				     }
			
 
				 }