Thomas 1 年之前
父節點
當前提交
c7f4a051c1
共有 2 個文件被更改,包括 171 次插入90 次删除
  1. 17 12
      Cargo.lock
  2. 154 78
      src/lib.rs

+ 17 - 12
Cargo.lock

@@ -378,7 +378,7 @@ dependencies = [
  "directories 5.0.1",
  "serde",
  "thiserror",
- "toml 0.8.16",
+ "toml 0.8.17",
 ]
 
 [[package]]
@@ -504,9 +504,9 @@ dependencies = [
 
 [[package]]
 name = "curl-sys"
-version = "0.4.73+curl-8.8.0"
+version = "0.4.74+curl-8.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "450ab250ecf17227c39afb9a2dd9261dc0035cb80f2612472fc0c4aac2dcb84d"
+checksum = "8af10b986114528fcdc4b63b6f5f021b7057618411046a4de2ba0f0149a097bf"
 dependencies = [
  "cc",
  "libc",
@@ -2023,7 +2023,7 @@ dependencies = [
 [[package]]
 name = "pandora_lib_blastn"
 version = "0.1.0"
-source = "git+https://git.t0m4.fr/Thomas/pandora_lib_blastn.git#b9d4d901ed51681f30fc797e8a5f00ed6ac8174e"
+source = "git+https://git.t0m4.fr/Thomas/pandora_lib_blastn.git#516523739733c85e09276ede7375145662aa88a5"
 dependencies = [
  "anyhow",
  "confy 0.6.1",
@@ -2164,9 +2164,13 @@ dependencies = [
 
 [[package]]
 name = "ppv-lite86"
-version = "0.2.17"
+version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+checksum = "2288c0e17cc8d342c712bb43a257a80ebffce59cdb33d5000d8348f3ec02528b"
+dependencies = [
+ "zerocopy",
+ "zerocopy-derive",
+]
 
 [[package]]
 name = "prettytable-rs"
@@ -3084,9 +3088,9 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.8.16"
+version = "0.8.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81967dd0dd2c1ab0bc3468bd7caecc32b8a4aa47d0c8c695d8c2b2108168d62c"
+checksum = "7a44eede9b727419af8095cb2d72fab15487a541f54647ad4414b34096ee4631"
 dependencies = [
  "serde",
  "serde_spanned",
@@ -3096,18 +3100,18 @@ dependencies = [
 
 [[package]]
 name = "toml_datetime"
-version = "0.6.7"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8fb9f64314842840f1d940ac544da178732128f1c78c21772e876579e0da1db"
+checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
 dependencies = [
  "serde",
 ]
 
 [[package]]
 name = "toml_edit"
-version = "0.22.17"
+version = "0.22.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d9f8729f5aea9562aac1cc0441f5d6de3cff1ee0c5d67293eeca5eb36ee7c16"
+checksum = "1490595c74d930da779e944f5ba2ecdf538af67df1a9848cbd156af43c1b7cf0"
 dependencies = [
  "indexmap",
  "serde",
@@ -3587,6 +3591,7 @@ version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
 dependencies = [
+ "byteorder",
  "zerocopy-derive",
 ]
 

+ 154 - 78
src/lib.rs

@@ -24,7 +24,7 @@ use rust_htslib::bam::{Read, Reader, Record};
 use std::{
     collections::{HashMap, HashSet, VecDeque},
     fs::{self, File},
-    io::{BufReader, Cursor, Write},
+    io::{BufRead, BufReader, Cursor, Write},
     path::{Path, PathBuf},
 };
 use uuid::Uuid;
@@ -386,11 +386,8 @@ pub fn order_sequences_with_positions(blast_results: &[BlastResult]) -> Vec<(Str
 
     // Calculate absolute positions
     let mut positions: HashMap<String, usize> = HashMap::new();
-    let mut current_position = 0;
-
-    for seq in &ordered_sequences {
+    for (current_position, seq) in ordered_sequences.iter().enumerate() {
         positions.insert(seq.clone(), current_position);
-        current_position += 1; // Increment position for each sequence
     }
 
     let mut res: Vec<_> = ordered_sequences
@@ -401,74 +398,6 @@ pub fn order_sequences_with_positions(blast_results: &[BlastResult]) -> Vec<(Str
     res
 }
 
-// fn order_sequences_with_positions_bam(records: &Vec<bam::Record>, header: &Header) -> anyhow::Result<Vec<(String, usize)>> {
-//     let mut graph: HashMap<String, HashSet<String>> = HashMap::new();
-//     let mut in_degree: HashMap<String, usize> = HashMap::new();
-//     let mut all_sequences: HashSet<String> = HashSet::new();
-//
-//     for record in records {
-//         let query_id = String::from_utf8(record.name().to_vec());
-//         let subject_id = record.ref_id().clone();
-//         let q_start = record.aligned_query_start();
-//         let s_start = record.start();
-//
-//         all_sequences.insert(query_id.clone());
-//         all_sequences.insert(subject_id.clone());
-//
-//         if q_start < s_start {
-//             graph.entry(query_id.clone())
-//                 .or_insert_with(HashSet::new())
-//                 .insert(subject_id.clone());
-//             *in_degree.entry(subject_id.clone()).or_insert(0) += 1;
-//         } else {
-//             graph.entry(subject_id.clone())
-//                 .or_insert_with(HashSet::new())
-//                 .insert(query_id.clone());
-//             *in_degree.entry(query_id.clone()).or_insert(0) += 1;
-//         }
-//     }
-//
-//     let mut queue: VecDeque<String> = all_sequences.iter()
-//         .filter(|&seq| !in_degree.contains_key(seq))
-//         .cloned()
-//         .collect();
-//
-//     let mut ordered_sequences: Vec<String> = Vec::new();
-//
-//     while let Some(seq) = queue.pop_front() {
-//         ordered_sequences.push(seq.clone());
-//
-//         if let Some(neighbors) = graph.get(&seq) {
-//             for neighbor in neighbors {
-//                 if let Some(degree) = in_degree.get_mut(neighbor) {
-//                     *degree -= 1;
-//                     if *degree == 0 {
-//                         queue.push_back(neighbor.clone());
-//                     }
-//                 }
-//             }
-//         }
-//     }
-//
-//     for seq in all_sequences {
-//         if !ordered_sequences.contains(&seq) {
-//             ordered_sequences.push(seq.clone());
-//         }
-//     }
-//
-//     let mut positions: HashMap<String, usize> = HashMap::new();
-//     let mut current_position = 0;
-//
-//     for seq in &ordered_sequences {
-//         positions.insert(seq.clone(), current_position);
-//         current_position += 1;
-//     }
-//
-//     ordered_sequences.into_iter()
-//         .map(|seq| (seq.clone(), *positions.get(&seq).unwrap()))
-//         .collect()
-// }
-
 pub fn fai(path: &str) -> Result<std::process::Output, std::io::Error> {
     duct::cmd!("samtools", "faidx", path).run()
 }
@@ -524,6 +453,10 @@ impl FlyeAsm {
         fs::create_dir(&tmp_dir).unwrap();
         let input_fa = format!("{}/{}.fasta", self.asm_dir, self.input_id);
 
+        if !Path::new(&input_fa).exists() {
+            records_to_fasta(&self.input_records, &input_fa)?;
+        }
+
         // let input_fa = format!("{tmp_dir}/input.fa");
         // records_to_fasta(&self.input_records, &input_fa)?;
 
@@ -588,7 +521,7 @@ impl FlyeAsm {
             let bed_path = format!("{}/{contig_id}.bed", self.asm_dir);
             let bed = pandora_lib_blastn::Blast::init(&contig_fa)?
                 .run()?
-                .keep_best()
+                // .keep_best()
                 .to_bed()?;
             let mut f = File::create(bed_path)?;
             f.write_all(bed.as_bytes())?;
@@ -635,8 +568,7 @@ pub fn records_to_fasta(r: &Vec<bam::Record>, fa_path: &str) -> anyhow::Result<(
 
 pub fn dir_flye(dir: &str, force: bool) -> anyhow::Result<()> {
     let pattern = format!("{}/*.bam", dir);
-    let re =
-        Regex::new(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.bam$").unwrap();
+    let re = Regex::new(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.bam$")?;
 
     for entry in glob::glob(&pattern).expect("Failed to read glob pattern") {
         match entry {
@@ -644,7 +576,9 @@ pub fn dir_flye(dir: &str, force: bool) -> anyhow::Result<()> {
                 if let Some(file_name) = path.file_name().and_then(|name| name.to_str()) {
                     if re.is_match(file_name) {
                         if let Some(input_id) = path.file_stem().and_then(|n| n.to_str()) {
-                            if PathBuf::from(format!("{dir}/{input_id}_flye.bam")).exists() && !force {
+                            if PathBuf::from(format!("{dir}/{input_id}_flye.bam")).exists()
+                                && !force
+                            {
                                 warn!("{input_id} already assembled: {}", igv_link(dir, input_id)?);
                             } else {
                                 let f = FlyeAsm::new(dir, input_id)?;
@@ -667,6 +601,102 @@ pub fn dir_flye(dir: &str, force: bool) -> anyhow::Result<()> {
     Ok(())
 }
 
+pub fn dedup_dir(dir: &str) -> anyhow::Result<()> {
+    // load bed
+    let pattern = format!("{}/*.bed", dir);
+    let re =
+        Regex::new(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}_flye\.bed$")?;
+
+    let mut bed_hm: HashMap<String, Vec<String>> = HashMap::new();
+    for entry in glob::glob(&pattern).expect("Failed to read glob pattern") {
+        match entry {
+            Ok(path) => {
+                println!("{path:?}");
+                if let Some(file_name) = path.file_name().and_then(|name| name.to_str()) {
+                    if re.is_match(file_name) {
+                        if let Some(input_id) = path.file_stem().and_then(|n| n.to_str()) {
+                            let mut bed_blast: Vec<String> = read_tsv_file(path.to_str().unwrap())?
+                                .iter()
+                                .map(|r| format!("{}{}", r.name, r.strand))
+                                .collect();
+                            bed_blast.sort();
+                            let key = bed_blast.join("|");
+                            bed_hm.entry(key).or_default().push(input_id.to_owned().replace("_flye", ""));
+                        }
+                    }
+                }
+            }
+            Err(e) => println!("{:?}", e),
+        }
+    }
+    println!("{bed_hm:#?}");
+
+    for group in bed_hm.values() {
+        if group.len() < 2 {
+            continue;
+        }
+        let new_locus_id = uuid::Uuid::new_v4();
+        let output_bam_path = format!("{dir}/{new_locus_id}.bam");
+        info!("Merging {} bams into {}", group.len(), output_bam_path);
+        let input_bam_paths = group.iter().map(|s| format!("{dir}/{s}.bam")).collect();
+        merge_bam_files(input_bam_paths, &output_bam_path)?;
+        dir_flye(dir, false)?;
+    }
+
+
+
+    Ok(())
+}
+
+pub struct BedRow {
+    pub contig: String,
+    pub start: u32,
+    pub end: u32,
+    pub name: String,
+    pub score: String,
+    pub strand: String,
+}
+
+// Function to read a TSV file and return a Vec of Row structs
+fn read_tsv_file(file_path: &str) -> anyhow::Result<Vec<BedRow>> {
+    // Open the file
+    let file = File::open(file_path)?;
+    let reader = BufReader::new(file);
+
+    // Create a vector to store the rows
+    let mut rows = Vec::new();
+
+    // Iterate over each line in the file
+    for line in reader.lines() {
+        // Unwrap the line, skipping any that cause errors
+        let line = line?;
+
+        // Split the line by tabs
+        let fields: Vec<&str> = line.split('\t').collect();
+
+        // Ensure the line has the correct number of fields
+        if fields.len() != 6 {
+            continue; // Skip lines with incorrect number of fields
+        }
+
+        // Parse the fields and create a Row struct
+        let row = BedRow {
+            contig: fields[0].to_string(),
+            start: fields[1].parse()?,
+            end: fields[2].parse()?,
+            name: fields[3].to_string(),
+            score: fields[4].to_string(),
+            strand: fields[5].to_string(),
+        };
+
+        // Add the row to the vector
+        rows.push(row);
+    }
+
+    // Return the vector of rows
+    Ok(rows)
+}
+
 pub fn igv_link(dir: &str, contig_id: &str) -> anyhow::Result<String> {
     let contig_fa = format!("{dir}/{contig_id}_flye.fa");
     let bam_track = Track::Bam(BamTrack::new(
@@ -698,6 +728,39 @@ pub fn igv_link(dir: &str, contig_id: &str) -> anyhow::Result<String> {
     Ok(link)
 }
 
+// Function to merge two BAM files into a new BAM file
+fn merge_bam_files(input_bam_paths: Vec<String>, output_bam_path: &str) -> anyhow::Result<()> {
+    // Ensure there is at least one BAM file to merge
+    if input_bam_paths.is_empty() {
+        return Err(anyhow!("No input BAM files provided"));
+    }
+
+    // Open the first input BAM file to get the header
+    let mut bam1 = rust_htslib::bam::Reader::from_path(&input_bam_paths[0])?;
+    let header = rust_htslib::bam::Header::from_template(bam1.header());
+
+    // Create a new BAM writer with the header from the first BAM file
+    let mut output_bam = rust_htslib::bam::Writer::from_path(output_bam_path, &header, rust_htslib::bam::Format::Bam)?;
+
+    // Write records from the first BAM file to the output BAM file
+    for result in bam1.records() {
+        let record = result?;
+        output_bam.write(&record)?;
+    }
+
+    // Iterate over the remaining BAM files and write their records to the output BAM file
+    for bam_path in &input_bam_paths[1..] {
+        let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
+        for result in bam.records() {
+            let record = result?;
+            output_bam.write(&record)?;
+        }
+    }
+
+    // The Writer will automatically close when it goes out of scope
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
     use std::fs;
@@ -766,9 +829,22 @@ mod tests {
     #[test]
     fn flye_d() -> anyhow::Result<()> {
         init();
-        let id = "SALICETTO";
+        let id = "LEVASSEUR";
         let res_dir = "/data/longreads_basic_pipe";
         let asm_dir = format!("{res_dir}/{id}/diag/asm_contigs");
         dir_flye(&asm_dir, true)
     }
+
+    #[test]
+    fn tmp() {
+        init();
+        dir_flye("/data/tmp/scan_ca67d4bc-a18e-40ab-9e0a-af90116ca20b/reads/chr9", true).unwrap();
+    }
+
+    #[test]
+    fn dedup() {
+        init();
+        dedup_dir("/data/tmp/scan_7ed2f43c-d16d-4dcc-bdb4-fb619d082991").unwrap();
+    }
+
 }