Thomas 1 year ago
parent
commit
7193ccdfdf
3 changed files with 682 additions and 123 deletions
  1. 595 54
      Cargo.lock
  2. 15 13
      Cargo.toml
  3. 72 56
      src/lib.rs

File diff suppressed because it is too large
+ 595 - 54
Cargo.lock


+ 15 - 13
Cargo.toml

@@ -6,26 +6,28 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-env_logger = "^0.10.1"
+env_logger = "^0.11.5"
 test-env-log = "0.2.8"
 minimap2 = { git = "https://github.com/jguhlin/minimap2-rs", features = ["simde"] }
 aligner_client = { git = "https://git.t0m4.fr/Thomas/aligner_client.git"}
 rust-htslib = "0.47.0"
-anyhow = "1.0.75"
-log = "0.4.19"
-rand = "0.8.4"
-uuid = { version = "1.6.1", features = ["serde", "v4"] }
+anyhow = "1.0.86"
+log = "0.4.22"
+rand = "0.8.5"
+uuid = { version = "1.10.0", features = ["serde", "v4"] }
 seq_io = "0.3.2"
-noodles-fasta = "0.34.0"
-noodles-vcf = "0.51.0"
+noodles-fasta = "0.42.0"
+noodles-vcf = "0.62.0"
 crossbeam = "0.8.4"
-crossbeam-channel = "0.5.12"
-test-log = "0.2.15"
+crossbeam-channel = "0.5.13"
+test-log = "0.2.16"
 num-format = "0.4.4"
-petgraph = "0.6.4"
+petgraph = "0.6.5"
 indicatif = {version = "0.17.8", features = ["rayon"]}
-indicatif-log-bridge = "0.2.2"
-rayon = "1.8.0"
-dashmap = "5.5.3"
+indicatif-log-bridge = "0.2.3"
+rayon = "1.10.0"
+dashmap = "6.0.1"
 duct = "0.13.7"
 pandora_lib_variants = {git = "https://git.t0m4.fr/Thomas/pandora_lib_variants.git"}
+pandora_lib_pileup = { git = "https://git.t0m4.fr/Thomas/pandora_lib_pileup.git" }
+glob = "0.3.1"

+ 72 - 56
src/lib.rs

@@ -1,6 +1,6 @@
 pub mod breakpoint;
 pub mod genomic_graph;
-mod phase;
+// mod phase;
 
 use anyhow::{anyhow, Ok, Result};
 use fasta::record::Sequence;
@@ -9,7 +9,6 @@ use minimap2::{Aligner, Mapping};
 use noodles_fasta as fasta;
 use num_format::{CustomFormat, Grouping, ToFormattedString, WriteFormatted};
 use petgraph::{dot::Dot, prelude::*};
-use rayon::prelude::*;
 use rust_htslib::bam::{self, Record};
 use std::{
     collections::HashMap,
@@ -138,7 +137,7 @@ impl ContigRef {
             }
         }
 
-        if res.len() > 0 {
+        if !res.is_empty() {
             Some(res)
         } else {
             None
@@ -210,7 +209,7 @@ impl ContigRef {
             ContigRef::RightAmbiguity(_) => None,
             ContigRef::Ambigous(_) => None,
             ContigRef::ChimericTriple((a, b, c)) => {
-                let mut v = vec![a, b, c];
+                let mut v = [a, b, c];
                 v.sort_by(|a, b| a.query_start.cmp(&b.query_start));
                 let (a, b, c) = (
                     *v.get(0).clone().unwrap(),
@@ -221,7 +220,7 @@ impl ContigRef {
                 let b_target_name = b.target_name.clone().unwrap_or(uk.clone());
                 let c_target_name = c.target_name.clone().unwrap_or(uk.clone());
 
-                if a_target_name != b_target_name {}
+                // if a_target_name != b_target_name {}
 
                 // Insertions
                 // prioritize first len
@@ -393,7 +392,7 @@ impl Genome {
     pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
         self.chromosomes.iter()
     }
-    
+
     pub fn contigs(&self) -> impl Iterator<Item = &Contig> {
         self.chromosomes.iter().flat_map(|(_, e)| e.iter())
     }
@@ -503,11 +502,11 @@ impl Genome {
     pub fn add_contig_from_seq(
         &mut self,
         name: String,
-        sequence: &Vec<u8>,
+        sequence: &[u8],
         aligner: impl Fn(String) -> Result<Vec<Mapping>>,
     ) -> Result<()> {
         let mappings = aligner(String::from_utf8(sequence.to_vec())?)?;
-        println!("{mappings:?}");
+        // println!("{mappings:?}");
         self.add_contig(name, mappings, None, String::from_utf8(sequence.to_vec())?)?;
         Ok(())
     }
@@ -525,19 +524,20 @@ impl Genome {
         let aligner_url = "http://localhost:4444/align";
         let aligner = aligner_client::dist_align(aligner_url.to_string());
         let mut genome = Self::new();
-        let paths = get_ext_paths(dir, "fasta")?;
+        // let paths = get_ext_paths(dir, "fasta")?;
+        let paths = get_contigs_fa_paths(dir)?;
         for path in paths {
             let fa = read_fasta(path.to_str().unwrap())?;
             for (name, sequence) in fa {
-                genome.add_contig_from_seq(name, &sequence.as_ref().to_vec(), &aligner)?;
+                genome.add_contig_from_seq(name, sequence.as_ref(), &aligner)?;
             }
         }
         Ok(genome)
     }
-
-    pub fn from_dir_bed(dir: &str)  {
-
-    }
+    //
+    // pub fn from_dir_bed(dir: &str)  {
+    //
+    // }
 
     // pub fn write_records(&self, file: &str) {
     //     let mut records =  Vec::new();
@@ -561,11 +561,9 @@ impl Genome {
     }
 
     pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
-        if let Some(chr) = self.chromosomes.get(chromosome) {
-            Some(chr.contigs.clone())
-        } else {
-            None
-        }
+        self.chromosomes
+            .get(chromosome)
+            .map(|chr| chr.contigs.clone())
     }
 }
 
@@ -642,7 +640,7 @@ impl Contig {
                 write_bed(&bed_path, &d)?;
             }
             ContigRef::ChimericTriple((a, b, c)) => {
-                let d: Vec<(String, i32, i32, String)> = vec![a, b, c]
+                let d: Vec<(String, i32, i32, String)> = [a, b, c]
                     .iter()
                     .map(|r| {
                         (
@@ -730,7 +728,7 @@ impl Contig {
     }
 }
 
-fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
+fn group_mappings(mappings: &mut [Mapping]) -> Result<Vec<Vec<Mapping>>> {
     // sort alignments by query_start
     mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
 
@@ -756,13 +754,13 @@ fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
     Ok(alignments)
 }
 
-pub fn write_bed(path: &str, d: &Vec<(String, i32, i32, String)>) -> Result<()> {
+pub fn write_bed(path: &str, d: &[(String, i32, i32, String)]) -> Result<()> {
     let file = File::create(path).unwrap();
     let mut writer = BufWriter::new(file);
     for (chr, start, end, value) in d.iter() {
         let row = format!(
             "{}\n",
-            vec![
+            [
                 chr.to_string(),
                 start.to_string(),
                 end.to_string(),
@@ -797,7 +795,7 @@ pub fn write_fasta(fasta_path: &str, d: &Vec<(String, String)>) {
     let mut passed = Vec::new();
     for (name, sequence) in d {
         let name = name.to_string();
-        if sequence.len() == 0 {
+        if sequence.is_empty() {
             continue;
         }
         if passed.contains(&name) {
@@ -871,7 +869,7 @@ pub fn write_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()>
 }
 
 pub fn read_fasta(path: &str) -> Result<Vec<(String, Sequence)>> {
-    let mut reader = File::open(&path)
+    let mut reader = File::open(path)
         .map(BufReader::new)
         .map(fasta::Reader::new)?;
 
@@ -885,27 +883,38 @@ pub fn read_fasta(path: &str) -> Result<Vec<(String, Sequence)>> {
 
     Ok(res)
 }
-fn get_ext_paths(dir: &str, ext: &str) -> Result<Vec<PathBuf>> {
-    let paths = std::fs::read_dir(dir)?
-        // Filter out all those directory entries which couldn't be read
-        .filter_map(|res| res.ok())
-        // Map the directory entries to paths
-        .map(|dir_entry| dir_entry.path())
-        // Filter out all paths with extensions other than `csv`
-        .filter_map(|path| {
-            if path.extension().map_or(false, |xt| xt == ext) {
-                Some(path)
-            } else {
-                None
-            }
-        })
-        .collect::<Vec<_>>();
-    Ok(paths)
+
+// fn get_ext_paths(dir: &str, ext: &str) -> Result<Vec<PathBuf>> {
+//     let paths = std::fs::read_dir(dir)?
+//         // Filter out all those directory entries which couldn't be read
+//         .filter_map(|res| res.ok())
+//         // Map the directory entries to paths
+//         .map(|dir_entry| dir_entry.path())
+//         // Filter out all paths with extensions other than `csv`
+//         .filter_map(|path| {
+//             if path.extension().map_or(false, |xt| xt == ext) {
+//                 Some(path)
+//             } else {
+//                 None
+//             }
+//         })
+//         .collect::<Vec<_>>();
+//     Ok(paths)
+// }
+
+fn get_contigs_fa_paths(dir: &str) -> Result<Vec<PathBuf>> {
+    let pattern = format!("{}/**/*_flye.fa", dir);
+    let fa_paths: Vec<PathBuf> = glob::glob(&pattern)
+        .expect("Failed to read glob pattern")
+        .filter_map(Result::ok)
+        .collect();
+
+    Ok(fa_paths)
 }
 
 pub fn dot_graph(
     graph: &StableGraph<String, String>,
-    way: &Vec<NodeIndex>,
+    way: &[NodeIndex],
     value: &str,
     color: &str,
     erase: bool,
@@ -921,7 +930,7 @@ pub fn dot_graph(
     for window in way.windows(2) {
         let edge_id = g.find_edge(window[0], window[1]).unwrap();
         let edge = g.edge_weight_mut(edge_id).unwrap();
-        let v = if edge != "" {
+        let v = if !edge.is_empty() {
             let mut v: Vec<&str> = edge.split(",").filter(|e| !e.is_empty()).collect();
             v.push(value);
             v.join(", ")
@@ -939,7 +948,8 @@ pub fn dot_graph(
 
     let mut dot = Dot::new(&g).to_string().replace("\\\"", "");
 
-    labels.sort_by(|a, b| b.len().cmp(&a.len()));
+    labels.sort_by_key(|b| std::cmp::Reverse(b.len()));
+    // labels.sort_by(|a, b| b.len().cmp(&a.len()));
     for label in labels {
         let label_str = format!("label = \"{label}\"");
         dot = dot.replace(
@@ -953,7 +963,7 @@ pub fn dot_graph(
 /// for non overlapping ways
 pub fn dot_graph_biall(
     graph: &StableGraph<String, String>,
-    ways: &Vec<Vec<NodeIndex>>,
+    ways: &[Vec<NodeIndex>],
     values: Vec<&str>,
     colors: Vec<&str>,
     erase: bool,
@@ -969,7 +979,7 @@ pub fn dot_graph_biall(
         for window in way.windows(2) {
             let edge_id = g.find_edge(window[0], window[1]).unwrap();
             let edge = g.edge_weight_mut(edge_id).unwrap();
-            let v = if edge != "" {
+            let v = if !edge.is_empty() {
                 let mut v: Vec<&str> = edge.split(",").filter(|e| !e.is_empty()).collect();
                 v.push(value);
                 v.join(", ")
@@ -981,7 +991,8 @@ pub fn dot_graph_biall(
         }
     }
 
-    g.retain_edges(|g, i| g.edge_weight(i).unwrap().to_string() != "".to_string());
+    // g.retain_edges(|g, i| g.edge_weight(i).unwrap().to_string() != "".to_string());
+    g.retain_edges(|g, i| !g.edge_weight(i).unwrap().is_empty());
     g.retain_nodes(|g, n| g.neighbors_undirected(n).count() > 0);
 
     let mut dot = Dot::new(&g).to_string().replace("\\\"", "");
@@ -1001,15 +1012,10 @@ pub fn dot_graph_biall(
 mod tests {
     use env_logger::Env;
 
-    use dashmap::DashSet;
-    use indicatif::MultiProgress;
-    use indicatif_log_bridge::LogWrapper;
-    use pandora_lib_variants::{in_out::dict_reader::read_dict, variants::Variant};
-
     use super::*;
     use crate::{
         genomic_graph::GenomicGraph,
-        phase::{variants_phasing, write_phases_bed},
+        // phase::{variants_phasing, write_phases_bed},
     };
 
     fn init() {
@@ -1170,16 +1176,26 @@ mod tests {
 
     #[test]
     fn dir() {
-        todo!();
         init();
 
         let id = "ROBIN";
-        let chrom = vec!["chr9"];
+        let chrom = ["chr9"];
         info!("This record will be captured by `cargo test`");
 
-        let dir = format!("/data/longreads_basic_pipe/{id}/diag/scan/reads/{chrom}");
+        let dir = format!(
+            "/data/longreads_basic_pipe/{id}/diag/scan/reads/{}",
+            chrom[0]
+        );
 
         // Load from fasta in dir.
         let genome = Genome::from_contigs_sequences(&dir).unwrap();
+        genome.stats();
+        genome.iter().for_each(|(_s, chrom)| {
+            chrom
+                .iter()
+                .filter_map(|c| c.hgvs())
+                .for_each(|c| println!("{c}"))
+        });
+        // println!("{genome:#?}");
     }
 }

Some files were not shown because too many files changed in this diff