Browse Source

dist aligner

Thomas 1 year ago
parent
commit
8a5d986511
3 changed files with 80 additions and 62 deletions
  1. 24 46
      Cargo.lock
  2. 2 0
      ggg.txt
  3. 54 16
      src/lib.rs

+ 24 - 46
Cargo.lock

@@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "aho-corasick"
-version = "1.1.2"
+version = "1.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
 dependencies = [
  "memchr",
 ]
@@ -29,7 +29,7 @@ dependencies = [
 [[package]]
 name = "aligner_client"
 version = "0.1.0"
-source = "git+https://git.t0m4.fr/Thomas/aligner_client.git#5267b82f5f597d2552115a251bfb778a55e4502f"
+source = "git+https://git.t0m4.fr/Thomas/aligner_client.git#bee81e8e739aaecacb9d5e0959a7fcb2dc095154"
 dependencies = [
  "anyhow",
  "minimap2",
@@ -131,16 +131,6 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
 
-[[package]]
-name = "bzip2"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
-dependencies = [
- "bzip2-sys",
- "libc",
-]
-
 [[package]]
 name = "bzip2-sys"
 version = "0.1.11+1.0.8"
@@ -300,7 +290,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -311,7 +301,7 @@ checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -759,9 +749,9 @@ checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
 
 [[package]]
 name = "libz-sys"
-version = "1.1.15"
+version = "1.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "037731f5d3aaa87a5675e895b63ddff1a87624bc29f77004ea829809654e48f6"
+checksum = "5e143b5e666b2695d28f6bca6497720813f699c9602dd7f5cac91008b8ada7f9"
 dependencies = [
  "cc",
  "cmake",
@@ -824,7 +814,7 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 [[package]]
 name = "minimap2"
 version = "0.1.17+minimap2.2.27"
-source = "git+https://github.com/jguhlin/minimap2-rs#e09e3f252f4c34b7f327b9f256b4602a967d62b3"
+source = "git+https://github.com/jguhlin/minimap2-rs#05e4c36932e2123cffdc7dd91f8651a0307fc8c3"
 dependencies = [
  "libc",
  "minimap2-sys",
@@ -835,9 +825,9 @@ dependencies = [
 
 [[package]]
 name = "minimap2-sys"
-version = "0.1.16+minimap2.2.26"
+version = "0.1.18+minimap2.2.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3116fd091e0b499cd370475c6d03f8c333aa956b1769140dfd07e1a42101c8a"
+checksum = "185d3f931e11c1df371455a01e93a0037041d011705b5ff1d283d619b234c47c"
 dependencies = [
  "cc",
  "libz-sys",
@@ -890,10 +880,7 @@ checksum = "db05a5ab397f64070d8c998fa0fbb84e484b81f95752af317dac183a82d9295d"
 dependencies = [
  "buffer-redux",
  "bytecount",
- "bzip2",
- "flate2",
  "memchr",
- "xz2",
 ]
 
 [[package]]
@@ -983,7 +970,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -1060,7 +1047,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -1361,7 +1348,7 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -1384,7 +1371,7 @@ dependencies = [
  "darling",
  "proc-macro-error",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -1455,7 +1442,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -1471,9 +1458,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.52"
+version = "2.0.53"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
+checksum = "7383cd0e49fff4b6b90ca5670bfd3e9d6a733b3f90c686605aa7eec8c4996032"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1546,7 +1533,7 @@ checksum = "c8f546451eaa38373f549093fe9fd05e7d2bade739e2ddf834b9968621d60107"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -1566,7 +1553,7 @@ checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -1611,7 +1598,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
 ]
 
 [[package]]
@@ -1726,9 +1713,9 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.7.0"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a"
+checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0"
 dependencies = [
  "getrandom",
  "serde",
@@ -1782,7 +1769,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
  "wasm-bindgen-shared",
 ]
 
@@ -1816,7 +1803,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.53",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -2009,12 +1996,3 @@ dependencies = [
  "cfg-if",
  "windows-sys 0.48.0",
 ]
-
-[[package]]
-name = "xz2"
-version = "0.1.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
-dependencies = [
- "lzma-sys",
-]

File diff suppressed because it is too large
+ 2 - 0
ggg.txt


+ 54 - 16
src/lib.rs

@@ -1,4 +1,5 @@
 use anyhow::{anyhow, Ok, Result};
+use fasta::record::Sequence;
 use log::info;
 use minimap2::{Aligner, Mapping};
 use noodles_fasta as fasta;
@@ -79,6 +80,48 @@ impl fmt::Display for ContigRef {
 }
 
 impl ContigRef {
+    pub fn desc(&self) -> Option<String> {
+        let uk = "UNKNOWN".to_string();
+        let to_desc = |v: &mut Vec<Mapping>| -> String {
+            v.sort_by(|a, b| a.query_start.cmp(&b.query_start));
+            let v: Vec<String> = v
+                .into_iter()
+                .map(|e| {
+                    let strand = match e.strand {
+                        minimap2::Strand::Forward => "",
+                        minimap2::Strand::Reverse => "_rev",
+                    };
+                    format!(
+                        "{}:{}_{}{}",
+                        e.target_name.clone().unwrap_or(uk.clone()),
+                        e.target_start,
+                        e.target_end,
+                        strand
+                    )
+                })
+                .collect();
+            format!("[{}]", v.join(";"))
+        };
+
+        match self {
+            ContigRef::Unique(a) => Some(format!(
+                "{}:{}_{}",
+                a.target_name.clone().unwrap_or(uk.clone()),
+                a.target_start,
+                a.target_end
+            )),
+            ContigRef::Chimeric((a, b)) => Some(to_desc(&mut vec![a.to_owned(), b.to_owned()])),
+            ContigRef::ChimericTriple((a, b, c)) => {
+                Some(to_desc(&mut vec![a.to_owned(), b.to_owned(), c.to_owned()]))
+            }
+            ContigRef::ChimericMultiple(_) => todo!(),
+            ContigRef::LeftAmbiguity(_) => todo!(),
+            ContigRef::RightAmbiguity(_) => todo!(),
+            ContigRef::Ambigous(a) => {
+                Some(to_desc(&mut a.to_owned()))
+            },
+        }
+    }
     pub fn hgvs(&self) -> Option<String> {
         let uk = "UNKNOWN".to_string();
         match self {
@@ -193,7 +236,6 @@ pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
         return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
     } else {
         let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
-        println!("{grouped:?}");
 
         if grouped.len() == 1 {
             let r = grouped.into_iter().flat_map(|e| e).collect();
@@ -224,7 +266,6 @@ pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
                 }
             }
             if first.len() == 1 && last.len() == 1 {
-                println!("bim");
                 if grouped.len() == 1 {
                     return Ok(ContigRef::ChimericTriple((
                         first.get(0).unwrap().clone(),
@@ -696,7 +737,7 @@ pub fn write_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()>
     Ok(())
 }
 
-pub fn read_fasta(path: &str) -> Result<Vec<(String, Vec<u8>)>> {
+pub fn read_fasta(path: &str) -> Result<Vec<(String, Sequence)>> {
     let mut reader = File::open(&path)
         .map(BufReader::new)
         .map(fasta::Reader::new)?;
@@ -705,41 +746,38 @@ pub fn read_fasta(path: &str) -> Result<Vec<(String, Vec<u8>)>> {
     for result in reader.records() {
         let record = result?;
         let u = String::from_utf8(record.name().to_vec())?;
-        let s = record.sequence().as_ref().to_vec();
+        let s = record.sequence().to_owned();
         res.push((u, s));
     }
 
     Ok(res)
 }
 
-pub fn dist_align(url: String) -> impl Fn(String) -> Result<Vec<Mapping>> {
-    move |sequence: String| -> Result<Vec<Mapping>> {
-        aligner_client::get_mappings(url.as_str(), sequence)
-    }
-}
 
 #[cfg(test)]
 mod tests {
     use super::*;
-    use test_log::test;
 
-    #[test_log::test]
+    #[test]
     fn it_works() -> Result<()> {
         let _ = env_logger::builder().is_test(true).try_init();
-        let contig_fa = "./data_test/contig_1.fa";
+        let contig_fa = "./data_test/contig_2.fa";
         let aligner_url = "http://localhost:4444/align";
 
         let mut genome = Genome::new();
-        let aligner = dist_align(aligner_url.to_string());
+        let aligner = aligner_client::dist_align(aligner_url.to_string());
 
         let sequences = read_fasta(contig_fa)?;
         for (name, seq) in sequences {
-            genome.add_contig_from_seq(name, &seq, &aligner)?;
+            genome.add_contig_from_seq(name.clone(), &seq.as_ref().to_vec(), &aligner)?;
+            let mut seqc: Vec<u8> = seq.complement().map(|e| e.unwrap()).collect();
+            seqc.reverse();
+            genome.add_contig_from_seq(format!("{name}_rev"), &seqc, &aligner)?;
             println!("Sending");
         }
-        genome.iter().for_each(|(n, c)| {
+        genome.iter().for_each(|(_, c)| {
             c.iter().for_each(|cont| {
-                println!("{}", cont.contig_ref);
+                println!("{}", cont.contig_ref.desc().unwrap());
             });
         });
 

Some files were not shown because too many files changed in this diff