Thomas 2 سال پیش
والد
کامیت
ee742bcabc
3فایلهای تغییر یافته به همراه146 افزوده شده و 113 حذف شده
  1. 3 3
      Cargo.lock
  2. 2 2
      Cargo.toml
  3. 141 108
      src/lib.rs

+ 3 - 3
Cargo.lock

@@ -293,6 +293,7 @@ dependencies = [
  "flate2",
  "libc",
  "minimap2-sys",
+ "rust-htslib",
  "simdutf8",
 ]
 
@@ -414,9 +415,9 @@ checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
 
 [[package]]
 name = "rust-htslib"
-version = "0.46.0"
+version = "0.44.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aec6f9ca4601beb4ae75ff8c99144dd15de5a873f6adf058da299962c760968e"
+checksum = "7c7eb0f29fce64a4e22578905efef3d72389058016023279a58b282eb5c0c467"
 dependencies = [
  "bio-types",
  "byteorder",
@@ -426,7 +427,6 @@ dependencies = [
  "ieee754",
  "lazy_static",
  "libc",
- "libz-sys",
  "linear-map",
  "newtype_derive",
  "regex",

+ 2 - 2
Cargo.toml

@@ -6,7 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-minimap2 = "0.1.16+minimap2.2.26"
+minimap2 = { version = "0.1.16+minimap2.2.26", features = ["htslib"]}
 anyhow = "1.0.75"
 log = "0.4.19"
-rust-htslib = "0.46.0"
+rust-htslib = "0.44.1"

+ 141 - 108
src/lib.rs

@@ -1,22 +1,33 @@
-use anyhow::Result;
+use anyhow::{Result, Ok};
 use log::info;
 use minimap2::Mapping;
-use rust_htslib::bam::Record;
+use rust_htslib::bam::{self, Record};
 use std::{
     collections::{HashMap, VecDeque},
     fmt,
 };
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
+pub struct Genome {
+    pub chromosomes: HashMap<String, Chromosome>,
+}
+
+#[derive(Debug, Clone)]
+pub struct Chromosome {
+    contigs: Vec<Contig>,
+}
+
+#[derive(Debug, Clone)]
 pub struct Contig {
     pub id: String,
     pub mappings: Vec<Mapping>,
     pub supporting_records: Vec<Record>,
-    pub sequence: String
+    pub sequence: String,
+    pub contig_ref: ContigRef,
 }
 
 #[derive(Debug, Clone)]
-pub enum ContigsRef {
+pub enum ContigRef {
     Unique(Mapping),
     Chimeric((Mapping, Mapping)),
     ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
@@ -25,26 +36,26 @@ pub enum ContigsRef {
     Ambigous(Vec<Mapping>),
 }
 
-impl fmt::Display for ContigsRef {
+impl fmt::Display for ContigRef {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         let str = match self {
-            ContigsRef::Unique(m) => mapping_to_string(m),
-            ContigsRef::Chimeric((a, b)) => {
+            ContigRef::Unique(m) => mapping_to_string(m),
+            ContigRef::Chimeric((a, b)) => {
                 format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
             }
-            ContigsRef::ChimericMultiple((a, v, b)) => format!(
+            ContigRef::ChimericMultiple((a, v, b)) => format!(
                 "{}<->{}<->{}",
                 mapping_to_string(a),
                 mappings_to_string(v),
                 mapping_to_string(b)
             ),
-            ContigsRef::LeftAmbiguity((v, b)) => {
+            ContigRef::LeftAmbiguity((v, b)) => {
                 format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
             }
-            ContigsRef::RightAmbiguity((a, v)) => {
+            ContigRef::RightAmbiguity((a, v)) => {
                 format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
             }
-            ContigsRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
+            ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
         };
         fmt.write_str(&str).unwrap();
 
@@ -52,11 +63,11 @@ impl fmt::Display for ContigsRef {
     }
 }
 
-impl ContigsRef {
+impl ContigRef {
     pub fn hgvs(&self) -> Option<String> {
         match self {
-            ContigsRef::Unique(_) => None,
-            ContigsRef::Chimeric((a, b)) => {
+            ContigRef::Unique(_) => None,
+            ContigRef::Chimeric((a, b)) => {
                 if a.target_name == b.target_name {
                     let chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
                     let end = a.target_end;
@@ -66,10 +77,10 @@ impl ContigsRef {
                     None
                 }
             }
-            ContigsRef::ChimericMultiple(_) => None,
-            ContigsRef::LeftAmbiguity(_) => None,
-            ContigsRef::RightAmbiguity(_) => None,
-            ContigsRef::Ambigous(_) => None,
+            ContigRef::ChimericMultiple(_) => None,
+            ContigRef::LeftAmbiguity(_) => None,
+            ContigRef::RightAmbiguity(_) => None,
+            ContigRef::Ambigous(_) => None,
         }
     }
 }
@@ -95,91 +106,82 @@ fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
     v.join("//")
 }
 
-impl Contig {
-    pub fn get_ref_pos(&mut self) -> Result<ContigsRef> {
-        if self.mappings.len() == 1 {
-            return Ok(ContigsRef::Unique(self.mappings.get(0).unwrap().clone()));
-        } else {
-            let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut self.mappings)?.into();
+pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
+    let mut mappings = mappings;
+    if mappings.len() == 1 {
+        return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
+    } else {
+        let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
 
-            if grouped.len() == 1 {
-                let r = grouped.into_iter().flat_map(|e| e).collect();
-                return Ok(ContigsRef::Ambigous(r));
-            } else if grouped.len() >= 2 {
-                let first = grouped.pop_back().unwrap();
-                let last = grouped.pop_front().unwrap();
+        if grouped.len() == 1 {
+            let r = grouped.into_iter().flat_map(|e| e).collect();
+            return Ok(ContigRef::Ambigous(r));
+        } else if grouped.len() >= 2 {
+            let first = grouped.pop_back().unwrap();
+            let last = grouped.pop_front().unwrap();
 
-                if grouped.len() == 0 {
-                    if first.len() == 1 && last.len() == 1 {
-                        return Ok(ContigsRef::Chimeric((
-                            first.get(0).unwrap().clone(),
-                            last.get(0).unwrap().clone(),
-                        )));
-                    } else if first.len() == 1 {
-                        return Ok(ContigsRef::RightAmbiguity((
-                            first.get(0).unwrap().clone(),
-                            last.clone(),
-                        )));
-                    } else if last.len() == 1 {
-                        return Ok(ContigsRef::LeftAmbiguity((
-                            first.clone(),
-                            last.get(0).unwrap().clone(),
-                        )));
-                    } else {
-                        let all: Vec<Mapping> =
-                            vec![first, last].into_iter().flat_map(|e| e).collect();
-                        return Ok(ContigsRef::Ambigous(all));
-                    }
-                } else {
-                }
+            if grouped.len() == 0 {
                 if first.len() == 1 && last.len() == 1 {
-                    return Ok(ContigsRef::ChimericMultiple((
+                    return Ok(ContigRef::Chimeric((
                         first.get(0).unwrap().clone(),
-                        grouped.into_iter().flat_map(|e| e).collect(),
                         last.get(0).unwrap().clone(),
                     )));
                 } else if first.len() == 1 {
-                    let right: Vec<Mapping> =
-                        vec![grouped.into_iter().flat_map(|e| e).collect(), last]
-                            .into_iter()
-                            .flat_map(|e| e)
-                            .collect();
-                    return Ok(ContigsRef::RightAmbiguity((
+                    return Ok(ContigRef::RightAmbiguity((
                         first.get(0).unwrap().clone(),
-                        right,
+                        last.clone(),
                     )));
                 } else if last.len() == 1 {
-                    let left: Vec<Mapping> =
-                        vec![first, grouped.into_iter().flat_map(|e| e).collect()]
-                            .into_iter()
-                            .flat_map(|e| e)
-                            .collect();
-                    return Ok(ContigsRef::LeftAmbiguity((
-                        left,
+                    return Ok(ContigRef::LeftAmbiguity((
+                        first.clone(),
                         last.get(0).unwrap().clone(),
                     )));
                 } else {
-                    let all: Vec<Mapping> =
-                        vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
-                            .into_iter()
-                            .flat_map(|e| e)
-                            .collect();
-                    return Ok(ContigsRef::Ambigous(all));
+                    let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
+                    return Ok(ContigRef::Ambigous(all));
                 }
             } else {
-                return Ok(ContigsRef::Ambigous(
+            }
+            if first.len() == 1 && last.len() == 1 {
+                return Ok(ContigRef::ChimericMultiple((
+                    first.get(0).unwrap().clone(),
                     grouped.into_iter().flat_map(|e| e).collect(),
-                ));
+                    last.get(0).unwrap().clone(),
+                )));
+            } else if first.len() == 1 {
+                let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
+                    .into_iter()
+                    .flat_map(|e| e)
+                    .collect();
+                return Ok(ContigRef::RightAmbiguity((
+                    first.get(0).unwrap().clone(),
+                    right,
+                )));
+            } else if last.len() == 1 {
+                let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
+                    .into_iter()
+                    .flat_map(|e| e)
+                    .collect();
+                return Ok(ContigRef::LeftAmbiguity((
+                    left,
+                    last.get(0).unwrap().clone(),
+                )));
+            } else {
+                let all: Vec<Mapping> =
+                    vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
+                        .into_iter()
+                        .flat_map(|e| e)
+                        .collect();
+                return Ok(ContigRef::Ambigous(all));
             }
+        } else {
+            return Ok(ContigRef::Ambigous(
+                grouped.into_iter().flat_map(|e| e).collect(),
+            ));
         }
     }
 }
 
-#[derive(Debug, Clone)]
-pub struct Genome {
-    pub chromosomes: HashMap<String, Chromosome>,
-}
-
 impl Genome {
     pub fn new() -> Self {
         Genome {
@@ -189,73 +191,84 @@ impl Genome {
     pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
         self.chromosomes.iter()
     }
-    pub fn add_contig(&mut self, id: String, mappings: Vec<Mapping>, supporting_records: Vec<Record>, sequence: String) -> Result<()> {
-        let mut new_contig = Contig { id, mappings, supporting_records, sequence };
+    pub fn add_contig(
+        &mut self,
+        id: String,
+        mappings: Vec<Mapping>,
+        supporting_records: Vec<Record>,
+        sequence: String,
+    ) -> Result<()> {
+        let new_contig = Contig {
+            id,
+            mappings: mappings.clone(),
+            supporting_records,
+            sequence,
+            contig_ref: get_ref_pos(mappings)?,
+        };
         // get the category of Mapping
-        let ref_res = new_contig.get_ref_pos()?;
-        match ref_res.clone() {
-            ContigsRef::Unique(contig_mapping) => {
+        match new_contig.contig_ref.clone() {
+            ContigRef::Unique(contig_mapping) => {
                 match self
                     .chromosomes
                     .get_mut(&contig_mapping.target_name.unwrap())
                 {
                     Some(chromosome) => {
-                        chromosome.contigs.push(ref_res);
+                        chromosome.contigs.push(new_contig);
                     }
                     None => (),
                 }
             }
-            ContigsRef::Chimeric((a, b)) => {
+            ContigRef::Chimeric((a, b)) => {
                 let a_target_name = a.target_name.unwrap();
                 let b_target_name = b.target_name.unwrap();
                 if a_target_name == b_target_name {
                     if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
-                        chromosome.contigs.push(ref_res);
+                        chromosome.contigs.push(new_contig);
                     } else {
                         self.chromosomes.insert(
                             a_target_name,
                             Chromosome {
-                                contigs: vec![ref_res],
+                                contigs: vec![new_contig],
                             },
                         );
                     }
                 } else {
                     let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
                     if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
-                        chromosome.contigs.push(ref_res);
+                        chromosome.contigs.push(new_contig);
                     } else {
                         self.chromosomes.insert(
                             chimeric_name,
                             Chromosome {
-                                contigs: vec![ref_res],
+                                contigs: vec![new_contig],
                             },
                         );
                     }
                 }
             }
-            ContigsRef::ChimericMultiple((left, _, right)) => {
+            ContigRef::ChimericMultiple((left, _, right)) => {
                 let left_target_name = left.target_name.unwrap();
                 let right_target_name = right.target_name.unwrap();
                 if left_target_name == right_target_name {
                     if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
-                        chromosome.contigs.push(ref_res);
+                        chromosome.contigs.push(new_contig);
                     } else {
                         self.chromosomes.insert(
                             left_target_name,
                             Chromosome {
-                                contigs: vec![ref_res],
+                                contigs: vec![new_contig],
                             },
                         );
                     }
                 } else {
                     let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
                     if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
-                        chromosome.contigs.push(ref_res);
+                        chromosome.contigs.push(new_contig);
                     } else {
                         self.chromosomes.insert(
                             chimeric_name,
                             Chromosome {
-                                contigs: vec![ref_res],
+                                contigs: vec![new_contig],
                             },
                         );
                     }
@@ -263,12 +276,12 @@ impl Genome {
             }
             _ => {
                 if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
-                    chromosome.contigs.push(ref_res);
+                    chromosome.contigs.push(new_contig);
                 } else {
                     self.chromosomes.insert(
                         "Ambigous".to_string(),
                         Chromosome {
-                            contigs: vec![ref_res],
+                            contigs: vec![new_contig],
                         },
                     );
                 }
@@ -283,8 +296,8 @@ impl Genome {
             info!("{}:{}", k, v.contigs.len());
         }
     }
-    
-    pub fn chromosome(&self, chromosome: &str) -> Option<Vec<ContigsRef>> {
+
+    pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
         if let Some(chr) = self.chromosomes.get(chromosome) {
             Some(chr.contigs.clone())
         } else {
@@ -292,17 +305,37 @@ impl Genome {
         }
     }
 }
-#[derive(Debug, Clone)]
-pub struct Chromosome {
-    contigs: Vec<ContigsRef>,
-}
 
 impl Chromosome {
-    pub fn iter(&self) -> std::slice::Iter<'_, ContigsRef> {
+    pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
         self.contigs.iter()
     }
 }
 
+impl Contig {
+    pub fn sort(&mut self) {
+        self.mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start));
+    }
+
+    pub fn write_bam(&self, path: &str) -> Result<()> {
+        let mut header = bam::Header::new();
+        let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"@SQ");
+        sq_record.push_tag(b"SN", self.id.clone());
+        sq_record.push_tag(b"LN", self.sequence.len());
+        header.push_record(&sq_record);
+
+        let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
+
+        // copy reverse reads to new BAM file
+        for mapping in self.mappings.iter() {
+            let record = minimap2::htslib::mapping_to_record(Some(mapping), self.sequence.as_bytes(), header.clone(), None, Some(self.id.as_bytes()));
+            out.write(&record)?;
+        }
+        rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 5)?;
+        Ok(())
+    }
+}
+
 fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
     // sort alignments by query_start
     mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));