| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404 |
- use anyhow::{Ok, Result};
- use log::info;
- use minimap2::{Aligner, Mapping};
- use rust_htslib::bam::{self, Record};
- use std::{
- collections::{HashMap, VecDeque},
- fmt,
- };
- #[derive(Debug, Clone)]
- pub struct Genome {
- pub chromosomes: HashMap<String, Chromosome>,
- }
- #[derive(Debug, Clone)]
- pub struct Chromosome {
- contigs: Vec<Contig>,
- }
- #[derive(Debug, Clone)]
- pub struct Contig {
- pub id: String,
- // contig seq on ref
- pub mappings: Vec<Mapping>,
- // reads on ref
- pub supporting_records: Vec<Record>,
- pub sequence: String,
- pub contig_ref: ContigRef,
- }
- #[derive(Debug, Clone)]
- pub enum ContigRef {
- Unique(Mapping),
- Chimeric((Mapping, Mapping)),
- ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
- LeftAmbiguity((Vec<Mapping>, Mapping)),
- RightAmbiguity((Mapping, Vec<Mapping>)),
- Ambigous(Vec<Mapping>),
- }
- impl fmt::Display for ContigRef {
- fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
- let str = match self {
- ContigRef::Unique(m) => mapping_to_string(m),
- ContigRef::Chimeric((a, b)) => {
- format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
- }
- ContigRef::ChimericMultiple((a, v, b)) => format!(
- "{}<->{}<->{}",
- mapping_to_string(a),
- mappings_to_string(v),
- mapping_to_string(b)
- ),
- ContigRef::LeftAmbiguity((v, b)) => {
- format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
- }
- ContigRef::RightAmbiguity((a, v)) => {
- format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
- }
- ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
- };
- fmt.write_str(&str).unwrap();
- std::result::Result::Ok(())
- }
- }
- impl ContigRef {
- pub fn hgvs(&self) -> Option<String> {
- match self {
- ContigRef::Unique(_) => None,
- ContigRef::Chimeric((a, b)) => {
- if a.target_name == b.target_name {
- let chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
- let end = a.target_end;
- let start = b.target_start;
- Some(format!("{chr}:{end}_{start}"))
- } else {
- None
- }
- }
- ContigRef::ChimericMultiple(_) => None,
- ContigRef::LeftAmbiguity(_) => None,
- ContigRef::RightAmbiguity(_) => None,
- ContigRef::Ambigous(_) => None,
- }
- }
- }
- pub fn mapping_to_string(mapping: &Mapping) -> String {
- let uk = "UNKNOWN".to_string();
- format!(
- "{}:{}-{}({}:{}-{})",
- mapping.target_name.clone().unwrap_or(uk.clone()),
- mapping.target_start,
- mapping.target_end,
- mapping.query_name.clone().unwrap_or(uk),
- mapping.query_start,
- mapping.query_end
- )
- }
- fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
- let v = mappings
- .iter()
- .map(mapping_to_string)
- .collect::<Vec<String>>();
- v.join("//")
- }
- pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
- let mut mappings = mappings;
- if mappings.len() == 1 {
- return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
- } else {
- let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
- if grouped.len() == 1 {
- let r = grouped.into_iter().flat_map(|e| e).collect();
- return Ok(ContigRef::Ambigous(r));
- } else if grouped.len() >= 2 {
- let first = grouped.pop_back().unwrap();
- let last = grouped.pop_front().unwrap();
- if grouped.len() == 0 {
- if first.len() == 1 && last.len() == 1 {
- return Ok(ContigRef::Chimeric((
- first.get(0).unwrap().clone(),
- last.get(0).unwrap().clone(),
- )));
- } else if first.len() == 1 {
- return Ok(ContigRef::RightAmbiguity((
- first.get(0).unwrap().clone(),
- last.clone(),
- )));
- } else if last.len() == 1 {
- return Ok(ContigRef::LeftAmbiguity((
- first.clone(),
- last.get(0).unwrap().clone(),
- )));
- } else {
- let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
- return Ok(ContigRef::Ambigous(all));
- }
- } else {
- }
- if first.len() == 1 && last.len() == 1 {
- return Ok(ContigRef::ChimericMultiple((
- first.get(0).unwrap().clone(),
- grouped.into_iter().flat_map(|e| e).collect(),
- last.get(0).unwrap().clone(),
- )));
- } else if first.len() == 1 {
- let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
- .into_iter()
- .flat_map(|e| e)
- .collect();
- return Ok(ContigRef::RightAmbiguity((
- first.get(0).unwrap().clone(),
- right,
- )));
- } else if last.len() == 1 {
- let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
- .into_iter()
- .flat_map(|e| e)
- .collect();
- return Ok(ContigRef::LeftAmbiguity((
- left,
- last.get(0).unwrap().clone(),
- )));
- } else {
- let all: Vec<Mapping> =
- vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
- .into_iter()
- .flat_map(|e| e)
- .collect();
- return Ok(ContigRef::Ambigous(all));
- }
- } else {
- return Ok(ContigRef::Ambigous(
- grouped.into_iter().flat_map(|e| e).collect(),
- ));
- }
- }
- }
- impl Genome {
- pub fn new() -> Self {
- Genome {
- chromosomes: HashMap::new(),
- }
- }
- pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
- self.chromosomes.iter()
- }
- pub fn add_contig(
- &mut self,
- id: String,
- mappings: Vec<Mapping>,
- supporting_records: Vec<Record>,
- sequence: String,
- ) -> Result<()> {
- let new_contig = Contig {
- id,
- mappings: mappings.clone(),
- supporting_records,
- sequence,
- contig_ref: get_ref_pos(mappings)?,
- };
- // get the category of Mapping
- match new_contig.contig_ref.clone() {
- ContigRef::Unique(contig_mapping) => {
- match self
- .chromosomes
- .get_mut(&contig_mapping.target_name.unwrap())
- {
- Some(chromosome) => {
- chromosome.contigs.push(new_contig);
- }
- None => (),
- }
- }
- ContigRef::Chimeric((a, b)) => {
- let a_target_name = a.target_name.unwrap();
- let b_target_name = b.target_name.unwrap();
- if a_target_name == b_target_name {
- if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- a_target_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- } else {
- let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
- if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- chimeric_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- }
- }
- ContigRef::ChimericMultiple((left, _, right)) => {
- let left_target_name = left.target_name.unwrap();
- let right_target_name = right.target_name.unwrap();
- if left_target_name == right_target_name {
- if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- left_target_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- } else {
- let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
- if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- chimeric_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- }
- }
- _ => {
- if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- "Ambigous".to_string(),
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- }
- };
- Ok(())
- }
- pub fn stats(&self) {
- for (k, v) in self.chromosomes.iter() {
- info!("{}:{}", k, v.contigs.len());
- }
- }
- pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
- if let Some(chr) = self.chromosomes.get(chromosome) {
- Some(chr.contigs.clone())
- } else {
- None
- }
- }
- }
- impl Chromosome {
- pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
- self.contigs.iter()
- }
- }
- impl Contig {
- pub fn sort(&mut self) {
- self.mappings
- .sort_by(|a, b| a.target_start.cmp(&b.target_start));
- }
- pub fn write_bam(&self, path: &str) -> Result<()> {
- let aligner = Aligner::builder()
- .asm5()
- .with_threads(8)
- .with_cigar()
- .with_seq(self.sequence.as_bytes())
- .expect("Unable to build index");
- let mut mappings = Vec::new();
- for record in self.supporting_records.iter() {
- let seq = record.seq().as_bytes();
- let alignment = aligner
- .map(&seq, false, false, None, None)
- .expect("Unable to align");
- mappings.push(alignment);
- }
- let mappings: Vec<_> = mappings.into_iter().flatten().collect();
- let mut header = bam::Header::new();
- let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ");
- sq_record.push_tag(b"SN", self.id.clone());
- sq_record.push_tag(b"LN", self.sequence.len());
- header.push_record(&sq_record);
- let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
- // copy reverse reads to new BAM file
- for mapping in mappings.iter() {
- let record = minimap2::htslib::mapping_to_record(
- Some(mapping),
- self.sequence.as_bytes(),
- header.clone(),
- None,
- Some(self.id.as_bytes()),
- );
- out.write(&record)?;
- }
- rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 5)?;
- Ok(())
- }
- pub fn hgvs(&self) -> Option<String> {
- self.contig_ref.hgvs()
- }
- }
- fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
- // sort alignments by query_start
- mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
- let mut alignments: Vec<Vec<Mapping>> = vec![];
- // group by overlapps > 30
- for aln in mappings.iter() {
- let last = alignments.last_mut();
- if let Some(l) = last {
- if l.iter()
- .filter(|a| a.query_end - aln.query_start > 30)
- .count()
- > 0
- {
- l.push(aln.clone());
- } else {
- alignments.push(vec![aln.clone()]);
- }
- } else {
- alignments.push(vec![aln.clone()]);
- }
- }
- Ok(alignments)
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- #[test]
- fn it_works() {
- // let result = add(2, 2);
- // assert_eq!(result, 4);
- }
- }
|