| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596 |
- use anyhow::{Ok, Result};
- use log::info;
- use minimap2::{Aligner, Mapping};
- use noodles_fasta as fasta;
- use rust_htslib::bam::{self, Record};
- use std::{
- collections::{HashMap, VecDeque},
- fmt::{self, format},
- fs::{self, File},
- io::{BufWriter, Write},
- process::{Command, Stdio},
- };
- use uuid::Uuid;
- #[derive(Debug, Clone)]
- pub struct Genome {
- pub chromosomes: HashMap<String, Chromosome>,
- }
- #[derive(Debug, Clone)]
- pub struct Chromosome {
- contigs: Vec<Contig>,
- }
- #[derive(Debug, Clone)]
- pub struct Contig {
- pub id: String,
- // contig seq on ref
- pub mappings: Vec<Mapping>,
- // reads on ref
- pub supporting_records: Vec<Record>,
- pub sequence: String,
- pub contig_ref: ContigRef,
- }
- #[derive(Debug, Clone)]
- pub enum ContigRef {
- Unique(Mapping),
- Chimeric((Mapping, Mapping)),
- ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
- LeftAmbiguity((Vec<Mapping>, Mapping)),
- RightAmbiguity((Mapping, Vec<Mapping>)),
- Ambigous(Vec<Mapping>),
- }
- impl fmt::Display for ContigRef {
- fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
- let str = match self {
- ContigRef::Unique(m) => mapping_to_string(m),
- ContigRef::Chimeric((a, b)) => {
- format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
- }
- ContigRef::ChimericMultiple((a, v, b)) => format!(
- "{}<->{}<->{}",
- mapping_to_string(a),
- mappings_to_string(v),
- mapping_to_string(b)
- ),
- ContigRef::LeftAmbiguity((v, b)) => {
- format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
- }
- ContigRef::RightAmbiguity((a, v)) => {
- format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
- }
- ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
- };
- fmt.write_str(&str).unwrap();
- std::result::Result::Ok(())
- }
- }
- impl ContigRef {
- pub fn hgvs(&self) -> Option<String> {
- match self {
- ContigRef::Unique(_) => None,
- ContigRef::Chimeric((a, b)) => {
- if a.target_name == b.target_name {
- let chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
- let del_start = a.target_end;
- let del_end = b.target_start;
- let hgvs = format!("{chr}:{del_start}_{del_end}");
- Some(hgvs)
- } else {
- let a_chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string());
- let a_bp = a.target_end;
- let b_chr = b.target_name.clone().unwrap_or("UNKNOWN".to_string());
- let b_bp = b.target_end;
- let hgvs = format!("{a_chr}:{a_bp}delins[{b_chr}:{b_bp}]");
- Some(hgvs)
- }
- }
- ContigRef::ChimericMultiple(_) => None,
- ContigRef::LeftAmbiguity(_) => None,
- ContigRef::RightAmbiguity(_) => None,
- ContigRef::Ambigous(_) => None,
- }
- }
- }
- pub fn mapping_to_string(mapping: &Mapping) -> String {
- let uk = "UNKNOWN".to_string();
- format!(
- "{}:{}-{}({}:{}-{})",
- mapping.target_name.clone().unwrap_or(uk.clone()),
- mapping.target_start,
- mapping.target_end,
- mapping.query_name.clone().unwrap_or(uk),
- mapping.query_start,
- mapping.query_end
- )
- }
- fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
- let v = mappings
- .iter()
- .map(mapping_to_string)
- .collect::<Vec<String>>();
- v.join("//")
- }
- pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
- let mut mappings = mappings;
- if mappings.len() == 1 {
- return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
- } else {
- let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
- if grouped.len() == 1 {
- let r = grouped.into_iter().flat_map(|e| e).collect();
- return Ok(ContigRef::Ambigous(r));
- } else if grouped.len() >= 2 {
- let first = grouped.pop_back().unwrap();
- let last = grouped.pop_front().unwrap();
- if grouped.len() == 0 {
- if first.len() == 1 && last.len() == 1 {
- return Ok(ContigRef::Chimeric((
- first.get(0).unwrap().clone(),
- last.get(0).unwrap().clone(),
- )));
- } else if first.len() == 1 {
- return Ok(ContigRef::RightAmbiguity((
- first.get(0).unwrap().clone(),
- last.clone(),
- )));
- } else if last.len() == 1 {
- return Ok(ContigRef::LeftAmbiguity((
- first.clone(),
- last.get(0).unwrap().clone(),
- )));
- } else {
- let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
- return Ok(ContigRef::Ambigous(all));
- }
- } else {
- }
- if first.len() == 1 && last.len() == 1 {
- return Ok(ContigRef::ChimericMultiple((
- first.get(0).unwrap().clone(),
- grouped.into_iter().flat_map(|e| e).collect(),
- last.get(0).unwrap().clone(),
- )));
- } else if first.len() == 1 {
- let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
- .into_iter()
- .flat_map(|e| e)
- .collect();
- return Ok(ContigRef::RightAmbiguity((
- first.get(0).unwrap().clone(),
- right,
- )));
- } else if last.len() == 1 {
- let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
- .into_iter()
- .flat_map(|e| e)
- .collect();
- return Ok(ContigRef::LeftAmbiguity((
- left,
- last.get(0).unwrap().clone(),
- )));
- } else {
- let all: Vec<Mapping> =
- vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
- .into_iter()
- .flat_map(|e| e)
- .collect();
- return Ok(ContigRef::Ambigous(all));
- }
- } else {
- return Ok(ContigRef::Ambigous(
- grouped.into_iter().flat_map(|e| e).collect(),
- ));
- }
- }
- }
- impl Genome {
- pub fn new() -> Self {
- Genome {
- chromosomes: HashMap::new(),
- }
- }
- pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
- self.chromosomes.iter()
- }
- pub fn add_contig(
- &mut self,
- id: String,
- mappings: Vec<Mapping>,
- supporting_records: Vec<Record>,
- sequence: String,
- ) -> Result<()> {
- let new_contig = Contig {
- id,
- mappings: mappings.clone(),
- supporting_records,
- sequence,
- contig_ref: get_ref_pos(mappings)?,
- };
- // get the category of Mapping
- match new_contig.contig_ref.clone() {
- ContigRef::Unique(contig_mapping) => {
- match self
- .chromosomes
- .get_mut(&contig_mapping.target_name.unwrap())
- {
- Some(chromosome) => {
- chromosome.contigs.push(new_contig);
- }
- None => (),
- }
- }
- ContigRef::Chimeric((a, b)) => {
- let a_target_name = a.target_name.unwrap();
- let b_target_name = b.target_name.unwrap();
- if a_target_name == b_target_name {
- if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- a_target_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- } else {
- let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
- if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- chimeric_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- }
- }
- ContigRef::ChimericMultiple((left, _, right)) => {
- let left_target_name = left.target_name.unwrap();
- let right_target_name = right.target_name.unwrap();
- if left_target_name == right_target_name {
- if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- left_target_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- } else {
- let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
- if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- chimeric_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- }
- }
- _ => {
- if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- "Ambigous".to_string(),
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- }
- };
- Ok(())
- }
- pub fn stats(&self) {
- for (k, v) in self.chromosomes.iter() {
- info!("{}:{}", k, v.contigs.len());
- }
- }
- pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
- if let Some(chr) = self.chromosomes.get(chromosome) {
- Some(chr.contigs.clone())
- } else {
- None
- }
- }
- }
- impl Chromosome {
- pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
- self.contigs.iter()
- }
- }
- impl Contig {
- pub fn sort(&mut self) {
- self.mappings
- .sort_by(|a, b| a.target_start.cmp(&b.target_start));
- }
- pub fn to_igv(&self, dir_path: &str) -> Result<()> {
- let contig_name = if let Some(hgvs) = self.hgvs() {
- hgvs
- } else {
- self.id.clone()
- };
- let contig_dir = format!("{dir_path}/{contig_name}");
- fs::create_dir_all(contig_dir.clone())?;
- let fasta_path = format!("{contig_dir}/contig.fa");
- write_fasta(&fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
- write_fai(&fasta_path);
- let reads_path = format!("{contig_dir}/reads.fa");
- let n_reads = self
- .supporting_records
- .clone()
- .into_iter()
- .map(|r| {
- (
- String::from_utf8(r.qname().to_vec()).unwrap(),
- String::from_utf8(r.seq().as_bytes()).unwrap(),
- )
- })
- .collect();
- write_fasta(&reads_path, &n_reads);
- let bam_path = format!("{contig_dir}/{}.bam", self.id);
- write_bam(&fasta_path, &reads_path, &bam_path)?;
- let bed_path = format!("{contig_dir}/contig.bed");
- match &self.contig_ref {
- ContigRef::Chimeric((a, b)) => {
- let d = vec![
- (
- self.id.clone(),
- a.query_start,
- a.query_end,
- format!(
- "{}:{}-{}",
- a.target_name.clone().unwrap(),
- a.target_start,
- a.target_end
- ),
- ),
- (
- self.id.clone(),
- b.query_start,
- b.query_end,
- format!(
- "{}:{}-{}",
- b.target_name.clone().unwrap(),
- b.target_start,
- b.target_end
- ),
- ),
- ];
- write_bed(&bed_path, &d)?;
- }
- _ => (),
- }
- Ok(())
- }
- // bug cigar len != seq len
- pub fn write_bam(&self, path: &str) -> Result<()> {
- let aligner = Aligner::builder()
- .asm5()
- .with_threads(8)
- .with_cigar()
- .with_seq(self.sequence.as_bytes())
- .expect("Unable to build index");
- let mut mappings = Vec::new();
- for record in self.supporting_records.iter() {
- let seq = record.seq().as_bytes();
- let alignment = aligner
- .map(&seq, false, false, None, None)
- .expect("Unable to align");
- mappings.push(alignment);
- }
- let mut mappings: Vec<_> = mappings.into_iter().flatten().collect();
- mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start));
- let mut header = bam::Header::new();
- let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ");
- sq_record.push_tag(b"SN", self.id.clone());
- sq_record.push_tag(b"LN", self.sequence.len());
- header.push_record(&sq_record);
- let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
- // copy reverse reads to new BAM file
- for mapping in mappings.iter() {
- let record = minimap2::htslib::mapping_to_record(
- Some(mapping),
- self.sequence.as_bytes(),
- header.clone(),
- None,
- Some(Uuid::new_v4().as_bytes()),
- );
- let _ = out.write(&record);
- }
- rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 1)?;
- Ok(())
- }
- pub fn hgvs(&self) -> Option<String> {
- self.contig_ref.hgvs()
- }
- }
- fn group_mappings(mappings: &mut Vec<Mapping>) -> Result<Vec<Vec<Mapping>>> {
- // sort alignments by query_start
- mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
- let mut alignments: Vec<Vec<Mapping>> = vec![];
- // group by overlapps > 30
- for aln in mappings.iter() {
- let last = alignments.last_mut();
- if let Some(l) = last {
- if l.iter()
- .filter(|a| a.query_end - aln.query_start > 30)
- .count()
- > 0
- {
- l.push(aln.clone());
- } else {
- alignments.push(vec![aln.clone()]);
- }
- } else {
- alignments.push(vec![aln.clone()]);
- }
- }
- Ok(alignments)
- }
- pub fn write_bed(path: &str, d: &Vec<(String, i32, i32, String)>) -> Result<()> {
- let file = File::create(path).unwrap();
- let mut writer = BufWriter::new(file);
- for (chr, start, end, value) in d.iter() {
- let row = format!(
- "{}\n",
- vec![
- chr.to_string(),
- start.to_string(),
- end.to_string(),
- value.to_string()
- ]
- .join("\t")
- );
- writer.write_all(row.as_bytes())?;
- }
- Ok(())
- }
- // unique
- pub fn write_fastq(fastq_path: &str, d: &Vec<Record>) -> Result<()> {
- let file = File::create(fastq_path)?;
- let mut writer = BufWriter::new(file);
- for record in d {
- let name = String::from_utf8(record.qname().to_vec()).unwrap();
- writer.write_all(format!("@{name}\n").as_bytes())?;
- let seq = record.seq().as_bytes();
- writer.write_all(&seq)?;
- writer.write_all(b"\n+\n")?;
- let qual = record.qual();
- writer.write_all(qual)?;
- }
- Ok(())
- }
- pub fn write_fasta(fasta_path: &str, d: &Vec<(String, String)>) {
- let file = File::create(fasta_path).unwrap();
- let mut writer = fasta::writer::Builder::default().build_with_writer(file);
- let mut passed = Vec::new();
- for (name, sequence) in d {
- let name = name.to_string();
- if sequence.len() == 0 {
- continue;
- }
- if passed.contains(&name) {
- continue;
- }
- passed.push(name.clone());
- let record = fasta::Record::new(
- fasta::record::Definition::new(name.as_str(), None),
- fasta::record::Sequence::from(sequence.as_bytes().to_vec()),
- );
- writer.write_record(&record).unwrap();
- }
- }
- pub fn write_fai(path: &str) {
- let mut faidx = Command::new("samtools")
- .arg("faidx")
- .arg(path)
- .spawn()
- .expect("Samtools faidx failed to start");
- faidx.wait().unwrap();
- }
- pub fn write_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()> {
- let rg_id = uuid::Uuid::new_v4();
- let mm2 = Command::new("minimap2")
- .arg("-t")
- .arg("128")
- .arg("-ax")
- .arg("map-ont")
- .arg("-R")
- .arg(format!(
- "@RG\\tPL:ONTASM_PROM\\tID:ONTASM_{rg_id}\\tSM:{rg_id}\\tLB:ONTASM_NB_PROM"
- ))
- .arg(ref_path)
- .arg(reads_path)
- .stdout(Stdio::piped())
- .stderr(Stdio::piped())
- .spawn()
- .expect("Minimap2 failed to start");
- let view = Command::new("sambamba")
- .arg("view")
- .arg("-h")
- .arg("-S")
- .arg("-t")
- .arg("20")
- .arg("--format=bam")
- .arg("/dev/stdin")
- .stdin(Stdio::from(mm2.stdout.unwrap()))
- .stdout(Stdio::piped())
- .stderr(Stdio::piped())
- .spawn()
- .expect("Sambamba view failed to start");
- let mut sort = Command::new("sambamba")
- .arg("sort")
- .arg("-t")
- .arg("20")
- .arg("/dev/stdin")
- .arg("-o")
- .arg(bam_path)
- .stderr(Stdio::piped())
- .stdin(Stdio::from(view.stdout.unwrap()))
- .spawn()
- .expect("Sambamba sort failed to start");
- sort.wait().unwrap();
- Ok(())
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- #[test]
- fn it_works() {
- // let result = add(2, 2);
- // assert_eq!(result, 4);
- }
- }
|