use anyhow::{Ok, Result}; use log::info; use minimap2::{Aligner, Mapping}; use noodles_fasta as fasta; use rust_htslib::bam::{self, Record}; use std::{ collections::{HashMap, VecDeque}, fmt::{self, format}, fs::{self, File}, io::{BufWriter, Write}, process::{Command, Stdio}, }; use uuid::Uuid; #[derive(Debug, Clone)] pub struct Genome { pub chromosomes: HashMap, } #[derive(Debug, Clone)] pub struct Chromosome { contigs: Vec, } #[derive(Debug, Clone)] pub struct Contig { pub id: String, // contig seq on ref pub mappings: Vec, // reads on ref pub supporting_records: Vec, pub sequence: String, pub contig_ref: ContigRef, } #[derive(Debug, Clone)] pub enum ContigRef { Unique(Mapping), Chimeric((Mapping, Mapping)), ChimericMultiple((Mapping, Vec, Mapping)), LeftAmbiguity((Vec, Mapping)), RightAmbiguity((Mapping, Vec)), Ambigous(Vec), } impl fmt::Display for ContigRef { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { let str = match self { ContigRef::Unique(m) => mapping_to_string(m), ContigRef::Chimeric((a, b)) => { format!("{}<->{}", mapping_to_string(a), mapping_to_string(b)) } ContigRef::ChimericMultiple((a, v, b)) => format!( "{}<->{}<->{}", mapping_to_string(a), mappings_to_string(v), mapping_to_string(b) ), ContigRef::LeftAmbiguity((v, b)) => { format!("{}<->{}", mappings_to_string(v), mapping_to_string(b)) } ContigRef::RightAmbiguity((a, v)) => { format!("{}<->{}", mapping_to_string(a), mappings_to_string(v)) } ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)), }; fmt.write_str(&str).unwrap(); std::result::Result::Ok(()) } } impl ContigRef { pub fn hgvs(&self) -> Option { match self { ContigRef::Unique(_) => None, ContigRef::Chimeric((a, b)) => { if a.target_name == b.target_name { let chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string()); let del_start = a.target_end; let del_end = b.target_start; Some(format!("{chr}:{del_start}_{del_end}")) } else { let a_chr = a.target_name.clone().unwrap_or("UNKNOWN".to_string()); let a_bp = a.target_end; let b_chr = b.target_name.clone().unwrap_or("UNKNOWN".to_string()); let b_bp = b.target_end; Some(format!("{a_chr}:{a_bp}delins[{b_chr}:{b_bp}]")) } } ContigRef::ChimericMultiple(_) => None, ContigRef::LeftAmbiguity(_) => None, ContigRef::RightAmbiguity(_) => None, ContigRef::Ambigous(_) => None, } } } pub fn mapping_to_string(mapping: &Mapping) -> String { let uk = "UNKNOWN".to_string(); format!( "{}:{}-{}({}:{}-{})", mapping.target_name.clone().unwrap_or(uk.clone()), mapping.target_start, mapping.target_end, mapping.query_name.clone().unwrap_or(uk), mapping.query_start, mapping.query_end ) } fn mappings_to_string(mappings: &Vec) -> String { let v = mappings .iter() .map(mapping_to_string) .collect::>(); v.join("//") } pub fn get_ref_pos(mappings: Vec) -> Result { let mut mappings = mappings; if mappings.len() == 1 { return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone())); } else { let mut grouped: VecDeque> = group_mappings(&mut mappings)?.into(); if grouped.len() == 1 { let r = grouped.into_iter().flat_map(|e| e).collect(); return Ok(ContigRef::Ambigous(r)); } else if grouped.len() >= 2 { let first = grouped.pop_back().unwrap(); let last = grouped.pop_front().unwrap(); if grouped.len() == 0 { if first.len() == 1 && last.len() == 1 { return Ok(ContigRef::Chimeric(( first.get(0).unwrap().clone(), last.get(0).unwrap().clone(), ))); } else if first.len() == 1 { return Ok(ContigRef::RightAmbiguity(( first.get(0).unwrap().clone(), last.clone(), ))); } else if last.len() == 1 { return Ok(ContigRef::LeftAmbiguity(( first.clone(), last.get(0).unwrap().clone(), ))); } else { let all: Vec = vec![first, last].into_iter().flat_map(|e| e).collect(); return Ok(ContigRef::Ambigous(all)); } } else { } if first.len() == 1 && last.len() == 1 { return Ok(ContigRef::ChimericMultiple(( first.get(0).unwrap().clone(), grouped.into_iter().flat_map(|e| e).collect(), last.get(0).unwrap().clone(), ))); } else if first.len() == 1 { let right: Vec = vec![grouped.into_iter().flat_map(|e| e).collect(), last] .into_iter() .flat_map(|e| e) .collect(); return Ok(ContigRef::RightAmbiguity(( first.get(0).unwrap().clone(), right, ))); } else if last.len() == 1 { let left: Vec = vec![first, grouped.into_iter().flat_map(|e| e).collect()] .into_iter() .flat_map(|e| e) .collect(); return Ok(ContigRef::LeftAmbiguity(( left, last.get(0).unwrap().clone(), ))); } else { let all: Vec = vec![first, grouped.into_iter().flat_map(|e| e).collect(), last] .into_iter() .flat_map(|e| e) .collect(); return Ok(ContigRef::Ambigous(all)); } } else { return Ok(ContigRef::Ambigous( grouped.into_iter().flat_map(|e| e).collect(), )); } } } impl Genome { pub fn new() -> Self { Genome { chromosomes: HashMap::new(), } } pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> { self.chromosomes.iter() } pub fn add_contig( &mut self, id: String, mappings: Vec, supporting_records: Vec, sequence: String, ) -> Result<()> { let new_contig = Contig { id, mappings: mappings.clone(), supporting_records, sequence, contig_ref: get_ref_pos(mappings)?, }; // get the category of Mapping match new_contig.contig_ref.clone() { ContigRef::Unique(contig_mapping) => { match self .chromosomes .get_mut(&contig_mapping.target_name.unwrap()) { Some(chromosome) => { chromosome.contigs.push(new_contig); } None => (), } } ContigRef::Chimeric((a, b)) => { let a_target_name = a.target_name.unwrap(); let b_target_name = b.target_name.unwrap(); if a_target_name == b_target_name { if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) { chromosome.contigs.push(new_contig); } else { self.chromosomes.insert( a_target_name, Chromosome { contigs: vec![new_contig], }, ); } } else { let chimeric_name = format!("{}-{}", a_target_name, b_target_name); if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) { chromosome.contigs.push(new_contig); } else { self.chromosomes.insert( chimeric_name, Chromosome { contigs: vec![new_contig], }, ); } } } ContigRef::ChimericMultiple((left, _, right)) => { let left_target_name = left.target_name.unwrap(); let right_target_name = right.target_name.unwrap(); if left_target_name == right_target_name { if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) { chromosome.contigs.push(new_contig); } else { self.chromosomes.insert( left_target_name, Chromosome { contigs: vec![new_contig], }, ); } } else { let chimeric_name = format!("{}-{}", left_target_name, right_target_name); if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) { chromosome.contigs.push(new_contig); } else { self.chromosomes.insert( chimeric_name, Chromosome { contigs: vec![new_contig], }, ); } } } _ => { if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") { chromosome.contigs.push(new_contig); } else { self.chromosomes.insert( "Ambigous".to_string(), Chromosome { contigs: vec![new_contig], }, ); } } }; Ok(()) } pub fn stats(&self) { for (k, v) in self.chromosomes.iter() { info!("{}:{}", k, v.contigs.len()); } } pub fn chromosome(&self, chromosome: &str) -> Option> { if let Some(chr) = self.chromosomes.get(chromosome) { Some(chr.contigs.clone()) } else { None } } } impl Chromosome { pub fn iter(&self) -> std::slice::Iter<'_, Contig> { self.contigs.iter() } } impl Contig { pub fn sort(&mut self) { self.mappings .sort_by(|a, b| a.target_start.cmp(&b.target_start)); } pub fn to_igv(&self, dir_path: &str) -> Result<()> { let contig_dir = format!("{dir_path}/{}", self.id); fs::create_dir_all(contig_dir.clone())?; let fasta_path = format!("{contig_dir}/contig.fa"); write_fasta(&fasta_path, &vec![(self.id.clone(), self.sequence.clone())]); write_fai(&fasta_path); let reads_path = format!("{contig_dir}/reads.fa"); let n_reads = self .supporting_records .clone() .into_iter() .map(|r| { ( String::from_utf8(r.qname().to_vec()).unwrap(), String::from_utf8(r.seq().as_bytes()).unwrap(), ) }) .collect(); write_fasta(&reads_path, &n_reads); let bam_path = format!("{contig_dir}/{}.bam", self.id); create_bam(&fasta_path, &reads_path, &bam_path)?; let bed_path = format!("{contig_dir}/contig.bed"); match &self.contig_ref { ContigRef::Chimeric((a, b)) => { let d = vec![ ( self.id.clone(), a.query_start, a.query_end, format!( "{}:{}-{}", a.target_name.clone().unwrap(), a.target_start, a.target_end ), ), ( self.id.clone(), b.query_start, b.query_end, format!( "{}:{}-{}", b.target_name.clone().unwrap(), b.target_start, b.target_end ), ), ]; write_bed(&bed_path, &d)?; } _ => (), } Ok(()) } // bug cigar len != seq len pub fn write_bam(&self, path: &str) -> Result<()> { let aligner = Aligner::builder() .asm5() .with_threads(8) .with_cigar() .with_seq(self.sequence.as_bytes()) .expect("Unable to build index"); let mut mappings = Vec::new(); for record in self.supporting_records.iter() { let seq = record.seq().as_bytes(); let alignment = aligner .map(&seq, false, false, None, None) .expect("Unable to align"); mappings.push(alignment); } let mut mappings: Vec<_> = mappings.into_iter().flatten().collect(); mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start)); let mut header = bam::Header::new(); let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ"); sq_record.push_tag(b"SN", self.id.clone()); sq_record.push_tag(b"LN", self.sequence.len()); header.push_record(&sq_record); let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap(); // copy reverse reads to new BAM file for mapping in mappings.iter() { let record = minimap2::htslib::mapping_to_record( Some(mapping), self.sequence.as_bytes(), header.clone(), None, Some(Uuid::new_v4().as_bytes()), ); let _ = out.write(&record); } rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 1)?; Ok(()) } pub fn hgvs(&self) -> Option { self.contig_ref.hgvs() } } fn group_mappings(mappings: &mut Vec) -> Result>> { // sort alignments by query_start mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start)); let mut alignments: Vec> = vec![]; // group by overlapps > 30 for aln in mappings.iter() { let last = alignments.last_mut(); if let Some(l) = last { if l.iter() .filter(|a| a.query_end - aln.query_start > 30) .count() > 0 { l.push(aln.clone()); } else { alignments.push(vec![aln.clone()]); } } else { alignments.push(vec![aln.clone()]); } } Ok(alignments) } pub fn write_bed(path: &str, d: &Vec<(String, i32, i32, String)>) -> Result<()> { let file = File::create(path).unwrap(); let mut writer = BufWriter::new(file); for (chr, start, end, value) in d.iter() { let row = format!( "{}\n", vec![ chr.to_string(), start.to_string(), end.to_string(), value.to_string() ] .join("\t") ); writer.write_all(row.as_bytes())?; } Ok(()) } // unique pub fn write_fastq(fastq_path: &str, d: &Vec) -> Result<()> { let file = File::create(fastq_path)?; let mut writer = BufWriter::new(file); for record in d { let name = String::from_utf8(record.qname().to_vec()).unwrap(); writer.write_all(format!("@{name}\n").as_bytes())?; let seq = record.seq().as_bytes(); writer.write_all(&seq)?; writer.write_all(b"\n+\n")?; let qual = record.qual(); writer.write_all(qual)?; } Ok(()) } pub fn write_fasta(fasta_path: &str, d: &Vec<(String, String)>) { let file = File::create(fasta_path).unwrap(); let mut writer = fasta::writer::Builder::default().build_with_writer(file); let mut passed = Vec::new(); for (name, sequence) in d { let name = name.to_string(); if sequence.len() == 0 { continue; } if passed.contains(&name) { continue; } passed.push(name.clone()); let record = fasta::Record::new( fasta::record::Definition::new(name.as_str(), None), fasta::record::Sequence::from(sequence.as_bytes().to_vec()), ); writer.write_record(&record).unwrap(); } } pub fn write_fai(path: &str) { let mut faidx = Command::new("samtools") .arg("faidx") .arg(path) .spawn() .expect("Samtools faidx failed to start"); faidx.wait().unwrap(); } // pub fn write_aln_sh() pub fn create_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()> { let rg_id = uuid::Uuid::new_v4(); let mm2 = Command::new("minimap2") .arg("-t") .arg("128") .arg("-ax") .arg("map-ont") .arg("-R") .arg(format!( "@RG\\tPL:ONTASM_PROM\\tID:ONTASM_{rg_id}\\tSM:{rg_id}\\tLB:ONTASM_NB_PROM" )) .arg(ref_path) .arg(reads_path) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .spawn() .expect("Minimap2 failed to start"); let view = Command::new("sambamba") .arg("view") .arg("-h") .arg("-S") .arg("-t") .arg("20") .arg("--format=bam") .arg("/dev/stdin") .stdin(Stdio::from(mm2.stdout.unwrap())) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .spawn() .expect("Sambamba view failed to start"); let mut sort = Command::new("sambamba") .arg("sort") .arg("-t") .arg("20") .arg("/dev/stdin") .arg("-o") .arg(bam_path) .stderr(Stdio::piped()) .stdin(Stdio::from(view.stdout.unwrap())) .spawn() .expect("Sambamba sort failed to start"); sort.wait().unwrap(); Ok(()) } #[cfg(test)] mod tests { use super::*; #[test] fn it_works() { // let result = add(2, 2); // assert_eq!(result, 4); } }