| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201 |
- pub mod breakpoint;
- pub mod genomic_graph;
- // mod phase;
- use anyhow::{anyhow, Ok, Result};
- use fasta::record::Sequence;
- use log::info;
- use minimap2::{Aligner, Mapping};
- use noodles_fasta as fasta;
- use num_format::{CustomFormat, Grouping, ToFormattedString, WriteFormatted};
- use petgraph::{dot::Dot, prelude::*};
- use rust_htslib::bam::{self, Record};
- use std::{
- collections::HashMap,
- fmt,
- fs::{self, File},
- io::{BufReader, BufWriter, Write},
- path::PathBuf,
- process::{Command, Stdio},
- };
- use uuid::Uuid;
- #[derive(Debug, Clone)]
- pub struct Genome {
- pub chromosomes: HashMap<String, Chromosome>,
- }
- #[derive(Debug, Clone)]
- pub struct Chromosome {
- contigs: Vec<Contig>,
- }
- #[derive(Debug, Clone)]
- pub struct Contig {
- pub id: String,
- // contig seq on ref
- pub mappings: Vec<Mapping>,
- // reads on ref
- pub supporting_records: Option<Vec<Record>>,
- pub sequence: String,
- pub contig_ref: ContigRef,
- }
- #[derive(Debug, Clone)]
- pub enum ContigRef {
- Unique(Mapping),
- Chimeric((Mapping, Mapping)),
- ChimericTriple((Mapping, Mapping, Mapping)),
- ChimericMultiple((Mapping, Vec<Mapping>, Mapping)),
- LeftAmbiguity((Vec<Mapping>, Mapping)),
- RightAmbiguity((Mapping, Vec<Mapping>)),
- Ambigous(Vec<Mapping>),
- }
- impl fmt::Display for ContigRef {
- fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
- let str = match self {
- ContigRef::Unique(m) => mapping_to_string(m),
- ContigRef::Chimeric((a, b)) => {
- format!("{}<->{}", mapping_to_string(a), mapping_to_string(b))
- }
- ContigRef::ChimericMultiple((a, v, b)) => format!(
- "{}<->{}<->{}",
- mapping_to_string(a),
- mappings_to_string(v),
- mapping_to_string(b)
- ),
- ContigRef::LeftAmbiguity((v, b)) => {
- format!("{}<->{}", mappings_to_string(v), mapping_to_string(b))
- }
- ContigRef::RightAmbiguity((a, v)) => {
- format!("{}<->{}", mapping_to_string(a), mappings_to_string(v))
- }
- ContigRef::Ambigous(v) => format!("{}", mappings_to_string(v)),
- ContigRef::ChimericTriple((a, b, c)) => format!(
- "{}<->{}<->{}",
- mapping_to_string(a),
- mapping_to_string(b),
- mapping_to_string(c)
- ),
- };
- fmt.write_str(&str).unwrap();
- std::result::Result::Ok(())
- }
- }
- impl ContigRef {
- pub fn breakpoints(&self) -> Option<Vec<Mapping>> {
- match self {
- ContigRef::Unique(_) => None,
- ContigRef::Chimeric((a, b)) => Some(vec![a.clone(), b.clone()]),
- ContigRef::ChimericTriple((a, b, c)) => Some(vec![a.clone(), b.clone(), c.clone()]),
- ContigRef::ChimericMultiple(_) => None,
- ContigRef::LeftAmbiguity(_) => None,
- ContigRef::RightAmbiguity(_) => None,
- ContigRef::Ambigous(_) => None,
- }
- }
- pub fn breakpoints_repr(&self) -> Option<Vec<String>> {
- let left = "►";
- let right = "◄";
- let bp_right = "▐";
- let bp_left = "▌";
- let format = CustomFormat::builder()
- .grouping(Grouping::Standard)
- .minus_sign("-")
- .separator("_")
- .build()
- .unwrap();
- let get_sign = |m: &Mapping| -> &str {
- match m.strand {
- minimap2::Strand::Forward => left,
- minimap2::Strand::Reverse => right,
- }
- };
- let uk = "UNKNOWN".to_string();
- let mut res = Vec::new();
- if let Some(breakpoints) = self.breakpoints() {
- for v in breakpoints.windows(2) {
- let mut bp_string = format!("{}:", v[0].target_name.clone().unwrap_or(uk.clone()));
- let _ = bp_string
- .write_formatted(&v[0].target_end, &format)
- .unwrap();
- bp_string = format!(
- "{bp_string}{}{bp_left}{bp_right}{}{}:",
- get_sign(&v[0]),
- get_sign(&v[1]),
- v[1].target_name.clone().unwrap_or(uk.clone())
- );
- let _ = bp_string
- .write_formatted(&v[1].target_start, &format)
- .unwrap();
- res.push(bp_string);
- }
- }
- if !res.is_empty() {
- Some(res)
- } else {
- None
- }
- }
- pub fn desc(&self) -> Option<String> {
- let uk = "UNKNOWN".to_string();
- let to_desc = |v: &mut Vec<Mapping>| -> String {
- v.sort_by(|a, b| a.query_start.cmp(&b.query_start));
- let v: Vec<String> = v
- .into_iter()
- .map(|e| {
- let strand = match e.strand {
- minimap2::Strand::Forward => "",
- minimap2::Strand::Reverse => "_rev",
- };
- format!(
- "{}:{}_{}{}",
- e.target_name.clone().unwrap_or(uk.clone()),
- e.target_start,
- e.target_end,
- strand
- )
- })
- .collect();
- format!("[{}]", v.join(";"))
- };
- match self {
- ContigRef::Unique(a) => Some(format!(
- "{}:{}_{}",
- a.target_name.clone().unwrap_or(uk.clone()),
- a.target_start,
- a.target_end
- )),
- ContigRef::Chimeric((a, b)) => Some(to_desc(&mut vec![a.to_owned(), b.to_owned()])),
- ContigRef::ChimericTriple((a, b, c)) => {
- Some(to_desc(&mut vec![a.to_owned(), b.to_owned(), c.to_owned()]))
- }
- ContigRef::ChimericMultiple(_) => None,
- ContigRef::LeftAmbiguity(_) => None,
- ContigRef::RightAmbiguity(_) => None,
- ContigRef::Ambigous(a) => Some(to_desc(&mut a.to_owned())),
- }
- }
- pub fn hgvs(&self) -> Option<String> {
- let uk = "UNKNOWN".to_string();
- match self {
- ContigRef::Unique(_) => None,
- ContigRef::Chimeric((a, b)) => {
- if a.target_name == b.target_name {
- let chr = a.target_name.clone().unwrap_or(uk.clone());
- let del_start = a.target_end;
- let del_end = b.target_start;
- let hgvs = format!("{chr}:{del_start}_{del_end}");
- Some(hgvs)
- } else {
- let a_chr = a.target_name.clone().unwrap_or(uk.clone());
- let a_bp = a.target_end;
- let b_chr = b.target_name.clone().unwrap_or(uk.clone());
- let b_bp = b.target_end;
- let hgvs = format!("{a_chr}:{a_bp}delins[{b_chr}:{b_bp}]");
- Some(hgvs)
- }
- }
- ContigRef::ChimericMultiple(_) => None,
- ContigRef::LeftAmbiguity(_) => None,
- ContigRef::RightAmbiguity(_) => None,
- ContigRef::Ambigous(_) => None,
- ContigRef::ChimericTriple((a, b, c)) => {
- let mut v = [a, b, c];
- v.sort_by(|a, b| a.query_start.cmp(&b.query_start));
- let (a, b, c) = (
- *v.get(0).clone().unwrap(),
- *v.get(1).clone().unwrap(),
- *v.get(2).clone().unwrap(),
- );
- let a_target_name = a.target_name.clone().unwrap_or(uk.clone());
- let b_target_name = b.target_name.clone().unwrap_or(uk.clone());
- let c_target_name = c.target_name.clone().unwrap_or(uk.clone());
- // if a_target_name != b_target_name {}
- // Insertions
- // prioritize first len
- let (bp_a_1, bp_a_2) = if a.query_end <= b.query_end {
- // TODO add inserted nt
- (
- (a.target_name.clone().unwrap_or(uk.clone()), a.target_end),
- (b.target_name.clone().unwrap_or(uk.clone()), b.target_start),
- )
- } else {
- let diff = a.query_end - b.query_start;
- (
- (a.target_name.clone().unwrap_or(uk.clone()), a.target_end),
- (
- b.target_name.clone().unwrap_or(uk.clone()),
- b.target_start + diff,
- ),
- )
- };
- let (bp_b_1, bp_b_2) = if b.query_end <= c.query_end {
- // TODO add inserted nt
- (
- (b.target_name.clone().unwrap_or(uk.clone()), b.target_end),
- (c.target_name.clone().unwrap_or(uk.clone()), c.target_start),
- )
- } else {
- let diff = b.query_end - c.query_start;
- (
- (b.target_name.clone().unwrap_or(uk.clone()), b.target_end),
- (
- c.target_name.clone().unwrap_or(uk.clone()),
- c.target_start + diff,
- ),
- )
- };
- if bp_a_1.0 == bp_b_2.0 {
- let hgvs = format!(
- "{}:{}_{}ins[{}:{}_{}]",
- bp_a_1.0, bp_a_1.1, bp_b_2.1, bp_a_2.0, bp_a_2.1, bp_b_1.1
- );
- Some(hgvs)
- } else {
- None
- }
- }
- }
- }
- }
- pub fn mapping_to_string(mapping: &Mapping) -> String {
- let uk = "UNKNOWN".to_string();
- format!(
- "{}:{}-{}({}:{}-{})",
- mapping.target_name.clone().unwrap_or(uk.clone()),
- mapping.target_start,
- mapping.target_end,
- mapping.query_name.clone().unwrap_or(uk),
- mapping.query_start,
- mapping.query_end
- )
- }
- fn mappings_to_string(mappings: &Vec<Mapping>) -> String {
- let v = mappings
- .iter()
- .map(mapping_to_string)
- .collect::<Vec<String>>();
- v.join("//")
- }
- pub fn get_ref_pos(mappings: Vec<Mapping>) -> Result<ContigRef> {
- let mut mappings = mappings;
- mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
- if mappings.len() == 1 {
- return Ok(ContigRef::Unique(mappings.get(0).unwrap().clone()));
- } else {
- let mut grouped: Vec<Vec<Mapping>> = group_mappings(&mut mappings)?;
- // let mut grouped: VecDeque<Vec<Mapping>> = group_mappings(&mut mappings)?.into();
- if grouped.len() == 1 {
- let r = grouped.into_iter().flat_map(|e| e).collect();
- return Ok(ContigRef::Ambigous(r));
- } else if grouped.len() >= 2 {
- // let first = grouped.pop_back().unwrap();
- // let last = grouped.pop_front().unwrap();
- let first = grouped.first().unwrap().to_vec();
- let last = grouped.last().unwrap().to_vec();
- grouped.remove(0);
- grouped.remove(grouped.len() - 1);
- assert!(first[0].query_start < last[0].query_start);
- if grouped.len() == 0 {
- if first.len() == 1 && last.len() == 1 {
- return Ok(ContigRef::Chimeric((
- first.get(0).unwrap().clone(),
- last.get(0).unwrap().clone(),
- )));
- } else if first.len() == 1 {
- return Ok(ContigRef::RightAmbiguity((
- first.get(0).unwrap().clone(),
- last.clone(),
- )));
- } else if last.len() == 1 {
- return Ok(ContigRef::LeftAmbiguity((
- first.clone(),
- last.get(0).unwrap().clone(),
- )));
- } else {
- let all: Vec<Mapping> = vec![first, last].into_iter().flat_map(|e| e).collect();
- return Ok(ContigRef::Ambigous(all));
- }
- }
- if first.len() == 1 && last.len() == 1 {
- if grouped.len() == 1 {
- return Ok(ContigRef::ChimericTriple((
- first.get(0).unwrap().clone(),
- grouped.get(0).unwrap().get(0).unwrap().clone(),
- last.get(0).unwrap().clone(),
- )));
- } else {
- return Ok(ContigRef::ChimericMultiple((
- first.get(0).unwrap().clone(),
- grouped.into_iter().flat_map(|e| e).collect(),
- last.get(0).unwrap().clone(),
- )));
- }
- } else if first.len() == 1 {
- let right: Vec<Mapping> = vec![grouped.into_iter().flat_map(|e| e).collect(), last]
- .into_iter()
- .flat_map(|e| e)
- .collect();
- return Ok(ContigRef::RightAmbiguity((
- first.get(0).unwrap().clone(),
- right,
- )));
- } else if last.len() == 1 {
- let left: Vec<Mapping> = vec![first, grouped.into_iter().flat_map(|e| e).collect()]
- .into_iter()
- .flat_map(|e| e)
- .collect();
- return Ok(ContigRef::LeftAmbiguity((
- left,
- last.get(0).unwrap().clone(),
- )));
- } else {
- let all: Vec<Mapping> =
- vec![first, grouped.into_iter().flat_map(|e| e).collect(), last]
- .into_iter()
- .flat_map(|e| e)
- .collect();
- return Ok(ContigRef::Ambigous(all));
- }
- } else {
- return Ok(ContigRef::Ambigous(
- grouped.into_iter().flat_map(|e| e).collect(),
- ));
- }
- }
- }
- impl Genome {
- pub fn new() -> Self {
- Genome {
- chromosomes: HashMap::new(),
- }
- }
- pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, Chromosome> {
- self.chromosomes.iter()
- }
- pub fn contigs(&self) -> impl Iterator<Item = &Contig> {
- self.chromosomes.iter().flat_map(|(_, e)| e.iter())
- }
- pub fn add_contig(
- &mut self,
- id: String,
- mappings: Vec<Mapping>,
- supporting_records: Option<Vec<Record>>,
- sequence: String,
- ) -> Result<()> {
- let mut mappings = mappings;
- mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
- let new_contig = Contig {
- id,
- mappings: mappings.clone(),
- supporting_records,
- sequence,
- contig_ref: get_ref_pos(mappings)?,
- };
- // get the category of Mapping
- match new_contig.contig_ref.clone() {
- ContigRef::Unique(contig_mapping) => {
- match self
- .chromosomes
- .get_mut(&contig_mapping.target_name.unwrap())
- {
- Some(chromosome) => {
- chromosome.contigs.push(new_contig);
- }
- None => (),
- }
- }
- ContigRef::Chimeric((a, b)) => {
- let a_target_name = a.target_name.unwrap();
- let b_target_name = b.target_name.unwrap();
- if a_target_name == b_target_name {
- if let Some(chromosome) = self.chromosomes.get_mut(&a_target_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- a_target_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- } else {
- let chimeric_name = format!("{}-{}", a_target_name, b_target_name);
- if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- chimeric_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- }
- }
- ContigRef::ChimericMultiple((left, _, right)) => {
- let left_target_name = left.target_name.unwrap();
- let right_target_name = right.target_name.unwrap();
- if left_target_name == right_target_name {
- if let Some(chromosome) = self.chromosomes.get_mut(&left_target_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- left_target_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- } else {
- let chimeric_name = format!("{}-{}", left_target_name, right_target_name);
- if let Some(chromosome) = self.chromosomes.get_mut(&chimeric_name) {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- chimeric_name,
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- }
- }
- _ => {
- if let Some(chromosome) = self.chromosomes.get_mut("Ambigous") {
- chromosome.contigs.push(new_contig);
- } else {
- self.chromosomes.insert(
- "Ambigous".to_string(),
- Chromosome {
- contigs: vec![new_contig],
- },
- );
- }
- }
- };
- Ok(())
- }
- pub fn add_contig_from_seq(
- &mut self,
- name: String,
- sequence: &[u8],
- aligner: impl Fn(String) -> Result<Vec<Mapping>>,
- ) -> Result<()> {
- let mappings = aligner(String::from_utf8(sequence.to_vec())?)?;
- // println!("{mappings:?}");
- self.add_contig(name, mappings, None, String::from_utf8(sequence.to_vec())?)?;
- Ok(())
- }
- pub fn write_contigs_sequences(&self, dir: &str) {
- for contig in self.contigs() {
- contig.write_fasta(&format!("{dir}/{}.fasta", contig.id))
- }
- // self.iter().for_each(|(_, chr)| {
- // chr.iter().for_each(|c| c.write_fasta(&format!("{dir}/{}.fasta", c.id)))
- // });
- }
- pub fn from_contigs_sequences(dir: &str) -> Result<Self> {
- let aligner_url = "http://localhost:4444/align";
- let aligner = aligner_client::dist_align(aligner_url.to_string());
- let mut genome = Self::new();
- // let paths = get_ext_paths(dir, "fasta")?;
- let paths = get_contigs_fa_paths(dir)?;
- for path in paths {
- let fa = read_fasta(path.to_str().unwrap())?;
- for (name, sequence) in fa {
- genome.add_contig_from_seq(name, sequence.as_ref(), &aligner)?;
- }
- }
- Ok(genome)
- }
- //
- // pub fn from_dir_bed(dir: &str) {
- //
- // }
- // pub fn write_records(&self, file: &str) {
- // let mut records = Vec::new();
- // for (name, chromosome) in self.chromosomes.iter() {
- // for contig in chromosome.iter() {
- // if let Some(rec) = &contig.supporting_records {
- // records.extend(rec.iter());
- // };
- // }
- // }
- // // header
- //
- // // writer
- // rust_htslib::bam::Writer::from_path(path, header, format)
- // }
- pub fn stats(&self) {
- for (k, v) in self.chromosomes.iter() {
- info!("{}:{}", k, v.contigs.len());
- }
- }
- pub fn chromosome(&self, chromosome: &str) -> Option<Vec<Contig>> {
- self.chromosomes
- .get(chromosome)
- .map(|chr| chr.contigs.clone())
- }
- }
- impl Chromosome {
- pub fn iter(&self) -> std::slice::Iter<'_, Contig> {
- self.contigs.iter()
- }
- }
- impl Contig {
- // pub fn sort(&mut self) {
- // // sorting by target order
- // self.mappings
- // .sort_by(|a, b| a.target_start.cmp(&b.target_start));
- // }
- pub fn to_igv(&self, dir_path: &str) -> Result<()> {
- let supporting_records = self.supporting_records.clone().ok_or(anyhow!("no reads"))?;
- let contig_name = if let Some(hgvs) = self.hgvs() {
- hgvs
- } else {
- self.id.clone()
- };
- let contig_dir = format!("{dir_path}/{contig_name}");
- fs::create_dir_all(contig_dir.clone())?;
- let fasta_path = format!("{contig_dir}/contig.fa");
- write_fasta(&fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
- write_fai(&fasta_path);
- let reads_path = format!("{contig_dir}/reads.fa");
- let n_reads = supporting_records
- .clone()
- .into_iter()
- .map(|r| {
- (
- String::from_utf8(r.qname().to_vec()).unwrap(),
- String::from_utf8(r.seq().as_bytes()).unwrap(),
- )
- })
- .collect();
- write_fasta(&reads_path, &n_reads);
- let bam_path = format!("{contig_dir}/{}.bam", self.id);
- write_bam(&fasta_path, &reads_path, &bam_path)?;
- let bed_path = format!("{contig_dir}/contig.bed");
- match &self.contig_ref {
- ContigRef::Chimeric((a, b)) => {
- let d = vec![
- (
- self.id.clone(),
- a.query_start,
- a.query_end,
- format!(
- "{}:{}-{}",
- a.target_name.clone().unwrap(),
- a.target_start,
- a.target_end
- ),
- ),
- (
- self.id.clone(),
- b.query_start,
- b.query_end,
- format!(
- "{}:{}-{}",
- b.target_name.clone().unwrap(),
- b.target_start,
- b.target_end
- ),
- ),
- ];
- write_bed(&bed_path, &d)?;
- }
- ContigRef::ChimericTriple((a, b, c)) => {
- let d: Vec<(String, i32, i32, String)> = [a, b, c]
- .iter()
- .map(|r| {
- (
- self.id.clone(),
- r.query_start,
- r.query_end,
- format!(
- "{}:{}-{}",
- r.target_name.clone().unwrap(),
- r.target_start,
- r.target_end
- ),
- )
- })
- .collect();
- write_bed(&bed_path, &d)?;
- }
- _ => (),
- }
- Ok(())
- }
- // bug cigar len != seq len
- // pub fn write_bam(&self, path: &str) -> Result<()> {
- // let aligner = Aligner::builder()
- // .asm5()
- // .with_threads(8)
- // .with_cigar()
- // .with_seq(self.sequence.as_bytes())
- // .expect("Unable to build index");
- //
- // let mut mappings = Vec::new();
- // let supporting_records = self
- // .supporting_records
- // .clone()
- // .ok_or(anyhow!("no supporting records"))?;
- // for record in supporting_records.iter() {
- // let seq = record.seq().as_bytes();
- // let alignment = aligner
- // .map(&seq, false, false, None, None)
- // .expect("Unable to align");
- // mappings.push(alignment);
- // }
- // let mut mappings: Vec<_> = mappings.into_iter().flatten().collect();
- // mappings.sort_by(|a, b| a.target_start.cmp(&b.target_start));
- //
- // let mut header = bam::Header::new();
- // let mut sq_record = rust_htslib::bam::header::HeaderRecord::new(b"SQ");
- // sq_record.push_tag(b"SN", self.id.clone());
- // sq_record.push_tag(b"LN", self.sequence.len());
- // header.push_record(&sq_record);
- //
- // let mut out = bam::Writer::from_path(path, &header, bam::Format::Bam).unwrap();
- //
- // // copy reverse reads to new BAM file
- // for mapping in mappings.iter() {
- // let record = minimap2::htslib::mapping_to_record(
- // Some(mapping),
- // self.sequence.as_bytes(),
- // header.clone(),
- // None,
- // Some(Uuid::new_v4().as_bytes()),
- // );
- // let _ = out.write(&record);
- // }
- // rust_htslib::bam::index::build(path, None, rust_htslib::bam::index::Type::Bai, 1)?;
- // Ok(())
- // }
- pub fn hgvs(&self) -> Option<String> {
- self.contig_ref.hgvs()
- }
- pub fn desc(&self) -> Option<String> {
- self.contig_ref.desc()
- }
- pub fn breakpoints_repr(&self) -> Option<Vec<String>> {
- self.contig_ref.breakpoints_repr()
- }
- pub fn write_fasta(&self, fasta_path: &str) {
- write_fasta(fasta_path, &vec![(self.id.clone(), self.sequence.clone())]);
- }
- }
- fn group_mappings(mappings: &mut [Mapping]) -> Result<Vec<Vec<Mapping>>> {
- // sort alignments by query_start
- mappings.sort_by(|a, b| a.query_start.cmp(&b.query_start));
- let mut alignments: Vec<Vec<Mapping>> = vec![];
- // group by overlapps > 30
- for aln in mappings.iter() {
- let last = alignments.last_mut();
- if let Some(l) = last {
- if l.iter()
- .filter(|a| a.query_end - aln.query_start > 30)
- .count()
- > 0
- {
- l.push(aln.clone());
- } else {
- alignments.push(vec![aln.clone()]);
- }
- } else {
- alignments.push(vec![aln.clone()]);
- }
- }
- Ok(alignments)
- }
- pub fn write_bed(path: &str, d: &[(String, i32, i32, String)]) -> Result<()> {
- let file = File::create(path).unwrap();
- let mut writer = BufWriter::new(file);
- for (chr, start, end, value) in d.iter() {
- let row = format!(
- "{}\n",
- [
- chr.to_string(),
- start.to_string(),
- end.to_string(),
- value.to_string()
- ]
- .join("\t")
- );
- writer.write_all(row.as_bytes())?;
- }
- Ok(())
- }
- // unique
- pub fn write_fastq(fastq_path: &str, d: &Vec<Record>) -> Result<()> {
- let file = File::create(fastq_path)?;
- let mut writer = BufWriter::new(file);
- for record in d {
- let name = String::from_utf8(record.qname().to_vec()).unwrap();
- writer.write_all(format!("@{name}\n").as_bytes())?;
- let seq = record.seq().as_bytes();
- writer.write_all(&seq)?;
- writer.write_all(b"\n+\n")?;
- let qual = record.qual();
- writer.write_all(qual)?;
- }
- Ok(())
- }
- pub fn write_fasta(fasta_path: &str, d: &Vec<(String, String)>) {
- let file = File::create(fasta_path).unwrap();
- let mut writer = fasta::writer::Builder::default().build_with_writer(file);
- let mut passed = Vec::new();
- for (name, sequence) in d {
- let name = name.to_string();
- if sequence.is_empty() {
- continue;
- }
- if passed.contains(&name) {
- continue;
- }
- passed.push(name.clone());
- let record = fasta::Record::new(
- fasta::record::Definition::new(name.as_str(), None),
- fasta::record::Sequence::from(sequence.as_bytes().to_vec()),
- );
- writer.write_record(&record).unwrap();
- }
- }
- pub fn write_fai(path: &str) {
- let mut faidx = Command::new("samtools")
- .arg("faidx")
- .arg(path)
- .spawn()
- .expect("Samtools faidx failed to start");
- faidx.wait().unwrap();
- }
- pub fn write_bam(ref_path: &str, reads_path: &str, bam_path: &str) -> Result<()> {
- let rg_id = uuid::Uuid::new_v4();
- let mm2 = Command::new("minimap2")
- .arg("-t")
- .arg("128")
- .arg("-ax")
- .arg("map-ont")
- .arg("-R")
- .arg(format!(
- "@RG\\tPL:ONTASM_PROM\\tID:ONTASM_{rg_id}\\tSM:{rg_id}\\tLB:ONTASM_NB_PROM"
- ))
- .arg(ref_path)
- .arg(reads_path)
- .stdout(Stdio::piped())
- .stderr(Stdio::piped())
- .spawn()
- .expect("Minimap2 failed to start");
- let view = Command::new("sambamba")
- .arg("view")
- .arg("-h")
- .arg("-S")
- .arg("-t")
- .arg("20")
- .arg("--format=bam")
- .arg("/dev/stdin")
- .stdin(Stdio::from(mm2.stdout.unwrap()))
- .stdout(Stdio::piped())
- .stderr(Stdio::piped())
- .spawn()
- .expect("Sambamba view failed to start");
- let mut sort = Command::new("sambamba")
- .arg("sort")
- .arg("-t")
- .arg("20")
- .arg("/dev/stdin")
- .arg("-o")
- .arg(bam_path)
- .stderr(Stdio::piped())
- .stdin(Stdio::from(view.stdout.unwrap()))
- .spawn()
- .expect("Sambamba sort failed to start");
- sort.wait().unwrap();
- Ok(())
- }
- pub fn read_fasta(path: &str) -> Result<Vec<(String, Sequence)>> {
- let mut reader = File::open(path)
- .map(BufReader::new)
- .map(fasta::Reader::new)?;
- let mut res = Vec::new();
- for result in reader.records() {
- let record = result?;
- let u = String::from_utf8(record.name().to_vec())?;
- let s = record.sequence().to_owned();
- res.push((u, s));
- }
- Ok(res)
- }
- // fn get_ext_paths(dir: &str, ext: &str) -> Result<Vec<PathBuf>> {
- // let paths = std::fs::read_dir(dir)?
- // // Filter out all those directory entries which couldn't be read
- // .filter_map(|res| res.ok())
- // // Map the directory entries to paths
- // .map(|dir_entry| dir_entry.path())
- // // Filter out all paths with extensions other than `csv`
- // .filter_map(|path| {
- // if path.extension().map_or(false, |xt| xt == ext) {
- // Some(path)
- // } else {
- // None
- // }
- // })
- // .collect::<Vec<_>>();
- // Ok(paths)
- // }
- fn get_contigs_fa_paths(dir: &str) -> Result<Vec<PathBuf>> {
- let pattern = format!("{}/**/*_flye.fa", dir);
- let fa_paths: Vec<PathBuf> = glob::glob(&pattern)
- .expect("Failed to read glob pattern")
- .filter_map(Result::ok)
- .collect();
- Ok(fa_paths)
- }
- pub fn dot_graph(
- graph: &StableGraph<String, String>,
- way: &[NodeIndex],
- value: &str,
- color: &str,
- erase: bool,
- ) -> String {
- let mut g = graph.clone();
- // erase labels
- if erase {
- g.edge_weights_mut().for_each(|e| *e = "".to_string());
- }
- let mut labels = Vec::new();
- for window in way.windows(2) {
- let edge_id = g.find_edge(window[0], window[1]).unwrap();
- let edge = g.edge_weight_mut(edge_id).unwrap();
- let v = if !edge.is_empty() {
- let mut v: Vec<&str> = edge.split(",").filter(|e| !e.is_empty()).collect();
- v.push(value);
- v.join(", ")
- } else {
- value.to_string()
- };
- labels.push(v.clone());
- *edge = v;
- }
- if erase {
- g.retain_edges(|g, i| g.edge_weight(i).unwrap().to_string() != "".to_string());
- g.retain_nodes(|g, n| g.neighbors_undirected(n).count() > 0);
- }
- let mut dot = Dot::new(&g).to_string().replace("\\\"", "");
- labels.sort_by_key(|b| std::cmp::Reverse(b.len()));
- // labels.sort_by(|a, b| b.len().cmp(&a.len()));
- for label in labels {
- let label_str = format!("label = \"{label}\"");
- dot = dot.replace(
- &format!("{label_str} ]\n"),
- &format!("{label_str}, color = \"{color}\" ]\n"),
- );
- }
- dot
- }
- /// for non overlapping ways
- pub fn dot_graph_biall(
- graph: &StableGraph<String, String>,
- ways: &[Vec<NodeIndex>],
- values: Vec<&str>,
- colors: Vec<&str>,
- erase: bool,
- ) -> String {
- let mut g = graph.clone();
- // erase labels
- if erase {
- g.edge_weights_mut().for_each(|e| *e = "".to_string());
- }
- let mut labels = Vec::new();
- for ((way, value), color) in ways.iter().zip(values.into_iter()).zip(colors.into_iter()) {
- for window in way.windows(2) {
- let edge_id = g.find_edge(window[0], window[1]).unwrap();
- let edge = g.edge_weight_mut(edge_id).unwrap();
- let v = if !edge.is_empty() {
- let mut v: Vec<&str> = edge.split(",").filter(|e| !e.is_empty()).collect();
- v.push(value);
- v.join(", ")
- } else {
- value.to_string()
- };
- labels.push((v.clone(), color));
- *edge = v;
- }
- }
- // g.retain_edges(|g, i| g.edge_weight(i).unwrap().to_string() != "".to_string());
- g.retain_edges(|g, i| !g.edge_weight(i).unwrap().is_empty());
- g.retain_nodes(|g, n| g.neighbors_undirected(n).count() > 0);
- let mut dot = Dot::new(&g).to_string().replace("\\\"", "");
- labels.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
- for (label, color) in labels {
- let label_str = format!("label = \"{label}\"");
- dot = dot.replace(
- &format!("{label_str} ]\n"),
- &format!("{label_str}, color = \"{color}\" ]\n"),
- );
- }
- dot
- }
- #[cfg(test)]
- mod tests {
- use env_logger::Env;
- use super::*;
- use crate::{
- genomic_graph::GenomicGraph,
- // phase::{variants_phasing, write_phases_bed},
- };
- fn init() {
- let _ = env_logger::Builder::from_env(Env::default().default_filter_or("info"))
- .is_test(true)
- .try_init();
- }
- #[test]
- fn it_works() -> Result<()> {
- let _ = env_logger::builder().is_test(true).try_init();
- let contig_fa = "./data_test/contig_2.fa";
- let aligner_url = "http://localhost:4444/align";
- let mut genome = Genome::new();
- let aligner = aligner_client::dist_align(aligner_url.to_string());
- let sequences = read_fasta(contig_fa)?;
- for (name, seq) in sequences {
- genome.add_contig_from_seq(name.clone(), &seq.as_ref().to_vec(), &aligner)?;
- let mut seqc: Vec<u8> = seq.complement().map(|e| e.unwrap()).collect();
- seqc.reverse();
- genome.add_contig_from_seq(format!("{name}_rev"), &seqc, &aligner)?;
- println!("Sending");
- }
- genome.iter().for_each(|(_, c)| {
- c.iter().for_each(|cont| {
- println!("{}", cont.contig_ref.desc().unwrap());
- });
- });
- Ok(())
- }
- #[test]
- fn test_graph() -> Result<()> {
- init();
- let case = "SALICETTO";
- let chrom = vec!["chr10"];
- info!("This record will be captured by `cargo test`");
- let dir = format!("/data/longreads_basic_pipe/{case}/diag/asm_contis");
- // Load from fasta in dir.
- let genome = Genome::from_contigs_sequences(&dir)?;
- genome.stats();
- let mut genomic_graph = GenomicGraph::from_genome(&genome);
- let sens = vec![true, false];
- let pos = vec![0, i32::MAX];
- let mut all_ways = Vec::new();
- if chrom.len() > 1 {
- (0..4).into_iter().for_each(|i| {
- let start_pos = if i < 2 { 0 } else { i32::MAX };
- let end_pos = pos[i % 2];
- (0..4).into_iter().for_each(|i| {
- let start_sens = if i < 2 { true } else { false };
- let end_sens = sens[i % 2];
- (0..4).into_iter().for_each(|i| {
- let start_chr = if i < 2 { chrom[0] } else { chrom[1] };
- let end_chr = chrom[i % 2];
- let start = (start_sens, start_chr, start_pos);
- let end = (end_sens, end_chr, end_pos);
- let (oriented_graph, _integrated_graph, ways) =
- genomic_graph.ways(start, end);
- let dot = oriented_graph.dot_graph();
- println!("dot\n{dot}");
- for (_i, way) in ways.iter().enumerate() {
- let s = way
- .iter()
- .map(|(_, _, _, s)| s.to_string())
- .collect::<Vec<String>>()
- .join("");
- all_ways.push(s);
- }
- });
- });
- });
- } else {
- let start_chr = chrom[0];
- let end_chr = chrom[0];
- let start = (true, start_chr, 0);
- let end = (true, end_chr, i32::MAX);
- let (oriented_graph, _integrated_graph, ways) = genomic_graph.ways(start, end);
- let dot = oriented_graph.dot_graph();
- println!("dot\n{dot}");
- for (_i, way) in ways.iter().enumerate() {
- let s = way
- .iter()
- .map(|(_, _, _, s)| s.to_string())
- .collect::<Vec<String>>()
- .join("");
- all_ways.push(s);
- }
- }
- all_ways.dedup();
- all_ways
- .iter()
- .enumerate()
- .for_each(|(i, s)| println!("{i}.\t{s}"));
- // let s = Dot::new(&integrated_graph).to_string().replace("\\\"", "");
- // let x11_colors: Vec<String> = vec![
- // String::from("Red"),
- // String::from("Green"),
- // String::from("Blue"),
- // String::from("Cyan"),
- // String::from("Magenta"),
- // String::from("Yellow"),
- // String::from("DarkRed"),
- // String::from("DarkGreen"),
- // String::from("DarkBlue"),
- // String::from("DarkCyan"),
- // String::from("DarkMagenta"),
- // String::from("DarkYellow"),
- // String::from("LightRed"),
- // String::from("LightGreen"),
- // String::from("LightBlue"),
- // String::from("LightCyan"),
- // String::from("LightMagenta"),
- // String::from("LightYellow"),
- // String::from("Orange"),
- // String::from("Brown"),
- // String::from("Beige"),
- // ];
- // let mut s = s.clone();
- // ways.iter().enumerate().for_each(|(i, _)| {
- // s = s.replace(
- // &format!("[ label = \"{}\" ]", i + 1),
- // &format!(
- // "[ label = \"{}\" color = \"{}\" ]",
- // i + 1,
- // x11_colors[i].to_string()
- // ),
- // );
- // });
- // println!("{s}");
- //
- // for (i, way) in ways.iter().enumerate() {
- // let s = way
- // .iter()
- // .map(|(_, _, _, s)| s.to_string())
- // .collect::<Vec<String>>()
- // .join("");
- // println!("{}.\t{s}", i + 1);
- // }
- Ok(())
- }
- #[test]
- fn dir() {
- init();
- let id = "ROBIN";
- info!("This record will be captured by `cargo test`");
- let dir = format!("/data/longreads_basic_pipe/{id}/diag/scan/reads",);
- // Load from fasta in dir.
- let genome = Genome::from_contigs_sequences(&dir).unwrap();
- genome.stats();
- let mut res: Vec<String> = genome
- .iter()
- .flat_map(|(_s, chrom)| {
- chrom.iter().filter_map(|c| c.hgvs())
- // .map(|c| println!("{c}"))
- })
- .collect();
- res.sort();
- res.dedup();
- res.iter().for_each(|s| println!("{s}"));
- // println!("{genome:#?}");
- }
- }
|