|
|
@@ -1,824 +1,11 @@
|
|
|
pub mod assembler;
|
|
|
pub mod io;
|
|
|
|
|
|
-use anyhow::{anyhow, Context};
|
|
|
-
|
|
|
-use io::bed::read_bed;
|
|
|
-use log::{info, warn};
|
|
|
-use nom::AsBytes;
|
|
|
-use pandora_lib_blastn::BlastResult;
|
|
|
-use pandora_lib_igv::{BamTrack, BedTrack, Track};
|
|
|
-use petgraph::{dot::Dot, graph::NodeIndex, stable_graph::StableUnGraph};
|
|
|
-use rayon::prelude::*;
|
|
|
-use rust_htslib::bam::{Read, Reader, Record};
|
|
|
-use std::{
|
|
|
- collections::{HashMap, HashSet, VecDeque},
|
|
|
- fs::{self, File},
|
|
|
- io::{BufRead, BufReader, Write},
|
|
|
- path::{Path, PathBuf},
|
|
|
- str::FromStr,
|
|
|
-};
|
|
|
-
|
|
|
-//
|
|
|
-// pub fn make_graph(d: Vec<BlastResult>, records: &Vec<Record>) {
|
|
|
-// let mut g: StableUnGraph<String, Option<BlastResult>> = StableUnGraph::default();
|
|
|
-// let mut name_to_id: HashMap<String, NodeIndex> = HashMap::new();
|
|
|
-//
|
|
|
-// for blast_result in d {
|
|
|
-// let nid_a = if let Some(nid_a) = name_to_id.get(&blast_result.query_id) {
|
|
|
-// *nid_a
|
|
|
-// } else {
|
|
|
-// let k = blast_result.query_id.clone();
|
|
|
-// let nid_a = g.add_node(k.clone());
|
|
|
-// name_to_id.insert(k, nid_a);
|
|
|
-// nid_a
|
|
|
-// };
|
|
|
-// let nid_b = if let Some(nid_b) = name_to_id.get(&blast_result.subject_id) {
|
|
|
-// *nid_b
|
|
|
-// } else {
|
|
|
-// let k = blast_result.subject_id.clone();
|
|
|
-// let nid_b = g.add_node(k.clone());
|
|
|
-// name_to_id.insert(k, nid_b);
|
|
|
-// nid_b
|
|
|
-// };
|
|
|
-// g.add_edge(nid_a, nid_b, Some(blast_result));
|
|
|
-// }
|
|
|
-//
|
|
|
-// println!(
|
|
|
-// "{:?}",
|
|
|
-// Dot::with_config(&g, &[petgraph::dot::Config::EdgeNoLabel])
|
|
|
-// );
|
|
|
-// // let order = dijkstra(&g, start_node, None, |_| 0);
|
|
|
-// // println!("{order:#?}");
|
|
|
-//
|
|
|
-// // begin with start node
|
|
|
-// let mut n_neigh: Vec<(usize, NodeIndex)> = g
|
|
|
-// .node_indices()
|
|
|
-// .map(|nid| (g.neighbors(nid).count(), nid))
|
|
|
-// .collect();
|
|
|
-// n_neigh.sort_by_key(|v| v.0);
|
|
|
-// let start_node = n_neigh.last().unwrap().1;
|
|
|
-// let start_name = g.node_weight(start_node).unwrap().to_string();
|
|
|
-//
|
|
|
-// let neighbors = g.neighbors(start_node);
|
|
|
-// let mut neighbors_blast = Vec::new();
|
|
|
-// for neighbor in neighbors {
|
|
|
-// let e: Vec<BlastResult> = g
|
|
|
-// .edges_connecting(start_node, neighbor)
|
|
|
-// .map(|e| e.weight().clone().unwrap())
|
|
|
-// .collect();
|
|
|
-// neighbors_blast.extend(e);
|
|
|
-// }
|
|
|
-// let mut neighbors_blast: Vec<(BlastResult, usize)> = neighbors_blast
|
|
|
-// .iter()
|
|
|
-// .map(|b| {
|
|
|
-// if b.subject_id != start_name {
|
|
|
-// b.swap_query_subject()
|
|
|
-// } else {
|
|
|
-// b.clone()
|
|
|
-// }
|
|
|
-// })
|
|
|
-// .map(|b| (b.clone(), get_seq_len(records, &b.query_id)))
|
|
|
-// .collect();
|
|
|
-// neighbors_blast.sort_by_key(|b| b.0.s_end);
|
|
|
-//
|
|
|
-// let start_len = get_seq_len(records, &start_name);
|
|
|
-// println!("{start_len}");
|
|
|
-// println!("{neighbors_blast:#?}");
|
|
|
-// }
|
|
|
-//
|
|
|
-// pub fn get_seq_len(records: &[Record], id: &str) -> usize {
|
|
|
-// let lens: Vec<usize> = records
|
|
|
-// .iter()
|
|
|
-// .filter(|r| String::from_utf8(r.qname().to_vec()).unwrap() == id)
|
|
|
-// .map(|r| r.seq_len())
|
|
|
-// .collect();
|
|
|
-// lens[0]
|
|
|
-// }
|
|
|
-//
|
|
|
-// pub fn order_sequences_with_positions(blast_results: &[BlastResult]) -> Vec<(String, usize)> {
|
|
|
-// let mut graph: HashMap<String, HashSet<String>> = HashMap::new();
|
|
|
-// let mut in_degree: HashMap<String, usize> = HashMap::new();
|
|
|
-// let mut all_sequences: HashSet<String> = HashSet::new();
|
|
|
-//
|
|
|
-// // Build the graph
|
|
|
-// for result in blast_results {
|
|
|
-// all_sequences.insert(result.query_id.clone());
|
|
|
-// all_sequences.insert(result.subject_id.clone());
|
|
|
-//
|
|
|
-// if result.q_start < result.s_start {
|
|
|
-// graph
|
|
|
-// .entry(result.query_id.clone())
|
|
|
-// .or_insert_with(HashSet::new)
|
|
|
-// .insert(result.subject_id.clone());
|
|
|
-// *in_degree.entry(result.subject_id.clone()).or_insert(0) += 1;
|
|
|
-// } else {
|
|
|
-// graph
|
|
|
-// .entry(result.subject_id.clone())
|
|
|
-// .or_insert_with(HashSet::new)
|
|
|
-// .insert(result.query_id.clone());
|
|
|
-// *in_degree.entry(result.query_id.clone()).or_insert(0) += 1;
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// // Topological sort
|
|
|
-// let mut queue: VecDeque<String> = all_sequences
|
|
|
-// .iter()
|
|
|
-// .filter(|&seq| !in_degree.contains_key(seq))
|
|
|
-// .cloned()
|
|
|
-// .collect();
|
|
|
-// let mut ordered_sequences: Vec<String> = Vec::new();
|
|
|
-//
|
|
|
-// while let Some(seq) = queue.pop_front() {
|
|
|
-// ordered_sequences.push(seq.clone());
|
|
|
-//
|
|
|
-// if let Some(neighbors) = graph.get(&seq) {
|
|
|
-// for neighbor in neighbors {
|
|
|
-// if let Some(degree) = in_degree.get_mut(neighbor) {
|
|
|
-// *degree -= 1;
|
|
|
-// if *degree == 0 {
|
|
|
-// queue.push_back(neighbor.clone());
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// // Handle any remaining sequences (in case of cycles)
|
|
|
-// for seq in all_sequences {
|
|
|
-// if !ordered_sequences.contains(&seq) {
|
|
|
-// ordered_sequences.push(seq);
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// // Calculate absolute positions
|
|
|
-// let mut positions: HashMap<String, usize> = HashMap::new();
|
|
|
-// for (current_position, seq) in ordered_sequences.iter().enumerate() {
|
|
|
-// positions.insert(seq.clone(), current_position);
|
|
|
-// }
|
|
|
-//
|
|
|
-// let mut res: Vec<_> = ordered_sequences
|
|
|
-// .into_iter()
|
|
|
-// .map(|seq| (seq.clone(), *positions.get(&seq).unwrap()))
|
|
|
-// .collect();
|
|
|
-// res.sort_by(|a, b| a.1.cmp(&b.1));
|
|
|
-// res
|
|
|
-// }
|
|
|
-//
|
|
|
-//
|
|
|
-//
|
|
|
-// #[derive(Debug, Default)]
|
|
|
-// pub struct FlyeAsm {
|
|
|
-// pub input_id: String,
|
|
|
-// pub asm_dir: String,
|
|
|
-// pub input_bam: String,
|
|
|
-// pub input_records: Vec<bam::Record>,
|
|
|
-// pub on_contig_bam: String,
|
|
|
-// pub contigs: Option<Vec<String>>,
|
|
|
-// pub flye_asm_params: FlyeAsmParams,
|
|
|
-// }
|
|
|
-//
|
|
|
-// #[derive(Debug)]
|
|
|
-// pub struct FlyeAsmParams {
|
|
|
-// pub min_cov: u16,
|
|
|
-// }
|
|
|
-//
|
|
|
-// impl Default for FlyeAsmParams {
|
|
|
-// fn default() -> Self {
|
|
|
-// Self { min_cov: 2 }
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// impl FlyeAsm {
|
|
|
-// pub fn new(asm_dir: &str, file_stem: &str) -> anyhow::Result<Self> {
|
|
|
-// let input_bam = format!("{asm_dir}/{file_stem}.bam");
|
|
|
-// let on_contig_bam = format!("{asm_dir}/{file_stem}_contig.bam");
|
|
|
-//
|
|
|
-// let reader = bam::BamReader::from_path(&input_bam, 3)?;
|
|
|
-// let mut input_records = Vec::new();
|
|
|
-// for rec in reader {
|
|
|
-// input_records.push(rec?);
|
|
|
-// }
|
|
|
-// Ok(FlyeAsm {
|
|
|
-// asm_dir: asm_dir.to_string(),
|
|
|
-// input_id: file_stem.to_string(),
|
|
|
-// on_contig_bam,
|
|
|
-// contigs: None,
|
|
|
-// flye_asm_params: FlyeAsmParams::default(),
|
|
|
-// input_records,
|
|
|
-// input_bam,
|
|
|
-// })
|
|
|
-// }
|
|
|
-//
|
|
|
-// pub fn assemble(mut self) -> anyhow::Result<Self> {
|
|
|
-// let min_cov = self.flye_asm_params.min_cov;
|
|
|
-//
|
|
|
-// let tmp_dir = format!("/tmp/ass_{}", uuid::Uuid::new_v4());
|
|
|
-// info!("Creating tmp directory {tmp_dir}");
|
|
|
-// fs::create_dir(&tmp_dir).unwrap();
|
|
|
-// let input_fa = format!("{}/{}.fasta", self.asm_dir, self.input_id);
|
|
|
-//
|
|
|
-// if !Path::new(&input_fa).exists() {
|
|
|
-// records_to_fasta(&self.input_records, &input_fa)?;
|
|
|
-// }
|
|
|
-//
|
|
|
-// // let input_fa = format!("{tmp_dir}/input.fa");
|
|
|
-// // records_to_fasta(&self.input_records, &input_fa)?;
|
|
|
-//
|
|
|
-// let flye_tmp_dir = format!("{tmp_dir}/flye");
|
|
|
-// fs::create_dir(&flye_tmp_dir).unwrap();
|
|
|
-//
|
|
|
-// run_flye(&input_fa, &flye_tmp_dir, "10M");
|
|
|
-//
|
|
|
-// let contigs_path = format!("{flye_tmp_dir}/assembly.fasta");
|
|
|
-//
|
|
|
-// if Path::new(&contigs_path).exists() {
|
|
|
-// let assembly_path = format!("{flye_tmp_dir}/assembly.fasta");
|
|
|
-// let flye_cov_path = format!("{flye_tmp_dir}/40-polishing/base_coverage.bed.gz");
|
|
|
-//
|
|
|
-// // Read the assembly fasta
|
|
|
-// let mut reader = File::open(assembly_path)
|
|
|
-// .map(BufReader::new)
|
|
|
-// .map(noodles_fasta::Reader::new)?;
|
|
|
-//
|
|
|
-// let mut contigs = Vec::new();
|
|
|
-// for result in reader.records() {
|
|
|
-// let record = result?;
|
|
|
-// let contig_name = String::from_utf8(record.name().to_vec()).unwrap();
|
|
|
-// let (s, e) = read_flye_coverage(&flye_cov_path, min_cov.into(), &contig_name);
|
|
|
-// let seq = record.sequence().as_ref();
|
|
|
-// let seq = String::from_utf8(seq.to_vec()).unwrap();
|
|
|
-// let seq: String = seq[s..e].into();
|
|
|
-// contigs.push(seq);
|
|
|
-// }
|
|
|
-// self.contigs = Some(contigs);
|
|
|
-// } else {
|
|
|
-// warn!("❌ No reads assembled for {}", self.input_id);
|
|
|
-// }
|
|
|
-//
|
|
|
-// // Cleaning
|
|
|
-// fs::remove_dir_all(tmp_dir)?;
|
|
|
-//
|
|
|
-// Ok(self)
|
|
|
-// }
|
|
|
-//
|
|
|
-// pub fn save(self) -> anyhow::Result<()> {
|
|
|
-// if self.contigs.is_none() {
|
|
|
-// return Err(anyhow!("No reads were assembled"));
|
|
|
-// }
|
|
|
-//
|
|
|
-// for (i, contig) in self.contigs.unwrap().iter().enumerate() {
|
|
|
-// let suffixe = if i == 0 {
|
|
|
-// "".to_string()
|
|
|
-// } else {
|
|
|
-// format!("_{i}")
|
|
|
-// };
|
|
|
-//
|
|
|
-// let contig_id = format!("{}{suffixe}_flye", self.input_id);
|
|
|
-//
|
|
|
-// let contig_fa = format!("{}/{contig_id}.fa", self.asm_dir);
|
|
|
-//
|
|
|
-// info!("Saving contig {contig_id} in {contig_fa}");
|
|
|
-// write_fasta(&contig_fa, &vec![(contig_id.clone(), contig.clone())]);
|
|
|
-// fai(&contig_fa)?;
|
|
|
-//
|
|
|
-// // Writing bed file from best blastn results
|
|
|
-// let bed_path = format!("{}/{contig_id}.bed", self.asm_dir);
|
|
|
-// let bed = pandora_lib_blastn::Blast::init(&contig_fa)?
|
|
|
-// .run()?
|
|
|
-// .keep_best()
|
|
|
-// .to_bed()?;
|
|
|
-// let mut f = File::create(bed_path)?;
|
|
|
-// f.write_all(bed.as_bytes())?;
|
|
|
-//
|
|
|
-// // Remaping input bam to contig
|
|
|
-// info!("Mapping input reads to {contig_id}");
|
|
|
-// let new_bam = format!("{}/{contig_id}.bam", self.asm_dir);
|
|
|
-// duct::cmd!("bwa", "index", contig_fa.clone()).run()?;
|
|
|
-// let input_fa = format!("{}/{}.fasta", self.asm_dir, self.input_id);
|
|
|
-// let bwa = format!("bwa mem {contig_fa} {input_fa}");
|
|
|
-// let samtools = "samtools sort /dev/stdin";
|
|
|
-// let pipe = format!("{bwa} | {samtools} > {new_bam}");
|
|
|
-// duct::cmd!("bash", "-c", pipe).run()?;
|
|
|
-//
|
|
|
-// // Copy modified base tags to new bam
|
|
|
-// cp_mod_tags(&self.input_records, &new_bam)?;
|
|
|
-//
|
|
|
-// // Run modkit
|
|
|
-// let modkit_pileup = format!("{}/{contig_id}_mod.bed", self.asm_dir);
|
|
|
-// duct::cmd!("modkit", "pileup", new_bam, modkit_pileup).run()?;
|
|
|
-// }
|
|
|
-// Ok(())
|
|
|
-// }
|
|
|
-// }
|
|
|
-
|
|
|
-// fn cluster_reads(input_path: &str) -> anyhow::Result<Vec<PathBuf>> {
|
|
|
-// // Open the input BAM file
|
|
|
-// let mut reader = Reader::from_path(input_path)
|
|
|
-// .with_context(|| format!("Failed to open input BAM file: {}", input_path))?;
|
|
|
-// let header = reader.header().clone();
|
|
|
-// let header = rust_htslib::bam::Header::from_template(&header);
|
|
|
-//
|
|
|
-// // Read all records into a vector
|
|
|
-// let mut records: HashMap<i32, Vec<Record>> = HashMap::new();
|
|
|
-// for record in reader.records() {
|
|
|
-// let record = record?;
|
|
|
-// records.entry(record.tid()).or_default().push(record);
|
|
|
-// }
|
|
|
-//
|
|
|
-// records
|
|
|
-// .values_mut()
|
|
|
-// .for_each(|v| v.sort_by_key(|a| a.pos()));
|
|
|
-//
|
|
|
-// let threshold = 500;
|
|
|
-// let records: Vec<Vec<Record>> = records
|
|
|
-// .values()
|
|
|
-// .flat_map(|v| {
|
|
|
-// v.iter().fold(vec![vec![]], |mut acc: Vec<Vec<Record>>, r| {
|
|
|
-// if let Some(last_group) = acc.last_mut() {
|
|
|
-// if last_group.is_empty()
|
|
|
-// || r.pos() - last_group.last().unwrap().pos() < threshold
|
|
|
-// {
|
|
|
-// last_group.push(r.clone());
|
|
|
-// } else {
|
|
|
-// acc.push(vec![r.clone()]);
|
|
|
-// }
|
|
|
-// }
|
|
|
-// acc
|
|
|
-// })
|
|
|
-// })
|
|
|
-// .collect();
|
|
|
-//
|
|
|
-// let file_stem = PathBuf::from(input_path)
|
|
|
-// .file_stem()
|
|
|
-// .unwrap()
|
|
|
-// .to_str()
|
|
|
-// .unwrap()
|
|
|
-// .to_string();
|
|
|
-// let dir = PathBuf::from(input_path).parent().unwrap().to_owned();
|
|
|
-// let mut output_paths = Vec::new();
|
|
|
-//
|
|
|
-// // Iterate through records, creating a new file for each omitted record
|
|
|
-// for (i, recs) in records.iter().enumerate() {
|
|
|
-// if recs.len() < 3 {
|
|
|
-// continue;
|
|
|
-// }
|
|
|
-// let output_filename = format!("{}/{file_stem}_{i}.bam", dir.display());
|
|
|
-// let output_path = PathBuf::from(&output_filename);
|
|
|
-// info!("Creating bam {}", output_path.display());
|
|
|
-//
|
|
|
-// let mut writer = rust_htslib::bam::Writer::from_path(
|
|
|
-// &output_path,
|
|
|
-// &header,
|
|
|
-// rust_htslib::bam::Format::Bam,
|
|
|
-// )
|
|
|
-// .with_context(|| {
|
|
|
-// format!(
|
|
|
-// "Failed to create output BAM file: {}",
|
|
|
-// output_path.display()
|
|
|
-// )
|
|
|
-// })?;
|
|
|
-//
|
|
|
-// // Write all records except the omitted one
|
|
|
-// for record in recs.iter() {
|
|
|
-// writer.write(record).with_context(|| {
|
|
|
-// format!("Failed to write record to file: {}", output_path.display())
|
|
|
-// })?;
|
|
|
-// }
|
|
|
-//
|
|
|
-// output_paths.push(output_path);
|
|
|
-// }
|
|
|
-// Ok(output_paths)
|
|
|
-// }
|
|
|
-//
|
|
|
-// fn two_by_tow(input_path: &str) -> anyhow::Result<Vec<PathBuf>> {
|
|
|
-// // Open the input BAM file
|
|
|
-// let mut reader = Reader::from_path(input_path)
|
|
|
-// .with_context(|| format!("Failed to open input BAM file: {}", input_path))?;
|
|
|
-// let header = reader.header().clone();
|
|
|
-// let header = rust_htslib::bam::Header::from_template(&header);
|
|
|
-//
|
|
|
-// // Read all records into a vector
|
|
|
-// let mut records = Vec::new();
|
|
|
-// for record in reader.records() {
|
|
|
-// records.push(record?);
|
|
|
-// }
|
|
|
-//
|
|
|
-// if records.len() < 3 {
|
|
|
-// return Err(anyhow!("Not enough reads"));
|
|
|
-// }
|
|
|
-//
|
|
|
-// let file_stem = PathBuf::from(input_path)
|
|
|
-// .file_stem()
|
|
|
-// .unwrap()
|
|
|
-// .to_str()
|
|
|
-// .unwrap()
|
|
|
-// .to_string();
|
|
|
-// let dir = PathBuf::from(input_path).parent().unwrap().to_owned();
|
|
|
-// let mut output_paths = Vec::new();
|
|
|
-//
|
|
|
-// for i in 0..records.len() {
|
|
|
-// let output_filename = format!("{}/{file_stem}_{i}.bam", dir.display());
|
|
|
-// let output_path = PathBuf::from(&output_filename);
|
|
|
-// info!("Creating bam {}", output_path.display());
|
|
|
-//
|
|
|
-// let mut writer = rust_htslib::bam::Writer::from_path(
|
|
|
-// &output_path,
|
|
|
-// &header,
|
|
|
-// rust_htslib::bam::Format::Bam,
|
|
|
-// )
|
|
|
-// .with_context(|| {
|
|
|
-// format!(
|
|
|
-// "Failed to create output BAM file: {}",
|
|
|
-// output_path.display()
|
|
|
-// )
|
|
|
-// })?;
|
|
|
-//
|
|
|
-// // Write two records to the file, skipping the i-th record
|
|
|
-// for (j, record) in records.iter().enumerate() {
|
|
|
-// if j != i && j != (i + 1) % records.len() {
|
|
|
-// writer.write(record)?;
|
|
|
-// } else {
|
|
|
-// warn!(
|
|
|
-// "{} removing {}",
|
|
|
-// output_path.display(),
|
|
|
-// String::from_utf8(record.qname().to_vec()).unwrap()
|
|
|
-// );
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// output_paths.push(output_path);
|
|
|
-// }
|
|
|
-// Ok(output_paths)
|
|
|
-// }
|
|
|
-//
|
|
|
-// fn two_by_tow_new(input_path: &str) -> anyhow::Result<Vec<PathBuf>> {
|
|
|
-// // Open the input BAM file
|
|
|
-// let mut reader = Reader::from_path(input_path)
|
|
|
-// .with_context(|| format!("Failed to open input BAM file: {}", input_path))?;
|
|
|
-// let header = reader.header().clone();
|
|
|
-// let header = rust_htslib::bam::Header::from_template(&header);
|
|
|
-//
|
|
|
-// // Read all records into a vector
|
|
|
-// let mut records = Vec::new();
|
|
|
-// for record in reader.records() {
|
|
|
-// records.push(record?);
|
|
|
-// }
|
|
|
-//
|
|
|
-// if records.len() < 3 {
|
|
|
-// return Err(anyhow!("Not enough reads"));
|
|
|
-// }
|
|
|
-//
|
|
|
-// let file_stem = PathBuf::from(input_path)
|
|
|
-// .file_stem()
|
|
|
-// .unwrap()
|
|
|
-// .to_str()
|
|
|
-// .unwrap()
|
|
|
-// .to_string();
|
|
|
-// let dir = PathBuf::from(input_path).parent().unwrap().to_owned();
|
|
|
-// let mut output_paths = Vec::new();
|
|
|
-//
|
|
|
-// // Iterate through records, creating a new file for each pair of records
|
|
|
-// for i in (0..records.len()).step_by(2) {
|
|
|
-// let output_filename = format!("{}/{file_stem}_{}.bam", dir.display(), i / 2);
|
|
|
-// let output_path = PathBuf::from(&output_filename);
|
|
|
-// info!("Creating bam {}", output_path.display());
|
|
|
-//
|
|
|
-// let mut writer = rust_htslib::bam::Writer::from_path(
|
|
|
-// &output_path,
|
|
|
-// &header,
|
|
|
-// rust_htslib::bam::Format::Bam,
|
|
|
-// )
|
|
|
-// .with_context(|| {
|
|
|
-// format!(
|
|
|
-// "Failed to create output BAM file: {}",
|
|
|
-// output_path.display()
|
|
|
-// )
|
|
|
-// })?;
|
|
|
-//
|
|
|
-// // Write two records to the file
|
|
|
-// writer.write(&records[i])?;
|
|
|
-// if i + 1 < records.len() {
|
|
|
-// writer.write(&records[i + 1])?;
|
|
|
-// }
|
|
|
-//
|
|
|
-// output_paths.push(output_path);
|
|
|
-// }
|
|
|
-// Ok(output_paths)
|
|
|
-// }
|
|
|
-//
|
|
|
-// fn create_bam_files_with_omissions(input_path: &str) -> anyhow::Result<Vec<PathBuf>> {
|
|
|
-// // Open the input BAM file
|
|
|
-// let mut reader = Reader::from_path(input_path)
|
|
|
-// .with_context(|| format!("Failed to open input BAM file: {}", input_path))?;
|
|
|
-// let header = reader.header().clone();
|
|
|
-// let header = rust_htslib::bam::Header::from_template(&header);
|
|
|
-//
|
|
|
-// // Read all records into a vector
|
|
|
-// let mut records = Vec::new();
|
|
|
-// for record in reader.records() {
|
|
|
-// records.push(record?);
|
|
|
-// }
|
|
|
-//
|
|
|
-// if records.len() < 3 {
|
|
|
-// return Err(anyhow!("Not enough reads"));
|
|
|
-// }
|
|
|
-//
|
|
|
-// let file_stem = PathBuf::from(input_path)
|
|
|
-// .file_stem()
|
|
|
-// .unwrap()
|
|
|
-// .to_str()
|
|
|
-// .unwrap()
|
|
|
-// .to_string();
|
|
|
-// let dir = PathBuf::from(input_path).parent().unwrap().to_owned();
|
|
|
-// let mut output_paths = Vec::new();
|
|
|
-//
|
|
|
-// // Iterate through records, creating a new file for each omitted record
|
|
|
-// for (i, omitted_record) in records.iter().enumerate() {
|
|
|
-// let output_filename = format!("{}/{file_stem}_{i}.bam", dir.display());
|
|
|
-// let output_path = PathBuf::from(&output_filename);
|
|
|
-// info!("Creating bam {}", output_path.display());
|
|
|
-//
|
|
|
-// let mut writer = rust_htslib::bam::Writer::from_path(
|
|
|
-// &output_path,
|
|
|
-// &header,
|
|
|
-// rust_htslib::bam::Format::Bam,
|
|
|
-// )
|
|
|
-// .with_context(|| {
|
|
|
-// format!(
|
|
|
-// "Failed to create output BAM file: {}",
|
|
|
-// output_path.display()
|
|
|
-// )
|
|
|
-// })?;
|
|
|
-//
|
|
|
-// // Write all records except the omitted one
|
|
|
-// for record in records.iter() {
|
|
|
-// if record != omitted_record {
|
|
|
-// writer.write(record).with_context(|| {
|
|
|
-// format!("Failed to write record to file: {}", output_path.display())
|
|
|
-// })?;
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// output_paths.push(output_path);
|
|
|
-// }
|
|
|
-//
|
|
|
-// Ok(output_paths)
|
|
|
-// }
|
|
|
-//
|
|
|
-// pub fn rescue(asm_dir: &str, file_stem: &str) -> anyhow::Result<()> {
|
|
|
-// let input_bam = format!("{}/{}.bam", asm_dir, file_stem);
|
|
|
-// // let paths = cluster_reads(&input_bam)?;
|
|
|
-// let paths = two_by_tow(&input_bam)?;
|
|
|
-// // let paths = create_bam_files_with_omissions(&input_bam)?;
|
|
|
-// paths.par_iter().for_each(|path| {
|
|
|
-// let file_stem = path.file_stem().unwrap().to_str().unwrap().to_string();
|
|
|
-// info!("Trying to assemble {file_stem}");
|
|
|
-// match FlyeAsm::new(asm_dir, &file_stem)
|
|
|
-// .unwrap()
|
|
|
-// .assemble()
|
|
|
-// .unwrap()
|
|
|
-// .save()
|
|
|
-// {
|
|
|
-// Ok(_) => {
|
|
|
-// info!("{file_stem} has been assembled ✅");
|
|
|
-// let link = igv_link(asm_dir, &file_stem).unwrap();
|
|
|
-// info!("{link}");
|
|
|
-// }
|
|
|
-// Err(err) => {
|
|
|
-// warn!("❌ {file_stem} {err}");
|
|
|
-// fs::remove_file(path).unwrap();
|
|
|
-// }
|
|
|
-// }
|
|
|
-// });
|
|
|
-//
|
|
|
-// Ok(())
|
|
|
-// }
|
|
|
-//
|
|
|
-//
|
|
|
-//
|
|
|
-// pub fn dir_flye(dir: &str, force: bool) -> anyhow::Result<()> {
|
|
|
-// let pattern = format!("{}/*.bam", dir);
|
|
|
-// // let re = Regex::new(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.bam$")?;
|
|
|
-//
|
|
|
-// for entry in glob::glob(&pattern).expect("Failed to read glob pattern") {
|
|
|
-// match entry {
|
|
|
-// Ok(path) => {
|
|
|
-// if let Some(_file_name) = path.file_name().and_then(|name| name.to_str()) {
|
|
|
-// if let Some(input_id) = path.file_stem().and_then(|n| n.to_str()) {
|
|
|
-// if PathBuf::from(format!("{dir}/{input_id}_flye.bam")).exists() && !force {
|
|
|
-// warn!("{input_id} already assembled: {}", igv_link(dir, input_id)?);
|
|
|
-// } else {
|
|
|
-// let f = FlyeAsm::new(dir, input_id)?;
|
|
|
-// match f.assemble()?.save() {
|
|
|
-// // Ok(flye_asm) => match flye_asm.save() {
|
|
|
-// Ok(_) => {
|
|
|
-// info!("✅ Assembled: {}", igv_link(dir, input_id)?);
|
|
|
-// }
|
|
|
-// // Err(e) => warn!("❌ {e}"),
|
|
|
-// // },
|
|
|
-// Err(err) => {
|
|
|
-// warn!("❌ {input_id} has not been assembled {err}");
|
|
|
-// warn!("Rescuing");
|
|
|
-// rescue(dir, input_id)?;
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// Err(e) => warn!("{:?}", e),
|
|
|
-// }
|
|
|
-// }
|
|
|
-// Ok(())
|
|
|
-// }
|
|
|
-//
|
|
|
-// pub fn dedup_dir(dir: &str) -> anyhow::Result<()> {
|
|
|
-// // load bed
|
|
|
-// let pattern = format!("{}/*_flye.bed", dir);
|
|
|
-// // let re =
|
|
|
-// // Regex::new(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}_flye\.bed$")?;
|
|
|
-//
|
|
|
-// let mut bed_hm: HashMap<String, Vec<String>> = HashMap::new();
|
|
|
-// for entry in glob::glob(&pattern).expect("Failed to read glob pattern") {
|
|
|
-// match entry {
|
|
|
-// Ok(path) => {
|
|
|
-// println!("{path:?}");
|
|
|
-// if let Some(_file_name) = path.file_name().and_then(|name| name.to_str()) {
|
|
|
-// // if re.is_match(file_name) {
|
|
|
-// if let Some(input_id) = path.file_stem().and_then(|n| n.to_str()) {
|
|
|
-// let mut bed_blast: Vec<String> = read_blastn_bed(path.to_str().unwrap())?
|
|
|
-// .iter()
|
|
|
-// .map(|r| format!("{}{}", r.name, r.strand))
|
|
|
-// .collect();
|
|
|
-// bed_blast.sort();
|
|
|
-// let key = bed_blast.join("|");
|
|
|
-// bed_hm
|
|
|
-// .entry(key)
|
|
|
-// .or_default()
|
|
|
-// .push(input_id.to_owned().replace("_flye", ""));
|
|
|
-// }
|
|
|
-// // }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// Err(e) => println!("{:?}", e),
|
|
|
-// }
|
|
|
-// }
|
|
|
-// println!("{bed_hm:#?}");
|
|
|
-//
|
|
|
-// for group in bed_hm.values() {
|
|
|
-// if group.len() < 2 {
|
|
|
-// continue;
|
|
|
-// }
|
|
|
-// let new_locus_id = uuid::Uuid::new_v4();
|
|
|
-// let output_bam_path = format!("{dir}/{new_locus_id}.bam");
|
|
|
-// info!("Merging {} bams into {}", group.len(), output_bam_path);
|
|
|
-// let input_bam_paths = group.iter().map(|s| format!("{dir}/{s}.bam")).collect();
|
|
|
-// merge_bam_files(input_bam_paths, &output_bam_path)?;
|
|
|
-// dir_flye(dir, false)?;
|
|
|
-// }
|
|
|
-//
|
|
|
-// Ok(())
|
|
|
-// }
|
|
|
-//
|
|
|
-// pub struct BlastnBedRow {
|
|
|
-// pub contig: String,
|
|
|
-// pub start: u32,
|
|
|
-// pub end: u32,
|
|
|
-// pub name: String,
|
|
|
-// pub score: String,
|
|
|
-// pub strand: String,
|
|
|
-// }
|
|
|
-//
|
|
|
-// impl FromStr for BlastnBedRow {
|
|
|
-// type Err = anyhow::Error;
|
|
|
-//
|
|
|
-// fn from_str(line: &str) -> anyhow::Result<Self> {
|
|
|
-// let fields: Vec<&str> = line.split('\t').collect();
|
|
|
-//
|
|
|
-// if fields.len() != 6 {
|
|
|
-// return Err(anyhow!(
|
|
|
-// "Error while parsing bed row, number of fields doesn't match {}",
|
|
|
-// line
|
|
|
-// ));
|
|
|
-// }
|
|
|
-//
|
|
|
-// let row = BlastnBedRow {
|
|
|
-// contig: fields[0].to_string(),
|
|
|
-// start: fields[1].parse()?,
|
|
|
-// end: fields[2].parse()?,
|
|
|
-// name: fields[3].to_string(),
|
|
|
-// score: fields[4].to_string(),
|
|
|
-// strand: fields[5].to_string(),
|
|
|
-// };
|
|
|
-//
|
|
|
-// Ok(row)
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// fn read_blastn_bed(file_path: &str) -> anyhow::Result<Vec<BlastnBedRow>> {
|
|
|
-// let file = File::open(file_path)?;
|
|
|
-// let reader = BufReader::new(file);
|
|
|
-//
|
|
|
-// reader
|
|
|
-// .lines()
|
|
|
-// .map_while(Result::ok)
|
|
|
-// .map(|s| BlastnBedRow::from_str(&s))
|
|
|
-// .collect()
|
|
|
-// }
|
|
|
-//
|
|
|
-// pub fn igv_link(dir: &str, contig_id: &str) -> anyhow::Result<String> {
|
|
|
-// let contig_fa = format!("{dir}/{contig_id}_flye.fa");
|
|
|
-// let bam_track = Track::Bam(BamTrack::new(
|
|
|
-// "Assembled reads",
|
|
|
-// &format!("{dir}/{contig_id}_flye.bam"),
|
|
|
-// ));
|
|
|
-// let bed_track = Track::Bed(BedTrack::new(
|
|
|
-// "Blastn",
|
|
|
-// &format!("{dir}/{contig_id}_flye.bed"),
|
|
|
-// ));
|
|
|
-// let mod_track = Track::Bed(BedTrack::new(
|
|
|
-// "Modified bases",
|
|
|
-// &format!("{dir}/{contig_id}_flye_mod.bed"),
|
|
|
-// ));
|
|
|
-//
|
|
|
-// let reference = pandora_lib_igv::ReferenceValues {
|
|
|
-// id: format!("{contig_id}_flye.fa"),
|
|
|
-// name: format!("{contig_id}_flye.fa"),
|
|
|
-// index_url: format!("{contig_fa}.fai"),
|
|
|
-// fasta_url: contig_fa,
|
|
|
-// ..Default::default()
|
|
|
-// };
|
|
|
-// let igv = pandora_lib_igv::Session::default()
|
|
|
-// .with_reference(reference)
|
|
|
-// .add_track(bam_track)?
|
|
|
-// .add_track(bed_track)?
|
|
|
-// .add_track(mod_track)?;
|
|
|
-// let link = igv.link("http://store-desktop.local/igv/")?;
|
|
|
-// Ok(link)
|
|
|
-// }
|
|
|
-//
|
|
|
-// // Function to merge two BAM files into a new BAM file
|
|
|
-// fn merge_bam_files(input_bam_paths: Vec<String>, output_bam_path: &str) -> anyhow::Result<()> {
|
|
|
-// // Ensure there is at least one BAM file to merge
|
|
|
-// if input_bam_paths.is_empty() {
|
|
|
-// return Err(anyhow!("No input BAM files provided"));
|
|
|
-// }
|
|
|
-//
|
|
|
-// // Open the first input BAM file to get the header
|
|
|
-// let mut bam1 = rust_htslib::bam::Reader::from_path(&input_bam_paths[0])?;
|
|
|
-// let header = rust_htslib::bam::Header::from_template(bam1.header());
|
|
|
-//
|
|
|
-// // Create a new BAM writer with the header from the first BAM file
|
|
|
-// let mut output_bam = rust_htslib::bam::Writer::from_path(
|
|
|
-// output_bam_path,
|
|
|
-// &header,
|
|
|
-// rust_htslib::bam::Format::Bam,
|
|
|
-// )?;
|
|
|
-//
|
|
|
-// // Write records from the first BAM file to the output BAM file
|
|
|
-// for result in bam1.records() {
|
|
|
-// let record = result?;
|
|
|
-// output_bam.write(&record)?;
|
|
|
-// }
|
|
|
-//
|
|
|
-// // Iterate over the remaining BAM files and write their records to the output BAM file
|
|
|
-// for bam_path in &input_bam_paths[1..] {
|
|
|
-// let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
|
|
|
-// for result in bam.records() {
|
|
|
-// let record = result?;
|
|
|
-// output_bam.write(&record)?;
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// Ok(())
|
|
|
-// }
|
|
|
-
|
|
|
-// pub fn par_whole_assemble(reads_dir: &str, dict_path: &str, force: bool) -> anyhow::Result<()> {
|
|
|
-// read_dict(dict_path)?.par_iter().for_each(|(contig, _)| {
|
|
|
-// dir_flye(&format!("{reads_dir}/{contig}"), force).unwrap();
|
|
|
-// });
|
|
|
-// Ok(())
|
|
|
-// }
|
|
|
-
|
|
|
-pub fn relevant_links(dir_path: &str) -> anyhow::Result<()> {
|
|
|
- let pattern = format!("{}/**/*_flye.bed", dir_path);
|
|
|
- let bed_paths: Vec<PathBuf> = glob::glob(&pattern)
|
|
|
- .expect("Failed to read glob pattern")
|
|
|
- .filter_map(Result::ok)
|
|
|
- .collect();
|
|
|
- for bed_path in bed_paths {
|
|
|
- let bed_rows = read_bed(&bed_path)?;
|
|
|
- if bed_rows.len() > 1 {
|
|
|
- println!("{bed_rows:#?}");
|
|
|
- }
|
|
|
- }
|
|
|
- Ok(())
|
|
|
-}
|
|
|
-
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
- use crate::assembler::spades::SpadesConfig;
|
|
|
+ use std::path::PathBuf;
|
|
|
+
|
|
|
+ use crate::assembler::{spades::SpadesConfig, wtdbg2::Wtdbg2Config};
|
|
|
|
|
|
use self::assembler::{assemble_records, flye::FlyeConfig, AssembleConfig};
|
|
|
|
|
|
@@ -842,6 +29,7 @@ mod tests {
|
|
|
let res = assemble_records(
|
|
|
&PathBuf::from(bam),
|
|
|
&vec![AssembleConfig::Flye(FlyeConfig::new("/tmp/test_ass_flye"))],
|
|
|
+ 1
|
|
|
);
|
|
|
println!("{res:#?}");
|
|
|
}
|
|
|
@@ -860,6 +48,7 @@ mod tests {
|
|
|
let res = assemble_records(
|
|
|
&PathBuf::from(bam),
|
|
|
&vec![AssembleConfig::Flye(FlyeConfig::new("/tmp/test_ass_flye"))],
|
|
|
+ 1
|
|
|
);
|
|
|
println!("{res:#?}");
|
|
|
}
|
|
|
@@ -867,88 +56,50 @@ mod tests {
|
|
|
#[test]
|
|
|
fn spades_bam() {
|
|
|
init();
|
|
|
- let id = "ROBIN";
|
|
|
- let input_id = "143063957-143063957_8a24";
|
|
|
+ // let id = "ROBIN";
|
|
|
+ // let input_id = "143063957-143063957_8a24";
|
|
|
+ // let chr = "chr10";
|
|
|
+
|
|
|
+ let id = "LEVASSEUR";
|
|
|
+ let input_id = "12677262-102018167_2c45";
|
|
|
+ let chr = "chr10";
|
|
|
|
|
|
let res_dir = "/data/longreads_basic_pipe";
|
|
|
- let asm_dir = format!("{res_dir}/{id}/diag/scan/reads/chr9");
|
|
|
+ let asm_dir = format!("{res_dir}/{id}/diag/scan/reads/{chr}");
|
|
|
let bam = format!("{asm_dir}/{input_id}.bam");
|
|
|
|
|
|
let res = assemble_records(
|
|
|
&PathBuf::from(bam),
|
|
|
- &vec![AssembleConfig::Spades(SpadesConfig::new("/tmp/test_ass_spades"))],
|
|
|
+ &vec![AssembleConfig::Spades(SpadesConfig::new(
|
|
|
+ "/data/spades",
|
|
|
+ ))],
|
|
|
+ 1
|
|
|
);
|
|
|
println!("{res:#?}");
|
|
|
}
|
|
|
|
|
|
-
|
|
|
- // #[test]
|
|
|
- // fn flye_test() -> anyhow::Result<()> {
|
|
|
- // init();
|
|
|
- // let id = "ROBIN";
|
|
|
- // let input_id = "143063957-143063957_8a24";
|
|
|
- //
|
|
|
- // // let id = "SALICETTO";
|
|
|
- // // let input_id = "cd8e4a8a-27ed-4045-af88-9838201f164f";
|
|
|
- //
|
|
|
- // let res_dir = "/data/longreads_basic_pipe";
|
|
|
- // let asm_dir = format!("{res_dir}/{id}/diag/scan/reads/chr9");
|
|
|
- //
|
|
|
- // match FlyeAsm::new(&asm_dir, input_id)?.assemble()?.save() {
|
|
|
- // Ok(_) => (),
|
|
|
- // Err(err) => {
|
|
|
- // warn!("{input_id} has not been assembled ❌ {err}");
|
|
|
- // warn!("Rescuing");
|
|
|
- // rescue(&asm_dir, input_id)?;
|
|
|
- // }
|
|
|
- // };
|
|
|
- // // let l = igv_link(&asm_dir, input_id)?;
|
|
|
- // // info!("{l}");
|
|
|
- // Ok(())
|
|
|
- // }
|
|
|
- //
|
|
|
- // #[test]
|
|
|
- // fn flye_d() -> anyhow::Result<()> {
|
|
|
- // init();
|
|
|
- // let id = "LEVASSEUR";
|
|
|
- // let res_dir = "/data/longreads_basic_pipe";
|
|
|
- // let asm_dir = format!("{res_dir}/{id}/diag/asm_contigs");
|
|
|
- // dir_flye(&asm_dir, true)
|
|
|
- // }
|
|
|
- //
|
|
|
- // #[test]
|
|
|
- // fn tmp() {
|
|
|
- // init();
|
|
|
- // dir_flye(
|
|
|
- // "/data/longreads_basic_pipe/LEVASSEUR/diag/scan/reads/chr10",
|
|
|
- // true,
|
|
|
- // )
|
|
|
- // .unwrap();
|
|
|
- // }
|
|
|
- //
|
|
|
- // #[test]
|
|
|
- // fn dedup() {
|
|
|
- // init();
|
|
|
- // dedup_dir("/data/tmp/scan_7ed2f43c-d16d-4dcc-bdb4-fb619d082991").unwrap();
|
|
|
- // }
|
|
|
- //
|
|
|
- // #[test]
|
|
|
- // fn whole() {
|
|
|
- // init();
|
|
|
- // let id = "ROBIN";
|
|
|
- // if let Err(err) = par_whole_assemble(
|
|
|
- // &format!("/data/longreads_basic_pipe/{id}/diag/scan/reads"),
|
|
|
- // "/data/ref/hs1/chm13v2.0.dict",
|
|
|
- // false,
|
|
|
- // ) {
|
|
|
- // println!("{err}");
|
|
|
- // }
|
|
|
- // }
|
|
|
- //
|
|
|
#[test]
|
|
|
- fn links() {
|
|
|
+ fn wtdbg2_bam() {
|
|
|
init();
|
|
|
+ // let id = "LEVASSEUR";
|
|
|
+ // let input_id = "12677262-102018167_2c45";
|
|
|
+ // let chr = "chr10";
|
|
|
+
|
|
|
let id = "ROBIN";
|
|
|
- relevant_links(&format!("/data/longreads_basic_pipe/{id}/diag/scan/reads")).unwrap();
|
|
|
+ let input_id = "143063957-143063957_8a24";
|
|
|
+ let chr = "chr9";
|
|
|
+
|
|
|
+ let res_dir = "/data/longreads_basic_pipe";
|
|
|
+ let asm_dir = format!("{res_dir}/{id}/diag/scan/reads/{chr}");
|
|
|
+ let bam = format!("{asm_dir}/{input_id}.bam");
|
|
|
+
|
|
|
+ let res = assemble_records(
|
|
|
+ &PathBuf::from(bam),
|
|
|
+ &vec![AssembleConfig::Wtdbg2(Wtdbg2Config::new(
|
|
|
+ "/data/wtdbg2",
|
|
|
+ ))],
|
|
|
+ 1
|
|
|
+ );
|
|
|
+ println!("{res:#?}");
|
|
|
}
|
|
|
}
|