|
|
@@ -24,7 +24,7 @@ use rust_htslib::bam::{Read, Reader, Record};
|
|
|
use std::{
|
|
|
collections::{HashMap, HashSet, VecDeque},
|
|
|
fs::{self, File},
|
|
|
- io::{BufReader, Cursor, Write},
|
|
|
+ io::{BufRead, BufReader, Cursor, Write},
|
|
|
path::{Path, PathBuf},
|
|
|
};
|
|
|
use uuid::Uuid;
|
|
|
@@ -386,11 +386,8 @@ pub fn order_sequences_with_positions(blast_results: &[BlastResult]) -> Vec<(Str
|
|
|
|
|
|
// Calculate absolute positions
|
|
|
let mut positions: HashMap<String, usize> = HashMap::new();
|
|
|
- let mut current_position = 0;
|
|
|
-
|
|
|
- for seq in &ordered_sequences {
|
|
|
+ for (current_position, seq) in ordered_sequences.iter().enumerate() {
|
|
|
positions.insert(seq.clone(), current_position);
|
|
|
- current_position += 1; // Increment position for each sequence
|
|
|
}
|
|
|
|
|
|
let mut res: Vec<_> = ordered_sequences
|
|
|
@@ -401,74 +398,6 @@ pub fn order_sequences_with_positions(blast_results: &[BlastResult]) -> Vec<(Str
|
|
|
res
|
|
|
}
|
|
|
|
|
|
-// fn order_sequences_with_positions_bam(records: &Vec<bam::Record>, header: &Header) -> anyhow::Result<Vec<(String, usize)>> {
|
|
|
-// let mut graph: HashMap<String, HashSet<String>> = HashMap::new();
|
|
|
-// let mut in_degree: HashMap<String, usize> = HashMap::new();
|
|
|
-// let mut all_sequences: HashSet<String> = HashSet::new();
|
|
|
-//
|
|
|
-// for record in records {
|
|
|
-// let query_id = String::from_utf8(record.name().to_vec());
|
|
|
-// let subject_id = record.ref_id().clone();
|
|
|
-// let q_start = record.aligned_query_start();
|
|
|
-// let s_start = record.start();
|
|
|
-//
|
|
|
-// all_sequences.insert(query_id.clone());
|
|
|
-// all_sequences.insert(subject_id.clone());
|
|
|
-//
|
|
|
-// if q_start < s_start {
|
|
|
-// graph.entry(query_id.clone())
|
|
|
-// .or_insert_with(HashSet::new())
|
|
|
-// .insert(subject_id.clone());
|
|
|
-// *in_degree.entry(subject_id.clone()).or_insert(0) += 1;
|
|
|
-// } else {
|
|
|
-// graph.entry(subject_id.clone())
|
|
|
-// .or_insert_with(HashSet::new())
|
|
|
-// .insert(query_id.clone());
|
|
|
-// *in_degree.entry(query_id.clone()).or_insert(0) += 1;
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// let mut queue: VecDeque<String> = all_sequences.iter()
|
|
|
-// .filter(|&seq| !in_degree.contains_key(seq))
|
|
|
-// .cloned()
|
|
|
-// .collect();
|
|
|
-//
|
|
|
-// let mut ordered_sequences: Vec<String> = Vec::new();
|
|
|
-//
|
|
|
-// while let Some(seq) = queue.pop_front() {
|
|
|
-// ordered_sequences.push(seq.clone());
|
|
|
-//
|
|
|
-// if let Some(neighbors) = graph.get(&seq) {
|
|
|
-// for neighbor in neighbors {
|
|
|
-// if let Some(degree) = in_degree.get_mut(neighbor) {
|
|
|
-// *degree -= 1;
|
|
|
-// if *degree == 0 {
|
|
|
-// queue.push_back(neighbor.clone());
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// for seq in all_sequences {
|
|
|
-// if !ordered_sequences.contains(&seq) {
|
|
|
-// ordered_sequences.push(seq.clone());
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// let mut positions: HashMap<String, usize> = HashMap::new();
|
|
|
-// let mut current_position = 0;
|
|
|
-//
|
|
|
-// for seq in &ordered_sequences {
|
|
|
-// positions.insert(seq.clone(), current_position);
|
|
|
-// current_position += 1;
|
|
|
-// }
|
|
|
-//
|
|
|
-// ordered_sequences.into_iter()
|
|
|
-// .map(|seq| (seq.clone(), *positions.get(&seq).unwrap()))
|
|
|
-// .collect()
|
|
|
-// }
|
|
|
-
|
|
|
pub fn fai(path: &str) -> Result<std::process::Output, std::io::Error> {
|
|
|
duct::cmd!("samtools", "faidx", path).run()
|
|
|
}
|
|
|
@@ -524,6 +453,10 @@ impl FlyeAsm {
|
|
|
fs::create_dir(&tmp_dir).unwrap();
|
|
|
let input_fa = format!("{}/{}.fasta", self.asm_dir, self.input_id);
|
|
|
|
|
|
+ if !Path::new(&input_fa).exists() {
|
|
|
+ records_to_fasta(&self.input_records, &input_fa)?;
|
|
|
+ }
|
|
|
+
|
|
|
// let input_fa = format!("{tmp_dir}/input.fa");
|
|
|
// records_to_fasta(&self.input_records, &input_fa)?;
|
|
|
|
|
|
@@ -588,7 +521,7 @@ impl FlyeAsm {
|
|
|
let bed_path = format!("{}/{contig_id}.bed", self.asm_dir);
|
|
|
let bed = pandora_lib_blastn::Blast::init(&contig_fa)?
|
|
|
.run()?
|
|
|
- .keep_best()
|
|
|
+ // .keep_best()
|
|
|
.to_bed()?;
|
|
|
let mut f = File::create(bed_path)?;
|
|
|
f.write_all(bed.as_bytes())?;
|
|
|
@@ -635,8 +568,7 @@ pub fn records_to_fasta(r: &Vec<bam::Record>, fa_path: &str) -> anyhow::Result<(
|
|
|
|
|
|
pub fn dir_flye(dir: &str, force: bool) -> anyhow::Result<()> {
|
|
|
let pattern = format!("{}/*.bam", dir);
|
|
|
- let re =
|
|
|
- Regex::new(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.bam$").unwrap();
|
|
|
+ let re = Regex::new(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.bam$")?;
|
|
|
|
|
|
for entry in glob::glob(&pattern).expect("Failed to read glob pattern") {
|
|
|
match entry {
|
|
|
@@ -644,7 +576,9 @@ pub fn dir_flye(dir: &str, force: bool) -> anyhow::Result<()> {
|
|
|
if let Some(file_name) = path.file_name().and_then(|name| name.to_str()) {
|
|
|
if re.is_match(file_name) {
|
|
|
if let Some(input_id) = path.file_stem().and_then(|n| n.to_str()) {
|
|
|
- if PathBuf::from(format!("{dir}/{input_id}_flye.bam")).exists() && !force {
|
|
|
+ if PathBuf::from(format!("{dir}/{input_id}_flye.bam")).exists()
|
|
|
+ && !force
|
|
|
+ {
|
|
|
warn!("{input_id} already assembled: {}", igv_link(dir, input_id)?);
|
|
|
} else {
|
|
|
let f = FlyeAsm::new(dir, input_id)?;
|
|
|
@@ -667,6 +601,102 @@ pub fn dir_flye(dir: &str, force: bool) -> anyhow::Result<()> {
|
|
|
Ok(())
|
|
|
}
|
|
|
|
|
|
+pub fn dedup_dir(dir: &str) -> anyhow::Result<()> {
|
|
|
+ // load bed
|
|
|
+ let pattern = format!("{}/*.bed", dir);
|
|
|
+ let re =
|
|
|
+ Regex::new(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}_flye\.bed$")?;
|
|
|
+
|
|
|
+ let mut bed_hm: HashMap<String, Vec<String>> = HashMap::new();
|
|
|
+ for entry in glob::glob(&pattern).expect("Failed to read glob pattern") {
|
|
|
+ match entry {
|
|
|
+ Ok(path) => {
|
|
|
+ println!("{path:?}");
|
|
|
+ if let Some(file_name) = path.file_name().and_then(|name| name.to_str()) {
|
|
|
+ if re.is_match(file_name) {
|
|
|
+ if let Some(input_id) = path.file_stem().and_then(|n| n.to_str()) {
|
|
|
+ let mut bed_blast: Vec<String> = read_tsv_file(path.to_str().unwrap())?
|
|
|
+ .iter()
|
|
|
+ .map(|r| format!("{}{}", r.name, r.strand))
|
|
|
+ .collect();
|
|
|
+ bed_blast.sort();
|
|
|
+ let key = bed_blast.join("|");
|
|
|
+ bed_hm.entry(key).or_default().push(input_id.to_owned().replace("_flye", ""));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Err(e) => println!("{:?}", e),
|
|
|
+ }
|
|
|
+ }
|
|
|
+ println!("{bed_hm:#?}");
|
|
|
+
|
|
|
+ for group in bed_hm.values() {
|
|
|
+ if group.len() < 2 {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ let new_locus_id = uuid::Uuid::new_v4();
|
|
|
+ let output_bam_path = format!("{dir}/{new_locus_id}.bam");
|
|
|
+ info!("Merging {} bams into {}", group.len(), output_bam_path);
|
|
|
+ let input_bam_paths = group.iter().map(|s| format!("{dir}/{s}.bam")).collect();
|
|
|
+ merge_bam_files(input_bam_paths, &output_bam_path)?;
|
|
|
+ dir_flye(dir, false)?;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
+
|
|
|
+pub struct BedRow {
|
|
|
+ pub contig: String,
|
|
|
+ pub start: u32,
|
|
|
+ pub end: u32,
|
|
|
+ pub name: String,
|
|
|
+ pub score: String,
|
|
|
+ pub strand: String,
|
|
|
+}
|
|
|
+
|
|
|
+// Function to read a TSV file and return a Vec of Row structs
|
|
|
+fn read_tsv_file(file_path: &str) -> anyhow::Result<Vec<BedRow>> {
|
|
|
+ // Open the file
|
|
|
+ let file = File::open(file_path)?;
|
|
|
+ let reader = BufReader::new(file);
|
|
|
+
|
|
|
+ // Create a vector to store the rows
|
|
|
+ let mut rows = Vec::new();
|
|
|
+
|
|
|
+ // Iterate over each line in the file
|
|
|
+ for line in reader.lines() {
|
|
|
+ // Unwrap the line, skipping any that cause errors
|
|
|
+ let line = line?;
|
|
|
+
|
|
|
+ // Split the line by tabs
|
|
|
+ let fields: Vec<&str> = line.split('\t').collect();
|
|
|
+
|
|
|
+ // Ensure the line has the correct number of fields
|
|
|
+ if fields.len() != 6 {
|
|
|
+ continue; // Skip lines with incorrect number of fields
|
|
|
+ }
|
|
|
+
|
|
|
+ // Parse the fields and create a Row struct
|
|
|
+ let row = BedRow {
|
|
|
+ contig: fields[0].to_string(),
|
|
|
+ start: fields[1].parse()?,
|
|
|
+ end: fields[2].parse()?,
|
|
|
+ name: fields[3].to_string(),
|
|
|
+ score: fields[4].to_string(),
|
|
|
+ strand: fields[5].to_string(),
|
|
|
+ };
|
|
|
+
|
|
|
+ // Add the row to the vector
|
|
|
+ rows.push(row);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Return the vector of rows
|
|
|
+ Ok(rows)
|
|
|
+}
|
|
|
+
|
|
|
pub fn igv_link(dir: &str, contig_id: &str) -> anyhow::Result<String> {
|
|
|
let contig_fa = format!("{dir}/{contig_id}_flye.fa");
|
|
|
let bam_track = Track::Bam(BamTrack::new(
|
|
|
@@ -698,6 +728,39 @@ pub fn igv_link(dir: &str, contig_id: &str) -> anyhow::Result<String> {
|
|
|
Ok(link)
|
|
|
}
|
|
|
|
|
|
+// Function to merge two BAM files into a new BAM file
|
|
|
+fn merge_bam_files(input_bam_paths: Vec<String>, output_bam_path: &str) -> anyhow::Result<()> {
|
|
|
+ // Ensure there is at least one BAM file to merge
|
|
|
+ if input_bam_paths.is_empty() {
|
|
|
+ return Err(anyhow!("No input BAM files provided"));
|
|
|
+ }
|
|
|
+
|
|
|
+ // Open the first input BAM file to get the header
|
|
|
+ let mut bam1 = rust_htslib::bam::Reader::from_path(&input_bam_paths[0])?;
|
|
|
+ let header = rust_htslib::bam::Header::from_template(bam1.header());
|
|
|
+
|
|
|
+ // Create a new BAM writer with the header from the first BAM file
|
|
|
+ let mut output_bam = rust_htslib::bam::Writer::from_path(output_bam_path, &header, rust_htslib::bam::Format::Bam)?;
|
|
|
+
|
|
|
+ // Write records from the first BAM file to the output BAM file
|
|
|
+ for result in bam1.records() {
|
|
|
+ let record = result?;
|
|
|
+ output_bam.write(&record)?;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Iterate over the remaining BAM files and write their records to the output BAM file
|
|
|
+ for bam_path in &input_bam_paths[1..] {
|
|
|
+ let mut bam = rust_htslib::bam::Reader::from_path(bam_path)?;
|
|
|
+ for result in bam.records() {
|
|
|
+ let record = result?;
|
|
|
+ output_bam.write(&record)?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // The Writer will automatically close when it goes out of scope
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
+
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
use std::fs;
|
|
|
@@ -766,9 +829,22 @@ mod tests {
|
|
|
#[test]
|
|
|
fn flye_d() -> anyhow::Result<()> {
|
|
|
init();
|
|
|
- let id = "SALICETTO";
|
|
|
+ let id = "LEVASSEUR";
|
|
|
let res_dir = "/data/longreads_basic_pipe";
|
|
|
let asm_dir = format!("{res_dir}/{id}/diag/asm_contigs");
|
|
|
dir_flye(&asm_dir, true)
|
|
|
}
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn tmp() {
|
|
|
+ init();
|
|
|
+ dir_flye("/data/tmp/scan_ca67d4bc-a18e-40ab-9e0a-af90116ca20b/reads/chr9", true).unwrap();
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn dedup() {
|
|
|
+ init();
|
|
|
+ dedup_dir("/data/tmp/scan_7ed2f43c-d16d-4dcc-bdb4-fb619d082991").unwrap();
|
|
|
+ }
|
|
|
+
|
|
|
}
|