|
|
@@ -0,0 +1,202 @@
|
|
|
+use std::{
|
|
|
+ fs::{self, File},
|
|
|
+ io::Write,
|
|
|
+ process::{Command, Stdio},
|
|
|
+};
|
|
|
+
|
|
|
+use anyhow::{Context, Ok};
|
|
|
+use log::info;
|
|
|
+use serde::{Deserialize, Serialize};
|
|
|
+
|
|
|
+// cat /home/prom/Documents/Programmes/desc_seq_lib/data_test/419b7353-bc8c-4ffe-8ad6-0bbfac8c0cfa.fasta | blastn -db hs1_simple_chr.fa -outfmt 6
|
|
|
+pub fn run_blastn(query_fa_path: &str) -> anyhow::Result<String> {
|
|
|
+ info!("Running blastn in {query_fa_path}");
|
|
|
+ let blastn_bin = "/usr/bin/blastn";
|
|
|
+ let blast_db = "/data/ref/hs1/hs1_simple_chr.fa";
|
|
|
+
|
|
|
+ let file_contents = fs::read_to_string(query_fa_path).context("Failed to read the file")?;
|
|
|
+
|
|
|
+ let mut cmd = Command::new(blastn_bin)
|
|
|
+ .arg("-db")
|
|
|
+ .arg(blast_db)
|
|
|
+ .arg("-outfmt")
|
|
|
+ .arg("6")
|
|
|
+ .arg("-num_threads")
|
|
|
+ .arg("6")
|
|
|
+ .stdin(Stdio::piped())
|
|
|
+ .stdout(Stdio::piped())
|
|
|
+ .spawn()
|
|
|
+ .expect("Blastn failed to start");
|
|
|
+
|
|
|
+ {
|
|
|
+ let stdin = cmd.stdin.as_mut().expect("Failed to open stdin");
|
|
|
+ stdin
|
|
|
+ .write_all(file_contents.as_bytes())
|
|
|
+ .expect("Failed to write to stdin");
|
|
|
+ }
|
|
|
+
|
|
|
+ let output = cmd.wait_with_output().expect("Failed to read stdout");
|
|
|
+
|
|
|
+ Ok(String::from_utf8_lossy(&output.stdout).to_string())
|
|
|
+}
|
|
|
+
|
|
|
+pub fn parse_blastn_touput(output: String) -> Vec<BlastResult> {
|
|
|
+ let mut lines: Vec<&str> = output.split("\n").collect();
|
|
|
+
|
|
|
+ let mut results = Vec::new();
|
|
|
+ for line in lines.drain(..) {
|
|
|
+ if let std::result::Result::Ok(r) = line.to_string().try_into() {
|
|
|
+ results.push(r);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ results
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
+pub struct BlastResult {
|
|
|
+ pub query_id: String,
|
|
|
+ pub subject_id: String,
|
|
|
+ pub percent_identity: f64,
|
|
|
+ pub alignment_length: usize,
|
|
|
+ pub mismatches: usize,
|
|
|
+ pub gap_opens: usize,
|
|
|
+ pub q_start: usize,
|
|
|
+ pub q_end: usize,
|
|
|
+ pub s_start: usize,
|
|
|
+ pub s_end: usize,
|
|
|
+ pub e_value: f64,
|
|
|
+ pub bit_score: f64,
|
|
|
+}
|
|
|
+
|
|
|
+impl TryFrom<String> for BlastResult {
|
|
|
+ type Error = anyhow::Error;
|
|
|
+
|
|
|
+ fn try_from(line: String) -> anyhow::Result<Self> {
|
|
|
+ let fields: Vec<&str> = line.split('\t').collect();
|
|
|
+
|
|
|
+ if fields.len() != 12 {
|
|
|
+ return Err(anyhow::anyhow!(
|
|
|
+ "Expected 12 fields, but got {}",
|
|
|
+ fields.len()
|
|
|
+ ));
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(BlastResult {
|
|
|
+ query_id: fields[0].to_string(),
|
|
|
+ subject_id: fields[1].to_string(),
|
|
|
+ percent_identity: fields[2]
|
|
|
+ .parse()
|
|
|
+ .context("Failed to parse percent_identity")?,
|
|
|
+ alignment_length: fields[3]
|
|
|
+ .parse()
|
|
|
+ .context("Failed to parse alignment_length")?,
|
|
|
+ mismatches: fields[4].parse().context("Failed to parse mismatches")?,
|
|
|
+ gap_opens: fields[5].parse().context("Failed to parse gap_opens")?,
|
|
|
+ q_start: fields[6].parse().context("Failed to parse q_start")?,
|
|
|
+ q_end: fields[7].parse().context("Failed to parse q_end")?,
|
|
|
+ s_start: fields[8].parse().context("Failed to parse s_start")?,
|
|
|
+ s_end: fields[9].parse().context("Failed to parse s_end")?,
|
|
|
+ e_value: fields[10].parse().context("Failed to parse e_value")?,
|
|
|
+ bit_score: fields[11].parse().context("Failed to parse bit_score")?,
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub fn blastn(fa_path: &str, output_json: &str) -> anyhow::Result<()> {
|
|
|
+ let res = run_blastn(fa_path)?;
|
|
|
+ let mut res = parse_blastn_touput(res);
|
|
|
+ res.sort_by(|a, b| b.bit_score.total_cmp(&a.bit_score));
|
|
|
+ let json = serde_json::to_string(&res)?;
|
|
|
+ let mut file = File::create(output_json)?;
|
|
|
+ file.write_all(json.as_bytes())?;
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug)]
|
|
|
+struct BlastPile {
|
|
|
+ x: usize,
|
|
|
+ length: usize,
|
|
|
+ y: usize,
|
|
|
+ data: BlastResult,
|
|
|
+}
|
|
|
+
|
|
|
+impl BlastPile {
|
|
|
+ pub fn new(d: &BlastResult) -> BlastPile {
|
|
|
+ BlastPile {
|
|
|
+ x: d.q_start,
|
|
|
+ length: d.q_end - d.q_start,
|
|
|
+ y: 0,
|
|
|
+ data: d.clone(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+fn sort_blast_pile(objects: &mut Vec<BlastPile>) {
|
|
|
+ objects.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap());
|
|
|
+}
|
|
|
+
|
|
|
+fn allocate_y_positions(objects: &mut Vec<BlastPile>) {
|
|
|
+ sort_blast_pile(objects);
|
|
|
+
|
|
|
+ let mut active_intervals: Vec<(usize, usize)> = vec![]; // Stores (x + length, y) tuples
|
|
|
+
|
|
|
+ for obj in objects.iter_mut() {
|
|
|
+ let mut y_position = 0;
|
|
|
+
|
|
|
+ // Find the lowest y position where it does not overlap
|
|
|
+ loop {
|
|
|
+ let overlap = active_intervals
|
|
|
+ .iter()
|
|
|
+ .any(|(end_x, y)| *y == y_position && obj.x < *end_x);
|
|
|
+ if !overlap {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ y_position += 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Assign the y position
|
|
|
+ obj.y = y_position;
|
|
|
+
|
|
|
+ // Add the current object to active intervals
|
|
|
+ active_intervals.push((obj.x + obj.length, obj.y));
|
|
|
+
|
|
|
+ // Remove intervals that are no longer active
|
|
|
+ active_intervals.retain(|(end_x, _)| obj.x < *end_x);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+pub fn best_blastn(fa_path: &str) -> anyhow::Result<Vec<BlastResult>> {
|
|
|
+ let res = run_blastn(fa_path)?;
|
|
|
+ let mut results: Vec<BlastPile> = parse_blastn_touput(res)
|
|
|
+ .iter()
|
|
|
+ .take(1000)
|
|
|
+ .map(|v| BlastPile::new(v))
|
|
|
+ .collect();
|
|
|
+ allocate_y_positions(&mut results);
|
|
|
+ let results: Vec<BlastResult> = results
|
|
|
+ .into_iter()
|
|
|
+ .filter(|r| r.y == 0)
|
|
|
+ .map(|r| r.data)
|
|
|
+ .collect();
|
|
|
+ Ok(results)
|
|
|
+}
|
|
|
+
|
|
|
+#[cfg(test)]
|
|
|
+mod tests {
|
|
|
+ use super::*;
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn best() -> anyhow::Result<()> {
|
|
|
+ let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
|
|
|
+ .build();
|
|
|
+
|
|
|
+ let fa = "test_data/419b7353-bc8c-4ffe-8ad6-0bbfac8c0cfa.fasta";
|
|
|
+
|
|
|
+ let res = best_blastn(fa)?;
|
|
|
+
|
|
|
+ assert_eq!(res[0].q_end, 2838);
|
|
|
+ assert_eq!(res[1].q_start, 2935);
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|