|
|
@@ -1,195 +1,676 @@
|
|
|
+//! Helpers to run common `bcftools` operations behind the generic
|
|
|
+//! `Command` / `Runner` / `SlurmRunner` traits.
|
|
|
+//!
|
|
|
+//! All commands are configured from a [`Config`] and expose a `from_config`
|
|
|
+//! constructor plus a generated shell `cmd()` string suitable for local
|
|
|
+//! or Slurm execution.
|
|
|
+
|
|
|
use anyhow::Context;
|
|
|
use log::info;
|
|
|
-use std::{fs, path::Path};
|
|
|
+use std::{
|
|
|
+ fs,
|
|
|
+ path::{Path, PathBuf},
|
|
|
+};
|
|
|
use uuid::Uuid;
|
|
|
|
|
|
-use crate::runners::{run_wait, CommandRun, RunReport};
|
|
|
+use crate::commands::SlurmParams;
|
|
|
+use crate::config::Config;
|
|
|
+
|
|
|
+/// Sorts, normalizes and keeps only `FILTER=PASS` variants with `bcftools`.
|
|
|
+///
|
|
|
+/// The command sequence is:
|
|
|
+///
|
|
|
+/// 1. `bcftools sort` on `input`
|
|
|
+/// 2. `bcftools norm` with `--atom-overlaps .`
|
|
|
+/// 3. `bcftools view -i "FILTER='PASS'" -Oz --write-index`
|
|
|
+///
|
|
|
+/// The final output is a BGZF-compressed VCF (`.vcf.gz`) plus its index.
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct BcftoolsKeepPass {
|
|
|
+ /// Path to the `bcftools` binary.
|
|
|
+ pub bin: String,
|
|
|
+ /// Number of threads to pass to `bcftools` where supported.
|
|
|
+ pub threads: u8,
|
|
|
+ /// Input VCF/BCF file.
|
|
|
+ pub input: PathBuf,
|
|
|
+ /// Output VCF/BCF file with only `FILTER=PASS` records.
|
|
|
+ pub output: PathBuf,
|
|
|
+ /// Temporary dir from a global [`Config`].
|
|
|
+ pub tmp_dir: PathBuf,
|
|
|
+ /// Temporary sorted VCF/BCF path.
|
|
|
+ tmp_sort: PathBuf,
|
|
|
+ /// Temporary normalized VCF/BCF path.
|
|
|
+ tmp_norm: PathBuf,
|
|
|
+}
|
|
|
+
|
|
|
+impl BcftoolsKeepPass {
|
|
|
+ /// Builds a [`BcftoolsKeepPass`] from a global [`Config`].
|
|
|
+ ///
|
|
|
+ /// The `input` and `output` paths are not accessed at construction time;
|
|
|
+ /// they are validated in [`Command::init`].
|
|
|
+ pub fn from_config(config: &Config, input: impl AsRef<Path>, output: impl AsRef<Path>) -> Self {
|
|
|
+ Self {
|
|
|
+ bin: config.bcftools_bin.clone(),
|
|
|
+ threads: config.bcftools_threads,
|
|
|
+ input: input.as_ref().into(),
|
|
|
+ output: output.as_ref().into(),
|
|
|
+ tmp_dir: config.tmp_dir.clone().into(),
|
|
|
+ tmp_sort: PathBuf::new(),
|
|
|
+ tmp_norm: PathBuf::new(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Command for BcftoolsKeepPass {
|
|
|
+ /// Validates the input file and allocates random temporary paths.
|
|
|
+ fn init(&mut self) -> anyhow::Result<()> {
|
|
|
+ if !self.input.exists() {
|
|
|
+ anyhow::bail!("File doesnt exist {}", self.input.display());
|
|
|
+ }
|
|
|
+ self.tmp_sort = self.tmp_dir.join(Uuid::new_v4().to_string());
|
|
|
+ self.tmp_norm = self.tmp_dir.join(Uuid::new_v4().to_string());
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns the shell command that performs sort, norm and `FILTER=PASS` view.
|
|
|
+ fn cmd(&self) -> String {
|
|
|
+ format!(
|
|
|
+ "{bin} sort {input} -o {tmp_sort} && \
|
|
|
+ {bin} norm --threads {threads} -a --atom-overlaps . {tmp_sort} -o {tmp_norm} && \
|
|
|
+ {bin} view --write-index --threads {threads} -i \"FILTER='PASS'\" {tmp_norm} -Oz -o {output}",
|
|
|
+ bin = self.bin,
|
|
|
+ threads = self.threads,
|
|
|
+ input = self.input.display(),
|
|
|
+ tmp_sort = self.tmp_sort.display(),
|
|
|
+ tmp_norm = self.tmp_norm.display(),
|
|
|
+ output = self.output.display(),
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Removes temporary files created during [`Command::init`] / [`Command::cmd`].
|
|
|
+ fn clean_up(&self) -> anyhow::Result<()> {
|
|
|
+ if self.tmp_sort.exists() {
|
|
|
+ fs::remove_file(&self.tmp_sort)
|
|
|
+ .with_context(|| format!("Failed to remove {}", self.tmp_sort.display()))?;
|
|
|
+ }
|
|
|
+ if self.tmp_norm.exists() {
|
|
|
+ fs::remove_file(&self.tmp_norm)
|
|
|
+ .with_context(|| format!("Failed to remove {}", self.tmp_norm.display()))?;
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Runner for BcftoolsKeepPass {}
|
|
|
+
|
|
|
+impl super::SlurmRunner for BcftoolsKeepPass {
|
|
|
+ /// Slurm resource request for the `bcftools keep pass` job.
|
|
|
+ fn slurm_args(&self) -> Vec<String> {
|
|
|
+ SlurmParams {
|
|
|
+ job_name: Some("bcftools_keep_pass".into()),
|
|
|
+ cpus_per_task: Some(self.threads as u32),
|
|
|
+ mem: Some("45G".into()),
|
|
|
+ partition: Some("shortq".into()),
|
|
|
+ gres: None,
|
|
|
+ }
|
|
|
+ .to_args()
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Sorts, normalizes and keeps only *precise* `FILTER=PASS` variants.
|
|
|
+///
|
|
|
+/// Differs from [`BcftoolsKeepPass`] by excluding imprecise calls using
|
|
|
+/// `-e "INFO/IMPRECISE==1 || FILTER!='PASS'"`.
|
|
|
+///
|
|
|
+/// The final output is a BGZF-compressed VCF (`.vcf.gz`) plus its index.
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct BcftoolsKeepPassPrecise {
|
|
|
+ /// Path to the `bcftools` binary.
|
|
|
+ pub bin: String,
|
|
|
+ /// Number of threads to pass to `bcftools` where supported.
|
|
|
+ pub threads: u8,
|
|
|
+ /// Input VCF/BCF file.
|
|
|
+ pub input: PathBuf,
|
|
|
+ /// Output VCF/BCF file with precise `FILTER=PASS` records.
|
|
|
+ pub output: PathBuf,
|
|
|
+ /// Temporary dir from a global [`Config`].
|
|
|
+ pub tmp_dir: PathBuf,
|
|
|
+ /// Temporary sorted VCF/BCF path.
|
|
|
+ tmp_sort: PathBuf,
|
|
|
+ /// Temporary normalized VCF/BCF path.
|
|
|
+ tmp_norm: PathBuf,
|
|
|
+}
|
|
|
+
|
|
|
+impl BcftoolsKeepPassPrecise {
|
|
|
+ /// Builds a [`BcftoolsKeepPassPrecise`] from a global [`Config`].
|
|
|
+ pub fn from_config(config: &Config, input: impl AsRef<Path>, output: impl AsRef<Path>) -> Self {
|
|
|
+ Self {
|
|
|
+ bin: config.bcftools_bin.clone(),
|
|
|
+ threads: config.bcftools_threads,
|
|
|
+ input: input.as_ref().into(),
|
|
|
+ output: output.as_ref().into(),
|
|
|
+ tmp_dir: config.tmp_dir.clone().into(),
|
|
|
+ tmp_sort: PathBuf::new(),
|
|
|
+ tmp_norm: PathBuf::new(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Command for BcftoolsKeepPassPrecise {
|
|
|
+ /// Validates the input file and allocates random temporary paths.
|
|
|
+ fn init(&mut self) -> anyhow::Result<()> {
|
|
|
+ if !self.input.exists() {
|
|
|
+ anyhow::bail!("File doesnt exist {}", self.input.display());
|
|
|
+ }
|
|
|
+ self.tmp_sort = self.tmp_dir.join(Uuid::new_v4().to_string());
|
|
|
+ self.tmp_norm = self.tmp_dir.join(Uuid::new_v4().to_string());
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns the shell command that performs sort, norm and precise PASS filtering.
|
|
|
+ fn cmd(&self) -> String {
|
|
|
+ format!(
|
|
|
+ "{bin} sort {input} -o {tmp_sort} && \
|
|
|
+ {bin} norm --threads {threads} -a --atom-overlaps . {tmp_sort} -o {tmp_norm} && \
|
|
|
+ {bin} view --write-index --threads {threads} -e \"INFO/IMPRECISE==1 || FILTER!='PASS'\" {tmp_norm} -Oz -o {output}",
|
|
|
+ bin = self.bin,
|
|
|
+ threads = self.threads,
|
|
|
+ input = self.input.display(),
|
|
|
+ tmp_sort = self.tmp_sort.display(),
|
|
|
+ tmp_norm = self.tmp_norm.display(),
|
|
|
+ output = self.output.display(),
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Removes temporary files created during [`Command::init`] / [`Command::cmd`].
|
|
|
+ fn clean_up(&self) -> anyhow::Result<()> {
|
|
|
+ if self.tmp_sort.exists() {
|
|
|
+ fs::remove_file(&self.tmp_sort)
|
|
|
+ .with_context(|| format!("Failed to remove {}", self.tmp_sort.display()))?;
|
|
|
+ }
|
|
|
+ if self.tmp_norm.exists() {
|
|
|
+ fs::remove_file(&self.tmp_norm)
|
|
|
+ .with_context(|| format!("Failed to remove {}", self.tmp_norm.display()))?;
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Runner for BcftoolsKeepPassPrecise {}
|
|
|
+
|
|
|
+impl super::SlurmRunner for BcftoolsKeepPassPrecise {
|
|
|
+ /// Slurm resource request for the `bcftools keep pass precise` job.
|
|
|
+ fn slurm_args(&self) -> Vec<String> {
|
|
|
+ SlurmParams {
|
|
|
+ job_name: Some("bcftools_keep_pass_precise".into()),
|
|
|
+ cpus_per_task: Some(self.threads as u32),
|
|
|
+ mem: Some("45G".into()),
|
|
|
+ partition: Some("shortq".into()),
|
|
|
+ gres: None,
|
|
|
+ }
|
|
|
+ .to_args()
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Concatenates multiple VCF/BCF files with `bcftools concat`.
|
|
|
+///
|
|
|
+/// Builds a temporary list file containing all inputs, then runs
|
|
|
+/// `bcftools concat -a -D -f list --write-index` to produce `output`.
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct BcftoolsConcat {
|
|
|
+ /// Path to the `bcftools` binary.
|
|
|
+ pub bin: String,
|
|
|
+ /// Number of threads to pass to `bcftools`.
|
|
|
+ pub threads: u8,
|
|
|
+ /// List of input VCF/BCF files to concatenate.
|
|
|
+ pub inputs: Vec<PathBuf>,
|
|
|
+ /// Output concatenated VCF/BCF file.
|
|
|
+ pub output: PathBuf,
|
|
|
+ /// Temporary dir from a global [`Config`].
|
|
|
+ pub tmp_dir: PathBuf,
|
|
|
+ /// Temporary list file path for `bcftools concat -f`.
|
|
|
+ tmp_list: PathBuf,
|
|
|
+}
|
|
|
+
|
|
|
+impl BcftoolsConcat {
|
|
|
+ /// Builds a [`BcftoolsConcat`] from a global [`Config`].
|
|
|
+ ///
|
|
|
+ /// Input paths are converted to [`PathBuf`] immediately, but existence
|
|
|
+ /// is validated in [`Command::init`].
|
|
|
+ pub fn from_config(
|
|
|
+ config: &Config,
|
|
|
+ inputs: Vec<impl AsRef<Path>>,
|
|
|
+ output: impl AsRef<Path>,
|
|
|
+ ) -> Self {
|
|
|
+ Self {
|
|
|
+ bin: config.bcftools_bin.clone(),
|
|
|
+ threads: config.bcftools_threads,
|
|
|
+ inputs: inputs.into_iter().map(|p| p.as_ref().into()).collect(),
|
|
|
+ output: output.as_ref().into(),
|
|
|
+ tmp_dir: config.tmp_dir.clone().into(),
|
|
|
+ tmp_list: PathBuf::new(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Command for BcftoolsConcat {
|
|
|
+ /// Validates input VCFs and writes the temporary concat list file
|
|
|
+ fn init(&mut self) -> anyhow::Result<()> {
|
|
|
+ if self.inputs.is_empty() {
|
|
|
+ anyhow::bail!("No VCFs provided");
|
|
|
+ }
|
|
|
+ for p in &self.inputs {
|
|
|
+ if !p.exists() {
|
|
|
+ anyhow::bail!("VCF doesnt exist: {}", p.display());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ self.tmp_list = self.tmp_dir.join(Uuid::new_v4().to_string());
|
|
|
+ let list = self
|
|
|
+ .inputs
|
|
|
+ .iter()
|
|
|
+ .map(|p| p.display().to_string())
|
|
|
+ .collect::<Vec<_>>()
|
|
|
+ .join("\n");
|
|
|
+
|
|
|
+ info!("Writing bcftools concat list: {}", self.tmp_list.display());
|
|
|
+ fs::write(&self.tmp_list, list)?;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns the shell command that runs `bcftools concat` and removes the list file.
|
|
|
+ fn cmd(&self) -> String {
|
|
|
+ format!(
|
|
|
+ "{bin} concat --write-index --threads {threads} -a -D -f {list} -Oz -o {output} && rm -f {list}",
|
|
|
+ bin = self.bin,
|
|
|
+ threads = self.threads,
|
|
|
+ list = self.tmp_list.display(),
|
|
|
+ output = self.output.display(),
|
|
|
+ )
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Runner for BcftoolsConcat {}
|
|
|
+
|
|
|
+impl super::SlurmRunner for BcftoolsConcat {
|
|
|
+ // Slurm resource request for the `bcftools concat` job.
|
|
|
+ fn slurm_args(&self) -> Vec<String> {
|
|
|
+ SlurmParams {
|
|
|
+ job_name: Some("bcftools_concat".into()),
|
|
|
+ cpus_per_task: Some(self.threads as u32),
|
|
|
+ mem: Some("45G".into()),
|
|
|
+ partition: Some("shortq".into()),
|
|
|
+ gres: None,
|
|
|
+ }
|
|
|
+ .to_args()
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Indexes a VCF/BCF file with `bcftools index`.
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct BcftoolsIndex {
|
|
|
+ /// Path to the `bcftools` binary.
|
|
|
+ pub bin: String,
|
|
|
+ /// Number of threads to pass to `bcftools index`.
|
|
|
+ pub threads: u8,
|
|
|
+ /// VCF/BCF file to index.
|
|
|
+ pub vcf: PathBuf,
|
|
|
+}
|
|
|
+
|
|
|
+impl BcftoolsIndex {
|
|
|
+ /// Builds a [`BcftoolsIndex`] from a global [`Config`].
|
|
|
+ pub fn from_config(config: &Config, vcf: impl AsRef<Path>) -> Self {
|
|
|
+ Self {
|
|
|
+ bin: config.bcftools_bin.clone(),
|
|
|
+ threads: config.bcftools_threads,
|
|
|
+ vcf: vcf.as_ref().into(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Command for BcftoolsIndex {
|
|
|
+ /// Validates that the VCF/BCF file exists.
|
|
|
+ fn init(&mut self) -> anyhow::Result<()> {
|
|
|
+ if !self.vcf.exists() {
|
|
|
+ anyhow::bail!("VCF doesnt exist: {}", self.vcf.display());
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns the shell command that runs `bcftools index`.
|
|
|
+ fn cmd(&self) -> String {
|
|
|
+ format!(
|
|
|
+ "{bin} index --threads {threads} {vcf}",
|
|
|
+ bin = self.bin,
|
|
|
+ threads = self.threads,
|
|
|
+ vcf = self.vcf.display()
|
|
|
+ )
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Runner for BcftoolsIndex {}
|
|
|
+impl super::SlurmRunner for BcftoolsIndex {
|
|
|
+ /// Slurm resource request for the `bcftools index` job.
|
|
|
+ fn slurm_args(&self) -> Vec<String> {
|
|
|
+ SlurmParams {
|
|
|
+ job_name: Some("bcftools_index".into()),
|
|
|
+ cpus_per_task: Some(self.threads as u32),
|
|
|
+ mem: Some("45G".into()),
|
|
|
+ partition: Some("shortq".into()),
|
|
|
+ gres: None,
|
|
|
+ }
|
|
|
+ .to_args()
|
|
|
+ }
|
|
|
+}
|
|
|
|
|
|
+/// Compresses a VCF/BCF file to BGZF using `bcftools view -Oz`.
|
|
|
#[derive(Debug)]
|
|
|
-pub struct BcftoolsConfig {
|
|
|
+pub struct BcftoolsCompress {
|
|
|
+ /// Path to the `bcftools` binary.
|
|
|
pub bin: String,
|
|
|
+ /// Number of threads to pass to `bcftools`.
|
|
|
pub threads: u8,
|
|
|
+ /// Input uncompressed VCF/BCF file.
|
|
|
+ pub in_vcf: PathBuf,
|
|
|
+ /// Output compressed VCF/BCF (`.vcf.gz` / `.bcf`) file.
|
|
|
+ pub out_vcf: PathBuf,
|
|
|
}
|
|
|
|
|
|
-impl Default for BcftoolsConfig {
|
|
|
- fn default() -> Self {
|
|
|
+impl BcftoolsCompress {
|
|
|
+ /// Builds a [`BcftoolsCompress`] from a global [`Config`].
|
|
|
+ ///
|
|
|
+ /// Existence of `in_vcf` and non-existence of `out_vcf` are enforced
|
|
|
+ /// in [`Command::init`].
|
|
|
+ pub fn from_config(
|
|
|
+ config: &Config,
|
|
|
+ in_vcf: impl AsRef<Path>,
|
|
|
+ out_vcf: impl AsRef<Path>,
|
|
|
+ ) -> Self {
|
|
|
Self {
|
|
|
- bin: "/data/tools/bcftools-1.21/bcftools".to_string(),
|
|
|
- threads: 20,
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-pub fn bcftools_keep_pass(
|
|
|
- input: &str,
|
|
|
- output: &str,
|
|
|
- config: BcftoolsConfig,
|
|
|
-) -> anyhow::Result<RunReport> {
|
|
|
- if !Path::new(input).exists() {
|
|
|
- anyhow::bail!("File doesnt exist {input}")
|
|
|
- }
|
|
|
- // First sort
|
|
|
- let tmp_file = format!("/tmp/{}", Uuid::new_v4());
|
|
|
- let mut cmd_run = CommandRun::new(&config.bin, &["sort", input, "-o", &tmp_file]);
|
|
|
- let _ = run_wait(&mut cmd_run)?;
|
|
|
-
|
|
|
- // 2. norm
|
|
|
- let tmp2_file = format!("/tmp/{}", Uuid::new_v4());
|
|
|
- let mut cmd_run = CommandRun::new(
|
|
|
- &config.bin,
|
|
|
- &[
|
|
|
- "norm",
|
|
|
- "--threads",
|
|
|
- &config.threads.to_string(),
|
|
|
- "-a",
|
|
|
- "--atom-overlaps",
|
|
|
- ".",
|
|
|
- &tmp_file,
|
|
|
- "-o",
|
|
|
- &tmp2_file,
|
|
|
- ],
|
|
|
- );
|
|
|
- let _ = run_wait(&mut cmd_run)?;
|
|
|
- fs::remove_file(tmp_file)?;
|
|
|
-
|
|
|
- // Then filter
|
|
|
- let mut cmd_run = CommandRun::new(
|
|
|
- &config.bin,
|
|
|
- &[
|
|
|
- "view",
|
|
|
- "--write-index",
|
|
|
- "--threads",
|
|
|
- &config.threads.to_string(),
|
|
|
- "-i",
|
|
|
- "FILTER='PASS'",
|
|
|
- &tmp2_file,
|
|
|
- "-o",
|
|
|
- output,
|
|
|
- ],
|
|
|
- );
|
|
|
- let res = run_wait(&mut cmd_run)?;
|
|
|
- fs::remove_file(tmp2_file)?;
|
|
|
- Ok(res)
|
|
|
-}
|
|
|
-
|
|
|
-pub fn bcftools_keep_pass_precise(
|
|
|
- input: &str,
|
|
|
- output: &str,
|
|
|
- config: BcftoolsConfig,
|
|
|
-) -> anyhow::Result<RunReport> {
|
|
|
- if !Path::new(input).exists() {
|
|
|
- anyhow::bail!("File doesnt exist {input}")
|
|
|
- }
|
|
|
- // First sort
|
|
|
- let tmp_file = format!("/tmp/{}", Uuid::new_v4());
|
|
|
- let mut cmd_run = CommandRun::new(&config.bin, &["sort", input, "-o", &tmp_file]);
|
|
|
- let _ = run_wait(&mut cmd_run)?;
|
|
|
-
|
|
|
- // 2. norm
|
|
|
- let tmp2_file = format!("/tmp/{}", Uuid::new_v4());
|
|
|
- let mut cmd_run = CommandRun::new(
|
|
|
- &config.bin,
|
|
|
- &[
|
|
|
- "norm",
|
|
|
- "--threads",
|
|
|
- &config.threads.to_string(),
|
|
|
- "-a",
|
|
|
- "--atom-overlaps",
|
|
|
- ".",
|
|
|
- &tmp_file,
|
|
|
- "-o",
|
|
|
- &tmp2_file,
|
|
|
- ],
|
|
|
- );
|
|
|
- let _ = run_wait(&mut cmd_run)?;
|
|
|
- fs::remove_file(tmp_file)?;
|
|
|
-
|
|
|
- // Then filter
|
|
|
- let mut cmd_run = CommandRun::new(
|
|
|
- &config.bin,
|
|
|
- &[
|
|
|
- "view",
|
|
|
- "--write-index",
|
|
|
- "--threads",
|
|
|
- &config.threads.to_string(),
|
|
|
- "-e",
|
|
|
- "INFO/IMPRECISE==1 || FILTER!=\"PASS\"",
|
|
|
- &tmp2_file,
|
|
|
- "-o",
|
|
|
- output,
|
|
|
- ],
|
|
|
- );
|
|
|
- let res = run_wait(&mut cmd_run)?;
|
|
|
- fs::remove_file(tmp2_file)?;
|
|
|
- Ok(res)
|
|
|
-}
|
|
|
-
|
|
|
-pub fn bcftools_concat(
|
|
|
- inputs: Vec<String>,
|
|
|
- output: &str,
|
|
|
- config: BcftoolsConfig,
|
|
|
-) -> anyhow::Result<RunReport> {
|
|
|
- info!("Concatening vcf with bcftools: {}", inputs.join(", "));
|
|
|
- let tmp_file = format!("/tmp/{}", Uuid::new_v4());
|
|
|
- fs::write(&tmp_file, inputs.join("\n"))?;
|
|
|
-
|
|
|
- let args = [
|
|
|
- "concat",
|
|
|
- "--write-index",
|
|
|
- "--threads",
|
|
|
- &config.threads.to_string(),
|
|
|
- "-a",
|
|
|
- "-D",
|
|
|
- "-f",
|
|
|
- &tmp_file,
|
|
|
- "-o",
|
|
|
- output,
|
|
|
- ];
|
|
|
- // Then filter
|
|
|
- let mut cmd_run = CommandRun::new(&config.bin, &args);
|
|
|
- let res = run_wait(&mut cmd_run)
|
|
|
- .context(format!("Error while running `bcftools {}`", args.join(" ")))?;
|
|
|
- fs::remove_file(tmp_file)?;
|
|
|
- Ok(res)
|
|
|
-}
|
|
|
-
|
|
|
-pub fn bcftools_keep_only_in_a(
|
|
|
- a: &str,
|
|
|
- b: &str,
|
|
|
- out: &str,
|
|
|
- config: &BcftoolsConfig,
|
|
|
-) -> anyhow::Result<()> {
|
|
|
- let args = ["isec", "-C", "-w", "1", a, b, "-o", out];
|
|
|
- let mut cmd_run = CommandRun::new(&config.bin, &args);
|
|
|
- let _ = run_wait(&mut cmd_run)
|
|
|
- .context(format!("Error while running `bcftools {}`", args.join(" ")))?;
|
|
|
- Ok(())
|
|
|
-}
|
|
|
-
|
|
|
-pub fn bcftools_index(vcf: &str, config: &BcftoolsConfig) -> anyhow::Result<()> {
|
|
|
- let args = ["index", "--threads", &config.threads.to_string(), vcf];
|
|
|
- let mut cmd_run = CommandRun::new(&config.bin, &args);
|
|
|
- let _ = run_wait(&mut cmd_run)
|
|
|
- .context(format!("Error while running `bcftools {}`", args.join(" ")))?;
|
|
|
- Ok(())
|
|
|
-}
|
|
|
-
|
|
|
-pub fn bcftools_compress(
|
|
|
- in_vcf: &str,
|
|
|
- out_vcf: &str,
|
|
|
- config: &BcftoolsConfig,
|
|
|
-) -> anyhow::Result<()> {
|
|
|
- let args = [
|
|
|
- "view",
|
|
|
- "--threads",
|
|
|
- &config.threads.to_string(),
|
|
|
- in_vcf,
|
|
|
- "-Oz",
|
|
|
- "-o",
|
|
|
- out_vcf,
|
|
|
- ];
|
|
|
- let mut cmd_run = CommandRun::new(&config.bin, &args);
|
|
|
- let _ = run_wait(&mut cmd_run)
|
|
|
- .context(format!("Error while running `bcftools {}`", args.join(" ")))?;
|
|
|
- Ok(())
|
|
|
+ bin: config.bcftools_bin.clone(),
|
|
|
+ threads: config.bcftools_threads,
|
|
|
+ in_vcf: in_vcf.as_ref().into(),
|
|
|
+ out_vcf: out_vcf.as_ref().into(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Command for BcftoolsCompress {
|
|
|
+ /// Validates input/output VCF paths before running the command.
|
|
|
+ fn init(&mut self) -> anyhow::Result<()> {
|
|
|
+ if !self.in_vcf.exists() {
|
|
|
+ anyhow::bail!("VCF doesnt exist: {}", self.in_vcf.display());
|
|
|
+ }
|
|
|
+ if self.out_vcf.exists() {
|
|
|
+ anyhow::bail!("Output VCF exists: {}", self.out_vcf.display());
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Returns the shell command that compresses the VCF with `bcftools view -Oz`.
|
|
|
+ fn cmd(&self) -> String {
|
|
|
+ format!(
|
|
|
+ "{bin} view --threads {threads} {in_vcf} -Oz -o {out_vcf}",
|
|
|
+ bin = self.bin,
|
|
|
+ threads = self.threads,
|
|
|
+ in_vcf = self.in_vcf.display(),
|
|
|
+ out_vcf = self.out_vcf.display()
|
|
|
+ )
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl super::Runner for BcftoolsCompress {}
|
|
|
+impl super::SlurmRunner for BcftoolsCompress {
|
|
|
+ /// Slurm resource request for the `bcftools compress` job.
|
|
|
+ fn slurm_args(&self) -> Vec<String> {
|
|
|
+ SlurmParams {
|
|
|
+ job_name: Some("bcftools_compress".into()),
|
|
|
+ cpus_per_task: Some(self.threads as u32),
|
|
|
+ mem: Some("45G".into()),
|
|
|
+ partition: Some("shortq".into()),
|
|
|
+ gres: None,
|
|
|
+ }
|
|
|
+ .to_args()
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[cfg(test)]
|
|
|
+mod tests {
|
|
|
+ use super::*;
|
|
|
+ use std::{
|
|
|
+ fs,
|
|
|
+ path::{Path, PathBuf},
|
|
|
+ };
|
|
|
+
|
|
|
+ use log::info;
|
|
|
+ use uuid::Uuid;
|
|
|
+
|
|
|
+ use crate::{
|
|
|
+ commands::SlurmRunner,
|
|
|
+ config::Config,
|
|
|
+ helpers::{test_init, TempDirGuard},
|
|
|
+ };
|
|
|
+
|
|
|
+ /// Small valid VCF with 3 variants and full header.
|
|
|
+ fn write_small_vcf(path: &Path) -> anyhow::Result<()> {
|
|
|
+ let vcf = "\
|
|
|
+##fileformat=VCFv4.2
|
|
|
+##contig=<ID=1,length=1000000>
|
|
|
+##INFO=<ID=IMPRECISE,Number=1,Type=Integer,Description=\"Imprecise call indicator\">
|
|
|
+##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">
|
|
|
+##FILTER=<ID=q10,Description=\"Quality <10\">
|
|
|
+#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ts1
|
|
|
+1\t100\t.\tA\tC\t50\tPASS\t.\tGT\t0/1
|
|
|
+1\t200\t.\tG\tT\t50\tq10\t.\tGT\t0/1
|
|
|
+1\t300\t.\tT\tG\t50\tPASS\tIMPRECISE=1\tGT\t0/1
|
|
|
+";
|
|
|
+ if let Some(parent) = path.parent() {
|
|
|
+ fs::create_dir_all(parent)?;
|
|
|
+ }
|
|
|
+ info!("Writing small test VCF to {}", path.display());
|
|
|
+ fs::write(path, vcf)?;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn bcftools_compress_index_slurm() -> anyhow::Result<()> {
|
|
|
+ test_init();
|
|
|
+ let config = Config::default();
|
|
|
+ let base = PathBuf::from(&config.tmp_dir).join(Uuid::new_v4().to_string());
|
|
|
+ fs::create_dir_all(&base)?;
|
|
|
+ let _guard = TempDirGuard::new(base.clone());
|
|
|
+
|
|
|
+ let raw = base.join("raw.vcf");
|
|
|
+ let raw_gz = base.join("raw.vcf.gz");
|
|
|
+
|
|
|
+ write_small_vcf(&raw)?;
|
|
|
+
|
|
|
+ // 1) compress raw.vcf -> raw.vcf.gz
|
|
|
+ let mut compress = BcftoolsCompress::from_config(&config, &raw, &raw_gz);
|
|
|
+ let out_compress = SlurmRunner::run(&mut compress)?;
|
|
|
+ println!("{out_compress}");
|
|
|
+ assert!(
|
|
|
+ raw_gz.exists(),
|
|
|
+ "compressed VCF should exist after compress"
|
|
|
+ );
|
|
|
+
|
|
|
+ // 2) index raw.vcf.gz
|
|
|
+ let mut index = BcftoolsIndex::from_config(&config, &raw_gz);
|
|
|
+ let out_index = SlurmRunner::run(&mut index)?;
|
|
|
+ println!("{out_index}");
|
|
|
+
|
|
|
+ let tbi = PathBuf::from(format!("{}.tbi", raw_gz.display()));
|
|
|
+ let csi = PathBuf::from(format!("{}.csi", raw_gz.display()));
|
|
|
+ assert!(
|
|
|
+ tbi.exists() || csi.exists(),
|
|
|
+ "index file (.tbi or .csi) should exist"
|
|
|
+ );
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn bcftools_compress_index_then_keep_pass_slurm() -> anyhow::Result<()> {
|
|
|
+ test_init();
|
|
|
+ let config = Config::default();
|
|
|
+ let base = PathBuf::from(&config.tmp_dir).join(Uuid::new_v4().to_string());
|
|
|
+ fs::create_dir_all(&base)?;
|
|
|
+ let _guard = TempDirGuard::new(base.clone());
|
|
|
+
|
|
|
+ let raw = base.join("raw.vcf");
|
|
|
+ let raw_gz = base.join("raw.vcf.gz");
|
|
|
+ let pass_vcf_gz = base.join("pass.vcf.gz");
|
|
|
+
|
|
|
+ write_small_vcf(&raw)?;
|
|
|
+
|
|
|
+ // 1) compress raw.vcf -> raw.vcf.gz
|
|
|
+ let mut compress = BcftoolsCompress::from_config(&config, &raw, &raw_gz);
|
|
|
+ let out_compress = SlurmRunner::run(&mut compress)?;
|
|
|
+ println!("{out_compress}");
|
|
|
+ assert!(
|
|
|
+ raw_gz.exists(),
|
|
|
+ "compressed VCF should exist after compress"
|
|
|
+ );
|
|
|
+
|
|
|
+ // 2) index raw.vcf.gz
|
|
|
+ let mut index = BcftoolsIndex::from_config(&config, &raw_gz);
|
|
|
+ let out_index = SlurmRunner::run(&mut index)?;
|
|
|
+ println!("{out_index}");
|
|
|
+
|
|
|
+ let raw_tbi = PathBuf::from(format!("{}.tbi", raw_gz.display()));
|
|
|
+ let raw_csi = PathBuf::from(format!("{}.csi", raw_gz.display()));
|
|
|
+ assert!(
|
|
|
+ raw_tbi.exists() || raw_csi.exists(),
|
|
|
+ "index file (.tbi or .csi) for raw.vcf.gz should exist"
|
|
|
+ );
|
|
|
+
|
|
|
+ // 3) keep only FILTER=PASS from raw.vcf.gz -> pass.vcf.gz (with index)
|
|
|
+ let mut keep = BcftoolsKeepPass::from_config(&config, &raw_gz, &pass_vcf_gz);
|
|
|
+ let out_keep = SlurmRunner::run(&mut keep)?;
|
|
|
+ println!("{out_keep}");
|
|
|
+
|
|
|
+ assert!(
|
|
|
+ pass_vcf_gz.exists(),
|
|
|
+ "PASS-only BGZF VCF should exist after keep-pass step"
|
|
|
+ );
|
|
|
+ let pass_tbi = PathBuf::from(format!("{}.tbi", pass_vcf_gz.display()));
|
|
|
+ let pass_csi = PathBuf::from(format!("{}.csi", pass_vcf_gz.display()));
|
|
|
+ assert!(
|
|
|
+ pass_tbi.exists() || pass_csi.exists(),
|
|
|
+ "index for pass.vcf.gz should exist (written by --write-index)"
|
|
|
+ );
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn bcftools_compress_index_then_keep_pass_precise_slurm() -> anyhow::Result<()> {
|
|
|
+ test_init();
|
|
|
+ let config = Config::default();
|
|
|
+ let base = PathBuf::from(&config.tmp_dir).join(Uuid::new_v4().to_string());
|
|
|
+ fs::create_dir_all(&base)?;
|
|
|
+ let _guard = TempDirGuard::new(base.clone());
|
|
|
+
|
|
|
+ let raw = base.join("raw.vcf");
|
|
|
+ let raw_gz = base.join("raw.vcf.gz");
|
|
|
+ let pass_precise_vcf_gz = base.join("pass_precise.vcf.gz");
|
|
|
+
|
|
|
+ write_small_vcf(&raw)?;
|
|
|
+
|
|
|
+ // 1) compress raw.vcf -> raw.vcf.gz
|
|
|
+ let mut compress = BcftoolsCompress::from_config(&config, &raw, &raw_gz);
|
|
|
+ let out_compress = SlurmRunner::run(&mut compress)?;
|
|
|
+ println!("{out_compress}");
|
|
|
+ assert!(
|
|
|
+ raw_gz.exists(),
|
|
|
+ "compressed VCF should exist after compress"
|
|
|
+ );
|
|
|
+
|
|
|
+ // 2) index raw.vcf.gz
|
|
|
+ let mut index = BcftoolsIndex::from_config(&config, &raw_gz);
|
|
|
+ let out_index = SlurmRunner::run(&mut index)?;
|
|
|
+ println!("{out_index}");
|
|
|
+
|
|
|
+ let tbi = PathBuf::from(format!("{}.tbi", raw_gz.display()));
|
|
|
+ let csi = PathBuf::from(format!("{}.csi", raw_gz.display()));
|
|
|
+ assert!(
|
|
|
+ tbi.exists() || csi.exists(),
|
|
|
+ "index file (.tbi or .csi) should exist"
|
|
|
+ );
|
|
|
+
|
|
|
+ // 3) keep only precise FILTER=PASS from raw.vcf.gz
|
|
|
+ let mut keep_precise =
|
|
|
+ BcftoolsKeepPassPrecise::from_config(&config, &raw_gz, &pass_precise_vcf_gz);
|
|
|
+ let out_keep_precise = SlurmRunner::run(&mut keep_precise)?;
|
|
|
+ println!("{out_keep_precise}");
|
|
|
+ assert!(pass_precise_vcf_gz.exists());
|
|
|
+ let pp_tbi = PathBuf::from(format!("{}.tbi", pass_precise_vcf_gz.display()));
|
|
|
+ let pp_csi = PathBuf::from(format!("{}.csi", pass_precise_vcf_gz.display()));
|
|
|
+ assert!(pp_tbi.exists() || pp_csi.exists());
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+
|
|
|
+ #[test]
|
|
|
+ fn bcftools_concat_compress_index_slurm() -> anyhow::Result<()> {
|
|
|
+ test_init();
|
|
|
+ let config = Config::default();
|
|
|
+ let base = PathBuf::from(&config.tmp_dir).join(Uuid::new_v4().to_string());
|
|
|
+ fs::create_dir_all(&base)?;
|
|
|
+ let _guard = TempDirGuard::new(base.clone());
|
|
|
+
|
|
|
+ let raw1 = base.join("a.vcf");
|
|
|
+ let raw2 = base.join("b.vcf");
|
|
|
+ let raw3 = base.join("c.vcf");
|
|
|
+ let gz1 = base.join("a.vcf.gz");
|
|
|
+ let gz2 = base.join("b.vcf.gz");
|
|
|
+ let gz3 = base.join("c.vcf.gz");
|
|
|
+ let concat_gz = base.join("concat.vcf.gz");
|
|
|
+
|
|
|
+ // write three small raw VCFs
|
|
|
+ write_small_vcf(&raw1)?;
|
|
|
+ write_small_vcf(&raw2)?;
|
|
|
+ write_small_vcf(&raw3)?;
|
|
|
+
|
|
|
+ // 1) compress each raw VCF
|
|
|
+ let mut c1 = BcftoolsCompress::from_config(&config, &raw1, &gz1);
|
|
|
+ let mut c2 = BcftoolsCompress::from_config(&config, &raw2, &gz2);
|
|
|
+ let mut c3 = BcftoolsCompress::from_config(&config, &raw3, &gz3);
|
|
|
+ SlurmRunner::run(&mut c1)?;
|
|
|
+ SlurmRunner::run(&mut c2)?;
|
|
|
+ SlurmRunner::run(&mut c3)?;
|
|
|
+ assert!(
|
|
|
+ gz1.exists() && gz2.exists() && gz3.exists(),
|
|
|
+ "all compressed VCFs should exist"
|
|
|
+ );
|
|
|
+
|
|
|
+ // 2) index each compressed VCF
|
|
|
+ let mut i1 = BcftoolsIndex::from_config(&config, &gz1);
|
|
|
+ let mut i2 = BcftoolsIndex::from_config(&config, &gz2);
|
|
|
+ let mut i3 = BcftoolsIndex::from_config(&config, &gz3);
|
|
|
+ SlurmRunner::run(&mut i1)?;
|
|
|
+ SlurmRunner::run(&mut i2)?;
|
|
|
+ SlurmRunner::run(&mut i3)?;
|
|
|
+ // just basic existence checks; index suffix may be .tbi or .csi
|
|
|
+ assert!(
|
|
|
+ gz1.with_extension("vcf.gz.tbi").exists()
|
|
|
+ || gz1.with_extension("vcf.gz.csi").exists()
|
|
|
+ || PathBuf::from(format!("{}.tbi", gz1.display())).exists()
|
|
|
+ || PathBuf::from(format!("{}.csi", gz1.display())).exists()
|
|
|
+ );
|
|
|
+
|
|
|
+ // 3) concat compressed VCFs (bcftools can concat bgzip VCFs)
|
|
|
+ let mut concat = BcftoolsConcat::from_config(&config, vec![&gz1, &gz2, &gz3], &concat_gz);
|
|
|
+ let out_concat = SlurmRunner::run(&mut concat)?;
|
|
|
+ println!("{out_concat}");
|
|
|
+ assert!(concat_gz.exists(), "concatenated BGZF VCF should exist");
|
|
|
+
|
|
|
+ let concat_tbi = PathBuf::from(format!("{}.tbi", concat_gz.display()));
|
|
|
+ let concat_csi = PathBuf::from(format!("{}.csi", concat_gz.display()));
|
|
|
+ assert!(
|
|
|
+ concat_tbi.exists() || concat_csi.exists(),
|
|
|
+ "index for concatenated VCF should exist"
|
|
|
+ );
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
}
|