|
|
@@ -2,12 +2,13 @@ use crate::{
|
|
|
annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
|
|
|
collection::vcf::Vcf,
|
|
|
commands::{
|
|
|
- bcftools::{BcftoolsConcat, BcftoolsKeepPass},
|
|
|
- CapturedOutput, Command as JobCommand, Runner as LocalRunner, SbatchRunner, SlurmParams,
|
|
|
- SlurmRunner,
|
|
|
+ CapturedOutput, Command as JobCommand, Runner as LocalRunner, SbatchRunner, SlurmParams, SlurmRunner, bcftools::{BcftoolsConcat, BcftoolsKeepPass}, run_many_sbatch
|
|
|
},
|
|
|
config::Config,
|
|
|
- helpers::{is_file_older, remove_dir_if_exists, temp_file_path},
|
|
|
+ helpers::{
|
|
|
+ get_genome_sizes, is_file_older, remove_dir_if_exists, split_genome_into_n_regions,
|
|
|
+ temp_file_path,
|
|
|
+ },
|
|
|
io::vcf::read_vcf,
|
|
|
pipes::{Initialize, ShouldRun, Version},
|
|
|
runners::Run,
|
|
|
@@ -21,8 +22,11 @@ use anyhow::Context;
|
|
|
use log::{debug, info, warn};
|
|
|
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
|
|
use regex::Regex;
|
|
|
+use rust_htslib::bam::{self, Read};
|
|
|
use std::{
|
|
|
- fmt, fs, path::Path, process::{Command as ProcessCommand, Stdio}
|
|
|
+ fmt, fs,
|
|
|
+ path::Path,
|
|
|
+ process::{Command as ProcessCommand, Stdio},
|
|
|
};
|
|
|
|
|
|
/// A pipeline runner for executing ClairS on paired tumor and normal samples.
|
|
|
@@ -41,6 +45,12 @@ pub struct ClairS {
|
|
|
/// Optional list of regions passed as repeated `-r REGION` args.
|
|
|
/// When empty, ClairS runs genome-wide.
|
|
|
pub regions: Vec<String>,
|
|
|
+
|
|
|
+ /// Optional part index for chunked parallel runs (1-indexed).
|
|
|
+ ///
|
|
|
+ /// When `Some(n)`, output files go into a `part{n}` subdirectory and
|
|
|
+ /// PASS VCFs are per-part, later merged into the canonical VCF.
|
|
|
+ pub part_index: Option<usize>,
|
|
|
}
|
|
|
|
|
|
impl fmt::Display for ClairS {
|
|
|
@@ -55,6 +65,12 @@ impl fmt::Display for ClairS {
|
|
|
format!("{} regions", self.regions.len())
|
|
|
}
|
|
|
)?;
|
|
|
+ writeln!(
|
|
|
+ f,
|
|
|
+ " Part : {}",
|
|
|
+ self.part_index
|
|
|
+ .map_or("full".into(), |n| format!("part{n}"))
|
|
|
+ )?;
|
|
|
writeln!(f, " Log dir : {}", self.log_dir)
|
|
|
}
|
|
|
}
|
|
|
@@ -71,6 +87,7 @@ impl Initialize for ClairS {
|
|
|
log_dir,
|
|
|
config: config.clone(),
|
|
|
regions: Vec::new(),
|
|
|
+ part_index: None,
|
|
|
};
|
|
|
|
|
|
if clairs.config.clairs_force {
|
|
|
@@ -99,7 +116,7 @@ impl ShouldRun for ClairS {
|
|
|
|
|
|
impl JobCommand for ClairS {
|
|
|
fn init(&mut self) -> anyhow::Result<()> {
|
|
|
- let output_dir = self.config.clairs_output_dir(&self.id);
|
|
|
+ let output_dir = self.part_output_dir();
|
|
|
|
|
|
fs::create_dir_all(&output_dir)
|
|
|
.with_context(|| format!("Failed create dir: {output_dir}"))?;
|
|
|
@@ -111,7 +128,7 @@ impl JobCommand for ClairS {
|
|
|
}
|
|
|
|
|
|
fn cmd(&self) -> String {
|
|
|
- let output_dir = self.config.clairs_output_dir(&self.id);
|
|
|
+ let output_dir = self.part_output_dir();
|
|
|
|
|
|
// Build repeated -r REGION args if any regions were set (for batched runs)
|
|
|
let region_args = if self.regions.is_empty() {
|
|
|
@@ -190,38 +207,56 @@ impl Run for ClairS {
|
|
|
|
|
|
impl ClairS {
|
|
|
fn postprocess_local(&self) -> anyhow::Result<()> {
|
|
|
- // Germline PASS
|
|
|
- let clair3_germline_passed = self.config.clairs_germline_passed_vcf(&self.id);
|
|
|
- if !Path::new(&clair3_germline_passed).exists() {
|
|
|
- let clair3_germline_normal = self.config.clairs_germline_normal_vcf(&self.id);
|
|
|
-
|
|
|
- let mut cmd = BcftoolsKeepPass::from_config(
|
|
|
- &self.config,
|
|
|
- clair3_germline_normal,
|
|
|
- clair3_germline_passed.clone(),
|
|
|
- );
|
|
|
- let report = <BcftoolsKeepPass as LocalRunner>::run(&mut cmd).with_context(|| {
|
|
|
- format!(
|
|
|
- "Failed to run `bcftools keep PASS` for {}.",
|
|
|
- clair3_germline_passed
|
|
|
- )
|
|
|
- })?;
|
|
|
-
|
|
|
- let log_file = format!("{}/bcftools_germline_pass_", self.log_dir);
|
|
|
- report
|
|
|
- .save_to_file(&log_file)
|
|
|
- .with_context(|| format!("Error while writing logs into {log_file}"))?;
|
|
|
- } else {
|
|
|
- debug!(
|
|
|
- "ClairS Germline PASSED VCF already exists for {}, skipping.",
|
|
|
- self.id
|
|
|
- );
|
|
|
+ // Germline PASS only once (full run, not per-part)
|
|
|
+ if self.part_index.is_none() {
|
|
|
+ let clair3_germline_passed = self.config.clairs_germline_passed_vcf(&self.id);
|
|
|
+ if !Path::new(&clair3_germline_passed).exists() {
|
|
|
+ let clair3_germline_normal = self.config.clairs_germline_normal_vcf(&self.id);
|
|
|
+
|
|
|
+ let mut cmd = BcftoolsKeepPass::from_config(
|
|
|
+ &self.config,
|
|
|
+ clair3_germline_normal,
|
|
|
+ clair3_germline_passed.clone(),
|
|
|
+ );
|
|
|
+ let report =
|
|
|
+ <BcftoolsKeepPass as LocalRunner>::run(&mut cmd).with_context(|| {
|
|
|
+ format!(
|
|
|
+ "Failed to run `bcftools keep PASS` for {}.",
|
|
|
+ clair3_germline_passed
|
|
|
+ )
|
|
|
+ })?;
|
|
|
+
|
|
|
+ let log_file = format!("{}/bcftools_germline_pass_", self.log_dir);
|
|
|
+ report
|
|
|
+ .save_to_file(&log_file)
|
|
|
+ .with_context(|| format!("Error while writing logs into {log_file}"))?;
|
|
|
+ } else {
|
|
|
+ debug!(
|
|
|
+ "ClairS Germline PASSED VCF already exists for {}, skipping.",
|
|
|
+ self.id
|
|
|
+ );
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- // Somatic concat + PASS
|
|
|
- let passed_vcf = self.config.clairs_passed_vcf(&self.id);
|
|
|
+ // Somatic concat + PASS (per-part or full)
|
|
|
+ let passed_vcf = self.somatic_passed_vcf_path();
|
|
|
if !Path::new(&passed_vcf).exists() {
|
|
|
let (output_vcf, output_indels_vcf) = self.config.clairs_output_vcfs(&self.id);
|
|
|
+ let output_dir = self.part_output_dir();
|
|
|
+ let output_vcf = format!(
|
|
|
+ "{output_dir}/{}",
|
|
|
+ Path::new(&output_vcf)
|
|
|
+ .file_name()
|
|
|
+ .unwrap()
|
|
|
+ .to_string_lossy()
|
|
|
+ );
|
|
|
+ let output_indels_vcf = format!(
|
|
|
+ "{output_dir}/{}",
|
|
|
+ Path::new(&output_indels_vcf)
|
|
|
+ .file_name()
|
|
|
+ .unwrap()
|
|
|
+ .to_string_lossy()
|
|
|
+ );
|
|
|
|
|
|
let tmp_file = temp_file_path(".vcf.gz")?.to_str().unwrap().to_string();
|
|
|
|
|
|
@@ -258,8 +293,8 @@ impl ClairS {
|
|
|
.with_context(|| format!("Failed to remove temporary file {tmp_file}"))?;
|
|
|
} else {
|
|
|
debug!(
|
|
|
- "ClairS PASSED VCF already exists for {}, skipping.",
|
|
|
- self.id
|
|
|
+ "ClairS PASSED VCF already exists for {}, part {:?}, skipping.",
|
|
|
+ self.id, self.part_index
|
|
|
);
|
|
|
}
|
|
|
|
|
|
@@ -267,34 +302,51 @@ impl ClairS {
|
|
|
}
|
|
|
|
|
|
fn postprocess_sbatch(&self) -> anyhow::Result<()> {
|
|
|
- // Germline PASS via Slurm
|
|
|
- let clair3_germline_passed = self.config.clairs_germline_passed_vcf(&self.id);
|
|
|
- if !Path::new(&clair3_germline_passed).exists() {
|
|
|
- let clair3_germline_normal = self.config.clairs_germline_normal_vcf(&self.id);
|
|
|
-
|
|
|
- let mut cmd = BcftoolsKeepPass::from_config(
|
|
|
- &self.config,
|
|
|
- clair3_germline_normal,
|
|
|
- clair3_germline_passed.clone(),
|
|
|
- );
|
|
|
- let report = SlurmRunner::run(&mut cmd)
|
|
|
- .context("Failed to run `bcftools keep PASS` on Slurm")?;
|
|
|
-
|
|
|
- let log_file = format!("{}/bcftools_germline_pass_", self.log_dir);
|
|
|
- report
|
|
|
- .save_to_file(&log_file)
|
|
|
- .context("Error while writing logs")?;
|
|
|
- } else {
|
|
|
- debug!(
|
|
|
- "ClairS Germline PASSED VCF already exists for {}, skipping.",
|
|
|
- self.id
|
|
|
- );
|
|
|
+ // Germline PASS only once
|
|
|
+ if self.part_index.is_none() {
|
|
|
+ let clair3_germline_passed = self.config.clairs_germline_passed_vcf(&self.id);
|
|
|
+ if !Path::new(&clair3_germline_passed).exists() {
|
|
|
+ let clair3_germline_normal = self.config.clairs_germline_normal_vcf(&self.id);
|
|
|
+
|
|
|
+ let mut cmd = BcftoolsKeepPass::from_config(
|
|
|
+ &self.config,
|
|
|
+ clair3_germline_normal,
|
|
|
+ clair3_germline_passed.clone(),
|
|
|
+ );
|
|
|
+ let report = SlurmRunner::run(&mut cmd)
|
|
|
+ .context("Failed to run `bcftools keep PASS` on Slurm")?;
|
|
|
+
|
|
|
+ let log_file = format!("{}/bcftools_germline_pass_", self.log_dir);
|
|
|
+ report
|
|
|
+ .save_to_file(&log_file)
|
|
|
+ .context("Error while writing logs")?;
|
|
|
+ } else {
|
|
|
+ debug!(
|
|
|
+ "ClairS Germline PASSED VCF already exists for {}, skipping.",
|
|
|
+ self.id
|
|
|
+ );
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- // Somatic concat + PASS via Slurm
|
|
|
- let passed_vcf = self.config.clairs_passed_vcf(&self.id);
|
|
|
+ // Somatic concat + PASS (per-part or full)
|
|
|
+ let passed_vcf = self.somatic_passed_vcf_path();
|
|
|
if !Path::new(&passed_vcf).exists() {
|
|
|
let (output_vcf, output_indels_vcf) = self.config.clairs_output_vcfs(&self.id);
|
|
|
+ let output_dir = self.part_output_dir();
|
|
|
+ let output_vcf = format!(
|
|
|
+ "{output_dir}/{}",
|
|
|
+ Path::new(&output_vcf)
|
|
|
+ .file_name()
|
|
|
+ .unwrap()
|
|
|
+ .to_string_lossy()
|
|
|
+ );
|
|
|
+ let output_indels_vcf = format!(
|
|
|
+ "{output_dir}/{}",
|
|
|
+ Path::new(&output_indels_vcf)
|
|
|
+ .file_name()
|
|
|
+ .unwrap()
|
|
|
+ .to_string_lossy()
|
|
|
+ );
|
|
|
|
|
|
let tmp_file = temp_file_path(".vcf.gz")?.to_str().unwrap().to_string();
|
|
|
|
|
|
@@ -324,8 +376,8 @@ impl ClairS {
|
|
|
fs::remove_file(&tmp_file).context("Failed to remove temporary merged VCF")?;
|
|
|
} else {
|
|
|
debug!(
|
|
|
- "ClairS PASSED VCF already exists for {}, skipping.",
|
|
|
- self.id
|
|
|
+ "ClairS PASSED VCF already exists for {}, part {:?}, skipping.",
|
|
|
+ self.id, self.part_index
|
|
|
);
|
|
|
}
|
|
|
|
|
|
@@ -365,6 +417,34 @@ impl ClairS {
|
|
|
self.postprocess_sbatch()?;
|
|
|
Ok(out)
|
|
|
}
|
|
|
+
|
|
|
+ /// Per-part output directory.
|
|
|
+ ///
|
|
|
+ /// For chunked runs, this is `{clairs_output_dir(id)}/part{idx}`.
|
|
|
+ /// For full-genome runs, just `clairs_output_dir(id)`.
|
|
|
+ fn part_output_dir(&self) -> String {
|
|
|
+ let base_dir = self.config.clairs_output_dir(&self.id);
|
|
|
+ match self.part_index {
|
|
|
+ Some(idx) => format!("{base_dir}/part{idx}"),
|
|
|
+ None => base_dir,
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Somatic PASS VCF path for this run.
|
|
|
+ ///
|
|
|
+ /// - When `part_index.is_some()`: per-part intermediate PASS VCF
|
|
|
+ /// (inside the part dir), later merged.
|
|
|
+ /// - When `None`: canonical final path from `Config::clairs_passed_vcf`.
|
|
|
+ fn somatic_passed_vcf_path(&self) -> String {
|
|
|
+ match self.part_index {
|
|
|
+ Some(idx) => {
|
|
|
+ // Example: {clairs_output_dir(id)}/part{idx}/clairs.part{idx}.pass.vcf.gz
|
|
|
+ let outdir = self.part_output_dir();
|
|
|
+ format!("{outdir}/clairs.part{idx}.pass.vcf.gz")
|
|
|
+ }
|
|
|
+ None => self.config.clairs_passed_vcf(&self.id),
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/* ---------------- Variant / Label / Version impls ------------------------ */
|
|
|
@@ -503,6 +583,92 @@ impl Version for ClairS {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/// Merge N chunked ClairS PASS VCFs into the final clairs_passed_vcf().
|
|
|
+fn merge_clairs_parts(base: &ClairS, n_parts: usize) -> anyhow::Result<()> {
|
|
|
+ use std::path::PathBuf;
|
|
|
+
|
|
|
+ let mut part_pass_paths: Vec<PathBuf> = Vec::with_capacity(n_parts);
|
|
|
+
|
|
|
+ for i in 1..=n_parts {
|
|
|
+ let mut part = base.clone();
|
|
|
+ part.part_index = Some(i);
|
|
|
+ let part_pass = part.somatic_passed_vcf_path();
|
|
|
+
|
|
|
+ anyhow::ensure!(
|
|
|
+ Path::new(&part_pass).exists(),
|
|
|
+ "Missing ClairS part {i} PASS VCF: {part_pass}"
|
|
|
+ );
|
|
|
+
|
|
|
+ part_pass_paths.push(PathBuf::from(part_pass));
|
|
|
+ }
|
|
|
+
|
|
|
+ let final_passed_vcf = base.config.clairs_passed_vcf(&base.id);
|
|
|
+ let final_tmp = format!("{final_passed_vcf}.tmp");
|
|
|
+
|
|
|
+ if let Some(parent) = Path::new(&final_passed_vcf).parent() {
|
|
|
+ fs::create_dir_all(parent)?;
|
|
|
+ }
|
|
|
+
|
|
|
+ info!(
|
|
|
+ "Concatenating {} ClairS part VCFs into {}",
|
|
|
+ n_parts, final_passed_vcf
|
|
|
+ );
|
|
|
+
|
|
|
+ let mut concat = BcftoolsConcat::from_config(&base.config, part_pass_paths, &final_tmp);
|
|
|
+ SlurmRunner::run(&mut concat).context("Failed to run bcftools concat for ClairS parts")?;
|
|
|
+
|
|
|
+ fs::rename(&final_tmp, &final_passed_vcf).context("Failed to rename merged ClairS PASS VCF")?;
|
|
|
+
|
|
|
+ info!(
|
|
|
+ "Successfully merged {} ClairS parts into {}",
|
|
|
+ n_parts, final_passed_vcf
|
|
|
+ );
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
+
|
|
|
+pub fn run_clairs_chunked_sbatch_with_merge(
|
|
|
+ id: &str,
|
|
|
+ config: &Config,
|
|
|
+ n_parts: usize,
|
|
|
+) -> anyhow::Result<Vec<CapturedOutput>> {
|
|
|
+ let base = ClairS::initialize(id, config)?;
|
|
|
+
|
|
|
+ // If final VCF already up-to-date, skip (uses full run ShouldRun logic)
|
|
|
+ if !base.should_run() {
|
|
|
+ debug!("ClairS PASS VCF already up-to-date for {id}, skipping.");
|
|
|
+ return Ok(Vec::new());
|
|
|
+ }
|
|
|
+
|
|
|
+ // Genome sizes from normal BAM header
|
|
|
+ let normal_bam = config.normal_bam(id);
|
|
|
+ let reader =
|
|
|
+ bam::Reader::from_path(&normal_bam).with_context(|| format!("Opening BAM {normal_bam}"))?;
|
|
|
+ let header = bam::Header::from_template(reader.header());
|
|
|
+ let genome_sizes = get_genome_sizes(&header)?;
|
|
|
+ let region_chunks = split_genome_into_n_regions(&genome_sizes, n_parts);
|
|
|
+ let n_parts = region_chunks.len();
|
|
|
+
|
|
|
+ // Build jobs
|
|
|
+ let mut jobs = Vec::with_capacity(n_parts);
|
|
|
+ for (i, regions) in region_chunks.into_iter().enumerate() {
|
|
|
+ let mut job = base.clone();
|
|
|
+ job.part_index = Some(i + 1);
|
|
|
+ job.regions = regions;
|
|
|
+ job.log_dir = format!("{}/part{}", base.log_dir, i + 1);
|
|
|
+ info!("Planned ClairS job:\n{job}");
|
|
|
+ jobs.push(job);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Run all parts via Slurm
|
|
|
+ let outputs = run_many_sbatch(jobs)?;
|
|
|
+
|
|
|
+ // Merge somatic PASS VCFs into final clairs_passed_vcf()
|
|
|
+ merge_clairs_parts(&base, n_parts)?;
|
|
|
+
|
|
|
+ Ok(outputs)
|
|
|
+}
|
|
|
+
|
|
|
#[cfg(test)]
|
|
|
mod tests {
|
|
|
use super::*;
|