|
@@ -1,19 +1,57 @@
|
|
|
//! Longphase haplotagging, phasing, and modcall runners.
|
|
//! Longphase haplotagging, phasing, and modcall runners.
|
|
|
//!
|
|
//!
|
|
|
-//! All steps use the shared runner traits (local/Slurm) driven by the global `Config`.
|
|
|
|
|
|
|
+//! This module provides runners for [longphase](https://github.com/twolinin/longphase),
|
|
|
|
|
+//! a tool for SNP phasing and haplotagging using long-read sequencing data.
|
|
|
|
|
+//!
|
|
|
|
|
+//! # Overview
|
|
|
|
|
+//!
|
|
|
|
|
+//! The module implements three main operations for somatic cancer genomics workflows:
|
|
|
|
|
+//!
|
|
|
|
|
+//! - **Modcall** ([`LongphaseModcallSolo`]): Extracts methylation information from BAM files
|
|
|
|
|
+//! to assist phasing. Uses adaptive thresholds derived from modkit summary.
|
|
|
|
|
+//! - **Phase** ([`LongphasePhase`]): Phases [`ClairS`] germline variants using read-backed phasing
|
|
|
|
|
+//! with methylation support.
|
|
|
|
|
+//! - **Haplotag** ([`LongphaseHap`]): Tags BAM reads with haplotype assignments (HP tags)
|
|
|
|
|
+//! based on phased variants.
|
|
|
|
|
+//!
|
|
|
|
|
+//! # Execution Modes
|
|
|
|
|
+//!
|
|
|
|
|
+//! All runners support both local and SLURM cluster execution via the shared runner traits.
|
|
|
|
|
+//! [`LongphaseHap`] additionally supports chunked per-chromosome execution on SLURM for
|
|
|
|
|
+//! improved parallelism on large BAMs.
|
|
|
|
|
+//!
|
|
|
|
|
+//! # Workflow
|
|
|
|
|
+//!
|
|
|
|
|
+//! The typical somatic workflow is orchestrated by [`run_phasing_somatic`]:
|
|
|
|
|
+//!
|
|
|
|
|
+//! ```text
|
|
|
|
|
+//! 1. Modcall (normal) ─┐
|
|
|
|
|
+//! 2. Modcall (tumor) ─┼─> 3. Phase (germline) ─┬─> 4. Haplotag (normal)
|
|
|
|
|
+//! └─> 5. Haplotag (tumor)
|
|
|
|
|
+//! ```
|
|
|
|
|
+//!
|
|
|
|
|
+//! # Example
|
|
|
|
|
+//!
|
|
|
|
|
+//! ```no_run
|
|
|
|
|
+//! use crate::config::Config;
|
|
|
|
|
+//! use crate::commands::longphase::run_phasing_somatic;
|
|
|
|
|
+//!
|
|
|
|
|
+//! let config = Config::load("config.toml")?;
|
|
|
|
|
+//! run_phasing_somatic("SAMPLE_001", &config)?;
|
|
|
|
|
+//! ```
|
|
|
use crate::{
|
|
use crate::{
|
|
|
commands::{
|
|
commands::{
|
|
|
bcftools::{BcftoolsCompress, BcftoolsIndex, BcftoolsKeepPass},
|
|
bcftools::{BcftoolsCompress, BcftoolsIndex, BcftoolsKeepPass},
|
|
|
samtools::{SamtoolsIndex, SamtoolsMergeMany},
|
|
samtools::{SamtoolsIndex, SamtoolsMergeMany},
|
|
|
},
|
|
},
|
|
|
config::Config,
|
|
config::Config,
|
|
|
- helpers::{get_genome_sizes, is_file_older, path_prefix},
|
|
|
|
|
|
|
+ helpers::{get_genome_sizes, is_file_older, path_prefix, TempDirGuard},
|
|
|
|
|
+ locker::SampleLock,
|
|
|
pipes::{Initialize, InitializeSolo, ShouldRun},
|
|
pipes::{Initialize, InitializeSolo, ShouldRun},
|
|
|
run, run_many,
|
|
run, run_many,
|
|
|
runners::Run,
|
|
runners::Run,
|
|
|
};
|
|
};
|
|
|
use anyhow::Context;
|
|
use anyhow::Context;
|
|
|
-use log::warn;
|
|
|
|
|
use rust_htslib::bam::{self, Read};
|
|
use rust_htslib::bam::{self, Read};
|
|
|
use std::{
|
|
use std::{
|
|
|
fs,
|
|
fs,
|
|
@@ -24,7 +62,39 @@ use uuid::Uuid;
|
|
|
|
|
|
|
|
use super::modkit::ModkitSummary;
|
|
use super::modkit::ModkitSummary;
|
|
|
|
|
|
|
|
|
|
+/// Runs the complete somatic phasing pipeline for a sample.
|
|
|
|
|
+///
|
|
|
|
|
+/// This function orchestrates the full phasing workflow for tumor/normal pairs:
|
|
|
|
|
+///
|
|
|
|
|
+/// 1. Runs modcall on normal BAM to extract methylation calls
|
|
|
|
|
+/// 2. Runs modcall on tumor BAM to extract methylation calls
|
|
|
|
|
+/// 3. Phases germline variants from [`ClairS`] using normal BAM and methylation data
|
|
|
|
|
+/// 4. Haplotags normal BAM with phased haplotype assignments
|
|
|
|
|
+/// 5. Haplotags tumor BAM with phased haplotype assignments
|
|
|
|
|
+///
|
|
|
|
|
+/// # Arguments
|
|
|
|
|
+///
|
|
|
|
|
+/// * `id` - Sample identifier
|
|
|
|
|
+/// * `config` - Pipeline configuration containing paths and parameters
|
|
|
|
|
+///
|
|
|
|
|
+/// # Errors
|
|
|
|
|
+///
|
|
|
|
|
+/// Returns an error if:
|
|
|
|
|
+/// - Sample lock cannot be acquired (another process is working on this sample)
|
|
|
|
|
+/// - Any pipeline step fails
|
|
|
|
|
+///
|
|
|
|
|
+/// # Example
|
|
|
|
|
+///
|
|
|
|
|
+/// ```no_run
|
|
|
|
|
+/// let config = Config::load("config.toml")?;
|
|
|
|
|
+/// run_phasing_somatic("SAMPLE_001", &config)?;
|
|
|
|
|
+/// ```
|
|
|
pub fn run_phasing_somatic(id: &str, config: &Config) -> anyhow::Result<()> {
|
|
pub fn run_phasing_somatic(id: &str, config: &Config) -> anyhow::Result<()> {
|
|
|
|
|
+ // Acquire lock before any work
|
|
|
|
|
+ let lock_dir = format!("{}/locks", config.result_dir);
|
|
|
|
|
+ let _lock = SampleLock::acquire(&lock_dir, id, "phasing_somatic")
|
|
|
|
|
+ .with_context(|| format!("Cannot start somatic phasing for {id}"))?;
|
|
|
|
|
+
|
|
|
LongphaseModcallSolo::initialize(id, &config.normal_name, config)?.run()?;
|
|
LongphaseModcallSolo::initialize(id, &config.normal_name, config)?.run()?;
|
|
|
LongphaseModcallSolo::initialize(id, &config.tumoral_name, config)?.run()?;
|
|
LongphaseModcallSolo::initialize(id, &config.tumoral_name, config)?.run()?;
|
|
|
|
|
|
|
@@ -36,14 +106,40 @@ pub fn run_phasing_somatic(id: &str, config: &Config) -> anyhow::Result<()> {
|
|
|
Ok(())
|
|
Ok(())
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// Haplotags BAM reads using phased variant calls.
|
|
|
|
|
+///
|
|
|
|
|
+/// Assigns HP (haplotype) tags to reads based on their overlap with phased
|
|
|
|
|
+/// heterozygous variants. This enables downstream haplotype-aware analyses
|
|
|
|
|
+/// such as allele-specific expression or somatic variant phasing.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Execution Modes
|
|
|
|
|
+///
|
|
|
|
|
+/// - **Local**: Processes entire BAM in a single job
|
|
|
|
|
+/// - **SLURM**: Splits by chromosome, processes in parallel, then merges
|
|
|
|
|
+///
|
|
|
|
|
+/// The SLURM chunked mode significantly improves runtime for large BAMs
|
|
|
|
|
+/// by parallelizing across chromosomes.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Output
|
|
|
|
|
+///
|
|
|
|
|
+/// Produces a haplotagged BAM file with HP:i:1 or HP:i:2 tags on reads
|
|
|
|
|
+/// that overlap phased variants. Also tags supplementary alignments
|
|
|
|
|
+/// via `--tagSupplementary`.
|
|
|
#[derive(Debug, Clone)]
|
|
#[derive(Debug, Clone)]
|
|
|
pub struct LongphaseHap {
|
|
pub struct LongphaseHap {
|
|
|
|
|
+ /// Sample identifier.
|
|
|
pub id: String,
|
|
pub id: String,
|
|
|
|
|
+ /// Path to phased VCF file.
|
|
|
pub vcf: String,
|
|
pub vcf: String,
|
|
|
|
|
+ /// Input BAM file path.
|
|
|
pub bam: PathBuf,
|
|
pub bam: PathBuf,
|
|
|
|
|
+ /// Output haplotagged BAM prefix (longphase appends `.bam`).
|
|
|
pub bam_hp: PathBuf,
|
|
pub bam_hp: PathBuf,
|
|
|
|
|
+ /// Pipeline configuration.
|
|
|
pub config: Config,
|
|
pub config: Config,
|
|
|
|
|
+ /// Directory for log files.
|
|
|
pub log_dir: String,
|
|
pub log_dir: String,
|
|
|
|
|
+ /// Command arguments (built at runtime).
|
|
|
job_args: Vec<String>,
|
|
job_args: Vec<String>,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -69,24 +165,6 @@ impl InitializeSolo for LongphaseHap {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
impl LongphaseHap {
|
|
impl LongphaseHap {
|
|
|
- pub fn new(id: &str, bam: &str, phased_vcf: &str, config: Config) -> Self {
|
|
|
|
|
- let log_dir = format!("{}/{}/log/longphase", config.result_dir, id);
|
|
|
|
|
-
|
|
|
|
|
- let bam = Path::new(bam);
|
|
|
|
|
- let new_fn = format!("{}_HP", bam.file_stem().unwrap().to_str().unwrap());
|
|
|
|
|
- let bam_hp = bam.with_file_name(new_fn);
|
|
|
|
|
-
|
|
|
|
|
- Self {
|
|
|
|
|
- id: id.to_string(),
|
|
|
|
|
- bam: bam.to_path_buf(),
|
|
|
|
|
- config,
|
|
|
|
|
- log_dir,
|
|
|
|
|
- vcf: phased_vcf.to_string(),
|
|
|
|
|
- bam_hp: bam_hp.to_path_buf(),
|
|
|
|
|
- job_args: Vec::new(),
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
/// Build haplotag arguments, optionally restricted to a given region.
|
|
/// Build haplotag arguments, optionally restricted to a given region.
|
|
|
fn build_job_args(&self, region: Option<&str>, out_prefix: &Path) -> Vec<String> {
|
|
fn build_job_args(&self, region: Option<&str>, out_prefix: &Path) -> Vec<String> {
|
|
|
let mut args = vec![
|
|
let mut args = vec![
|
|
@@ -115,6 +193,23 @@ impl LongphaseHap {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
impl LongphaseHap {
|
|
impl LongphaseHap {
|
|
|
|
|
+ /// Executes haplotagging in chunked mode on SLURM.
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// Splits the BAM by chromosome, submits parallel SLURM jobs for each,
|
|
|
|
|
+ /// then merges results into the final haplotagged BAM.
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// # Process
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// 1. Read chromosome list from BAM header
|
|
|
|
|
+ /// 2. Create temporary directory for per-chromosome outputs
|
|
|
|
|
+ /// 3. Submit parallel haplotag jobs via SLURM
|
|
|
|
|
+ /// 4. Merge all chromosome BAMs with samtools
|
|
|
|
|
+ /// 5. Index final merged BAM
|
|
|
|
|
+ /// 6. Clean up temporary files
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// # Errors
|
|
|
|
|
+ ///
|
|
|
|
|
+ /// Returns an error if any SLURM job fails, merge fails, or indexing fails.
|
|
|
fn run_chunked_slurm(&mut self) -> anyhow::Result<()> {
|
|
fn run_chunked_slurm(&mut self) -> anyhow::Result<()> {
|
|
|
// Read contigs from BAM header
|
|
// Read contigs from BAM header
|
|
|
let reader = bam::Reader::from_path(&self.bam)
|
|
let reader = bam::Reader::from_path(&self.bam)
|
|
@@ -134,6 +229,7 @@ impl LongphaseHap {
|
|
|
Path::new(&self.config.tmp_dir).join(format!("longphase_hap_{}", Uuid::new_v4()));
|
|
Path::new(&self.config.tmp_dir).join(format!("longphase_hap_{}", Uuid::new_v4()));
|
|
|
fs::create_dir_all(&tmp_dir)
|
|
fs::create_dir_all(&tmp_dir)
|
|
|
.with_context(|| format!("Failed to create tmp dir {}", tmp_dir.display()))?;
|
|
.with_context(|| format!("Failed to create tmp dir {}", tmp_dir.display()))?;
|
|
|
|
|
+ let _guard = TempDirGuard::new(tmp_dir.clone());
|
|
|
|
|
|
|
|
let mut jobs = Vec::with_capacity(chroms.len());
|
|
let mut jobs = Vec::with_capacity(chroms.len());
|
|
|
let mut tmp_bams = Vec::with_capacity(chroms.len());
|
|
let mut tmp_bams = Vec::with_capacity(chroms.len());
|
|
@@ -179,11 +275,9 @@ impl LongphaseHap {
|
|
|
run!(&self.config, &mut merge)
|
|
run!(&self.config, &mut merge)
|
|
|
.context("samtools merge failed for Longphase haplotag chunks")?;
|
|
.context("samtools merge failed for Longphase haplotag chunks")?;
|
|
|
|
|
|
|
|
- // (Optional) cleanup temp BAMs
|
|
|
|
|
for bam in tmp_bams {
|
|
for bam in tmp_bams {
|
|
|
- if let Err(e) = fs::remove_file(&bam) {
|
|
|
|
|
- warn!("Failed to remove temp BAM {}: {e}", bam.display());
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ let _ = fs::remove_file(&bam);
|
|
|
|
|
+ let _ = fs::remove_file(format!("{}.bai", bam.display()));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// Index final BAM
|
|
// Index final BAM
|
|
@@ -203,7 +297,7 @@ impl Run for LongphaseHap {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
if !Path::new(&self.log_dir).exists() {
|
|
if !Path::new(&self.log_dir).exists() {
|
|
|
- fs::create_dir_all(&self.log_dir).expect("Failed to create output directory.");
|
|
|
|
|
|
|
+ fs::create_dir_all(&self.log_dir).context("Failed to create output directory.")?;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
if self.bam_hp.exists() {
|
|
if self.bam_hp.exists() {
|
|
@@ -282,16 +376,40 @@ impl crate::commands::SbatchRunner for LongphaseHap {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// /data/tools/longphase_linux-x64 phase -s ClairS/clair3_normal_tumoral_germline_output.vcf.gz -b CUNY_diag_hs1_hp.bam -r /data/ref/hs1/chm13v2.0.fa -t 155 --ont -o ClairS/clair3_normal_tumoral_germline_output_PS
|
|
|
|
|
|
|
+/// Phases germline variants using read-backed phasing with methylation support.
|
|
|
|
|
+///
|
|
|
|
|
+/// Uses longphase's `phase` subcommand to assign phase sets (PS tags) to
|
|
|
|
|
+/// heterozygous germline variants. Incorporates methylation data from modcall
|
|
|
|
|
+/// to improve phasing accuracy and block lengths.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Input Requirements
|
|
|
|
|
+///
|
|
|
|
|
+/// - Germline VCF (from ClairS constitutional calling)
|
|
|
|
|
+/// - Normal BAM file
|
|
|
|
|
+/// - Modcall VCF (methylation calls)
|
|
|
|
|
+///
|
|
|
|
|
+/// # Output
|
|
|
|
|
+///
|
|
|
|
|
+/// Produces a phased VCF with:
|
|
|
|
|
+/// - GT fields updated with phased genotypes (e.g., `0|1` instead of `0/1`)
|
|
|
|
|
+/// - PS (phase set) tags indicating contiguous phased blocks
|
|
|
#[derive(Debug)]
|
|
#[derive(Debug)]
|
|
|
pub struct LongphasePhase {
|
|
pub struct LongphasePhase {
|
|
|
|
|
+ /// Sample identifier.
|
|
|
pub id: String,
|
|
pub id: String,
|
|
|
|
|
+ /// Input germline VCF path.
|
|
|
pub vcf: String,
|
|
pub vcf: String,
|
|
|
|
|
+ /// Output file prefix.
|
|
|
pub out_prefix: String,
|
|
pub out_prefix: String,
|
|
|
|
|
+ /// Normal BAM file path.
|
|
|
pub bam: String,
|
|
pub bam: String,
|
|
|
|
|
+ /// Pipeline configuration.
|
|
|
pub config: Config,
|
|
pub config: Config,
|
|
|
|
|
+ /// Directory for log files.
|
|
|
pub log_dir: String,
|
|
pub log_dir: String,
|
|
|
|
|
+ /// Modcall VCF with methylation data.
|
|
|
pub modcall_vcf: String,
|
|
pub modcall_vcf: String,
|
|
|
|
|
+ /// Command arguments (built at runtime).
|
|
|
job_args: Vec<String>,
|
|
job_args: Vec<String>,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -363,7 +481,8 @@ impl crate::commands::SbatchRunner for LongphasePhase {
|
|
|
impl Run for LongphasePhase {
|
|
impl Run for LongphasePhase {
|
|
|
fn run(&mut self) -> anyhow::Result<()> {
|
|
fn run(&mut self) -> anyhow::Result<()> {
|
|
|
if !self.should_run() {
|
|
if !self.should_run() {
|
|
|
- info!("Germline phased vcf is up-to-date for {}.", self.id)
|
|
|
|
|
|
|
+ info!("Germline phased vcf is up-to-date for {}.", self.id);
|
|
|
|
|
+ return Ok(());
|
|
|
}
|
|
}
|
|
|
info!("Running longphase phase for: {}", self.vcf);
|
|
info!("Running longphase phase for: {}", self.vcf);
|
|
|
info!("Saving longphase phase results in: {}", self.out_prefix);
|
|
info!("Saving longphase phase results in: {}", self.out_prefix);
|
|
@@ -419,17 +538,44 @@ impl Run for LongphasePhase {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/// Extracts methylation calls from a BAM file for phasing support.
|
|
|
|
|
+///
|
|
|
|
|
+/// Uses longphase's `modcall` subcommand to identify methylated CpG sites
|
|
|
|
|
+/// from Oxford Nanopore modified base calls (MM/ML tags). The resulting
|
|
|
|
|
+/// methylation information improves phasing accuracy by providing additional
|
|
|
|
|
+/// haplotype-informative signal.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Threshold Selection
|
|
|
|
|
+///
|
|
|
|
|
+/// The methylation threshold is automatically determined from the modkit
|
|
|
|
|
+/// summary file, ensuring optimal sensitivity/specificity for the specific
|
|
|
|
|
+/// sample's modification calling quality.
|
|
|
|
|
+///
|
|
|
|
|
+/// # Output
|
|
|
|
|
+///
|
|
|
|
|
+/// Produces a VCF file with methylation calls at CpG sites, filtered to
|
|
|
|
|
+/// PASS-only variants for downstream phasing use.
|
|
|
#[derive(Debug)]
|
|
#[derive(Debug)]
|
|
|
pub struct LongphaseModcallSolo {
|
|
pub struct LongphaseModcallSolo {
|
|
|
|
|
+ /// Sample identifier.
|
|
|
pub id: String,
|
|
pub id: String,
|
|
|
|
|
+ /// Time point (normal/tumor identifier).
|
|
|
pub time: String,
|
|
pub time: String,
|
|
|
|
|
+ /// Input BAM file path.
|
|
|
pub bam: String,
|
|
pub bam: String,
|
|
|
|
|
+ /// Output file prefix.
|
|
|
pub prefix: String,
|
|
pub prefix: String,
|
|
|
|
|
+ /// Reference genome path.
|
|
|
pub reference: String,
|
|
pub reference: String,
|
|
|
|
|
+ /// Number of threads for modcall.
|
|
|
pub threads: u8,
|
|
pub threads: u8,
|
|
|
|
|
+ /// Directory for log files.
|
|
|
pub log_dir: String,
|
|
pub log_dir: String,
|
|
|
|
|
+ /// Methylation probability threshold from modkit.
|
|
|
pub mod_threshold: f64,
|
|
pub mod_threshold: f64,
|
|
|
|
|
+ /// Pipeline configuration.
|
|
|
pub config: Config,
|
|
pub config: Config,
|
|
|
|
|
+ /// Command arguments (built at runtime).
|
|
|
job_args: Vec<String>,
|
|
job_args: Vec<String>,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -512,7 +658,7 @@ impl crate::commands::SlurmRunner for LongphaseModcallSolo {
|
|
|
impl crate::commands::SbatchRunner for LongphaseModcallSolo {
|
|
impl crate::commands::SbatchRunner for LongphaseModcallSolo {
|
|
|
fn slurm_params(&self) -> super::SlurmParams {
|
|
fn slurm_params(&self) -> super::SlurmParams {
|
|
|
crate::commands::SlurmParams {
|
|
crate::commands::SlurmParams {
|
|
|
- job_name: Some(format!("longphase_modcall_{}", self.id)),
|
|
|
|
|
|
|
+ job_name: Some(format!("longphase_modcall_{}_{}", self.id, self.time)),
|
|
|
cpus_per_task: Some(self.threads as u32),
|
|
cpus_per_task: Some(self.threads as u32),
|
|
|
mem: Some("60G".into()),
|
|
mem: Some("60G".into()),
|
|
|
partition: Some("shortq".into()),
|
|
partition: Some("shortq".into()),
|