|
|
@@ -0,0 +1,867 @@
|
|
|
+//! # Straglr Short Tandem Repeat Genotyper
|
|
|
+//!
|
|
|
+//! This module provides wrappers for [Straglr](https://github.com/bcgsc/straglr),
|
|
|
+//! a genotyper for short tandem repeats (STRs) optimized for long-read sequencing data.
|
|
|
+//!
|
|
|
+//! ## Overview
|
|
|
+//!
|
|
|
+//! Straglr detects and genotypes STR expansions from long-read data, including:
|
|
|
+//! - Pathogenic repeat expansions (Huntington's disease, SCAs, FXS, etc.)
|
|
|
+//! - Genome-wide STR profiling
|
|
|
+//! - De novo repeat expansion detection
|
|
|
+//! - Support for both known and novel STR loci
|
|
|
+//!
|
|
|
+//! ## Key Features
|
|
|
+//!
|
|
|
+//! - **Pathogenic repeat detection** - Identifies disease-causing STR expansions
|
|
|
+//! - **Long-read optimized** - Leverages full-length reads spanning repeat regions
|
|
|
+//! - **Locus annotation** - Uses BED file of known pathogenic loci
|
|
|
+//! - **VCF output** - Optional variant-style output for downstream analysis
|
|
|
+//! - **Solo and paired modes** - Single-sample or tumor-normal analysis
|
|
|
+//!
|
|
|
+//! ## Requirements
|
|
|
+//!
|
|
|
+//! Before running Straglr, ensure:
|
|
|
+//! - BAM file is indexed (`.bai` file present)
|
|
|
+//! - Reference genome is accessible
|
|
|
+//! - STR loci BED file is configured (`config.straglr_loci_bed`)
|
|
|
+//! - Python environment with Straglr is available
|
|
|
+//!
|
|
|
+//! ## Output Files
|
|
|
+//!
|
|
|
+//! Paired mode TSV output:
|
|
|
+//! ```text
|
|
|
+//! {result_dir}/{id}/straglr/{id}_straglr.tsv
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! Solo mode TSV output:
|
|
|
+//! ```text
|
|
|
+//! {result_dir}/{id}/straglr_solo/{time_point}/{id}_{time_point}_straglr.tsv
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! ## Usage
|
|
|
+//!
|
|
|
+//! ### Paired (Tumor-Normal) Mode
|
|
|
+//!
|
|
|
+//! ```ignore
|
|
|
+//! use pandora_lib_promethion::callers::straglr::Straglr;
|
|
|
+//! use pandora_lib_promethion::config::Config;
|
|
|
+//! use pandora_lib_promethion::pipes::Initialize;
|
|
|
+//! use pandora_lib_promethion::runners::Run;
|
|
|
+//!
|
|
|
+//! let config = Config::default();
|
|
|
+//! let mut caller = Straglr::initialize("sample_001", &config)?;
|
|
|
+//!
|
|
|
+//! if caller.should_run() {
|
|
|
+//! caller.run()?;
|
|
|
+//! }
|
|
|
+//! # Ok::<(), anyhow::Error>(())
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! ### Solo Mode
|
|
|
+//!
|
|
|
+//! ```ignore
|
|
|
+//! use pandora_lib_promethion::callers::straglr::StraglrSolo;
|
|
|
+//! use pandora_lib_promethion::pipes::InitializeSolo;
|
|
|
+//!
|
|
|
+//! let config = Config::default();
|
|
|
+//! let mut caller = StraglrSolo::initialize("sample_001", "norm", &config)?;
|
|
|
+//! caller.run()?;
|
|
|
+//! # Ok::<(), anyhow::Error>(())
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! ### Chunked Parallel Mode (Genome-Wide)
|
|
|
+//!
|
|
|
+//! For whole-genome STR genotyping, use the chunked execution mode to parallelize:
|
|
|
+//!
|
|
|
+//! ```ignore
|
|
|
+//! use pandora_lib_promethion::callers::straglr::run_straglr_chunked;
|
|
|
+//! use pandora_lib_promethion::config::Config;
|
|
|
+//!
|
|
|
+//! let config = Config::default();
|
|
|
+//!
|
|
|
+//! // Run genome-wide genotyping with 20 parallel jobs
|
|
|
+//! run_straglr_chunked("sample_001", "norm", &config, 20)?;
|
|
|
+//! # Ok::<(), anyhow::Error>(())
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! ### Loading and Analyzing Results
|
|
|
+//!
|
|
|
+//! ```ignore
|
|
|
+//! use pandora_lib_promethion::callers::straglr::Straglr;
|
|
|
+//! use pandora_lib_promethion::pipes::Initialize;
|
|
|
+//!
|
|
|
+//! let config = Config::default();
|
|
|
+//! let caller = Straglr::initialize("sample_001", &config)?;
|
|
|
+//!
|
|
|
+//! // Load results from both samples
|
|
|
+//! let (normal, tumor) = caller.load_results()?;
|
|
|
+//!
|
|
|
+//! // Find pathogenic expansions (e.g., >40 repeats)
|
|
|
+//! for str_locus in &normal {
|
|
|
+//! if str_locus.is_expanded(40) {
|
|
|
+//! println!("Expanded repeat at {}: {} copies ({})",
|
|
|
+//! str_locus.location_string(),
|
|
|
+//! str_locus.max_copy_number().unwrap(),
|
|
|
+//! str_locus.repeat_unit);
|
|
|
+//! }
|
|
|
+//! }
|
|
|
+//!
|
|
|
+//! // Find somatic STR changes
|
|
|
+//! let changes = caller.find_somatic_changes(2)?;
|
|
|
+//! for (location, normal, tumor, diff) in changes {
|
|
|
+//! println!("{}: Normal={:?}, Tumor={:?}, Diff={}",
|
|
|
+//! location, normal.copy_numbers, tumor.copy_numbers, diff);
|
|
|
+//! }
|
|
|
+//! # Ok::<(), anyhow::Error>(())
|
|
|
+//! ```
|
|
|
+//!
|
|
|
+//! ## References
|
|
|
+//!
|
|
|
+//! - [Straglr GitHub](https://github.com/bcgsc/straglr)
|
|
|
+//! - [Straglr Paper](https://doi.org/10.1186/s13059-021-02447-3)
|
|
|
+use crate::{
|
|
|
+ commands::{Command as JobCommand, LocalBatchRunner, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner},
|
|
|
+ config::Config,
|
|
|
+ helpers::{is_file_older, remove_dir_if_exists},
|
|
|
+ io::straglr::{read_straglr_tsv, StraglrRow},
|
|
|
+ pipes::{Initialize, InitializeSolo, ShouldRun, Version},
|
|
|
+ run, run_many,
|
|
|
+ runners::Run,
|
|
|
+};
|
|
|
+use anyhow::Context;
|
|
|
+use log::{debug, info};
|
|
|
+use std::{
|
|
|
+ fs::{self, File},
|
|
|
+ io::{BufRead, BufReader, Write},
|
|
|
+ path::Path,
|
|
|
+};
|
|
|
+
|
|
|
+/// Straglr paired (tumor-normal) STR genotyper.
|
|
|
+///
|
|
|
+/// Executes Straglr for STR genotyping on both tumor and normal samples,
|
|
|
+/// enabling detection of somatic STR expansions or contractions.
|
|
|
+///
|
|
|
+/// # Fields
|
|
|
+///
|
|
|
+/// - `id` - Sample identifier (e.g., "34528")
|
|
|
+/// - `config` - Global pipeline configuration
|
|
|
+/// - `log_dir` - Directory for execution logs (e.g., "{result_dir}/{id}/log/straglr")
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct Straglr {
|
|
|
+ /// Sample identifier
|
|
|
+ pub id: String,
|
|
|
+ /// Global pipeline configuration
|
|
|
+ pub config: Config,
|
|
|
+ /// Directory for log file storage
|
|
|
+ pub log_dir: String,
|
|
|
+}
|
|
|
+
|
|
|
+impl Initialize for Straglr {
|
|
|
+ /// Initializes a new Straglr instance for a given sample ID and configuration.
|
|
|
+ ///
|
|
|
+ /// Creates the output log directory path and optionally cleans up previous output files
|
|
|
+ /// if `straglr_force` is set.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ ///
|
|
|
+ /// * `id` - The sample ID
|
|
|
+ /// * `config` - The execution configuration
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ ///
|
|
|
+ /// A `Straglr` instance wrapped in `Ok`, or an error if setup fails
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if:
|
|
|
+ /// - `config.straglr_force` is true and output directory cannot be removed
|
|
|
+ /// - Directory deletion fails due to permissions or I/O errors
|
|
|
+ fn initialize(id: &str, config: &Config) -> anyhow::Result<Self> {
|
|
|
+ info!("Initialize Straglr for {id}.");
|
|
|
+
|
|
|
+ let log_dir = format!("{}/{}/log/straglr", config.result_dir, id);
|
|
|
+ let straglr = Self {
|
|
|
+ id: id.to_string(),
|
|
|
+ config: config.clone(),
|
|
|
+ log_dir,
|
|
|
+ };
|
|
|
+
|
|
|
+ if straglr.config.straglr_force {
|
|
|
+ remove_dir_if_exists(&straglr.config.straglr_output_dir(id))?;
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(straglr)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl ShouldRun for Straglr {
|
|
|
+ /// Determines whether Straglr should re-run based on whether the output TSV
|
|
|
+ /// is older than either the tumor or normal BAM file.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ ///
|
|
|
+ /// `true` if Straglr needs to be re-run, otherwise `false`
|
|
|
+ fn should_run(&self) -> bool {
|
|
|
+ let normal_tsv = &self.config.straglr_normal_tsv(&self.id);
|
|
|
+ let tumor_tsv = &self.config.straglr_tumor_tsv(&self.id);
|
|
|
+
|
|
|
+ let result = is_file_older(normal_tsv, &self.config.normal_bam(&self.id), true)
|
|
|
+ .unwrap_or(true)
|
|
|
+ || is_file_older(normal_tsv, &self.config.tumoral_bam(&self.id), true).unwrap_or(true)
|
|
|
+ || is_file_older(tumor_tsv, &self.config.normal_bam(&self.id), true).unwrap_or(true)
|
|
|
+ || is_file_older(tumor_tsv, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
|
|
|
+
|
|
|
+ if result {
|
|
|
+ info!("Straglr should run for: {}.", self.id);
|
|
|
+ }
|
|
|
+ result
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl Run for Straglr {
|
|
|
+ /// Runs the Straglr STR genotyper on both normal and tumor BAM files.
|
|
|
+ ///
|
|
|
+ /// Executes Straglr separately for normal and tumor samples, producing
|
|
|
+ /// TSV files with STR genotypes for each.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ ///
|
|
|
+ /// `Ok(())` if everything runs successfully; otherwise, an error with context.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ ///
|
|
|
+ /// Returns an error if:
|
|
|
+ /// - Straglr is already up-to-date (`should_run()` returns false)
|
|
|
+ /// - Output directory cannot be created
|
|
|
+ /// - Tumor or normal BAM files are missing or corrupted
|
|
|
+ /// - Reference genome is missing
|
|
|
+ /// - STR loci BED file is missing or malformed
|
|
|
+ /// - Straglr execution fails for either sample
|
|
|
+ /// - Log files cannot be written
|
|
|
+ fn run(&mut self) -> anyhow::Result<()> {
|
|
|
+ if !self.should_run() {
|
|
|
+ anyhow::bail!("Straglr is up-to-date");
|
|
|
+ }
|
|
|
+
|
|
|
+ info!("Running Straglr v{}", Straglr::version(&self.config)?);
|
|
|
+
|
|
|
+ let id = &self.id;
|
|
|
+ let output_dir = self.config.straglr_output_dir(id);
|
|
|
+ fs::create_dir_all(&output_dir).context("Failed to create Straglr output directory")?;
|
|
|
+
|
|
|
+ // Run on normal sample
|
|
|
+ let normal_tsv = self.config.straglr_normal_tsv(id);
|
|
|
+ if !Path::new(&normal_tsv).exists() {
|
|
|
+ info!("Running Straglr on normal sample: {}", id);
|
|
|
+ let mut job = StraglrJob {
|
|
|
+ conda_sh: self.config.conda_sh.clone(),
|
|
|
+ straglr_bin: self.config.straglr_bin.clone(),
|
|
|
+ bam: self.config.normal_bam(id),
|
|
|
+ reference: self.config.reference.clone(),
|
|
|
+ loci_bed: self.config.straglr_loci_bed.clone(),
|
|
|
+ output_prefix: format!("{}/{}_normal", output_dir, id),
|
|
|
+ min_support: self.config.straglr_min_support,
|
|
|
+ min_cluster_size: self.config.straglr_min_cluster_size,
|
|
|
+ genotype_in_size: self.config.straglr_genotype_in_size,
|
|
|
+ };
|
|
|
+
|
|
|
+ let output = run!(&self.config, &mut job)
|
|
|
+ .context("Error while running Straglr on normal sample")?;
|
|
|
+
|
|
|
+ let log_file = format!("{}/straglr_normal_", self.log_dir);
|
|
|
+ output
|
|
|
+ .save_to_file(&log_file)
|
|
|
+ .context(format!("Error while writing Straglr logs into {log_file}"))?;
|
|
|
+ } else {
|
|
|
+ debug!(
|
|
|
+ "Straglr normal TSV already exists for {}, skipping execution.",
|
|
|
+ self.id
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ // Run on tumor sample
|
|
|
+ let tumor_tsv = self.config.straglr_tumor_tsv(id);
|
|
|
+ if !Path::new(&tumor_tsv).exists() {
|
|
|
+ info!("Running Straglr on tumor sample: {}", id);
|
|
|
+ let mut job = StraglrJob {
|
|
|
+ conda_sh: self.config.conda_sh.clone(),
|
|
|
+ straglr_bin: self.config.straglr_bin.clone(),
|
|
|
+ bam: self.config.tumoral_bam(id),
|
|
|
+ reference: self.config.reference.clone(),
|
|
|
+ loci_bed: self.config.straglr_loci_bed.clone(),
|
|
|
+ output_prefix: format!("{}/{}_tumor", output_dir, id),
|
|
|
+ min_support: self.config.straglr_min_support,
|
|
|
+ min_cluster_size: self.config.straglr_min_cluster_size,
|
|
|
+ genotype_in_size: self.config.straglr_genotype_in_size,
|
|
|
+ };
|
|
|
+
|
|
|
+ let output = run!(&self.config, &mut job)
|
|
|
+ .context("Error while running Straglr on tumor sample")?;
|
|
|
+
|
|
|
+ let log_file = format!("{}/straglr_tumor_", self.log_dir);
|
|
|
+ output
|
|
|
+ .save_to_file(&log_file)
|
|
|
+ .context(format!("Error while writing Straglr logs into {log_file}"))?;
|
|
|
+ } else {
|
|
|
+ debug!(
|
|
|
+ "Straglr tumor TSV already exists for {}, skipping execution.",
|
|
|
+ self.id
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl Straglr {
|
|
|
+ /// Loads and parses the normal sample Straglr TSV results.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// Vector of STR loci from the normal sample
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if the TSV file cannot be read or parsed.
|
|
|
+ pub fn load_normal_results(&self) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
+ let tsv_path = self.config.straglr_normal_tsv(&self.id);
|
|
|
+ read_straglr_tsv(&tsv_path)
|
|
|
+ .context(format!("Failed to read normal Straglr results from {}", tsv_path))
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Loads and parses the tumor sample Straglr TSV results.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// Vector of STR loci from the tumor sample
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if the TSV file cannot be read or parsed.
|
|
|
+ pub fn load_tumor_results(&self) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
+ let tsv_path = self.config.straglr_tumor_tsv(&self.id);
|
|
|
+ read_straglr_tsv(&tsv_path)
|
|
|
+ .context(format!("Failed to read tumor Straglr results from {}", tsv_path))
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Loads both normal and tumor results as a tuple.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// `(normal_results, tumor_results)` tuple
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if either TSV file cannot be read or parsed.
|
|
|
+ pub fn load_results(&self) -> anyhow::Result<(Vec<StraglrRow>, Vec<StraglrRow>)> {
|
|
|
+ Ok((self.load_normal_results()?, self.load_tumor_results()?))
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Finds STR loci that differ between tumor and normal samples.
|
|
|
+ ///
|
|
|
+ /// Compares copy numbers at matching loci to identify somatic STR changes.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ /// * `min_difference` - Minimum copy number difference to report (default: 2)
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// Vector of tuples: `(locus_id, normal_row, tumor_row, copy_number_diff)`
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if results cannot be loaded.
|
|
|
+ pub fn find_somatic_changes(
|
|
|
+ &self,
|
|
|
+ min_difference: u32,
|
|
|
+ ) -> anyhow::Result<Vec<(String, StraglrRow, StraglrRow, i64)>> {
|
|
|
+ let (normal, tumor) = self.load_results()?;
|
|
|
+
|
|
|
+ let mut changes = Vec::new();
|
|
|
+
|
|
|
+ for normal_row in &normal {
|
|
|
+ let location = normal_row.location_string();
|
|
|
+
|
|
|
+ // Find matching locus in tumor
|
|
|
+ if let Some(tumor_row) = tumor.iter().find(|t| {
|
|
|
+ t.chrom == normal_row.chrom && t.start == normal_row.start && t.end == normal_row.end
|
|
|
+ }) {
|
|
|
+ // Compare max copy numbers
|
|
|
+ if let (Some(normal_cn), Some(tumor_cn)) =
|
|
|
+ (normal_row.max_copy_number(), tumor_row.max_copy_number())
|
|
|
+ {
|
|
|
+ let diff = tumor_cn as i64 - normal_cn as i64;
|
|
|
+ if diff.abs() >= min_difference as i64 {
|
|
|
+ changes.push((location, normal_row.clone(), tumor_row.clone(), diff));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(changes)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+struct StraglrJob {
|
|
|
+ conda_sh: String,
|
|
|
+ straglr_bin: String,
|
|
|
+ bam: String,
|
|
|
+ reference: String,
|
|
|
+ loci_bed: String,
|
|
|
+ output_prefix: String,
|
|
|
+ min_support: u32,
|
|
|
+ min_cluster_size: u32,
|
|
|
+ genotype_in_size: bool,
|
|
|
+}
|
|
|
+
|
|
|
+impl JobCommand for StraglrJob {
|
|
|
+ fn cmd(&self) -> String {
|
|
|
+ let mut cmd = format!(
|
|
|
+ "source {conda_sh} && conda activate straglr_env && {straglr} {bam} {reference} --loci {loci} --min_support {min_sup} --min_cluster_size {min_clust} --output {output}",
|
|
|
+ conda_sh = self.conda_sh,
|
|
|
+ straglr = self.straglr_bin,
|
|
|
+ bam = self.bam,
|
|
|
+ reference = self.reference,
|
|
|
+ loci = self.loci_bed,
|
|
|
+ min_sup = self.min_support,
|
|
|
+ min_clust = self.min_cluster_size,
|
|
|
+ output = self.output_prefix
|
|
|
+ );
|
|
|
+
|
|
|
+ if self.genotype_in_size {
|
|
|
+ cmd.push_str(" --genotype_in_size");
|
|
|
+ }
|
|
|
+
|
|
|
+ cmd
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl LocalRunner for StraglrJob {}
|
|
|
+
|
|
|
+impl LocalBatchRunner for StraglrJob {}
|
|
|
+
|
|
|
+impl SlurmRunner for StraglrJob {
|
|
|
+ fn slurm_args(&self) -> Vec<String> {
|
|
|
+ SlurmParams {
|
|
|
+ job_name: Some("straglr".into()),
|
|
|
+ partition: Some("shortq".into()),
|
|
|
+ cpus_per_task: Some(4),
|
|
|
+ mem: Some("16G".into()),
|
|
|
+ gres: None,
|
|
|
+ }
|
|
|
+ .to_args()
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl SbatchRunner for StraglrJob {
|
|
|
+ fn slurm_params(&self) -> SlurmParams {
|
|
|
+ SlurmParams {
|
|
|
+ job_name: Some("straglr".into()),
|
|
|
+ partition: Some("shortq".into()),
|
|
|
+ cpus_per_task: Some(4),
|
|
|
+ mem: Some("16G".into()),
|
|
|
+ gres: None,
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl Version for Straglr {
|
|
|
+ /// Retrieves the Straglr version by running `straglr --version`.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if command execution fails or version parsing fails.
|
|
|
+ fn version(config: &Config) -> anyhow::Result<String> {
|
|
|
+ // Override cmd for version check
|
|
|
+ struct VersionJob {
|
|
|
+ conda_sh: String,
|
|
|
+ straglr_bin: String,
|
|
|
+ }
|
|
|
+
|
|
|
+ impl JobCommand for VersionJob {
|
|
|
+ fn cmd(&self) -> String {
|
|
|
+ format!(
|
|
|
+ "source {} && conda activate straglr_env && {} --version",
|
|
|
+ self.conda_sh, self.straglr_bin
|
|
|
+ )
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ impl LocalRunner for VersionJob {}
|
|
|
+
|
|
|
+ impl SlurmRunner for VersionJob {
|
|
|
+ fn slurm_args(&self) -> Vec<String> {
|
|
|
+ SlurmParams {
|
|
|
+ job_name: Some("straglr_version".into()),
|
|
|
+ partition: Some("shortq".into()),
|
|
|
+ cpus_per_task: Some(1),
|
|
|
+ mem: Some("1G".into()),
|
|
|
+ gres: None,
|
|
|
+ }
|
|
|
+ .to_args()
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ let mut version_job = VersionJob {
|
|
|
+ conda_sh: config.conda_sh.clone(),
|
|
|
+ straglr_bin: config.straglr_bin.clone(),
|
|
|
+ };
|
|
|
+
|
|
|
+ let out =
|
|
|
+ run!(&config, &mut version_job).context("Error while running `straglr --version`")?;
|
|
|
+
|
|
|
+ let combined = format!("{}{}", out.stdout, out.stderr);
|
|
|
+ let v = combined
|
|
|
+ .lines()
|
|
|
+ .find(|line| line.contains("straglr") || line.contains("version"))
|
|
|
+ .map(|line| line.trim().to_string())
|
|
|
+ .ok_or_else(|| anyhow::anyhow!("Could not parse straglr version from output"))?;
|
|
|
+
|
|
|
+ Ok(v)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Straglr solo (single-sample) STR genotyper.
|
|
|
+///
|
|
|
+/// Executes Straglr for STR genotyping on a single BAM file.
|
|
|
+/// Useful for germline STR analysis or when no matched normal is available.
|
|
|
+///
|
|
|
+/// # Fields
|
|
|
+///
|
|
|
+/// - `id` - Sample identifier (e.g., "34528")
|
|
|
+/// - `time` - Time point label: typically `config.normal_name` ("norm") or `config.tumoral_name` ("diag")
|
|
|
+/// - `config` - Global pipeline configuration
|
|
|
+/// - `log_dir` - Directory for execution logs (e.g., "{result_dir}/{id}/log/straglr_solo")
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct StraglrSolo {
|
|
|
+ /// Sample identifier
|
|
|
+ pub id: String,
|
|
|
+ /// Time point identifier (e.g., "norm" or "diag")
|
|
|
+ pub time: String,
|
|
|
+ /// Global pipeline configuration
|
|
|
+ pub config: Config,
|
|
|
+ /// Directory for log file storage
|
|
|
+ pub log_dir: String,
|
|
|
+}
|
|
|
+
|
|
|
+impl InitializeSolo for StraglrSolo {
|
|
|
+ /// Initializes Straglr solo analysis for a sample at a specific time point.
|
|
|
+ ///
|
|
|
+ /// Creates necessary log directory.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if directory creation fails.
|
|
|
+ fn initialize(id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
|
|
|
+ let log_dir = format!("{}/{}/log/straglr_solo", config.result_dir, id);
|
|
|
+ if !Path::new(&log_dir).exists() {
|
|
|
+ fs::create_dir_all(&log_dir)
|
|
|
+ .context(format!("Failed to create {log_dir} directory"))?;
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(StraglrSolo {
|
|
|
+ id: id.to_string(),
|
|
|
+ time: time.to_string(),
|
|
|
+ config: config.clone(),
|
|
|
+ log_dir,
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl Run for StraglrSolo {
|
|
|
+ /// Runs the Straglr pipeline for a single sample.
|
|
|
+ ///
|
|
|
+ /// Skips if output file already exists.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if Straglr execution or log writing fails.
|
|
|
+ fn run(&mut self) -> anyhow::Result<()> {
|
|
|
+ let id = &self.id;
|
|
|
+ let time = &self.time;
|
|
|
+
|
|
|
+ let output_tsv = &self.config.straglr_solo_tsv(id, time);
|
|
|
+
|
|
|
+ if !Path::new(output_tsv).exists() {
|
|
|
+ let output_dir = self.config.straglr_solo_output_dir(id, time);
|
|
|
+ fs::create_dir_all(&output_dir)
|
|
|
+ .context("Failed to create Straglr solo output directory")?;
|
|
|
+
|
|
|
+ let mut job = StraglrJob {
|
|
|
+ conda_sh: self.config.conda_sh.clone(),
|
|
|
+ straglr_bin: self.config.straglr_bin.clone(),
|
|
|
+ bam: self.config.solo_bam(id, time),
|
|
|
+ reference: self.config.reference.clone(),
|
|
|
+ loci_bed: self.config.straglr_loci_bed.clone(),
|
|
|
+ output_prefix: format!("{}/{}_{}", output_dir, id, time),
|
|
|
+ min_support: self.config.straglr_min_support,
|
|
|
+ min_cluster_size: self.config.straglr_min_cluster_size,
|
|
|
+ genotype_in_size: self.config.straglr_genotype_in_size,
|
|
|
+ };
|
|
|
+
|
|
|
+ let report =
|
|
|
+ run!(&self.config, &mut job).context("Error while running straglr solo")?;
|
|
|
+
|
|
|
+ let log_file = format!("{}/straglr_", self.log_dir);
|
|
|
+ report
|
|
|
+ .save_to_file(&log_file)
|
|
|
+ .context(format!("Error while writing logs into {log_file}"))?;
|
|
|
+ } else {
|
|
|
+ debug!("Straglr output TSV already exists.");
|
|
|
+ }
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl StraglrSolo {
|
|
|
+ /// Loads and parses the Straglr TSV results for this solo sample.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// Vector of STR loci from the solo sample
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if the TSV file cannot be read or parsed.
|
|
|
+ pub fn load_results(&self) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
+ let tsv_path = self.config.straglr_solo_tsv(&self.id, &self.time);
|
|
|
+ read_straglr_tsv(&tsv_path)
|
|
|
+ .context(format!("Failed to read Straglr results from {}", tsv_path))
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Filters results to show only expanded repeats above a threshold.
|
|
|
+ ///
|
|
|
+ /// # Arguments
|
|
|
+ /// * `min_copy_number` - Minimum copy number threshold
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// Vector of STR loci with copy numbers >= threshold
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// Returns an error if results cannot be loaded.
|
|
|
+ pub fn load_expanded_repeats(&self, min_copy_number: u32) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
+ let results = self.load_results()?;
|
|
|
+ Ok(results
|
|
|
+ .into_iter()
|
|
|
+ .filter(|row| row.is_expanded(min_copy_number))
|
|
|
+ .collect())
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Runs Straglr in parallel chunks for genome-wide STR genotyping.
|
|
|
+///
|
|
|
+/// Splits the genome into `n_parts` regions, creates temporary BED files for each region,
|
|
|
+/// runs Straglr in parallel, and merges the results into a single TSV file.
|
|
|
+///
|
|
|
+/// This function is designed for whole-genome STR genotyping where processing the entire
|
|
|
+/// genome at once would be too slow. By splitting into chunks, multiple Straglr instances
|
|
|
+/// can run in parallel.
|
|
|
+///
|
|
|
+/// # Arguments
|
|
|
+///
|
|
|
+/// * `id` - Sample identifier
|
|
|
+/// * `time_point` - Time point label (e.g., "norm", "diag")
|
|
|
+/// * `config` - Global pipeline configuration
|
|
|
+/// * `n_parts` - Number of parallel chunks (will be adjusted if genome is smaller)
|
|
|
+///
|
|
|
+/// # Returns
|
|
|
+///
|
|
|
+/// `Ok(())` if all chunks complete successfully and merge succeeds
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if:
|
|
|
+/// - `n_parts` is 0
|
|
|
+/// - BAM file cannot be opened or has no header
|
|
|
+/// - Temporary BED files cannot be created
|
|
|
+/// - Any Straglr chunk fails to execute
|
|
|
+/// - TSV merging fails
|
|
|
+/// - Final output file cannot be written
|
|
|
+///
|
|
|
+/// # Implementation Details
|
|
|
+///
|
|
|
+/// 1. Reads BAM header to determine genome sizes
|
|
|
+/// 2. Splits genome into approximately equal-sized regions
|
|
|
+/// 3. Creates temporary BED file for each region in `{tmp_dir}/straglr_chunk_{id}_{time}_{i}.bed`
|
|
|
+/// 4. Runs Straglr in parallel via `run_many!` macro
|
|
|
+/// 5. Concatenates all output TSV files (skipping headers from parts 2+)
|
|
|
+/// 6. Removes temporary BED and partial TSV files
|
|
|
+///
|
|
|
+/// # Example
|
|
|
+///
|
|
|
+/// ```ignore
|
|
|
+/// use pandora_lib_promethion::callers::straglr::run_straglr_chunked;
|
|
|
+/// use pandora_lib_promethion::config::Config;
|
|
|
+///
|
|
|
+/// let config = Config::default();
|
|
|
+///
|
|
|
+/// // Run genome-wide STR genotyping with 10 parallel jobs
|
|
|
+/// run_straglr_chunked("sample_001", "norm", &config, 10)?;
|
|
|
+/// # Ok::<(), anyhow::Error>(())
|
|
|
+/// ```
|
|
|
+pub fn run_straglr_chunked(
|
|
|
+ id: &str,
|
|
|
+ time_point: &str,
|
|
|
+ config: &Config,
|
|
|
+ n_parts: usize,
|
|
|
+) -> anyhow::Result<()> {
|
|
|
+ anyhow::ensure!(n_parts > 0, "n_parts must be > 0");
|
|
|
+
|
|
|
+ info!(
|
|
|
+ "Running Straglr in {} parallel chunks for {} {}",
|
|
|
+ n_parts, id, time_point
|
|
|
+ );
|
|
|
+
|
|
|
+ // Get genome sizes from BAM header
|
|
|
+ let bam_path = config.solo_bam(id, time_point);
|
|
|
+ let reader = bam::Reader::from_path(&bam_path)
|
|
|
+ .with_context(|| format!("Failed to open BAM: {}", bam_path))?;
|
|
|
+ let header = bam::Header::from_template(reader.header());
|
|
|
+ let genome_sizes = get_genome_sizes(&header)?;
|
|
|
+
|
|
|
+ // Split genome into regions
|
|
|
+ let region_chunks = split_genome_into_n_regions_exact(&genome_sizes, n_parts);
|
|
|
+ let actual_n_parts = region_chunks.len();
|
|
|
+
|
|
|
+ info!(
|
|
|
+ "Split genome into {} chunks for Straglr processing",
|
|
|
+ actual_n_parts
|
|
|
+ );
|
|
|
+
|
|
|
+ // Create output directory
|
|
|
+ let output_dir = config.straglr_solo_output_dir(id, time_point);
|
|
|
+ fs::create_dir_all(&output_dir)
|
|
|
+ .context(format!("Failed to create output directory: {}", output_dir))?;
|
|
|
+
|
|
|
+ // Create temporary BED files and jobs
|
|
|
+ let mut jobs = Vec::with_capacity(actual_n_parts);
|
|
|
+ let mut temp_bed_files = Vec::with_capacity(actual_n_parts);
|
|
|
+ let mut temp_tsv_files = Vec::with_capacity(actual_n_parts);
|
|
|
+
|
|
|
+ for (i, regions) in region_chunks.into_iter().enumerate() {
|
|
|
+ let part_num = i + 1;
|
|
|
+
|
|
|
+ // Create temporary BED file for this chunk
|
|
|
+ let bed_path = format!(
|
|
|
+ "{}/straglr_chunk_{}_{}_part{}.bed",
|
|
|
+ config.tmp_dir, id, time_point, part_num
|
|
|
+ );
|
|
|
+
|
|
|
+ let mut bed_file = File::create(&bed_path)
|
|
|
+ .context(format!("Failed to create temporary BED file: {}", bed_path))?;
|
|
|
+
|
|
|
+ // Write regions to BED file (format: chr\tstart\tend)
|
|
|
+ for region_str in ®ions {
|
|
|
+ // Parse region format: "chr1:1000-2000" -> "chr1\t1000\t2000"
|
|
|
+ if let Some((chr, range)) = region_str.split_once(':') {
|
|
|
+ if let Some((start, end)) = range.split_once('-') {
|
|
|
+ writeln!(bed_file, "{}\t{}\t{}", chr, start, end)
|
|
|
+ .context("Failed to write to BED file")?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ bed_file.flush().context("Failed to flush BED file")?;
|
|
|
+
|
|
|
+ temp_bed_files.push(bed_path.clone());
|
|
|
+
|
|
|
+ // Create job for this chunk
|
|
|
+ let output_prefix = format!("{}/{}_{}_part{}", output_dir, id, time_point, part_num);
|
|
|
+ let output_tsv = format!("{}_straglr.tsv", output_prefix);
|
|
|
+ temp_tsv_files.push(output_tsv.clone());
|
|
|
+
|
|
|
+ let job = StraglrJob {
|
|
|
+ conda_sh: config.conda_sh.clone(),
|
|
|
+ straglr_bin: config.straglr_bin.clone(),
|
|
|
+ bam: bam_path.clone(),
|
|
|
+ reference: config.reference.clone(),
|
|
|
+ loci_bed: bed_path, // Use the chunk BED file instead of global loci
|
|
|
+ output_prefix,
|
|
|
+ min_support: config.straglr_min_support,
|
|
|
+ min_cluster_size: config.straglr_min_cluster_size,
|
|
|
+ genotype_in_size: config.straglr_genotype_in_size,
|
|
|
+ };
|
|
|
+
|
|
|
+ jobs.push(job);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Run all chunks in parallel
|
|
|
+ info!("Executing {} Straglr jobs in parallel", actual_n_parts);
|
|
|
+ let outputs = run_many!(config, jobs)?;
|
|
|
+
|
|
|
+ // Save logs
|
|
|
+ let log_dir = format!("{}/{}/log/straglr_chunked", config.result_dir, id);
|
|
|
+ fs::create_dir_all(&log_dir).context("Failed to create log directory")?;
|
|
|
+
|
|
|
+ for (i, output) in outputs.iter().enumerate() {
|
|
|
+ let log_file = format!("{}/straglr_part{}_", log_dir, i + 1);
|
|
|
+ output
|
|
|
+ .save_to_file(&log_file)
|
|
|
+ .context(format!("Failed to save logs for part {}", i + 1))?;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Merge TSV files
|
|
|
+ info!("Merging {} TSV files", actual_n_parts);
|
|
|
+ let final_tsv = config.straglr_solo_tsv(id, time_point);
|
|
|
+ merge_tsv_files(&temp_tsv_files, &final_tsv)
|
|
|
+ .context("Failed to merge Straglr TSV files")?;
|
|
|
+
|
|
|
+ // Clean up temporary files
|
|
|
+ info!("Cleaning up temporary files");
|
|
|
+ for bed_file in &temp_bed_files {
|
|
|
+ if let Err(e) = fs::remove_file(bed_file) {
|
|
|
+ debug!("Failed to remove temporary BED file {}: {}", bed_file, e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for tsv_file in &temp_tsv_files {
|
|
|
+ if let Err(e) = fs::remove_file(tsv_file) {
|
|
|
+ debug!("Failed to remove temporary TSV file {}: {}", tsv_file, e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ info!(
|
|
|
+ "Straglr chunked execution completed for {} {} (merged into {})",
|
|
|
+ id, time_point, final_tsv
|
|
|
+ );
|
|
|
+
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
+
|
|
|
+/// Merges multiple TSV files into a single output file.
|
|
|
+///
|
|
|
+/// Concatenates TSV files while preserving the header from the first file
|
|
|
+/// and skipping headers from subsequent files.
|
|
|
+///
|
|
|
+/// # Arguments
|
|
|
+///
|
|
|
+/// * `input_files` - Paths to input TSV files
|
|
|
+/// * `output_file` - Path to merged output TSV file
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if any file cannot be read or the output cannot be written.
|
|
|
+fn merge_tsv_files(input_files: &[String], output_file: &str) -> anyhow::Result<()> {
|
|
|
+ let mut output = File::create(output_file)
|
|
|
+ .context(format!("Failed to create output file: {}", output_file))?;
|
|
|
+
|
|
|
+ let mut first_file = true;
|
|
|
+
|
|
|
+ for (i, input_path) in input_files.iter().enumerate() {
|
|
|
+ if !Path::new(input_path).exists() {
|
|
|
+ debug!("Skipping non-existent file: {}", input_path);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ let content = fs::read_to_string(input_path)
|
|
|
+ .context(format!("Failed to read input file: {}", input_path))?;
|
|
|
+
|
|
|
+ if first_file {
|
|
|
+ // Write entire first file including header
|
|
|
+ output
|
|
|
+ .write_all(content.as_bytes())
|
|
|
+ .context("Failed to write to output file")?;
|
|
|
+ first_file = false;
|
|
|
+ } else {
|
|
|
+ // Skip header line(s) for subsequent files
|
|
|
+ for line in content.lines() {
|
|
|
+ if !line.starts_with('#') && !line.trim().is_empty() {
|
|
|
+ writeln!(output, "{}", line).context("Failed to write line to output")?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ debug!("Merged part {} from {}", i + 1, input_path);
|
|
|
+ }
|
|
|
+
|
|
|
+ output.flush().context("Failed to flush output file")?;
|
|
|
+ Ok(())
|
|
|
+}
|