Thomas 4 gün önce
ebeveyn
işleme
d6250eeae7

+ 60 - 0
pandora-config.example.toml

@@ -365,6 +365,66 @@ straglr_solo_output_dir = "{result_dir}/{id}/{time}/straglr"
 # Force Straglr recomputation.
 straglr_force = false
 
+#######################################
+# CoRAL
+#######################################
+# Number of CPU threads for the CoRAL reconstruction job.
+#
+# CoRAL is CPU-bound during breakpoint graph construction and quadratic
+# programming cycle extraction. 8–16 threads is sufficient for most
+# focal amplification cases; increase for highly complex ecDNA with
+# many amplicons.
+coral_threads = 16
+
+# Memory allocation for the CoRAL SLURM job (e.g. `"32G"`).
+#
+# Memory usage scales with amplicon complexity and BAM depth.
+# 32G is sufficient for typical WGS at 30–60×; increase to 64G
+# for highly rearranged genomes (chromothripsis, high ecDNA copy number).
+coral_slurm_mem = "64G"
+
+# SLURM partition to use for CoRAL jobs.
+#
+# CoRAL requires only CPU — do not submit to a GPU partition.
+coral_slurm_partition = "shortq"
+
+# Minimum copy number gain threshold for a segment to be considered
+# a focal amplification seed (CoRAL `--gain`).
+#
+# CoRAL applies this threshold to the raw absolute CN values from the
+# cn_segs BED — do NOT pre-correct for purity or ploidy, as this may
+# cause entire chromosome arms to exceed the threshold in aneuploid tumours.
+#
+# Default in CoRAL is 6.0 (diploid assumption). For hyperdiploid tumours
+# (e.g. hyperploid ALL, CML blast crisis) consider lowering to 4.0–5.0.
+coral_seed_gain = 6.0
+
+# Minimum size in base pairs for a CN segment to qualify as a seed
+# (CoRAL `--min-seed-size`).
+#
+# Segments below this size are discarded even if they exceed `coral_seed_gain`.
+# Two merged proximal segments (see `coral_max_seg_gap`) are evaluated
+# against this threshold as a single combined interval.
+#
+# Default in CoRAL is 100000 (100 kb). Reducing this risks including
+# artefactual short high-copy segments; increasing it misses small focal
+# amplifications (e.g. narrow EGFR or MYC peaks).
+coral_min_seed_size = 100000
+
+# Maximum gap in base pairs between two proximal CN segments to allow
+# merging into a single seed candidate (CoRAL `--max-seg-gap`).
+#
+# If two amplified segments are separated by a gap smaller than this value,
+# they are merged before the `coral_min_seed_size` filter is applied.
+# This handles cases where a single focal amplicon is split by a low-coverage
+# or diploid bin.
+#
+# Default in CoRAL is 300000 (300 kb). For haematological cancers with
+# compact focal amplifications (e.g. NUP214::ABL1, ABL1 amplification in
+# CML blast crisis) a tighter value such as 100000 reduces spurious merging
+# of adjacent independent amplicons.
+coral_max_seg_gap = 100000
+
 #######################################
 # Marlin
 #######################################

+ 341 - 0
src/callers/coral.rs

@@ -0,0 +1,341 @@
+//! # CoRAL — Focal Amplification Reconstruction
+//!
+//! Wraps [CoRAL](https://github.com/AmpliconSuite/CoRAL) for ecDNA and focal
+//! amplification cycle reconstruction from long-read WGS data.
+//!
+//! CoRAL takes as input:
+//! - A tumour BAM (haplotagged recommended)
+//! - A CN segments BED derived from SAVANA absolute CN output
+//! - A seeds BED of amplified intervals (CN >= 4 * ploidy)
+//!
+//! ## Pipeline position
+//!
+//! CoRAL runs **after** SAVANA CNA fitting. The converter
+//! [`savana_to_coral`] produces both required BED files from
+//! `{sample}_segmented_absolute_copy_number.tsv` and
+//! `{sample}_fitted_purity_ploidy.tsv`.
+//!
+//! ## Output files
+//!
+//! | File | Description |
+//! |------|-------------|
+//! | `coral_out/amplicon*_graph.txt` | Breakpoint graph per amplicon |
+//! | `coral_out/amplicon*_cycles.txt` | Cycle decomposition per amplicon |
+//!
+//! ## Implemented traits
+//!
+//! - [`Initialize`] — Directory creation, SAVANA → CoRAL BED conversion
+//! - [`ShouldRun`] — Skip if output directory already contains graph files
+//! - [`JobCommand`] — CoRAL CLI command string
+//! - [`LocalRunner`] — Local execution (conda env)
+//! - [`SbatchRunner`] — SLURM submission
+//! - [`Run`] — Unified execution via `run!` macro
+
+use std::fs;
+use std::path::Path;
+
+use anyhow::Context;
+use log::debug;
+use tracing::info;
+
+use crate::callers::savana::SavanaCN;
+use crate::commands::{Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner};
+use crate::config::Config;
+use crate::pipes::{Initialize, ShouldRun};
+use crate::run;
+use crate::runners::Run;
+
+// ── CoRALSeed ─────────────────────────────────────────────────────────────────
+
+/// CoRAL seed interval identification.
+///
+/// Runs `coral seed` to identify focally amplified genomic intervals
+/// from SAVANA absolute CN segments. Produces a `*_CNV_SEEDS.bed` file
+/// consumed by [`CoRALReconstruct`].
+#[derive(Debug)]
+pub struct CoRALSeed {
+    pub id: String,
+    pub config: Config,
+    pub cn_segs_bed: String,
+    pub output_dir: String,
+    pub log_dir: String,
+}
+
+impl Initialize for CoRALSeed {
+    fn initialize(id: &str, config: &Config) -> anyhow::Result<Self> {
+        let id = id.to_string();
+        let work_dir = format!("{}/{}/coral/work", config.result_dir, id);
+        let output_dir = format!("{}/{}/coral/seed", config.result_dir, id);
+        let log_dir = format!("{}/{}/log/coral", config.result_dir, id);
+        let cn_segs_bed = format!("{work_dir}/cn_segs.bed");
+
+        for dir in [&work_dir, &output_dir, &log_dir] {
+            fs::create_dir_all(dir)
+                .with_context(|| format!("Failed to create directory: {dir}"))?;
+        }
+
+        // Write SAVANA absolute CN → CoRAL cn_segs BED (no purity correction)
+        let cn = SavanaCN::parse_file(&id, config)
+            .with_context(|| format!("Failed to parse SAVANA CN segments for {id}"))?;
+        cn.write_coral_cn_segs(Path::new(&cn_segs_bed))?;
+
+        Ok(CoRALSeed {
+            id,
+            config: config.clone(),
+            cn_segs_bed,
+            output_dir,
+            log_dir,
+        })
+    }
+}
+
+impl ShouldRun for CoRALSeed {
+    /// Skip if the seeds BED already exists.
+    fn should_run(&self) -> bool {
+        !Path::new(&self.seeds_bed()).exists()
+    }
+}
+
+impl CoRALSeed {
+    /// Path of the seeds BED produced by `coral seed`.
+    pub fn seeds_bed(&self) -> String {
+        format!("{}/seeds_CNV_SEEDS.bed", self.output_dir)
+    }
+}
+
+impl JobCommand for CoRALSeed {
+    fn cmd(&self) -> String {
+        format!(
+            "source {conda_sh} && conda activate coral_env && \
+                coral seed \
+                --cn_segs         {cn_segs} \
+                --gain            {gain} \
+                --min-seed-size   {min_seed_size} \
+                --max-seg-gap     {max_seg_gap} \
+                --output-prefix   {}/seeds",
+            self.output_dir,
+            conda_sh      = self.config.conda_sh,
+            cn_segs       = self.cn_segs_bed,
+            gain          = self.config.coral_seed_gain,
+            min_seed_size = self.config.coral_min_seed_size,
+            max_seg_gap   = self.config.coral_max_seg_gap,
+        )
+    }
+}
+
+impl LocalRunner for CoRALSeed {}
+impl SbatchRunner for CoRALSeed {
+    fn slurm_params(&self) -> SlurmParams {
+        SlurmParams {
+            job_name: Some(format!("coral_seed_{}", self.id)),
+            partition: Some(self.config.coral_slurm_partition.clone()),
+            cpus_per_task: Some(1), // seed step is lightweight
+            mem: Some("8G".into()),
+            gres: None,
+        }
+    }
+}
+
+impl SlurmRunner for CoRALSeed {
+    fn slurm_args(&self) -> Vec<String> {
+        self.slurm_params().to_args()
+    }
+}
+
+impl Run for CoRALSeed {
+    fn run(&mut self) -> anyhow::Result<()> {
+        if !self.should_run() {
+            debug!("CoRAL seed already exists for {}; skipping.", self.id);
+            return Ok(());
+        }
+
+        let report = run!(self.config, self)
+            .with_context(|| format!("CoRAL seed failed for: {}", self.id))?;
+
+        let log_file = format!("{}/coral_seed", self.log_dir);
+        report
+            .save_to_file(&log_file)
+            .with_context(|| format!("Failed to write CoRAL seed log: {log_file}"))?;
+
+        // Guard: bail if no seeds were produced (no focal amplifications)
+        anyhow::ensure!(
+            Path::new(&self.seeds_bed()).exists(),
+            "CoRAL seed produced no output for {} — no focal amplifications detected \
+             at gain={:.1}, min-seed-size={}",
+            self.id,
+            self.config.coral_seed_gain,
+            self.config.coral_min_seed_size,
+        );
+
+        Ok(())
+    }
+}
+
+// ── CoRALReconstruct ──────────────────────────────────────────────────────────
+
+/// CoRAL focal amplification reconstruction.
+///
+/// Runs `coral reconstruct` from seed intervals produced by [`CoRALSeed`],
+/// building a breakpoint graph and extracting ecDNA cycle candidates.
+///
+/// Output files per amplicon:
+/// - `amplicon*_graph.txt`  — breakpoint graph
+/// - `amplicon*_cycles.txt` — cycle decomposition
+#[derive(Debug)]
+pub struct CoRALReconstruct {
+    pub id: String,
+    pub config: Config,
+    pub tumour_bam: String,
+    pub cn_segs_bed: String,
+    pub seeds_bed: String,
+    pub output_dir: String,
+    pub log_dir: String,
+}
+
+impl Initialize for CoRALReconstruct {
+    fn initialize(id: &str, config: &Config) -> anyhow::Result<Self> {
+        let seed = CoRALSeed::initialize(id, config)?;
+
+        // seeds_bed must exist — CoRALSeed::run() must have completed first
+        anyhow::ensure!(
+            Path::new(&seed.seeds_bed()).exists(),
+            "CoRAL seeds BED not found for {id}: {}. \
+             Run CoRALSeed first.",
+            seed.seeds_bed()
+        );
+
+        let output_dir = format!("{}/{}/coral/reconstruct", config.result_dir, id);
+        fs::create_dir_all(&output_dir)
+            .with_context(|| format!("Failed to create reconstruct dir: {output_dir}"))?;
+
+        Ok(CoRALReconstruct {
+            id: id.to_string(),
+            config: config.clone(),
+            tumour_bam: config.tumoral_bam(id),
+            cn_segs_bed: seed.cn_segs_bed.clone(),
+            seeds_bed: seed.seeds_bed(),
+            output_dir,
+            log_dir: seed.log_dir,
+        })
+    }
+}
+
+impl ShouldRun for CoRALReconstruct {
+    /// Skip if at least one graph file already exists in the output directory.
+    fn should_run(&self) -> bool {
+        let out = Path::new(&self.output_dir);
+        if !out.exists() {
+            return true;
+        }
+        let has_graph = fs::read_dir(out)
+            .map(|entries| {
+                entries
+                    .filter_map(|e| e.ok())
+                    .any(|e| e.file_name().to_string_lossy().ends_with("_graph.txt"))
+            })
+            .unwrap_or(true);
+        !has_graph
+    }
+}
+
+impl JobCommand for CoRALReconstruct {
+    fn cmd(&self) -> String {
+        format!(
+            "source {conda_sh} && conda activate coral_env && \
+                coral reconstruct \
+                --lr-bam          {bam} \
+                --cnv-seed        {seeds} \
+                --cn_segs         {cn_segs} \
+                --ref_genome      {reference} \
+                --output-prefix   {}/amplicon \
+                --threads         {}",
+            self.output_dir,
+            self.config.coral_threads,
+            conda_sh  = self.config.conda_sh,
+            bam       = self.tumour_bam,
+            seeds     = self.seeds_bed,
+            cn_segs   = self.cn_segs_bed,
+            reference = self.config.reference,
+        )
+    }
+}
+
+impl LocalRunner for CoRALReconstruct {}
+impl SbatchRunner for CoRALReconstruct {
+    fn slurm_params(&self) -> SlurmParams {
+        SlurmParams {
+            job_name: Some(format!("coral_reconstruct_{}", self.id)),
+            partition: Some(self.config.coral_slurm_partition.clone()),
+            cpus_per_task: Some(self.config.coral_threads.into()),
+            mem: Some(self.config.coral_slurm_mem.clone()),
+            gres: None,
+        }
+    }
+}
+
+impl SlurmRunner for CoRALReconstruct {
+    fn slurm_args(&self) -> Vec<String> {
+        self.slurm_params().to_args()
+    }
+}
+
+impl Run for CoRALReconstruct {
+    fn run(&mut self) -> anyhow::Result<()> {
+        if !self.should_run() {
+            debug!("CoRAL reconstruct already done for {}; skipping.", self.id);
+            return Ok(());
+        }
+
+        let report = run!(self.config, self)
+            .with_context(|| format!("CoRAL reconstruct failed for: {}", self.id))?;
+
+        let log_file = format!("{}/coral_reconstruct", self.log_dir);
+        report
+            .save_to_file(&log_file)
+            .with_context(|| format!("Failed to write CoRAL reconstruct log: {log_file}"))?;
+
+        Ok(())
+    }
+}
+
+// ── Top-level pipe ────────────────────────────────────────────────────────────
+
+/// Runs the full CoRAL pipeline: seed identification → amplicon reconstruction.
+///
+/// Steps:
+/// 1. [`CoRALSeed`]        — identify focally amplified seed intervals
+/// 2. [`CoRALReconstruct`] — build breakpoint graph and extract ecDNA cycles
+///
+/// Returns `Ok(())` early (without error) if the seed step finds no amplified
+/// intervals, since absence of focal amplifications is a valid biological result.
+///
+/// # Arguments
+/// * `id`     - Sample identifier
+/// * `config` - Pipeline configuration
+pub fn run_coral(id: &str, config: &Config) -> anyhow::Result<()> {
+    info!("CoRAL pipeline starting for {id}");
+
+    // Step 1: seed
+    let mut seed = CoRALSeed::initialize(id, config)
+        .with_context(|| format!("CoRALSeed::initialize failed for {id}"))?;
+
+    match seed.run() {
+        Ok(()) => {}
+        Err(e) if e.to_string().contains("no focal amplifications") => {
+            info!("CoRAL: no focal amplifications for {id}; pipeline skipped.");
+            return Ok(());
+        }
+        Err(e) => return Err(e),
+    }
+
+    // Step 2: reconstruct
+    let mut reconstruct = CoRALReconstruct::initialize(id, config)
+        .with_context(|| format!("CoRALReconstruct::initialize failed for {id}"))?;
+
+    reconstruct
+        .run()
+        .with_context(|| format!("CoRALReconstruct::run failed for {id}"))?;
+
+    info!("CoRAL pipeline complete for {id}");
+    Ok(())
+}

+ 14 - 3
src/callers/deep_somatic.rs

@@ -498,12 +498,23 @@ impl Version for DeepSomatic {
 }
 
 fn parse_deepsomatic_version(output: &str) -> anyhow::Result<String> {
-    let re = Regex::new(r"(?m)DeepVariant version\s+([^\s]+)").expect("Version regex is valid");
+    let deepsomatic_re =
+        Regex::new(r"(?mi)\bdeepsomatic(?:\s+version)?\s*[:=]?\s*([^\s]+)")
+            .expect("DeepSomatic version regex is valid");
+    if let Some(caps) = deepsomatic_re.captures(output) {
+        return Ok(caps
+            .get(1)
+            .expect("Regex has capture group 1")
+            .as_str()
+            .to_string());
+    }
 
-    let caps = re
+    // Fallback for containers that only expose the embedded DeepVariant version.
+    let deepvariant_re =
+        Regex::new(r"(?m)DeepVariant version\s+([^\s]+)").expect("DeepVariant regex is valid");
+    let caps = deepvariant_re
         .captures(output)
         .context("Could not parse DeepSomatic version from output")?;
-
     Ok(caps
         .get(1)
         .expect("Regex has capture group 1")

+ 3 - 1
src/callers/mod.rs

@@ -156,6 +156,7 @@ pub mod nanomonsv;
 pub mod savana;
 pub mod severus;
 pub mod straglr;
+pub mod coral;
 
 /// Runs all somatic variant callers sequentially for comprehensive multi-caller analysis.
 ///
@@ -269,7 +270,8 @@ pub fn run_somatic_callers(id: &str, config: &Config) -> anyhow::Result<()> {
         ];
 
         for h in handles {
-            h.join().expect("thread panicked")?;
+            h.join()
+                .map_err(|_| anyhow::anyhow!("somatic caller thread panicked"))??;
         }
     } else {
         Severus::initialize(id, config)?.run()?;

+ 2 - 3
src/callers/nanomonsv.rs

@@ -705,9 +705,8 @@ fn somatic_parse(
     config: &Config,
     log_dir: &str,
 ) -> anyhow::Result<()> {
-    let tumoral_out_prefix = format!("{diag_out_dir}/{id}_diag");
-    let norm_out_prefix = format!("{norm_out_dir}/{id}_norm"); // TODO: will break if norm
-                                                               // name change
+    let tumoral_out_prefix = format!("{diag_out_dir}/{id}_{}", config.tumoral_name);
+    let norm_out_prefix = format!("{norm_out_dir}/{id}_{}", config.normal_name);
 
     let diag_info_vcf = format!("{tumoral_out_prefix}.bp_info.sorted.bed.gz");
     let norm_info_vcf = format!("{norm_out_prefix}.bp_info.sorted.bed.gz");

+ 135 - 12
src/callers/savana.rs

@@ -97,9 +97,10 @@ use itertools::Itertools;
 use log::{debug, info, warn};
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
+use std::io::Write;
 use std::{
-    fs::{self},
-    io::BufRead,
+    fs::{self, File},
+    io::{BufRead, BufWriter},
     path::Path,
     str::FromStr,
 };
@@ -291,8 +292,8 @@ impl Run for Savana {
             }
 
             if !Path::new(&format!("{tumoral_hp_bam}.bai")).exists() {
-                let mut normal_index = SamtoolsIndex::from_config(&self.config, &normal_hp_bam);
-                normal_index.run()?;
+                let mut tumoral_index = SamtoolsIndex::from_config(&self.config, &tumoral_hp_bam);
+                tumoral_index.run()?;
             }
 
             self.job_args = vec![
@@ -820,15 +821,13 @@ impl SavanaCN {
                     as u64,
                 weight: parse_field(parts[6], "weight", line_num)?,
                 copy_number: parse_field(parts[7], "copy_number", line_num)?,
-                minor_allele_copy_number: if parts[8].is_empty() {
-                    None
-                } else {
-                    Some(parse_field(parts[8], "minor_allele_copy_number", line_num)?)
+                minor_allele_copy_number: match parts[8] {
+                    "" | "NA" => None,
+                    v => Some(parse_field(v, "minor_allele_copy_number", line_num)?),
                 },
-                mean_baf: if parts[9].is_empty() {
-                    None
-                } else {
-                    Some(parse_field(parts[9], "mean_baf", line_num)?)
+                mean_baf: match parts[9] {
+                    "" | "NA" => None,
+                    v => Some(parse_field(v, "mean_baf", line_num)?),
                 },
                 no_het_snps: if parts[10].is_empty() {
                     0
@@ -842,6 +841,130 @@ impl SavanaCN {
 
         Ok(Self { segments })
     }
+
+    /// Writes CoRAL `--cn_segs` BED: all segments with absolute CN.
+    ///
+    /// Format: `chrom\tstart\tend\tcopy_number`
+    pub fn write_coral_cn_segs(&self, path: &Path) -> anyhow::Result<()> {
+        let mut w = BufWriter::new(
+            File::create(path)
+                .with_context(|| format!("Cannot create cn_segs BED: {}", path.display()))?,
+        );
+        for seg in &self.segments {
+            writeln!(
+                w,
+                "{}\t{}\t{}\t{:.4}",
+                seg.chromosome, seg.start, seg.end, seg.copy_number
+            )?;
+        }
+        Ok(())
+    }
+
+    /// Writes CoRAL `--seeds` BED: amplified segments only, centromeres excluded.
+    ///
+    /// Uses a purity-aware threshold so that tumour CN dilution by normal
+    /// contamination is accounted for:
+    ///
+    /// ```text
+    /// fitted_CN = purity * tumour_CN + (1 - purity) * ploidy
+    /// seed_threshold = purity * min_tumour_cn + (1 - purity) * ploidy
+    /// ```
+    ///
+    /// Centromeric segments (stain `acen` in the cytoband BED) are excluded
+    /// regardless of copy number, as they produce artifactual high-depth signal
+    /// due to repetitive satellite sequences.
+    ///
+    /// # Arguments
+    /// * `path` - Output BED path
+    /// * `purity` - Tumour purity (0..1) from `_fitted_purity_ploidy.tsv`
+    /// * `ploidy` - Tumour ploidy from `_fitted_purity_ploidy.tsv`
+    /// * `min_tumour_cn` - Minimum tumour CN to qualify as a seed (typically `4 * ploidy`)
+    /// * `centromeres` - Parsed centromere intervals from `parse_centromere_intervals()`
+    pub fn write_coral_seeds(
+        &self,
+        path: &Path,
+        purity: f64,
+        ploidy: f64,
+        min_tumour_cn: f64,
+        centromeres: &[(String, u64, u64)],
+    ) -> anyhow::Result<()> {
+        let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy;
+        let mut w = BufWriter::new(
+            File::create(path)
+                .with_context(|| format!("Cannot create seeds BED: {}", path.display()))?,
+        );
+        for seg in &self.segments {
+            if seg.copy_number >= threshold
+                && !overlaps_centromere(&seg.chromosome, seg.start, seg.end, centromeres)
+            {
+                writeln!(w, "{}\t{}\t{}", seg.chromosome, seg.start, seg.end)?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Returns the number of amplified non-centromeric seed segments.
+    pub fn n_seeds(
+        &self,
+        purity: f64,
+        ploidy: f64,
+        min_tumour_cn: f64,
+        centromeres: &[(String, u64, u64)],
+    ) -> usize {
+        let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy;
+        self.segments
+            .iter()
+            .filter(|s| {
+                s.copy_number >= threshold
+                    && !overlaps_centromere(&s.chromosome, s.start, s.end, centromeres)
+            })
+            .count()
+    }
+}
+
+/// Returns true if [seg_start, seg_end) overlaps any centromeric interval
+/// on the same chromosome.
+fn overlaps_centromere(
+    chrom: &str,
+    start: u64,
+    end: u64,
+    centromeres: &[(String, u64, u64)],
+) -> bool {
+    centromeres
+        .iter()
+        .any(|(c, cs, ce)| c == chrom && start < *ce && end > *cs)
+}
+
+/// Reads SAVANA fitted_purity_ploidy.tsv.
+/// Expected format:
+///   purity  ploidy  distance  rank
+///   0.48    1.98    0.105     1
+pub fn read_savana_purity_ploidy(path: &Path) -> anyhow::Result<(f64, f64)> {
+    let content = std::fs::read_to_string(path)?;
+    let mut lines = content.lines();
+
+    // Skip header
+    lines.next().context("purity/ploidy file is empty")?;
+
+    let data = lines.next().context("purity/ploidy file has no data row")?;
+    let fields: Vec<&str> = data.split('\t').collect();
+
+    anyhow::ensure!(
+        fields.len() >= 2,
+        "expected at least 2 columns, got {}",
+        fields.len()
+    );
+
+    let purity = fields[0]
+        .trim()
+        .parse::<f64>()
+        .with_context(|| format!("cannot parse purity: '{}'", fields[0]))?;
+    let ploidy = fields[1]
+        .trim()
+        .parse::<f64>()
+        .with_context(|| format!("cannot parse ploidy: '{}'", fields[1]))?;
+
+    Ok((purity, ploidy))
 }
 
 #[cfg(test)]

+ 120 - 1
src/config.rs

@@ -223,7 +223,7 @@ pub struct Config {
     /// Typically forwarded to:
     ///   - `--native-pair-hmm-threads`
     ///   - `--reader-threads`
-    /// Should match available cores on the node.
+    ///     Should match available cores on the node.
     pub gatk_threads: usize, // e.g. 32
 
     /// Local single-run memory limit in GB.
@@ -282,6 +282,95 @@ pub struct Config {
     /// Template for constitutional phased VCF (`{result_dir}`, `{id}`).
     pub germline_phased_vcf: String,
 
+    // === CoRAL ===
+    /// Number of CPU threads for the CoRAL reconstruction job.
+    ///
+    /// CoRAL is CPU-bound during breakpoint graph construction and quadratic
+    /// programming cycle extraction. 8–16 threads is sufficient for most
+    /// focal amplification cases; increase for highly complex ecDNA with
+    /// many amplicons.
+    ///
+    /// # TOML
+    /// ```toml
+    /// coral_threads = 16
+    /// ```
+    pub coral_threads: u8,
+
+    /// Memory allocation for the CoRAL SLURM job (e.g. `"32G"`).
+    ///
+    /// Memory usage scales with amplicon complexity and BAM depth.
+    /// 32G is sufficient for typical WGS at 30–60×; increase to 64G
+    /// for highly rearranged genomes (chromothripsis, high ecDNA copy number).
+    ///
+    /// # TOML
+    /// ```toml
+    /// coral_slurm_mem = "32G"
+    /// ```
+    pub coral_slurm_mem: String,
+
+    /// SLURM partition to use for CoRAL jobs.
+    ///
+    /// CoRAL requires only CPU — do not submit to a GPU partition.
+    /// Use the same partition as other CPU-bound callers (e.g. Severus, NanomonSV).
+    ///
+    /// # TOML
+    /// ```toml
+    /// coral_slurm_partition = "cpuq"
+    /// ```
+    pub coral_slurm_partition: String,
+
+    /// Minimum copy number gain threshold for a segment to be considered
+    /// a focal amplification seed (CoRAL `--gain`).
+    ///
+    /// CoRAL applies this threshold to the raw absolute CN values from the
+    /// cn_segs BED — do NOT pre-correct for purity or ploidy, as this may
+    /// cause entire chromosome arms to exceed the threshold in aneuploid tumours.
+    ///
+    /// Default in CoRAL is 6.0 (diploid assumption). For hyperdiploid tumours
+    /// (e.g. hyperploid ALL, CML blast crisis) consider lowering to 4.0–5.0.
+    ///
+    /// # TOML
+    /// ```toml
+    /// coral_seed_gain = 6.0
+    /// ```
+    pub coral_seed_gain: f64,
+
+    /// Minimum size in base pairs for a CN segment to qualify as a seed
+    /// (CoRAL `--min-seed-size`).
+    ///
+    /// Segments below this size are discarded even if they exceed `coral_seed_gain`.
+    /// Two merged proximal segments (see `coral_max_seg_gap`) are evaluated
+    /// against this threshold as a single combined interval.
+    ///
+    /// Default in CoRAL is 100000 (100 kb). Reducing this risks including
+    /// artefactual short high-copy segments; increasing it misses small focal
+    /// amplifications (e.g. narrow EGFR or MYC peaks).
+    ///
+    /// # TOML
+    /// ```toml
+    /// coral_min_seed_size = 100000
+    /// ```
+    pub coral_min_seed_size: u32,
+
+    /// Maximum gap in base pairs between two proximal CN segments to allow
+    /// merging into a single seed candidate (CoRAL `--max-seg-gap`).
+    ///
+    /// If two amplified segments are separated by a gap smaller than this value,
+    /// they are merged before the `coral_min_seed_size` filter is applied.
+    /// This handles cases where a single focal amplicon is split by a low-coverage
+    /// or diploid bin.
+    ///
+    /// Default in CoRAL is 300000 (300 kb). For haematological cancers with
+    /// compact focal amplifications (e.g. NUP214::ABL1, ABL1 amplification in
+    /// CML blast crisis) a tighter value such as 100000 reduces spurious merging
+    /// of adjacent independent amplicons.
+    ///
+    /// # TOML
+    /// ```toml
+    /// coral_max_seg_gap = 100000
+    /// ```
+    pub coral_max_seg_gap: u32,
+
     // === Severus configuration ===
     /// Path to Severus main script (`severus.py`).
     pub severus_bin: String,
@@ -927,6 +1016,36 @@ impl Config {
             .replace("{haplotagged_bam_tag_name}", &self.haplotagged_bam_tag_name)
     }
 
+    /// CoRAl
+    pub fn coral_output_dir(&self, id: &str) -> String {
+        format!(
+            "{}/{}/{}/coral/output",
+            self.result_dir, id, self.tumoral_name
+        )
+    }
+
+    pub fn coral_work_dir(&self, id: &str) -> String {
+        format!(
+            "{}/{}/{}/coral/work",
+            self.result_dir, id, self.tumoral_name
+        )
+    }
+
+    pub fn savana_fitted_purity_ploidy(&self, id: &str) -> String {
+        // adjust the glob pattern to match your actual SAVANA output naming
+        format!(
+            "{}/{}/{}/savana/{}_*_fitted_purity_ploidy.tsv",
+            self.result_dir, id, self.tumoral_name, id
+        )
+    }
+
+    pub fn savana_segmented_absolute_cn(&self, id: &str) -> String {
+        format!(
+            "{}/{}/{}/savana/{}_*_segmented_absolute_copy_number.tsv",
+            self.result_dir, id, self.tumoral_name, id
+        )
+    }
+
     /// Severus paired output directory.
     pub fn severus_output_dir(&self, id: &str) -> String {
         self.severus_output_dir

+ 33 - 0
src/io/bed.rs

@@ -372,6 +372,39 @@ pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::R
     Ok(())
 }
 
+/// Returns a set of (chrom, start, end) intervals for centromeric regions
+/// parsed from a UCSC-format cytoband BED.
+///
+/// Centromeres are identified by the `acen` stain field (column 4).
+///
+/// Format: `chrom\tstart\tend\tband_name\tstain`
+pub fn parse_centromere_intervals(path: &str) -> anyhow::Result<Vec<(String, u64, u64)>> {
+    let reader = BufReader::new(get_reader(path)
+        .with_context(|| format!("Cannot open cytobands BED: {path}"))?);
+
+    let mut intervals = Vec::new();
+    for line in reader.lines() {
+        let line = line?;
+        if line.starts_with('#') || line.is_empty() {
+            continue;
+        }
+        let f: Vec<&str> = line.split('\t').collect();
+        anyhow::ensure!(
+            f.len() >= 5,
+            "Expected 5 columns in cytoband BED, got {}: {line}",
+            f.len()
+        );
+        if f[4].trim() == "acen" {
+            intervals.push((
+                f[0].to_string(),
+                f[1].parse::<u64>().with_context(|| format!("start: {}", f[1]))?,
+                f[2].parse::<u64>().with_context(|| format!("end: {}", f[2]))?,
+            ));
+        }
+    }
+    Ok(intervals)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

+ 10 - 2
src/variant/variants_stats.rs

@@ -620,9 +620,17 @@ pub fn somatic_depth_quality_ranges(
                 match (n_ok, t_ok) {
                     (false, false) => break,
                     (true, false) => {
-                        anyhow::bail!("{} has extra lines at {}", normal_path, line_no)
+                        anyhow::bail!(
+                            "{normal_path} has extra lines at {line_no}; last normal record = {:?}",
+                            String::from_utf8_lossy(n_rec.as_slice())
+                        )
+                    }
+                    (false, true) => {
+                        anyhow::bail!(
+                            "{tumor_path} has extra lines at {line_no}; last tumor record = {:?}",
+                            String::from_utf8_lossy(t_rec.as_slice())
+                        )
                     }
-                    (false, true) => anyhow::bail!("{} has extra lines at {}", tumor_path, line_no),
                     (true, true) => {
                         let (n_start, n_depths, n_lowq) =
                             parse_bin_record_into(&n_rec, &mut n_buf, &contig)