//! # Savana Haplotype-Aware Somatic Variant Caller //! //! This module provides wrappers for [Savana](https://github.com/cortes-ciriano-lab/savana), //! a haplotype-aware somatic variant caller for structural variants and copy number alterations //! from long-read sequencing data. //! //! ## Overview //! //! Savana detects: //! - Structural variants (SVs): deletions, duplications, inversions, translocations //! - Copy number variations (CNVs) with allele-specific information //! - Complex genomic rearrangements //! //! ## Key Features //! //! - **Haplotype-aware calling** - Uses phased germline variants and haplotagged BAMs //! - **Integrated phasing** - Automatically runs LongPhase if needed //! - **CNV segmentation** - Provides detailed copy number profiles //! //! ## Requirements //! //! Before running Savana, ensure: //! - Tumor and normal BAMs are indexed //! - Reference genome is accessible //! - Conda environment with Savana is configured (environment name: savana_env) //! //! ## Output Files //! //! PASS-filtered somatic variants: //! ```text //! {result_dir}/{id}/savana/somatic_sv_PASS.vcf.gz //! ``` //! //! Copy number segmentation: //! ```text //! {result_dir}/{id}/savana/copy_number_segmentation.txt.gz //! ``` //! //! Read count bins: //! ```text //! {result_dir}/{id}/savana/read_counts.txt.gz //! ``` //! //! ## Usage //! //! ```ignore //! use pandora_lib_promethion::callers::savana::Savana; //! use pandora_lib_promethion::config::Config; //! use pandora_lib_promethion::pipes::Initialize; //! use pandora_lib_promethion::runners::Run; //! //! let config = Config::default(); //! let mut caller = Savana::initialize("sample_001", &config)?; //! //! if caller.should_run() { //! caller.run()?; // Automatically handles phasing and haplotagging //! } //! //! // Load SV/CNV variants //! let variants = caller.variants(&annotations)?; //! println!("Found {} somatic SVs/CNVs", variants.variants.len()); //! //! // Load copy number data //! let cn_data = SavanaCN::parse_file("sample_001", &config)?; //! println!("Copy number segments: {}", cn_data.segments.len()); //! # Ok::<(), anyhow::Error>(()) //! ``` //! //! ## References //! //! - [Savana GitHub](https://github.com/cortes-ciriano-lab/savana) //! - [Savana Paper](https://doi.org/10.1038/s41467-022-34590-2) use crate::{ annotation::{Annotation, Annotations, Caller, CallerCat, Sample}, collection::vcf::Vcf, commands::{ bcftools::BcftoolsKeepPass, longphase::{LongphaseHap, LongphasePhase}, samtools::SamtoolsIndex, CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner, }, config::Config, helpers::{is_file_older, remove_dir_if_exists}, io::{readers::get_gz_reader, vcf::read_vcf}, locker::SampleLock, pipes::{Initialize, InitializeSolo, ShouldRun, Version}, positions::{num_to_contig, GenomeRange}, run, runners::Run, variant::{ variant_collection::VariantCollection, vcf_variant::{Label, Variants}, }, }; use anyhow::Context; use itertools::Itertools; use log::{debug, info, warn}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::io::Write; use std::{ fs::{self, File}, io::{BufRead, BufWriter}, path::Path, str::FromStr, }; /// Savana haplotype-aware somatic SV and CNV caller. /// /// Orchestrates the Savana pipeline including prerequisite phasing and haplotagging steps. /// Automatically invokes LongPhase if required phased VCFs or haplotagged BAMs are missing. /// /// # Fields /// /// - `id` - Sample identifier (e.g., "34528") /// - `config` - Global pipeline configuration /// - `log_dir` - Directory for execution logs (e.g., "{result_dir}/{id}/log/savana") /// - `job_args` - Internal command-line arguments passed to Savana (populated in `run()`) #[derive(Debug)] pub struct Savana { /// Sample identifier pub id: String, /// Global pipeline configuration pub config: Config, /// Directory for log file storage pub log_dir: String, /// Command-line arguments for Savana executable (populated during run) job_args: Vec, } impl Initialize for Savana { /// Initializes a new `Savana` instance for a given sample ID and configuration. /// /// If `savana_force` is enabled in [`Config`] and the output VCF exists, or if the run /// should be re-triggered (based on file freshness), the output directory /// will be cleaned. /// /// # Arguments /// /// * `id` - Sample identifier /// * `config` - Pipeline execution configuration /// /// # Returns /// /// A new `Savana` instance, or an error if cleanup fails. /// /// # Errors /// /// Returns an error if: /// - `config.savana_force` is true and output directory cannot be removed /// - Directory deletion fails due to permissions or I/O errors fn initialize(id: &str, config: &Config) -> anyhow::Result { info!("Initialize Savana for {id}."); let log_dir = format!("{}/{}/log/savana", config.result_dir, id); let savana = Self { id: id.to_string(), config: config.clone(), log_dir, job_args: Vec::new(), }; // If forced re-run is enabled or a run is needed, remove old output directory if savana.config.savana_force { remove_dir_if_exists(&savana.config.savana_output_dir(id))?; } Ok(savana) } } impl ShouldRun for Savana { /// Determines whether Savana should be re-run based on whether /// the filtered PASS VCF is older than the input BAMs. /// /// If either input BAM (normal or tumor) is newer than the PASS VCF, /// Savana is considered out of date and should be re-executed. /// /// # Returns /// `true` if an update is needed, or if timestamps can't be checked (file doesn't exist) fn should_run(&self) -> bool { let passed_vcf = &self.config.savana_passed_vcf(&self.id); let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true) .unwrap_or(true) || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true); if result { warn!("Savana should run for id: {}.", self.id); } result } } impl JobCommand for Savana { fn cmd(&self) -> String { format!( "source {conda_sh} && conda activate savana_env && {bin} {args}", conda_sh = self.config.conda_sh, bin = self.config.savana_bin, args = self.job_args.join(" ") ) } } impl LocalRunner for Savana {} impl SlurmRunner for Savana { fn slurm_args(&self) -> Vec { SlurmParams { job_name: Some(format!("savana_{}", self.id)), cpus_per_task: Some(self.config.savana_threads as u32), mem: Some(format!("{}G", self.config.savana_mem)), partition: Some("mediumq".into()), gres: None, } .to_args() } } impl SbatchRunner for Savana { fn slurm_params(&self) -> SlurmParams { SlurmParams { job_name: Some(format!("savana_{}", self.id)), cpus_per_task: Some(self.config.savana_threads as u32), mem: Some(format!("{}G", self.config.savana_mem)), partition: Some("mediumq".into()), gres: None, } } } impl Run for Savana { /// Executes the Savana pipeline, including prerequisite phasing and haplotagging steps. /// /// If the output VCF does not already exist, it ensures that the phased VCF and /// haplotagged BAMs exist (or runs the modules to produce them), then runs Savana. /// Finally, it filters the resulting VCF to retain only PASS variants. /// /// # Returns /// /// `Ok(())` if the run completes successfully, or an error otherwise. fn run(&mut self) -> anyhow::Result<()> { let lock_dir = format!("{}/locks", self.config.result_dir); let _lock = SampleLock::acquire(&lock_dir, &self.id, "savana") .with_context(|| format!("Cannot start SAVANA chunked for {}", self.id))?; if !self.should_run() { info!("Savana is up-to-date."); return Ok(()); } let output_vcf = &self.config.savana_output_vcf(&self.id); if !Path::new(&output_vcf).exists() { let output_dir = self.config.savana_output_dir(&self.id); fs::create_dir_all(&output_dir).with_context(|| { format!("Failed to create output dir for Savana run: {output_dir}") })?; // Check for phased germline vcf // not required anymore since >= 1.3.0 let phased_germline_vcf = self.config.germline_phased_vcf(&self.id); if !Path::new(&phased_germline_vcf).exists() { let mut phase = LongphasePhase::initialize(&self.id, &self.config.clone())?; run!(&self.config, &mut phase)?; } else { info!("Phased germline is already up-to-date."); } let normal_hp_bam = self.config.normal_haplotagged_bam(&self.id); let tumoral_hp_bam = self.config.tumoral_haplotagged_bam(&self.id); // Check for haplotagged bam if !Path::new(&normal_hp_bam).exists() { let mut normal_hap = LongphaseHap::initialize(&self.id, &self.config.normal_name, &self.config)?; normal_hap.run()?; } else { info!("Haplotagged normal BAM is already up-to-date"); } if !Path::new(&format!("{normal_hp_bam}.bai")).exists() { let mut normal_index = SamtoolsIndex::from_config(&self.config, &normal_hp_bam); normal_index.run()?; } if !Path::new(&tumoral_hp_bam).exists() { let mut tumoral_hap = LongphaseHap::initialize(&self.id, &self.config.tumoral_name, &self.config)?; tumoral_hap.run()?; } else { info!("Haplotagged tumoral BAM is already up-to-date"); } if !Path::new(&format!("{tumoral_hp_bam}.bai")).exists() { let mut tumoral_index = SamtoolsIndex::from_config(&self.config, &tumoral_hp_bam); tumoral_index.run()?; } self.job_args = vec![ "--tumour".to_string(), tumoral_hp_bam.clone(), "--normal".to_string(), normal_hp_bam.clone(), "--outdir".to_string(), self.config.savana_output_dir(&self.id), "--ref".to_string(), self.config.reference.clone(), "--snp_vcf".to_string(), phased_germline_vcf.clone(), "--no_blacklist".to_string(), "--threads".to_string(), self.config.savana_threads.to_string(), "--cna_threads".to_string(), self.config.savana_threads.to_string(), ]; let output = run!(&self.config, self).context("Error while running Savana (local or Slurm)")?; let log_file = format!("{}/savana_", self.log_dir); output .save_to_file(&log_file) .context(format!("Error while writing logs into {log_file}"))?; } else { debug!( "Savana output already exists for {}, skipping execution.", self.id ); } // Keep PASS let passed_vcf = self.config.savana_passed_vcf(&self.id); if !Path::new(&passed_vcf).exists() && Path::new(output_vcf).exists() { let mut keep_pass = BcftoolsKeepPass::from_config(&self.config, output_vcf, &passed_vcf); let report = run!(&self.config, &mut keep_pass).context(format!( "Error while running bcftools keep PASS for {output_vcf}" ))?; let log_file = format!("{}/bcftools_pass", self.log_dir); report .save_to_file(&log_file) .context(format!("Error while writing logs into {log_file}"))?; } Ok(()) } } impl Version for Savana { /// Retrieves the Savana version by running `savana --version` in its conda environment. /// /// # Errors /// Returns an error if command execution fails or "Version " not found in output. fn version(config: &Config) -> anyhow::Result { let mut job = Savana { id: "version".into(), config: config.clone(), log_dir: config.tmp_dir.clone(), job_args: vec!["--version".to_string()], }; let out = run!(&config, &mut job).context("Error while running `savana --version` (local)")?; parse_savana_version(&out) } } fn parse_savana_version(out: &CapturedOutput) -> anyhow::Result { let mut log = out.stdout.clone(); if !out.stderr.is_empty() { log.push_str(&out.stderr); } if let Some(epilog) = &out.slurm_epilog { log.push_str(&format!("\n{epilog:#?}")); } let start = log .find("Version ") .context("Failed to find 'Version ' in the log")?; let start_index = start + "Version ".len(); let end = log[start_index..] .find(|c: char| c.is_whitespace()) .map(|idx| start_index + idx) .unwrap_or_else(|| log.len()); Ok(log[start_index..end].to_string()) } impl CallerCat for Savana { /// Tags the caller identity for downstream variant classification and traceability. /// Identifies this tool as the Savana caller producing somatic variants. fn caller_cat(&self) -> Annotation { Annotation::Callers(Caller::Savana, Sample::Somatic) } } impl Variants for Savana { /// Loads variants from the PASS-filtered VCF produced by Savana and annotates them. /// /// Reads the VCF file, applies the caller tag via the `Annotations` object, and /// wraps the parsed data into a `VariantCollection`. /// /// # Arguments /// * `annotations` - Global annotation registry shared across callers. /// /// # Returns /// A populated `VariantCollection`, or an error if the VCF fails to parse. fn variants(&self, annotations: &Annotations) -> anyhow::Result { let caller = self.caller_cat(); let add = vec![caller.clone()]; let passed_vcf = self.config.savana_passed_vcf(&self.id); info!("Loading variants from {}: {}", caller, &passed_vcf); let variants = read_vcf(&passed_vcf) .map_err(|e| anyhow::anyhow!("Failed to read Savana VCF {}.\n{e}", passed_vcf))?; variants.par_iter().for_each(|v| { annotations.insert_update(v.hash(), &add); }); info!("{}, {} variants loaded.", caller, variants.len()); Ok(VariantCollection { variants, vcf: Vcf::new(passed_vcf.clone().into())?, caller, }) } } impl Label for Savana { /// Returns the string label for this caller. fn label(&self) -> String { self.caller_cat().to_string() } } /// A row from Savana copy number segmentation output. /// /// Represents a genomic segment with associated copy number information, /// including optional allele-specific data. pub struct SavanaCNRow { /// The genomic range (chromosome, start, end) of this segment pub range: GenomeRange, /// Unique identifier for this segment pub segment_id: String, /// Number of bins included in this segment pub bin_count: u32, /// Sum of the lengths of all bins in this segment pub sum_of_bin_lengths: u32, /// Weight assigned to this segment in the analysis pub weight: f32, /// Estimated copy number for this segment pub copy_number: f32, /// Minor allele copy number, if available (requires heterozygous SNPs) pub minor_allele_copy_number: Option, /// Mean B-allele frequency for heterozygous SNPs in this segment pub mean_baf: Option, /// Number of heterozygous SNPs in this segment pub n_het_snps: u32, } impl FromStr for SavanaCNRow { type Err = anyhow::Error; /// Parses a tab-separated line from Savana copy number output. /// /// # Format /// Expected columns: chromosome, start, end, segment_id, bin_count, /// sum_of_bin_lengths, weight, copy_number, minor_allele_copy_number, /// mean_baf, n_het_snps /// /// # Errors /// Returns an error if any field is missing or cannot be parsed to the expected type. fn from_str(s: &str) -> anyhow::Result { let cells: Vec<&str> = s.split("\t").collect(); let range = GenomeRange::from_1_inclusive_str( cells .first() .context(format!("Error while parsing contig {s}."))?, cells .get(1) .context(format!("Error while parsing start {s}."))?, cells .get(2) .context(format!("Error while parsing end {s}."))?, ) .map_err(|e| anyhow::anyhow!("Error while parsing range {s}.\n{e}"))?; Ok(Self { range, segment_id: cells .get(3) .context(format!("Error while parsing segment_id {s}."))? .to_string(), bin_count: cells .get(4) .context(format!("Error while parsing bin_count {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing bin_count {s}.\n{e}"))?, sum_of_bin_lengths: cells .get(5) .context(format!("Error while parsing sum_of_bin_lengths {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing sum_of_bin_lengths {s}.\n{e}"))?, weight: cells .get(6) .context(format!("Error while parsing weight {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing weight {s}.\n{e}"))?, copy_number: cells .get(7) .context(format!("Error while parsing copy_number {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing copy_number {s}.\n{e}"))?, minor_allele_copy_number: cells .get(8) .context(format!("Error while parsing minor_allele_copy_number {s}."))? .parse::() .map(Some) .unwrap_or(None), mean_baf: cells .get(9) .context(format!("Error while parsing mean_baf {s}."))? .parse::() .map(Some) .unwrap_or(None), n_het_snps: cells .get(10) .context(format!("Error while parsing n_het_snps {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing n_het_snps {s}.\n{e}"))?, }) } } /// Container for Savana copy number segmentation data. /// /// Holds all copy number segments for a sample, loaded from Savana output files. pub struct SavanaCopyNumber { /// Vector of copy number segments pub data: Vec, } impl SavanaCopyNumber { /// Loads copy number data for a given sample ID. /// /// # Arguments /// * `id` - Sample identifier /// * `config` - Configuration containing path information /// /// # Errors /// Returns an error if the file cannot be read or parsed. pub fn load_id(id: &str, config: Config) -> anyhow::Result { let path = config.savana_copy_number(id); let reader = get_gz_reader(&path)?; let mut data = Vec::new(); for line in reader.lines() { let line = line?; if line.starts_with("chromosome") { continue; } let cn: SavanaCNRow = line.parse()?; data.push(cn); } Ok(Self { data }) } } /// A row from Savana read count output. /// /// Contains read depth information for tumor and normal samples in genomic bins. pub struct SavanaRCRow { /// The genomic range (chromosome, start, end) of this bin pub range: GenomeRange, /// Percentage of bases in the bin with known sequence (not N) pub perc_known_bases: f32, /// Whether this bin should be used in analysis pub use_bin: bool, /// Number of reads from the tumor sample in this bin pub tumour_read_count: u32, /// Number of reads from the normal sample in this bin pub normal_read_count: u32, } impl FromStr for SavanaRCRow { type Err = anyhow::Error; /// Parses a tab-separated line from Savana read count output. /// /// # Format /// Expected columns: bin (format: chr:start_end), chromosome, start, end, /// perc_known_bases, use_bin, tumour_read_count, normal_read_count /// /// # Errors /// Returns an error if any field is missing or cannot be parsed. fn from_str(s: &str) -> anyhow::Result { let cells: Vec<&str> = s.split("\t").collect(); let bin = cells .first() .context(format!("Error while parsing range {s}."))?; let range = bin .split_once(':') .and_then(|(a, b)| { b.split_once('_').map(|(b, c)| { GenomeRange::from_1_inclusive_str(a, b, c) .context(format!("Error while parsing range {bin}")) }) }) .context(format!("Invalid range format: {bin}"))??; Ok(Self { range, perc_known_bases: cells .get(4) .context(format!("Error while parsing perc_known_bases {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing perc_known_bases {s}.\n{e}"))?, use_bin: cells .get(5) .context(format!("Error while parsing use_bin {s}."))? .to_lowercase() .parse() .map_err(|e| anyhow::anyhow!("Error while parsing use_bin {s}.\n{e}"))?, tumour_read_count: cells .get(6) .context(format!("Error while parsing tumour_read_count {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing tumour_read_count {s}.\n{e}"))?, normal_read_count: cells .get(7) .context(format!("Error while parsing normal_read_count {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing normal_read_count {s}.\n{e}"))?, }) } } /// Container for Savana read count data. /// /// Holds read depth information across genomic bins for tumor-normal pairs. pub struct SavanaReadCounts { /// Vector of read count bins pub data: Vec, } impl SavanaReadCounts { /// Loads read count data for a given sample ID. /// /// # Arguments /// * `id` - Sample identifier /// * `config` - Configuration containing path information /// /// # Errors /// Returns an error if the file cannot be read or parsed. pub fn load_id(id: &str, config: Config) -> anyhow::Result { let path = config.savana_read_counts(id); let reader = get_gz_reader(&path)?; let mut data = Vec::new(); for line in reader.lines() { let line = line?; if line.starts_with("bin") { continue; } let cn: SavanaRCRow = line.parse()?; data.push(cn); } Ok(Self { data }) } /// Calculates the total number of tumor reads across all bins. /// /// # Returns /// Sum of tumor read counts from all bins. pub fn n_tumoral_reads(&self) -> usize { self.data .par_iter() .map(|c| c.tumour_read_count as usize) .sum::() } /// Calculates the total number of normal reads across all bins. /// /// # Returns /// Sum of normal read counts from all bins. pub fn n_normal_reads(&self) -> usize { self.data .par_iter() .map(|c| c.normal_read_count as usize) .sum::() } /// Calculates normalized read counts per chromosome. /// /// Normalizes each chromosome's read count by the expected number of reads /// based on the number of bins in that chromosome relative to total bins. /// /// # Returns /// Vector of tuples containing (chromosome_name, normalized_count) pairs. pub fn norm_chr_counts(&self) -> Vec<(String, f64)> { let n_tum = self.n_tumoral_reads(); let n_bins = self.data.len(); self.data .iter() .chunk_by(|c| c.range.contig) .into_iter() .map(|(contig_num, chr_counts)| { let chr_counts: Vec<_> = chr_counts.collect(); let chr_n_bins = chr_counts.len(); let chr_counts = chr_counts .par_iter() .map(|c| c.tumour_read_count as usize) .sum::(); let r = chr_counts as f64 / ((chr_n_bins as f64 * n_tum as f64) / n_bins as f64); (num_to_contig(contig_num), r) }) .collect() } } /// Savana copy number data with serialization support. /// /// Alternative representation of copy number segments optimized for /// serialization and deserialization. #[derive(Debug, Serialize, Deserialize)] pub struct SavanaCN { /// Vector of copy number segments pub segments: Vec, } /// A copy number segment with full metadata. /// /// Contains all information about a genomic segment including copy number, /// allelic information, and quality metrics. #[derive(Debug, Serialize, Deserialize)] pub struct CNSegment { /// Chromosome name pub chromosome: String, /// Start position (1-based, inclusive) pub start: u64, /// End position (1-based, inclusive) pub end: u64, /// Unique segment identifier pub segment_id: String, /// Number of bins merged into this segment pub bin_count: u32, /// Total length of all bins in base pairs pub sum_of_bin_lengths: u64, /// Statistical weight of this segment pub weight: f64, /// Estimated total copy number pub copy_number: f64, /// Minor allele copy number (requires heterozygous SNPs) pub minor_allele_copy_number: Option, /// Mean B-allele frequency across heterozygous SNPs pub mean_baf: Option, /// Number of heterozygous SNPs in this segment pub no_het_snps: u32, } impl SavanaCN { /// Parses a Savana copy number file into structured segments. /// /// # Arguments /// * `id` - Sample identifier /// * `config` - Configuration containing path information /// /// # Returns /// Parsed copy number data with all segments. /// /// # Errors /// Returns an error if: /// * File cannot be opened or read /// * Line has incorrect number of columns (expected 11) /// * Any field cannot be parsed to the expected type pub fn parse_file(id: &str, config: &Config) -> anyhow::Result { let path = config.savana_copy_number(id); let reader = get_gz_reader(&path).context(anyhow::anyhow!("Error while reading: {path}"))?; let mut segments = Vec::new(); for (index, line) in reader.lines().enumerate() { let line = line.context("Failed to read line from file")?; // Skip header line if index == 0 { continue; } let parts: Vec<&str> = line.split('\t').collect(); if parts.len() != 11 { return Err(anyhow::anyhow!( "Invalid number of columns at line {} (expected 11, got {})", index + 1, parts.len() )); } // Helper closure for parsing with context let parse_field = |field: &str, name: &str, line_num: usize| -> anyhow::Result { field .parse() .context(format!("Failed to parse {name} at line {line_num}")) }; let parse_field_u32 = |field: &str, name: &str, line_num: usize| -> anyhow::Result { field .parse() .context(format!("Failed to parse {name} at line {line_num}")) }; let line_num = index + 1; let segment = CNSegment { chromosome: parts[0].to_string(), start: parse_field_u32(parts[1], "start", line_num)? as u64, end: parse_field_u32(parts[2], "end", line_num)? as u64, segment_id: parts[3].to_string(), bin_count: parse_field_u32(parts[4], "bin_count", line_num)?, sum_of_bin_lengths: parse_field_u32(parts[5], "sum_of_bin_lengths", line_num)? as u64, weight: parse_field(parts[6], "weight", line_num)?, copy_number: parse_field(parts[7], "copy_number", line_num)?, minor_allele_copy_number: match parts[8] { "" | "NA" => None, v => Some(parse_field(v, "minor_allele_copy_number", line_num)?), }, mean_baf: match parts[9] { "" | "NA" => None, v => Some(parse_field(v, "mean_baf", line_num)?), }, no_het_snps: if parts[10].is_empty() { 0 } else { parse_field_u32(parts[10], "no_het_snps", line_num)? }, }; segments.push(segment); } Ok(Self { segments }) } /// Writes CoRAL `--cn_segs` BED: all segments with absolute CN. /// /// Format: `chrom\tstart\tend\tcopy_number` pub fn write_coral_cn_segs(&self, path: &Path) -> anyhow::Result<()> { let mut w = BufWriter::new( File::create(path) .with_context(|| format!("Cannot create cn_segs BED: {}", path.display()))?, ); for seg in &self.segments { writeln!( w, "{}\t{}\t{}\t{:.4}", seg.chromosome, seg.start, seg.end, seg.copy_number )?; } Ok(()) } /// Writes CoRAL `--seeds` BED: amplified segments only, centromeres excluded. /// /// Uses a purity-aware threshold so that tumour CN dilution by normal /// contamination is accounted for: /// /// ```text /// fitted_CN = purity * tumour_CN + (1 - purity) * ploidy /// seed_threshold = purity * min_tumour_cn + (1 - purity) * ploidy /// ``` /// /// Centromeric segments (stain `acen` in the cytoband BED) are excluded /// regardless of copy number, as they produce artifactual high-depth signal /// due to repetitive satellite sequences. /// /// # Arguments /// * `path` - Output BED path /// * `purity` - Tumour purity (0..1) from `_fitted_purity_ploidy.tsv` /// * `ploidy` - Tumour ploidy from `_fitted_purity_ploidy.tsv` /// * `min_tumour_cn` - Minimum tumour CN to qualify as a seed (typically `4 * ploidy`) /// * `centromeres` - Parsed centromere intervals from `parse_centromere_intervals()` pub fn write_coral_seeds( &self, path: &Path, purity: f64, ploidy: f64, min_tumour_cn: f64, centromeres: &[(String, u64, u64)], ) -> anyhow::Result<()> { let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy; let mut w = BufWriter::new( File::create(path) .with_context(|| format!("Cannot create seeds BED: {}", path.display()))?, ); for seg in &self.segments { if seg.copy_number >= threshold && !overlaps_centromere(&seg.chromosome, seg.start, seg.end, centromeres) { writeln!(w, "{}\t{}\t{}", seg.chromosome, seg.start, seg.end)?; } } Ok(()) } /// Returns the number of amplified non-centromeric seed segments. pub fn n_seeds( &self, purity: f64, ploidy: f64, min_tumour_cn: f64, centromeres: &[(String, u64, u64)], ) -> usize { let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy; self.segments .iter() .filter(|s| { s.copy_number >= threshold && !overlaps_centromere(&s.chromosome, s.start, s.end, centromeres) }) .count() } } /// Returns true if [seg_start, seg_end) overlaps any centromeric interval /// on the same chromosome. fn overlaps_centromere( chrom: &str, start: u64, end: u64, centromeres: &[(String, u64, u64)], ) -> bool { centromeres .iter() .any(|(c, cs, ce)| c == chrom && start < *ce && end > *cs) } /// Reads SAVANA fitted_purity_ploidy.tsv. /// Expected format: /// purity ploidy distance rank /// 0.48 1.98 0.105 1 pub fn read_savana_purity_ploidy(path: &Path) -> anyhow::Result<(f64, f64)> { let content = std::fs::read_to_string(path)?; let mut lines = content.lines(); // Skip header lines.next().context("purity/ploidy file is empty")?; let data = lines.next().context("purity/ploidy file has no data row")?; let fields: Vec<&str> = data.split('\t').collect(); anyhow::ensure!( fields.len() >= 2, "expected at least 2 columns, got {}", fields.len() ); let purity = fields[0] .trim() .parse::() .with_context(|| format!("cannot parse purity: '{}'", fields[0]))?; let ploidy = fields[1] .trim() .parse::() .with_context(|| format!("cannot parse ploidy: '{}'", fields[1]))?; Ok((purity, ploidy)) } #[cfg(test)] mod tests { use super::*; use crate::helpers::test_init; #[test] fn savana_version() -> anyhow::Result<()> { test_init(); let vl = Savana::version(&Config::default())?; info!("Savana version: {vl}"); Ok(()) } #[test] fn savana_run() -> anyhow::Result<()> { test_init(); let config = Config::default(); let mut caller = Savana::initialize("DUMCO", &config)?; caller.run()?; Ok(()) } }