| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992 |
- //! # Savana Haplotype-Aware Somatic Variant Caller
- //!
- //! This module provides wrappers for [Savana](https://github.com/cortes-ciriano-lab/savana),
- //! a haplotype-aware somatic variant caller for structural variants and copy number alterations
- //! from long-read sequencing data.
- //!
- //! ## Overview
- //!
- //! Savana detects:
- //! - Structural variants (SVs): deletions, duplications, inversions, translocations
- //! - Copy number variations (CNVs) with allele-specific information
- //! - Complex genomic rearrangements
- //!
- //! ## Key Features
- //!
- //! - **Haplotype-aware calling** - Uses phased germline variants and haplotagged BAMs
- //! - **Integrated phasing** - Automatically runs LongPhase if needed
- //! - **CNV segmentation** - Provides detailed copy number profiles
- //!
- //! ## Requirements
- //!
- //! Before running Savana, ensure:
- //! - Tumor and normal BAMs are indexed
- //! - Reference genome is accessible
- //! - Conda environment with Savana is configured (environment name: savana_env)
- //!
- //! ## Output Files
- //!
- //! PASS-filtered somatic variants:
- //! ```text
- //! {result_dir}/{id}/savana/somatic_sv_PASS.vcf.gz
- //! ```
- //!
- //! Copy number segmentation:
- //! ```text
- //! {result_dir}/{id}/savana/copy_number_segmentation.txt.gz
- //! ```
- //!
- //! Read count bins:
- //! ```text
- //! {result_dir}/{id}/savana/read_counts.txt.gz
- //! ```
- //!
- //! ## Usage
- //!
- //! ```ignore
- //! use pandora_lib_promethion::callers::savana::Savana;
- //! use pandora_lib_promethion::config::Config;
- //! use pandora_lib_promethion::pipes::Initialize;
- //! use pandora_lib_promethion::runners::Run;
- //!
- //! let config = Config::default();
- //! let mut caller = Savana::initialize("sample_001", &config)?;
- //!
- //! if caller.should_run() {
- //! caller.run()?; // Automatically handles phasing and haplotagging
- //! }
- //!
- //! // Load SV/CNV variants
- //! let variants = caller.variants(&annotations)?;
- //! println!("Found {} somatic SVs/CNVs", variants.variants.len());
- //!
- //! // Load copy number data
- //! let cn_data = SavanaCN::parse_file("sample_001", &config)?;
- //! println!("Copy number segments: {}", cn_data.segments.len());
- //! # Ok::<(), anyhow::Error>(())
- //! ```
- //!
- //! ## References
- //!
- //! - [Savana GitHub](https://github.com/cortes-ciriano-lab/savana)
- //! - [Savana Paper](https://doi.org/10.1038/s41467-022-34590-2)
- use crate::{
- annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
- collection::vcf::Vcf,
- commands::{
- bcftools::BcftoolsKeepPass,
- longphase::{LongphaseHap, LongphasePhase},
- samtools::SamtoolsIndex,
- CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner,
- },
- config::Config,
- helpers::{is_file_older, remove_dir_if_exists},
- io::{readers::get_gz_reader, vcf::read_vcf},
- locker::SampleLock,
- pipes::{Initialize, InitializeSolo, ShouldRun, Version},
- positions::{num_to_contig, GenomeRange},
- run,
- runners::Run,
- variant::{
- variant_collection::VariantCollection,
- vcf_variant::{Label, Variants},
- },
- };
- use anyhow::Context;
- use itertools::Itertools;
- use log::{debug, info, warn};
- use rayon::prelude::*;
- use serde::{Deserialize, Serialize};
- use std::io::Write;
- use std::{
- fs::{self, File},
- io::{BufRead, BufWriter},
- path::Path,
- str::FromStr,
- };
- /// Savana haplotype-aware somatic SV and CNV caller.
- ///
- /// Orchestrates the Savana pipeline including prerequisite phasing and haplotagging steps.
- /// Automatically invokes LongPhase if required phased VCFs or haplotagged BAMs are missing.
- ///
- /// # Fields
- ///
- /// - `id` - Sample identifier (e.g., "34528")
- /// - `config` - Global pipeline configuration
- /// - `log_dir` - Directory for execution logs (e.g., "{result_dir}/{id}/log/savana")
- /// - `job_args` - Internal command-line arguments passed to Savana (populated in `run()`)
- #[derive(Debug)]
- pub struct Savana {
- /// Sample identifier
- pub id: String,
- /// Global pipeline configuration
- pub config: Config,
- /// Directory for log file storage
- pub log_dir: String,
- /// Command-line arguments for Savana executable (populated during run)
- job_args: Vec<String>,
- }
- impl Initialize for Savana {
- /// Initializes a new `Savana` instance for a given sample ID and configuration.
- ///
- /// If `savana_force` is enabled in [`Config`] and the output VCF exists, or if the run
- /// should be re-triggered (based on file freshness), the output directory
- /// will be cleaned.
- ///
- /// # Arguments
- ///
- /// * `id` - Sample identifier
- /// * `config` - Pipeline execution configuration
- ///
- /// # Returns
- ///
- /// A new `Savana` instance, or an error if cleanup fails.
- ///
- /// # Errors
- ///
- /// Returns an error if:
- /// - `config.savana_force` is true and output directory cannot be removed
- /// - Directory deletion fails due to permissions or I/O errors
- fn initialize(id: &str, config: &Config) -> anyhow::Result<Self> {
- info!("Initialize Savana for {id}.");
- let log_dir = format!("{}/{}/log/savana", config.result_dir, id);
- let savana = Self {
- id: id.to_string(),
- config: config.clone(),
- log_dir,
- job_args: Vec::new(),
- };
- // If forced re-run is enabled or a run is needed, remove old output directory
- if savana.config.savana_force {
- remove_dir_if_exists(&savana.config.savana_output_dir(id))?;
- }
- Ok(savana)
- }
- }
- impl ShouldRun for Savana {
- /// Determines whether Savana should be re-run based on whether
- /// the filtered PASS VCF is older than the input BAMs.
- ///
- /// If either input BAM (normal or tumor) is newer than the PASS VCF,
- /// Savana is considered out of date and should be re-executed.
- ///
- /// # Returns
- /// `true` if an update is needed, or if timestamps can't be checked (file doesn't exist)
- fn should_run(&self) -> bool {
- let passed_vcf = &self.config.savana_passed_vcf(&self.id);
- let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true)
- .unwrap_or(true)
- || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
- if result {
- warn!("Savana should run for id: {}.", self.id);
- }
- result
- }
- }
- impl JobCommand for Savana {
- fn cmd(&self) -> String {
- format!(
- "source {conda_sh} && conda activate savana_env && {bin} {args}",
- conda_sh = self.config.conda_sh,
- bin = self.config.savana_bin,
- args = self.job_args.join(" ")
- )
- }
- }
- impl LocalRunner for Savana {}
- impl SlurmRunner for Savana {
- fn slurm_args(&self) -> Vec<String> {
- SlurmParams {
- job_name: Some(format!("savana_{}", self.id)),
- cpus_per_task: Some(self.config.savana_threads as u32),
- mem: Some(format!("{}G", self.config.savana_mem)),
- partition: Some("mediumq".into()),
- gres: None,
- }
- .to_args()
- }
- }
- impl SbatchRunner for Savana {
- fn slurm_params(&self) -> SlurmParams {
- SlurmParams {
- job_name: Some(format!("savana_{}", self.id)),
- cpus_per_task: Some(self.config.savana_threads as u32),
- mem: Some(format!("{}G", self.config.savana_mem)),
- partition: Some("mediumq".into()),
- gres: None,
- }
- }
- }
- impl Run for Savana {
- /// Executes the Savana pipeline, including prerequisite phasing and haplotagging steps.
- ///
- /// If the output VCF does not already exist, it ensures that the phased VCF and
- /// haplotagged BAMs exist (or runs the modules to produce them), then runs Savana.
- /// Finally, it filters the resulting VCF to retain only PASS variants.
- ///
- /// # Returns
- ///
- /// `Ok(())` if the run completes successfully, or an error otherwise.
- fn run(&mut self) -> anyhow::Result<()> {
- let lock_dir = format!("{}/locks", self.config.result_dir);
- let _lock = SampleLock::acquire(&lock_dir, &self.id, "savana")
- .with_context(|| format!("Cannot start SAVANA chunked for {}", self.id))?;
- if !self.should_run() {
- info!("Savana is up-to-date.");
- return Ok(());
- }
- let output_vcf = &self.config.savana_output_vcf(&self.id);
- if !Path::new(&output_vcf).exists() {
- let output_dir = self.config.savana_output_dir(&self.id);
- fs::create_dir_all(&output_dir).with_context(|| {
- format!("Failed to create output dir for Savana run: {output_dir}")
- })?;
- // Check for phased germline vcf
- // not required anymore since >= 1.3.0
- let phased_germline_vcf = self.config.germline_phased_vcf(&self.id);
- if !Path::new(&phased_germline_vcf).exists() {
- let mut phase = LongphasePhase::initialize(&self.id, &self.config.clone())?;
- run!(&self.config, &mut phase)?;
- } else {
- info!("Phased germline is already up-to-date.");
- }
- let normal_hp_bam = self.config.normal_haplotagged_bam(&self.id);
- let tumoral_hp_bam = self.config.tumoral_haplotagged_bam(&self.id);
- // Check for haplotagged bam
- if !Path::new(&normal_hp_bam).exists() {
- let mut normal_hap =
- LongphaseHap::initialize(&self.id, &self.config.normal_name, &self.config)?;
- normal_hap.run()?;
- } else {
- info!("Haplotagged normal BAM is already up-to-date");
- }
- if !Path::new(&format!("{normal_hp_bam}.bai")).exists() {
- let mut normal_index = SamtoolsIndex::from_config(&self.config, &normal_hp_bam);
- normal_index.run()?;
- }
- if !Path::new(&tumoral_hp_bam).exists() {
- let mut tumoral_hap =
- LongphaseHap::initialize(&self.id, &self.config.tumoral_name, &self.config)?;
- tumoral_hap.run()?;
- } else {
- info!("Haplotagged tumoral BAM is already up-to-date");
- }
- if !Path::new(&format!("{tumoral_hp_bam}.bai")).exists() {
- let mut tumoral_index = SamtoolsIndex::from_config(&self.config, &tumoral_hp_bam);
- tumoral_index.run()?;
- }
- self.job_args = vec![
- "--tumour".to_string(),
- tumoral_hp_bam.clone(),
- "--normal".to_string(),
- normal_hp_bam.clone(),
- "--outdir".to_string(),
- self.config.savana_output_dir(&self.id),
- "--ref".to_string(),
- self.config.reference.clone(),
- "--snp_vcf".to_string(),
- phased_germline_vcf.clone(),
- "--no_blacklist".to_string(),
- "--threads".to_string(),
- self.config.savana_threads.to_string(),
- "--cna_threads".to_string(),
- self.config.savana_threads.to_string(),
- ];
- let output =
- run!(&self.config, self).context("Error while running Savana (local or Slurm)")?;
- let log_file = format!("{}/savana_", self.log_dir);
- output
- .save_to_file(&log_file)
- .context(format!("Error while writing logs into {log_file}"))?;
- } else {
- debug!(
- "Savana output already exists for {}, skipping execution.",
- self.id
- );
- }
- // Keep PASS
- let passed_vcf = self.config.savana_passed_vcf(&self.id);
- if !Path::new(&passed_vcf).exists() && Path::new(output_vcf).exists() {
- let mut keep_pass =
- BcftoolsKeepPass::from_config(&self.config, output_vcf, &passed_vcf);
- let report = run!(&self.config, &mut keep_pass).context(format!(
- "Error while running bcftools keep PASS for {output_vcf}"
- ))?;
- let log_file = format!("{}/bcftools_pass", self.log_dir);
- report
- .save_to_file(&log_file)
- .context(format!("Error while writing logs into {log_file}"))?;
- }
- Ok(())
- }
- }
- impl Version for Savana {
- /// Retrieves the Savana version by running `savana --version` in its conda environment.
- ///
- /// # Errors
- /// Returns an error if command execution fails or "Version " not found in output.
- fn version(config: &Config) -> anyhow::Result<String> {
- let mut job = Savana {
- id: "version".into(),
- config: config.clone(),
- log_dir: config.tmp_dir.clone(),
- job_args: vec!["--version".to_string()],
- };
- let out =
- run!(&config, &mut job).context("Error while running `savana --version` (local)")?;
- parse_savana_version(&out)
- }
- }
- fn parse_savana_version(out: &CapturedOutput) -> anyhow::Result<String> {
- let mut log = out.stdout.clone();
- if !out.stderr.is_empty() {
- log.push_str(&out.stderr);
- }
- if let Some(epilog) = &out.slurm_epilog {
- log.push_str(&format!("\n{epilog:#?}"));
- }
- let start = log
- .find("Version ")
- .context("Failed to find 'Version ' in the log")?;
- let start_index = start + "Version ".len();
- let end = log[start_index..]
- .find(|c: char| c.is_whitespace())
- .map(|idx| start_index + idx)
- .unwrap_or_else(|| log.len());
- Ok(log[start_index..end].to_string())
- }
- impl CallerCat for Savana {
- /// Tags the caller identity for downstream variant classification and traceability.
- /// Identifies this tool as the Savana caller producing somatic variants.
- fn caller_cat(&self) -> Annotation {
- Annotation::Callers(Caller::Savana, Sample::Somatic)
- }
- }
- impl Variants for Savana {
- /// Loads variants from the PASS-filtered VCF produced by Savana and annotates them.
- ///
- /// Reads the VCF file, applies the caller tag via the `Annotations` object, and
- /// wraps the parsed data into a `VariantCollection`.
- ///
- /// # Arguments
- /// * `annotations` - Global annotation registry shared across callers.
- ///
- /// # Returns
- /// A populated `VariantCollection`, or an error if the VCF fails to parse.
- fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
- let caller = self.caller_cat();
- let add = vec![caller.clone()];
- let passed_vcf = self.config.savana_passed_vcf(&self.id);
- info!("Loading variants from {}: {}", caller, &passed_vcf);
- let variants = read_vcf(&passed_vcf)
- .map_err(|e| anyhow::anyhow!("Failed to read Savana VCF {}.\n{e}", passed_vcf))?;
- variants.par_iter().for_each(|v| {
- annotations.insert_update(v.hash(), &add);
- });
- info!("{}, {} variants loaded.", caller, variants.len());
- Ok(VariantCollection {
- variants,
- vcf: Vcf::new(passed_vcf.clone().into())?,
- caller,
- })
- }
- }
- impl Label for Savana {
- /// Returns the string label for this caller.
- fn label(&self) -> String {
- self.caller_cat().to_string()
- }
- }
- /// A row from Savana copy number segmentation output.
- ///
- /// Represents a genomic segment with associated copy number information,
- /// including optional allele-specific data.
- pub struct SavanaCNRow {
- /// The genomic range (chromosome, start, end) of this segment
- pub range: GenomeRange,
- /// Unique identifier for this segment
- pub segment_id: String,
- /// Number of bins included in this segment
- pub bin_count: u32,
- /// Sum of the lengths of all bins in this segment
- pub sum_of_bin_lengths: u32,
- /// Weight assigned to this segment in the analysis
- pub weight: f32,
- /// Estimated copy number for this segment
- pub copy_number: f32,
- /// Minor allele copy number, if available (requires heterozygous SNPs)
- pub minor_allele_copy_number: Option<f64>,
- /// Mean B-allele frequency for heterozygous SNPs in this segment
- pub mean_baf: Option<f64>,
- /// Number of heterozygous SNPs in this segment
- pub n_het_snps: u32,
- }
- impl FromStr for SavanaCNRow {
- type Err = anyhow::Error;
- /// Parses a tab-separated line from Savana copy number output.
- ///
- /// # Format
- /// Expected columns: chromosome, start, end, segment_id, bin_count,
- /// sum_of_bin_lengths, weight, copy_number, minor_allele_copy_number,
- /// mean_baf, n_het_snps
- ///
- /// # Errors
- /// Returns an error if any field is missing or cannot be parsed to the expected type.
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let cells: Vec<&str> = s.split("\t").collect();
- let range = GenomeRange::from_1_inclusive_str(
- cells
- .first()
- .context(format!("Error while parsing contig {s}."))?,
- cells
- .get(1)
- .context(format!("Error while parsing start {s}."))?,
- cells
- .get(2)
- .context(format!("Error while parsing end {s}."))?,
- )
- .map_err(|e| anyhow::anyhow!("Error while parsing range {s}.\n{e}"))?;
- Ok(Self {
- range,
- segment_id: cells
- .get(3)
- .context(format!("Error while parsing segment_id {s}."))?
- .to_string(),
- bin_count: cells
- .get(4)
- .context(format!("Error while parsing bin_count {s}."))?
- .parse()
- .map_err(|e| anyhow::anyhow!("Error while parsing bin_count {s}.\n{e}"))?,
- sum_of_bin_lengths: cells
- .get(5)
- .context(format!("Error while parsing sum_of_bin_lengths {s}."))?
- .parse()
- .map_err(|e| anyhow::anyhow!("Error while parsing sum_of_bin_lengths {s}.\n{e}"))?,
- weight: cells
- .get(6)
- .context(format!("Error while parsing weight {s}."))?
- .parse()
- .map_err(|e| anyhow::anyhow!("Error while parsing weight {s}.\n{e}"))?,
- copy_number: cells
- .get(7)
- .context(format!("Error while parsing copy_number {s}."))?
- .parse()
- .map_err(|e| anyhow::anyhow!("Error while parsing copy_number {s}.\n{e}"))?,
- minor_allele_copy_number: cells
- .get(8)
- .context(format!("Error while parsing minor_allele_copy_number {s}."))?
- .parse::<f64>()
- .map(Some)
- .unwrap_or(None),
- mean_baf: cells
- .get(9)
- .context(format!("Error while parsing mean_baf {s}."))?
- .parse::<f64>()
- .map(Some)
- .unwrap_or(None),
- n_het_snps: cells
- .get(10)
- .context(format!("Error while parsing n_het_snps {s}."))?
- .parse()
- .map_err(|e| anyhow::anyhow!("Error while parsing n_het_snps {s}.\n{e}"))?,
- })
- }
- }
- /// Container for Savana copy number segmentation data.
- ///
- /// Holds all copy number segments for a sample, loaded from Savana output files.
- pub struct SavanaCopyNumber {
- /// Vector of copy number segments
- pub data: Vec<SavanaCNRow>,
- }
- impl SavanaCopyNumber {
- /// Loads copy number data for a given sample ID.
- ///
- /// # Arguments
- /// * `id` - Sample identifier
- /// * `config` - Configuration containing path information
- ///
- /// # Errors
- /// Returns an error if the file cannot be read or parsed.
- pub fn load_id(id: &str, config: Config) -> anyhow::Result<Self> {
- let path = config.savana_copy_number(id);
- let reader = get_gz_reader(&path)?;
- let mut data = Vec::new();
- for line in reader.lines() {
- let line = line?;
- if line.starts_with("chromosome") {
- continue;
- }
- let cn: SavanaCNRow = line.parse()?;
- data.push(cn);
- }
- Ok(Self { data })
- }
- }
- /// A row from Savana read count output.
- ///
- /// Contains read depth information for tumor and normal samples in genomic bins.
- pub struct SavanaRCRow {
- /// The genomic range (chromosome, start, end) of this bin
- pub range: GenomeRange,
- /// Percentage of bases in the bin with known sequence (not N)
- pub perc_known_bases: f32,
- /// Whether this bin should be used in analysis
- pub use_bin: bool,
- /// Number of reads from the tumor sample in this bin
- pub tumour_read_count: u32,
- /// Number of reads from the normal sample in this bin
- pub normal_read_count: u32,
- }
- impl FromStr for SavanaRCRow {
- type Err = anyhow::Error;
- /// Parses a tab-separated line from Savana read count output.
- ///
- /// # Format
- /// Expected columns: bin (format: chr:start_end), chromosome, start, end,
- /// perc_known_bases, use_bin, tumour_read_count, normal_read_count
- ///
- /// # Errors
- /// Returns an error if any field is missing or cannot be parsed.
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let cells: Vec<&str> = s.split("\t").collect();
- let bin = cells
- .first()
- .context(format!("Error while parsing range {s}."))?;
- let range = bin
- .split_once(':')
- .and_then(|(a, b)| {
- b.split_once('_').map(|(b, c)| {
- GenomeRange::from_1_inclusive_str(a, b, c)
- .context(format!("Error while parsing range {bin}"))
- })
- })
- .context(format!("Invalid range format: {bin}"))??;
- Ok(Self {
- range,
- perc_known_bases: cells
- .get(4)
- .context(format!("Error while parsing perc_known_bases {s}."))?
- .parse()
- .map_err(|e| anyhow::anyhow!("Error while parsing perc_known_bases {s}.\n{e}"))?,
- use_bin: cells
- .get(5)
- .context(format!("Error while parsing use_bin {s}."))?
- .to_lowercase()
- .parse()
- .map_err(|e| anyhow::anyhow!("Error while parsing use_bin {s}.\n{e}"))?,
- tumour_read_count: cells
- .get(6)
- .context(format!("Error while parsing tumour_read_count {s}."))?
- .parse()
- .map_err(|e| anyhow::anyhow!("Error while parsing tumour_read_count {s}.\n{e}"))?,
- normal_read_count: cells
- .get(7)
- .context(format!("Error while parsing normal_read_count {s}."))?
- .parse()
- .map_err(|e| anyhow::anyhow!("Error while parsing normal_read_count {s}.\n{e}"))?,
- })
- }
- }
- /// Container for Savana read count data.
- ///
- /// Holds read depth information across genomic bins for tumor-normal pairs.
- pub struct SavanaReadCounts {
- /// Vector of read count bins
- pub data: Vec<SavanaRCRow>,
- }
- impl SavanaReadCounts {
- /// Loads read count data for a given sample ID.
- ///
- /// # Arguments
- /// * `id` - Sample identifier
- /// * `config` - Configuration containing path information
- ///
- /// # Errors
- /// Returns an error if the file cannot be read or parsed.
- pub fn load_id(id: &str, config: Config) -> anyhow::Result<Self> {
- let path = config.savana_read_counts(id);
- let reader = get_gz_reader(&path)?;
- let mut data = Vec::new();
- for line in reader.lines() {
- let line = line?;
- if line.starts_with("bin") {
- continue;
- }
- let cn: SavanaRCRow = line.parse()?;
- data.push(cn);
- }
- Ok(Self { data })
- }
- /// Calculates the total number of tumor reads across all bins.
- ///
- /// # Returns
- /// Sum of tumor read counts from all bins.
- pub fn n_tumoral_reads(&self) -> usize {
- self.data
- .par_iter()
- .map(|c| c.tumour_read_count as usize)
- .sum::<usize>()
- }
- /// Calculates the total number of normal reads across all bins.
- ///
- /// # Returns
- /// Sum of normal read counts from all bins.
- pub fn n_normal_reads(&self) -> usize {
- self.data
- .par_iter()
- .map(|c| c.normal_read_count as usize)
- .sum::<usize>()
- }
- /// Calculates normalized read counts per chromosome.
- ///
- /// Normalizes each chromosome's read count by the expected number of reads
- /// based on the number of bins in that chromosome relative to total bins.
- ///
- /// # Returns
- /// Vector of tuples containing (chromosome_name, normalized_count) pairs.
- pub fn norm_chr_counts(&self) -> Vec<(String, f64)> {
- let n_tum = self.n_tumoral_reads();
- let n_bins = self.data.len();
- self.data
- .iter()
- .chunk_by(|c| c.range.contig)
- .into_iter()
- .map(|(contig_num, chr_counts)| {
- let chr_counts: Vec<_> = chr_counts.collect();
- let chr_n_bins = chr_counts.len();
- let chr_counts = chr_counts
- .par_iter()
- .map(|c| c.tumour_read_count as usize)
- .sum::<usize>();
- let r = chr_counts as f64 / ((chr_n_bins as f64 * n_tum as f64) / n_bins as f64);
- (num_to_contig(contig_num), r)
- })
- .collect()
- }
- }
- /// Savana copy number data with serialization support.
- ///
- /// Alternative representation of copy number segments optimized for
- /// serialization and deserialization.
- #[derive(Debug, Serialize, Deserialize)]
- pub struct SavanaCN {
- /// Vector of copy number segments
- pub segments: Vec<CNSegment>,
- }
- /// A copy number segment with full metadata.
- ///
- /// Contains all information about a genomic segment including copy number,
- /// allelic information, and quality metrics.
- #[derive(Debug, Serialize, Deserialize)]
- pub struct CNSegment {
- /// Chromosome name
- pub chromosome: String,
- /// Start position (1-based, inclusive)
- pub start: u64,
- /// End position (1-based, inclusive)
- pub end: u64,
- /// Unique segment identifier
- pub segment_id: String,
- /// Number of bins merged into this segment
- pub bin_count: u32,
- /// Total length of all bins in base pairs
- pub sum_of_bin_lengths: u64,
- /// Statistical weight of this segment
- pub weight: f64,
- /// Estimated total copy number
- pub copy_number: f64,
- /// Minor allele copy number (requires heterozygous SNPs)
- pub minor_allele_copy_number: Option<f64>,
- /// Mean B-allele frequency across heterozygous SNPs
- pub mean_baf: Option<f64>,
- /// Number of heterozygous SNPs in this segment
- pub no_het_snps: u32,
- }
- impl SavanaCN {
- /// Parses a Savana copy number file into structured segments.
- ///
- /// # Arguments
- /// * `id` - Sample identifier
- /// * `config` - Configuration containing path information
- ///
- /// # Returns
- /// Parsed copy number data with all segments.
- ///
- /// # Errors
- /// Returns an error if:
- /// * File cannot be opened or read
- /// * Line has incorrect number of columns (expected 11)
- /// * Any field cannot be parsed to the expected type
- pub fn parse_file(id: &str, config: &Config) -> anyhow::Result<Self> {
- let path = config.savana_copy_number(id);
- let reader =
- get_gz_reader(&path).context(anyhow::anyhow!("Error while reading: {path}"))?;
- let mut segments = Vec::new();
- for (index, line) in reader.lines().enumerate() {
- let line = line.context("Failed to read line from file")?;
- // Skip header line
- if index == 0 {
- continue;
- }
- let parts: Vec<&str> = line.split('\t').collect();
- if parts.len() != 11 {
- return Err(anyhow::anyhow!(
- "Invalid number of columns at line {} (expected 11, got {})",
- index + 1,
- parts.len()
- ));
- }
- // Helper closure for parsing with context
- let parse_field = |field: &str, name: &str, line_num: usize| -> anyhow::Result<f64> {
- field
- .parse()
- .context(format!("Failed to parse {name} at line {line_num}"))
- };
- let parse_field_u32 =
- |field: &str, name: &str, line_num: usize| -> anyhow::Result<u32> {
- field
- .parse()
- .context(format!("Failed to parse {name} at line {line_num}"))
- };
- let line_num = index + 1;
- let segment = CNSegment {
- chromosome: parts[0].to_string(),
- start: parse_field_u32(parts[1], "start", line_num)? as u64,
- end: parse_field_u32(parts[2], "end", line_num)? as u64,
- segment_id: parts[3].to_string(),
- bin_count: parse_field_u32(parts[4], "bin_count", line_num)?,
- sum_of_bin_lengths: parse_field_u32(parts[5], "sum_of_bin_lengths", line_num)?
- as u64,
- weight: parse_field(parts[6], "weight", line_num)?,
- copy_number: parse_field(parts[7], "copy_number", line_num)?,
- minor_allele_copy_number: match parts[8] {
- "" | "NA" => None,
- v => Some(parse_field(v, "minor_allele_copy_number", line_num)?),
- },
- mean_baf: match parts[9] {
- "" | "NA" => None,
- v => Some(parse_field(v, "mean_baf", line_num)?),
- },
- no_het_snps: if parts[10].is_empty() {
- 0
- } else {
- parse_field_u32(parts[10], "no_het_snps", line_num)?
- },
- };
- segments.push(segment);
- }
- Ok(Self { segments })
- }
- /// Writes CoRAL `--cn_segs` BED: all segments with absolute CN.
- ///
- /// Format: `chrom\tstart\tend\tcopy_number`
- pub fn write_coral_cn_segs(&self, path: &Path) -> anyhow::Result<()> {
- let mut w = BufWriter::new(
- File::create(path)
- .with_context(|| format!("Cannot create cn_segs BED: {}", path.display()))?,
- );
- for seg in &self.segments {
- writeln!(
- w,
- "{}\t{}\t{}\t{:.4}",
- seg.chromosome, seg.start, seg.end, seg.copy_number
- )?;
- }
- Ok(())
- }
- /// Writes CoRAL `--seeds` BED: amplified segments only, centromeres excluded.
- ///
- /// Uses a purity-aware threshold so that tumour CN dilution by normal
- /// contamination is accounted for:
- ///
- /// ```text
- /// fitted_CN = purity * tumour_CN + (1 - purity) * ploidy
- /// seed_threshold = purity * min_tumour_cn + (1 - purity) * ploidy
- /// ```
- ///
- /// Centromeric segments (stain `acen` in the cytoband BED) are excluded
- /// regardless of copy number, as they produce artifactual high-depth signal
- /// due to repetitive satellite sequences.
- ///
- /// # Arguments
- /// * `path` - Output BED path
- /// * `purity` - Tumour purity (0..1) from `_fitted_purity_ploidy.tsv`
- /// * `ploidy` - Tumour ploidy from `_fitted_purity_ploidy.tsv`
- /// * `min_tumour_cn` - Minimum tumour CN to qualify as a seed (typically `4 * ploidy`)
- /// * `centromeres` - Parsed centromere intervals from `parse_centromere_intervals()`
- pub fn write_coral_seeds(
- &self,
- path: &Path,
- purity: f64,
- ploidy: f64,
- min_tumour_cn: f64,
- centromeres: &[(String, u64, u64)],
- ) -> anyhow::Result<()> {
- let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy;
- let mut w = BufWriter::new(
- File::create(path)
- .with_context(|| format!("Cannot create seeds BED: {}", path.display()))?,
- );
- for seg in &self.segments {
- if seg.copy_number >= threshold
- && !overlaps_centromere(&seg.chromosome, seg.start, seg.end, centromeres)
- {
- writeln!(w, "{}\t{}\t{}", seg.chromosome, seg.start, seg.end)?;
- }
- }
- Ok(())
- }
- /// Returns the number of amplified non-centromeric seed segments.
- pub fn n_seeds(
- &self,
- purity: f64,
- ploidy: f64,
- min_tumour_cn: f64,
- centromeres: &[(String, u64, u64)],
- ) -> usize {
- let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy;
- self.segments
- .iter()
- .filter(|s| {
- s.copy_number >= threshold
- && !overlaps_centromere(&s.chromosome, s.start, s.end, centromeres)
- })
- .count()
- }
- }
- /// Returns true if [seg_start, seg_end) overlaps any centromeric interval
- /// on the same chromosome.
- fn overlaps_centromere(
- chrom: &str,
- start: u64,
- end: u64,
- centromeres: &[(String, u64, u64)],
- ) -> bool {
- centromeres
- .iter()
- .any(|(c, cs, ce)| c == chrom && start < *ce && end > *cs)
- }
- /// Reads SAVANA fitted_purity_ploidy.tsv.
- /// Expected format:
- /// purity ploidy distance rank
- /// 0.48 1.98 0.105 1
- pub fn read_savana_purity_ploidy(path: &Path) -> anyhow::Result<(f64, f64)> {
- let content = std::fs::read_to_string(path)?;
- let mut lines = content.lines();
- // Skip header
- lines.next().context("purity/ploidy file is empty")?;
- let data = lines.next().context("purity/ploidy file has no data row")?;
- let fields: Vec<&str> = data.split('\t').collect();
- anyhow::ensure!(
- fields.len() >= 2,
- "expected at least 2 columns, got {}",
- fields.len()
- );
- let purity = fields[0]
- .trim()
- .parse::<f64>()
- .with_context(|| format!("cannot parse purity: '{}'", fields[0]))?;
- let ploidy = fields[1]
- .trim()
- .parse::<f64>()
- .with_context(|| format!("cannot parse ploidy: '{}'", fields[1]))?;
- Ok((purity, ploidy))
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- use crate::helpers::test_init;
- #[test]
- fn savana_version() -> anyhow::Result<()> {
- test_init();
- let vl = Savana::version(&Config::default())?;
- info!("Savana version: {vl}");
- Ok(())
- }
- #[test]
- fn savana_run() -> anyhow::Result<()> {
- test_init();
- let config = Config::default();
- let mut caller = Savana::initialize("DUMCO", &config)?;
- caller.run()?;
- Ok(())
- }
- }
|