Thomas
/
pandora_lib_promethion


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992
							//! # Savana Haplotype-Aware Somatic Variant Caller
//!
//! This module provides wrappers for [Savana](https://github.com/cortes-ciriano-lab/savana),
//! a haplotype-aware somatic variant caller for structural variants and copy number alterations
//! from long-read sequencing data.
//!
//! ## Overview
//!
//! Savana detects:
//! - Structural variants (SVs): deletions, duplications, inversions, translocations
//! - Copy number variations (CNVs) with allele-specific information
//! - Complex genomic rearrangements
//!
//! ## Key Features
//!
//! - **Haplotype-aware calling** - Uses phased germline variants and haplotagged BAMs
//! - **Integrated phasing** - Automatically runs LongPhase if needed
//! - **CNV segmentation** - Provides detailed copy number profiles
//!
//! ## Requirements
//!
//! Before running Savana, ensure:
//! - Tumor and normal BAMs are indexed
//! - Reference genome is accessible
//! - Conda environment with Savana is configured (environment name: savana_env)
//!
//! ## Output Files
//!
//! PASS-filtered somatic variants:
//! ```text
//! {result_dir}/{id}/savana/somatic_sv_PASS.vcf.gz
//! ```
//!
//! Copy number segmentation:
//! ```text
//! {result_dir}/{id}/savana/copy_number_segmentation.txt.gz
//! ```
//!
//! Read count bins:
//! ```text
//! {result_dir}/{id}/savana/read_counts.txt.gz
//! ```
//!
//! ## Usage
//!
//! ```ignore
//! use pandora_lib_promethion::callers::savana::Savana;
//! use pandora_lib_promethion::config::Config;
//! use pandora_lib_promethion::pipes::Initialize;
//! use pandora_lib_promethion::runners::Run;
//!
//! let config = Config::default();
//! let mut caller = Savana::initialize("sample_001", &config)?;
//!
//! if caller.should_run() {
//!     caller.run()?;  // Automatically handles phasing and haplotagging
//! }
//!
//! // Load SV/CNV variants
//! let variants = caller.variants(&annotations)?;
//! println!("Found {} somatic SVs/CNVs", variants.variants.len());
//!
//! // Load copy number data
//! let cn_data = SavanaCN::parse_file("sample_001", &config)?;
//! println!("Copy number segments: {}", cn_data.segments.len());
//! # Ok::<(), anyhow::Error>(())
//! ```
//!
//! ## References
//!
//! - [Savana GitHub](https://github.com/cortes-ciriano-lab/savana)
//! - [Savana Paper](https://doi.org/10.1038/s41467-022-34590-2)
use crate::{
    annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
    collection::vcf::Vcf,
    commands::{
        bcftools::BcftoolsKeepPass,
        longphase::{LongphaseHap, LongphasePhase},
        samtools::SamtoolsIndex,
        CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner,
    },
    config::Config,
    helpers::{is_file_older, remove_dir_if_exists},
    io::{readers::get_gz_reader, vcf::read_vcf},
    locker::SampleLock,
    pipes::{Initialize, InitializeSolo, ShouldRun, Version},
    positions::{num_to_contig, GenomeRange},
    run,
    runners::Run,
    variant::{
        variant_collection::VariantCollection,
        vcf_variant::{Label, Variants},
    },
};
use anyhow::Context;
use itertools::Itertools;
use log::{debug, info, warn};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::io::Write;
use std::{
    fs::{self, File},
    io::{BufRead, BufWriter},
    path::Path,
    str::FromStr,
};

/// Savana haplotype-aware somatic SV and CNV caller.
///
/// Orchestrates the Savana pipeline including prerequisite phasing and haplotagging steps.
/// Automatically invokes LongPhase if required phased VCFs or haplotagged BAMs are missing.
///
/// # Fields
///
/// - `id` - Sample identifier (e.g., "34528")
/// - `config` - Global pipeline configuration
/// - `log_dir` - Directory for execution logs (e.g., "{result_dir}/{id}/log/savana")
/// - `job_args` - Internal command-line arguments passed to Savana (populated in `run()`)
#[derive(Debug)]
pub struct Savana {
    /// Sample identifier
    pub id: String,
    /// Global pipeline configuration
    pub config: Config,
    /// Directory for log file storage
    pub log_dir: String,

    /// Command-line arguments for Savana executable (populated during run)
    job_args: Vec<String>,
}

impl Initialize for Savana {
    /// Initializes a new `Savana` instance for a given sample ID and configuration.
    ///
    /// If `savana_force` is enabled in [`Config`] and the output VCF exists, or if the run
    /// should be re-triggered (based on file freshness), the output directory
    /// will be cleaned.
    ///
    /// # Arguments
    ///
    /// * `id` - Sample identifier
    /// * `config` - Pipeline execution configuration
    ///
    /// # Returns
    ///
    /// A new `Savana` instance, or an error if cleanup fails.
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - `config.savana_force` is true and output directory cannot be removed
    /// - Directory deletion fails due to permissions or I/O errors
    fn initialize(id: &str, config: &Config) -> anyhow::Result<Self> {
        info!("Initialize Savana for {id}.");
        let log_dir = format!("{}/{}/log/savana", config.result_dir, id);

        let savana = Self {
            id: id.to_string(),
            config: config.clone(),
            log_dir,
            job_args: Vec::new(),
        };

        // If forced re-run is enabled or a run is needed, remove old output directory
        if savana.config.savana_force {
            remove_dir_if_exists(&savana.config.savana_output_dir(id))?;
        }

        Ok(savana)
    }
}

impl ShouldRun for Savana {
    /// Determines whether Savana should be re-run based on whether
    /// the filtered PASS VCF is older than the input BAMs.
    ///
    /// If either input BAM (normal or tumor) is newer than the PASS VCF,
    /// Savana is considered out of date and should be re-executed.
    ///
    /// # Returns
    /// `true` if an update is needed, or if timestamps can't be checked (file doesn't exist)
    fn should_run(&self) -> bool {
        let passed_vcf = &self.config.savana_passed_vcf(&self.id);
        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true)
            .unwrap_or(true)
            || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
        if result {
            warn!("Savana should run for id: {}.", self.id);
        }
        result
    }
}

impl JobCommand for Savana {
    fn cmd(&self) -> String {
        format!(
            "source {conda_sh} && conda activate savana_env && {bin} {args}",
            conda_sh = self.config.conda_sh,
            bin = self.config.savana_bin,
            args = self.job_args.join(" ")
        )
    }
}

impl LocalRunner for Savana {}

impl SlurmRunner for Savana {
    fn slurm_args(&self) -> Vec<String> {
        SlurmParams {
            job_name: Some(format!("savana_{}", self.id)),
            cpus_per_task: Some(self.config.savana_threads as u32),
            mem: Some(format!("{}G", self.config.savana_mem)),
            partition: Some("mediumq".into()),
            gres: None,
        }
        .to_args()
    }
}

impl SbatchRunner for Savana {
    fn slurm_params(&self) -> SlurmParams {
        SlurmParams {
            job_name: Some(format!("savana_{}", self.id)),
            cpus_per_task: Some(self.config.savana_threads as u32),
            mem: Some(format!("{}G", self.config.savana_mem)),
            partition: Some("mediumq".into()),
            gres: None,
        }
    }
}

impl Run for Savana {
    /// Executes the Savana pipeline, including prerequisite phasing and haplotagging steps.
    ///
    /// If the output VCF does not already exist, it ensures that the phased VCF and
    /// haplotagged BAMs exist (or runs the modules to produce them), then runs Savana.
    /// Finally, it filters the resulting VCF to retain only PASS variants.
    ///
    /// # Returns
    ///
    /// `Ok(())` if the run completes successfully, or an error otherwise.
    fn run(&mut self) -> anyhow::Result<()> {
        let lock_dir = format!("{}/locks", self.config.result_dir);
        let _lock = SampleLock::acquire(&lock_dir, &self.id, "savana")
            .with_context(|| format!("Cannot start SAVANA chunked for {}", self.id))?;

        if !self.should_run() {
            info!("Savana is up-to-date.");
            return Ok(());
        }

        let output_vcf = &self.config.savana_output_vcf(&self.id);
        if !Path::new(&output_vcf).exists() {
            let output_dir = self.config.savana_output_dir(&self.id);
            fs::create_dir_all(&output_dir).with_context(|| {
                format!("Failed to create output dir for Savana run: {output_dir}")
            })?;

            // Check for phased germline vcf
            // not required anymore since >= 1.3.0
            let phased_germline_vcf = self.config.germline_phased_vcf(&self.id);
            if !Path::new(&phased_germline_vcf).exists() {
                let mut phase = LongphasePhase::initialize(&self.id, &self.config.clone())?;
                run!(&self.config, &mut phase)?;
            } else {
                info!("Phased germline is already up-to-date.");
            }

            let normal_hp_bam = self.config.normal_haplotagged_bam(&self.id);
            let tumoral_hp_bam = self.config.tumoral_haplotagged_bam(&self.id);

            // Check for haplotagged bam
            if !Path::new(&normal_hp_bam).exists() {
                let mut normal_hap =
                    LongphaseHap::initialize(&self.id, &self.config.normal_name, &self.config)?;
                normal_hap.run()?;
            } else {
                info!("Haplotagged normal BAM is already up-to-date");
            }

            if !Path::new(&format!("{normal_hp_bam}.bai")).exists() {
                let mut normal_index = SamtoolsIndex::from_config(&self.config, &normal_hp_bam);
                normal_index.run()?;
            }

            if !Path::new(&tumoral_hp_bam).exists() {
                let mut tumoral_hap =
                    LongphaseHap::initialize(&self.id, &self.config.tumoral_name, &self.config)?;
                tumoral_hap.run()?;
            } else {
                info!("Haplotagged tumoral BAM is already up-to-date");
            }

            if !Path::new(&format!("{tumoral_hp_bam}.bai")).exists() {
                let mut tumoral_index = SamtoolsIndex::from_config(&self.config, &tumoral_hp_bam);
                tumoral_index.run()?;
            }

            self.job_args = vec![
                "--tumour".to_string(),
                tumoral_hp_bam.clone(),
                "--normal".to_string(),
                normal_hp_bam.clone(),
                "--outdir".to_string(),
                self.config.savana_output_dir(&self.id),
                "--ref".to_string(),
                self.config.reference.clone(),
                "--snp_vcf".to_string(),
                phased_germline_vcf.clone(),
                "--no_blacklist".to_string(),
                "--threads".to_string(),
                self.config.savana_threads.to_string(),
                "--cna_threads".to_string(),
                self.config.savana_threads.to_string(),
            ];

            let output =
                run!(&self.config, self).context("Error while running Savana (local or Slurm)")?;
            let log_file = format!("{}/savana_", self.log_dir);
            output
                .save_to_file(&log_file)
                .context(format!("Error while writing logs into {log_file}"))?;
        } else {
            debug!(
                "Savana output already exists for {}, skipping execution.",
                self.id
            );
        }

        // Keep PASS
        let passed_vcf = self.config.savana_passed_vcf(&self.id);
        if !Path::new(&passed_vcf).exists() && Path::new(output_vcf).exists() {
            let mut keep_pass =
                BcftoolsKeepPass::from_config(&self.config, output_vcf, &passed_vcf);
            let report = run!(&self.config, &mut keep_pass).context(format!(
                "Error while running bcftools keep PASS for {output_vcf}"
            ))?;
            let log_file = format!("{}/bcftools_pass", self.log_dir);
            report
                .save_to_file(&log_file)
                .context(format!("Error while writing logs into {log_file}"))?;
        }

        Ok(())
    }
}

impl Version for Savana {
    /// Retrieves the Savana version by running `savana --version` in its conda environment.
    ///
    /// # Errors
    /// Returns an error if command execution fails or "Version " not found in output.
    fn version(config: &Config) -> anyhow::Result<String> {
        let mut job = Savana {
            id: "version".into(),
            config: config.clone(),
            log_dir: config.tmp_dir.clone(),
            job_args: vec!["--version".to_string()],
        };
        let out =
            run!(&config, &mut job).context("Error while running `savana --version` (local)")?;
        parse_savana_version(&out)
    }
}

fn parse_savana_version(out: &CapturedOutput) -> anyhow::Result<String> {
    let mut log = out.stdout.clone();
    if !out.stderr.is_empty() {
        log.push_str(&out.stderr);
    }
    if let Some(epilog) = &out.slurm_epilog {
        log.push_str(&format!("\n{epilog:#?}"));
    }

    let start = log
        .find("Version ")
        .context("Failed to find 'Version ' in the log")?;
    let start_index = start + "Version ".len();
    let end = log[start_index..]
        .find(|c: char| c.is_whitespace())
        .map(|idx| start_index + idx)
        .unwrap_or_else(|| log.len());
    Ok(log[start_index..end].to_string())
}

impl CallerCat for Savana {
    /// Tags the caller identity for downstream variant classification and traceability.
    /// Identifies this tool as the Savana caller producing somatic variants.
    fn caller_cat(&self) -> Annotation {
        Annotation::Callers(Caller::Savana, Sample::Somatic)
    }
}

impl Variants for Savana {
    /// Loads variants from the PASS-filtered VCF produced by Savana and annotates them.
    ///
    /// Reads the VCF file, applies the caller tag via the `Annotations` object, and
    /// wraps the parsed data into a `VariantCollection`.
    ///
    /// # Arguments
    /// * `annotations` - Global annotation registry shared across callers.
    ///
    /// # Returns
    /// A populated `VariantCollection`, or an error if the VCF fails to parse.
    fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
        let caller = self.caller_cat();
        let add = vec![caller.clone()];
        let passed_vcf = self.config.savana_passed_vcf(&self.id);

        info!("Loading variants from {}: {}", caller, &passed_vcf);
        let variants = read_vcf(&passed_vcf)
            .map_err(|e| anyhow::anyhow!("Failed to read Savana VCF {}.\n{e}", passed_vcf))?;
        variants.par_iter().for_each(|v| {
            annotations.insert_update(v.hash(), &add);
        });

        info!("{}, {} variants loaded.", caller, variants.len());
        Ok(VariantCollection {
            variants,
            vcf: Vcf::new(passed_vcf.clone().into())?,
            caller,
        })
    }
}

impl Label for Savana {
    /// Returns the string label for this caller.
    fn label(&self) -> String {
        self.caller_cat().to_string()
    }
}

/// A row from Savana copy number segmentation output.
///
/// Represents a genomic segment with associated copy number information,
/// including optional allele-specific data.
pub struct SavanaCNRow {
    /// The genomic range (chromosome, start, end) of this segment
    pub range: GenomeRange,
    /// Unique identifier for this segment
    pub segment_id: String,
    /// Number of bins included in this segment
    pub bin_count: u32,
    /// Sum of the lengths of all bins in this segment
    pub sum_of_bin_lengths: u32,
    /// Weight assigned to this segment in the analysis
    pub weight: f32,
    /// Estimated copy number for this segment
    pub copy_number: f32,
    /// Minor allele copy number, if available (requires heterozygous SNPs)
    pub minor_allele_copy_number: Option<f64>,
    /// Mean B-allele frequency for heterozygous SNPs in this segment
    pub mean_baf: Option<f64>,
    /// Number of heterozygous SNPs in this segment
    pub n_het_snps: u32,
}

impl FromStr for SavanaCNRow {
    type Err = anyhow::Error;

    /// Parses a tab-separated line from Savana copy number output.
    ///
    /// # Format
    /// Expected columns: chromosome, start, end, segment_id, bin_count,
    /// sum_of_bin_lengths, weight, copy_number, minor_allele_copy_number,
    /// mean_baf, n_het_snps
    ///
    /// # Errors
    /// Returns an error if any field is missing or cannot be parsed to the expected type.
    fn from_str(s: &str) -> anyhow::Result<Self> {
        let cells: Vec<&str> = s.split("\t").collect();
        let range = GenomeRange::from_1_inclusive_str(
            cells
                .first()
                .context(format!("Error while parsing contig {s}."))?,
            cells
                .get(1)
                .context(format!("Error while parsing start {s}."))?,
            cells
                .get(2)
                .context(format!("Error while parsing end {s}."))?,
        )
        .map_err(|e| anyhow::anyhow!("Error while parsing range {s}.\n{e}"))?;

        Ok(Self {
            range,
            segment_id: cells
                .get(3)
                .context(format!("Error while parsing segment_id {s}."))?
                .to_string(),
            bin_count: cells
                .get(4)
                .context(format!("Error while parsing bin_count {s}."))?
                .parse()
                .map_err(|e| anyhow::anyhow!("Error while parsing bin_count {s}.\n{e}"))?,
            sum_of_bin_lengths: cells
                .get(5)
                .context(format!("Error while parsing sum_of_bin_lengths {s}."))?
                .parse()
                .map_err(|e| anyhow::anyhow!("Error while parsing sum_of_bin_lengths {s}.\n{e}"))?,
            weight: cells
                .get(6)
                .context(format!("Error while parsing weight {s}."))?
                .parse()
                .map_err(|e| anyhow::anyhow!("Error while parsing weight {s}.\n{e}"))?,
            copy_number: cells
                .get(7)
                .context(format!("Error while parsing copy_number {s}."))?
                .parse()
                .map_err(|e| anyhow::anyhow!("Error while parsing copy_number {s}.\n{e}"))?,
            minor_allele_copy_number: cells
                .get(8)
                .context(format!("Error while parsing minor_allele_copy_number {s}."))?
                .parse::<f64>()
                .map(Some)
                .unwrap_or(None),
            mean_baf: cells
                .get(9)
                .context(format!("Error while parsing mean_baf {s}."))?
                .parse::<f64>()
                .map(Some)
                .unwrap_or(None),
            n_het_snps: cells
                .get(10)
                .context(format!("Error while parsing n_het_snps {s}."))?
                .parse()
                .map_err(|e| anyhow::anyhow!("Error while parsing n_het_snps {s}.\n{e}"))?,
        })
    }
}

/// Container for Savana copy number segmentation data.
///
/// Holds all copy number segments for a sample, loaded from Savana output files.
pub struct SavanaCopyNumber {
    /// Vector of copy number segments
    pub data: Vec<SavanaCNRow>,
}

impl SavanaCopyNumber {
    /// Loads copy number data for a given sample ID.
    ///
    /// # Arguments
    /// * `id` - Sample identifier
    /// * `config` - Configuration containing path information
    ///
    /// # Errors
    /// Returns an error if the file cannot be read or parsed.
    pub fn load_id(id: &str, config: Config) -> anyhow::Result<Self> {
        let path = config.savana_copy_number(id);
        let reader = get_gz_reader(&path)?;
        let mut data = Vec::new();
        for line in reader.lines() {
            let line = line?;
            if line.starts_with("chromosome") {
                continue;
            }
            let cn: SavanaCNRow = line.parse()?;
            data.push(cn);
        }
        Ok(Self { data })
    }
}

/// A row from Savana read count output.
///
/// Contains read depth information for tumor and normal samples in genomic bins.
pub struct SavanaRCRow {
    /// The genomic range (chromosome, start, end) of this bin
    pub range: GenomeRange,
    /// Percentage of bases in the bin with known sequence (not N)
    pub perc_known_bases: f32,
    /// Whether this bin should be used in analysis
    pub use_bin: bool,
    /// Number of reads from the tumor sample in this bin
    pub tumour_read_count: u32,
    /// Number of reads from the normal sample in this bin
    pub normal_read_count: u32,
}

impl FromStr for SavanaRCRow {
    type Err = anyhow::Error;

    /// Parses a tab-separated line from Savana read count output.
    ///
    /// # Format
    /// Expected columns: bin (format: chr:start_end), chromosome, start, end,
    /// perc_known_bases, use_bin, tumour_read_count, normal_read_count
    ///
    /// # Errors
    /// Returns an error if any field is missing or cannot be parsed.
    fn from_str(s: &str) -> anyhow::Result<Self> {
        let cells: Vec<&str> = s.split("\t").collect();
        let bin = cells
            .first()
            .context(format!("Error while parsing range {s}."))?;

        let range = bin
            .split_once(':')
            .and_then(|(a, b)| {
                b.split_once('_').map(|(b, c)| {
                    GenomeRange::from_1_inclusive_str(a, b, c)
                        .context(format!("Error while parsing range {bin}"))
                })
            })
            .context(format!("Invalid range format: {bin}"))??;

        Ok(Self {
            range,
            perc_known_bases: cells
                .get(4)
                .context(format!("Error while parsing perc_known_bases {s}."))?
                .parse()
                .map_err(|e| anyhow::anyhow!("Error while parsing perc_known_bases {s}.\n{e}"))?,
            use_bin: cells
                .get(5)
                .context(format!("Error while parsing use_bin {s}."))?
                .to_lowercase()
                .parse()
                .map_err(|e| anyhow::anyhow!("Error while parsing use_bin {s}.\n{e}"))?,
            tumour_read_count: cells
                .get(6)
                .context(format!("Error while parsing tumour_read_count {s}."))?
                .parse()
                .map_err(|e| anyhow::anyhow!("Error while parsing tumour_read_count {s}.\n{e}"))?,
            normal_read_count: cells
                .get(7)
                .context(format!("Error while parsing normal_read_count {s}."))?
                .parse()
                .map_err(|e| anyhow::anyhow!("Error while parsing normal_read_count {s}.\n{e}"))?,
        })
    }
}

/// Container for Savana read count data.
///
/// Holds read depth information across genomic bins for tumor-normal pairs.
pub struct SavanaReadCounts {
    /// Vector of read count bins
    pub data: Vec<SavanaRCRow>,
}

impl SavanaReadCounts {
    /// Loads read count data for a given sample ID.
    ///
    /// # Arguments
    /// * `id` - Sample identifier
    /// * `config` - Configuration containing path information
    ///
    /// # Errors
    /// Returns an error if the file cannot be read or parsed.
    pub fn load_id(id: &str, config: Config) -> anyhow::Result<Self> {
        let path = config.savana_read_counts(id);
        let reader = get_gz_reader(&path)?;
        let mut data = Vec::new();
        for line in reader.lines() {
            let line = line?;
            if line.starts_with("bin") {
                continue;
            }
            let cn: SavanaRCRow = line.parse()?;
            data.push(cn);
        }
        Ok(Self { data })
    }

    /// Calculates the total number of tumor reads across all bins.
    ///
    /// # Returns
    /// Sum of tumor read counts from all bins.
    pub fn n_tumoral_reads(&self) -> usize {
        self.data
            .par_iter()
            .map(|c| c.tumour_read_count as usize)
            .sum::<usize>()
    }

    /// Calculates the total number of normal reads across all bins.
    ///
    /// # Returns
    /// Sum of normal read counts from all bins.
    pub fn n_normal_reads(&self) -> usize {
        self.data
            .par_iter()
            .map(|c| c.normal_read_count as usize)
            .sum::<usize>()
    }

    /// Calculates normalized read counts per chromosome.
    ///
    /// Normalizes each chromosome's read count by the expected number of reads
    /// based on the number of bins in that chromosome relative to total bins.
    ///
    /// # Returns
    /// Vector of tuples containing (chromosome_name, normalized_count) pairs.
    pub fn norm_chr_counts(&self) -> Vec<(String, f64)> {
        let n_tum = self.n_tumoral_reads();
        let n_bins = self.data.len();
        self.data
            .iter()
            .chunk_by(|c| c.range.contig)
            .into_iter()
            .map(|(contig_num, chr_counts)| {
                let chr_counts: Vec<_> = chr_counts.collect();

                let chr_n_bins = chr_counts.len();
                let chr_counts = chr_counts
                    .par_iter()
                    .map(|c| c.tumour_read_count as usize)
                    .sum::<usize>();

                let r = chr_counts as f64 / ((chr_n_bins as f64 * n_tum as f64) / n_bins as f64);
                (num_to_contig(contig_num), r)
            })
            .collect()
    }
}

/// Savana copy number data with serialization support.
///
/// Alternative representation of copy number segments optimized for
/// serialization and deserialization.
#[derive(Debug, Serialize, Deserialize)]
pub struct SavanaCN {
    /// Vector of copy number segments
    pub segments: Vec<CNSegment>,
}

/// A copy number segment with full metadata.
///
/// Contains all information about a genomic segment including copy number,
/// allelic information, and quality metrics.
#[derive(Debug, Serialize, Deserialize)]
pub struct CNSegment {
    /// Chromosome name
    pub chromosome: String,
    /// Start position (1-based, inclusive)
    pub start: u64,
    /// End position (1-based, inclusive)
    pub end: u64,
    /// Unique segment identifier
    pub segment_id: String,
    /// Number of bins merged into this segment
    pub bin_count: u32,
    /// Total length of all bins in base pairs
    pub sum_of_bin_lengths: u64,
    /// Statistical weight of this segment
    pub weight: f64,
    /// Estimated total copy number
    pub copy_number: f64,
    /// Minor allele copy number (requires heterozygous SNPs)
    pub minor_allele_copy_number: Option<f64>,
    /// Mean B-allele frequency across heterozygous SNPs
    pub mean_baf: Option<f64>,
    /// Number of heterozygous SNPs in this segment
    pub no_het_snps: u32,
}

impl SavanaCN {
    /// Parses a Savana copy number file into structured segments.
    ///
    /// # Arguments
    /// * `id` - Sample identifier
    /// * `config` - Configuration containing path information
    ///
    /// # Returns
    /// Parsed copy number data with all segments.
    ///
    /// # Errors
    /// Returns an error if:
    /// * File cannot be opened or read
    /// * Line has incorrect number of columns (expected 11)
    /// * Any field cannot be parsed to the expected type
    pub fn parse_file(id: &str, config: &Config) -> anyhow::Result<Self> {
        let path = config.savana_copy_number(id);
        let reader =
            get_gz_reader(&path).context(anyhow::anyhow!("Error while reading: {path}"))?;

        let mut segments = Vec::new();

        for (index, line) in reader.lines().enumerate() {
            let line = line.context("Failed to read line from file")?;

            // Skip header line
            if index == 0 {
                continue;
            }

            let parts: Vec<&str> = line.split('\t').collect();
            if parts.len() != 11 {
                return Err(anyhow::anyhow!(
                    "Invalid number of columns at line {} (expected 11, got {})",
                    index + 1,
                    parts.len()
                ));
            }

            // Helper closure for parsing with context
            let parse_field = |field: &str, name: &str, line_num: usize| -> anyhow::Result<f64> {
                field
                    .parse()
                    .context(format!("Failed to parse {name} at line {line_num}"))
            };

            let parse_field_u32 =
                |field: &str, name: &str, line_num: usize| -> anyhow::Result<u32> {
                    field
                        .parse()
                        .context(format!("Failed to parse {name} at line {line_num}"))
                };

            let line_num = index + 1;

            let segment = CNSegment {
                chromosome: parts[0].to_string(),
                start: parse_field_u32(parts[1], "start", line_num)? as u64,
                end: parse_field_u32(parts[2], "end", line_num)? as u64,
                segment_id: parts[3].to_string(),
                bin_count: parse_field_u32(parts[4], "bin_count", line_num)?,
                sum_of_bin_lengths: parse_field_u32(parts[5], "sum_of_bin_lengths", line_num)?
                    as u64,
                weight: parse_field(parts[6], "weight", line_num)?,
                copy_number: parse_field(parts[7], "copy_number", line_num)?,
                minor_allele_copy_number: match parts[8] {
                    "" | "NA" => None,
                    v => Some(parse_field(v, "minor_allele_copy_number", line_num)?),
                },
                mean_baf: match parts[9] {
                    "" | "NA" => None,
                    v => Some(parse_field(v, "mean_baf", line_num)?),
                },
                no_het_snps: if parts[10].is_empty() {
                    0
                } else {
                    parse_field_u32(parts[10], "no_het_snps", line_num)?
                },
            };

            segments.push(segment);
        }

        Ok(Self { segments })
    }

    /// Writes CoRAL `--cn_segs` BED: all segments with absolute CN.
    ///
    /// Format: `chrom\tstart\tend\tcopy_number`
    pub fn write_coral_cn_segs(&self, path: &Path) -> anyhow::Result<()> {
        let mut w = BufWriter::new(
            File::create(path)
                .with_context(|| format!("Cannot create cn_segs BED: {}", path.display()))?,
        );
        for seg in &self.segments {
            writeln!(
                w,
                "{}\t{}\t{}\t{:.4}",
                seg.chromosome, seg.start, seg.end, seg.copy_number
            )?;
        }
        Ok(())
    }

    /// Writes CoRAL `--seeds` BED: amplified segments only, centromeres excluded.
    ///
    /// Uses a purity-aware threshold so that tumour CN dilution by normal
    /// contamination is accounted for:
    ///
    /// ```text
    /// fitted_CN = purity * tumour_CN + (1 - purity) * ploidy
    /// seed_threshold = purity * min_tumour_cn + (1 - purity) * ploidy
    /// ```
    ///
    /// Centromeric segments (stain `acen` in the cytoband BED) are excluded
    /// regardless of copy number, as they produce artifactual high-depth signal
    /// due to repetitive satellite sequences.
    ///
    /// # Arguments
    /// * `path` - Output BED path
    /// * `purity` - Tumour purity (0..1) from `_fitted_purity_ploidy.tsv`
    /// * `ploidy` - Tumour ploidy from `_fitted_purity_ploidy.tsv`
    /// * `min_tumour_cn` - Minimum tumour CN to qualify as a seed (typically `4 * ploidy`)
    /// * `centromeres` - Parsed centromere intervals from `parse_centromere_intervals()`
    pub fn write_coral_seeds(
        &self,
        path: &Path,
        purity: f64,
        ploidy: f64,
        min_tumour_cn: f64,
        centromeres: &[(String, u64, u64)],
    ) -> anyhow::Result<()> {
        let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy;
        let mut w = BufWriter::new(
            File::create(path)
                .with_context(|| format!("Cannot create seeds BED: {}", path.display()))?,
        );
        for seg in &self.segments {
            if seg.copy_number >= threshold
                && !overlaps_centromere(&seg.chromosome, seg.start, seg.end, centromeres)
            {
                writeln!(w, "{}\t{}\t{}", seg.chromosome, seg.start, seg.end)?;
            }
        }
        Ok(())
    }

    /// Returns the number of amplified non-centromeric seed segments.
    pub fn n_seeds(
        &self,
        purity: f64,
        ploidy: f64,
        min_tumour_cn: f64,
        centromeres: &[(String, u64, u64)],
    ) -> usize {
        let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy;
        self.segments
            .iter()
            .filter(|s| {
                s.copy_number >= threshold
                    && !overlaps_centromere(&s.chromosome, s.start, s.end, centromeres)
            })
            .count()
    }
}

/// Returns true if [seg_start, seg_end) overlaps any centromeric interval
/// on the same chromosome.
fn overlaps_centromere(
    chrom: &str,
    start: u64,
    end: u64,
    centromeres: &[(String, u64, u64)],
) -> bool {
    centromeres
        .iter()
        .any(|(c, cs, ce)| c == chrom && start < *ce && end > *cs)
}

/// Reads SAVANA fitted_purity_ploidy.tsv.
/// Expected format:
///   purity  ploidy  distance  rank
///   0.48    1.98    0.105     1
pub fn read_savana_purity_ploidy(path: &Path) -> anyhow::Result<(f64, f64)> {
    let content = std::fs::read_to_string(path)?;
    let mut lines = content.lines();

    // Skip header
    lines.next().context("purity/ploidy file is empty")?;

    let data = lines.next().context("purity/ploidy file has no data row")?;
    let fields: Vec<&str> = data.split('\t').collect();

    anyhow::ensure!(
        fields.len() >= 2,
        "expected at least 2 columns, got {}",
        fields.len()
    );

    let purity = fields[0]
        .trim()
        .parse::<f64>()
        .with_context(|| format!("cannot parse purity: '{}'", fields[0]))?;
    let ploidy = fields[1]
        .trim()
        .parse::<f64>()
        .with_context(|| format!("cannot parse ploidy: '{}'", fields[1]))?;

    Ok((purity, ploidy))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::helpers::test_init;

    #[test]
    fn savana_version() -> anyhow::Result<()> {
        test_init();
        let vl = Savana::version(&Config::default())?;
        info!("Savana version: {vl}");
        Ok(())
    }

    #[test]
    fn savana_run() -> anyhow::Result<()> {
        test_init();
        let config = Config::default();

        let mut caller = Savana::initialize("DUMCO", &config)?;
        caller.run()?;
        Ok(())
    }
}