savana.rs 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992
  1. //! # Savana Haplotype-Aware Somatic Variant Caller
  2. //!
  3. //! This module provides wrappers for [Savana](https://github.com/cortes-ciriano-lab/savana),
  4. //! a haplotype-aware somatic variant caller for structural variants and copy number alterations
  5. //! from long-read sequencing data.
  6. //!
  7. //! ## Overview
  8. //!
  9. //! Savana detects:
  10. //! - Structural variants (SVs): deletions, duplications, inversions, translocations
  11. //! - Copy number variations (CNVs) with allele-specific information
  12. //! - Complex genomic rearrangements
  13. //!
  14. //! ## Key Features
  15. //!
  16. //! - **Haplotype-aware calling** - Uses phased germline variants and haplotagged BAMs
  17. //! - **Integrated phasing** - Automatically runs LongPhase if needed
  18. //! - **CNV segmentation** - Provides detailed copy number profiles
  19. //!
  20. //! ## Requirements
  21. //!
  22. //! Before running Savana, ensure:
  23. //! - Tumor and normal BAMs are indexed
  24. //! - Reference genome is accessible
  25. //! - Conda environment with Savana is configured (environment name: savana_env)
  26. //!
  27. //! ## Output Files
  28. //!
  29. //! PASS-filtered somatic variants:
  30. //! ```text
  31. //! {result_dir}/{id}/savana/somatic_sv_PASS.vcf.gz
  32. //! ```
  33. //!
  34. //! Copy number segmentation:
  35. //! ```text
  36. //! {result_dir}/{id}/savana/copy_number_segmentation.txt.gz
  37. //! ```
  38. //!
  39. //! Read count bins:
  40. //! ```text
  41. //! {result_dir}/{id}/savana/read_counts.txt.gz
  42. //! ```
  43. //!
  44. //! ## Usage
  45. //!
  46. //! ```ignore
  47. //! use pandora_lib_promethion::callers::savana::Savana;
  48. //! use pandora_lib_promethion::config::Config;
  49. //! use pandora_lib_promethion::pipes::Initialize;
  50. //! use pandora_lib_promethion::runners::Run;
  51. //!
  52. //! let config = Config::default();
  53. //! let mut caller = Savana::initialize("sample_001", &config)?;
  54. //!
  55. //! if caller.should_run() {
  56. //! caller.run()?; // Automatically handles phasing and haplotagging
  57. //! }
  58. //!
  59. //! // Load SV/CNV variants
  60. //! let variants = caller.variants(&annotations)?;
  61. //! println!("Found {} somatic SVs/CNVs", variants.variants.len());
  62. //!
  63. //! // Load copy number data
  64. //! let cn_data = SavanaCN::parse_file("sample_001", &config)?;
  65. //! println!("Copy number segments: {}", cn_data.segments.len());
  66. //! # Ok::<(), anyhow::Error>(())
  67. //! ```
  68. //!
  69. //! ## References
  70. //!
  71. //! - [Savana GitHub](https://github.com/cortes-ciriano-lab/savana)
  72. //! - [Savana Paper](https://doi.org/10.1038/s41467-022-34590-2)
  73. use crate::{
  74. annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
  75. collection::vcf::Vcf,
  76. commands::{
  77. bcftools::BcftoolsKeepPass,
  78. longphase::{LongphaseHap, LongphasePhase},
  79. samtools::SamtoolsIndex,
  80. CapturedOutput, Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner,
  81. },
  82. config::Config,
  83. helpers::{is_file_older, remove_dir_if_exists},
  84. io::{readers::get_gz_reader, vcf::read_vcf},
  85. locker::SampleLock,
  86. pipes::{Initialize, InitializeSolo, ShouldRun, Version},
  87. positions::{num_to_contig, GenomeRange},
  88. run,
  89. runners::Run,
  90. variant::{
  91. variant_collection::VariantCollection,
  92. vcf_variant::{Label, Variants},
  93. },
  94. };
  95. use anyhow::Context;
  96. use itertools::Itertools;
  97. use log::{debug, info, warn};
  98. use rayon::prelude::*;
  99. use serde::{Deserialize, Serialize};
  100. use std::io::Write;
  101. use std::{
  102. fs::{self, File},
  103. io::{BufRead, BufWriter},
  104. path::Path,
  105. str::FromStr,
  106. };
  107. /// Savana haplotype-aware somatic SV and CNV caller.
  108. ///
  109. /// Orchestrates the Savana pipeline including prerequisite phasing and haplotagging steps.
  110. /// Automatically invokes LongPhase if required phased VCFs or haplotagged BAMs are missing.
  111. ///
  112. /// # Fields
  113. ///
  114. /// - `id` - Sample identifier (e.g., "34528")
  115. /// - `config` - Global pipeline configuration
  116. /// - `log_dir` - Directory for execution logs (e.g., "{result_dir}/{id}/log/savana")
  117. /// - `job_args` - Internal command-line arguments passed to Savana (populated in `run()`)
  118. #[derive(Debug)]
  119. pub struct Savana {
  120. /// Sample identifier
  121. pub id: String,
  122. /// Global pipeline configuration
  123. pub config: Config,
  124. /// Directory for log file storage
  125. pub log_dir: String,
  126. /// Command-line arguments for Savana executable (populated during run)
  127. job_args: Vec<String>,
  128. }
  129. impl Initialize for Savana {
  130. /// Initializes a new `Savana` instance for a given sample ID and configuration.
  131. ///
  132. /// If `savana_force` is enabled in [`Config`] and the output VCF exists, or if the run
  133. /// should be re-triggered (based on file freshness), the output directory
  134. /// will be cleaned.
  135. ///
  136. /// # Arguments
  137. ///
  138. /// * `id` - Sample identifier
  139. /// * `config` - Pipeline execution configuration
  140. ///
  141. /// # Returns
  142. ///
  143. /// A new `Savana` instance, or an error if cleanup fails.
  144. ///
  145. /// # Errors
  146. ///
  147. /// Returns an error if:
  148. /// - `config.savana_force` is true and output directory cannot be removed
  149. /// - Directory deletion fails due to permissions or I/O errors
  150. fn initialize(id: &str, config: &Config) -> anyhow::Result<Self> {
  151. info!("Initialize Savana for {id}.");
  152. let log_dir = format!("{}/{}/log/savana", config.result_dir, id);
  153. let savana = Self {
  154. id: id.to_string(),
  155. config: config.clone(),
  156. log_dir,
  157. job_args: Vec::new(),
  158. };
  159. // If forced re-run is enabled or a run is needed, remove old output directory
  160. if savana.config.savana_force {
  161. remove_dir_if_exists(&savana.config.savana_output_dir(id))?;
  162. }
  163. Ok(savana)
  164. }
  165. }
  166. impl ShouldRun for Savana {
  167. /// Determines whether Savana should be re-run based on whether
  168. /// the filtered PASS VCF is older than the input BAMs.
  169. ///
  170. /// If either input BAM (normal or tumor) is newer than the PASS VCF,
  171. /// Savana is considered out of date and should be re-executed.
  172. ///
  173. /// # Returns
  174. /// `true` if an update is needed, or if timestamps can't be checked (file doesn't exist)
  175. fn should_run(&self) -> bool {
  176. let passed_vcf = &self.config.savana_passed_vcf(&self.id);
  177. let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true)
  178. .unwrap_or(true)
  179. || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
  180. if result {
  181. warn!("Savana should run for id: {}.", self.id);
  182. }
  183. result
  184. }
  185. }
  186. impl JobCommand for Savana {
  187. fn cmd(&self) -> String {
  188. format!(
  189. "source {conda_sh} && conda activate savana_env && {bin} {args}",
  190. conda_sh = self.config.conda_sh,
  191. bin = self.config.savana_bin,
  192. args = self.job_args.join(" ")
  193. )
  194. }
  195. }
  196. impl LocalRunner for Savana {}
  197. impl SlurmRunner for Savana {
  198. fn slurm_args(&self) -> Vec<String> {
  199. SlurmParams {
  200. job_name: Some(format!("savana_{}", self.id)),
  201. cpus_per_task: Some(self.config.savana_threads as u32),
  202. mem: Some(format!("{}G", self.config.savana_mem)),
  203. partition: Some("mediumq".into()),
  204. gres: None,
  205. }
  206. .to_args()
  207. }
  208. }
  209. impl SbatchRunner for Savana {
  210. fn slurm_params(&self) -> SlurmParams {
  211. SlurmParams {
  212. job_name: Some(format!("savana_{}", self.id)),
  213. cpus_per_task: Some(self.config.savana_threads as u32),
  214. mem: Some(format!("{}G", self.config.savana_mem)),
  215. partition: Some("mediumq".into()),
  216. gres: None,
  217. }
  218. }
  219. }
  220. impl Run for Savana {
  221. /// Executes the Savana pipeline, including prerequisite phasing and haplotagging steps.
  222. ///
  223. /// If the output VCF does not already exist, it ensures that the phased VCF and
  224. /// haplotagged BAMs exist (or runs the modules to produce them), then runs Savana.
  225. /// Finally, it filters the resulting VCF to retain only PASS variants.
  226. ///
  227. /// # Returns
  228. ///
  229. /// `Ok(())` if the run completes successfully, or an error otherwise.
  230. fn run(&mut self) -> anyhow::Result<()> {
  231. let lock_dir = format!("{}/locks", self.config.result_dir);
  232. let _lock = SampleLock::acquire(&lock_dir, &self.id, "savana")
  233. .with_context(|| format!("Cannot start SAVANA chunked for {}", self.id))?;
  234. if !self.should_run() {
  235. info!("Savana is up-to-date.");
  236. return Ok(());
  237. }
  238. let output_vcf = &self.config.savana_output_vcf(&self.id);
  239. if !Path::new(&output_vcf).exists() {
  240. let output_dir = self.config.savana_output_dir(&self.id);
  241. fs::create_dir_all(&output_dir).with_context(|| {
  242. format!("Failed to create output dir for Savana run: {output_dir}")
  243. })?;
  244. // Check for phased germline vcf
  245. // not required anymore since >= 1.3.0
  246. let phased_germline_vcf = self.config.germline_phased_vcf(&self.id);
  247. if !Path::new(&phased_germline_vcf).exists() {
  248. let mut phase = LongphasePhase::initialize(&self.id, &self.config.clone())?;
  249. run!(&self.config, &mut phase)?;
  250. } else {
  251. info!("Phased germline is already up-to-date.");
  252. }
  253. let normal_hp_bam = self.config.normal_haplotagged_bam(&self.id);
  254. let tumoral_hp_bam = self.config.tumoral_haplotagged_bam(&self.id);
  255. // Check for haplotagged bam
  256. if !Path::new(&normal_hp_bam).exists() {
  257. let mut normal_hap =
  258. LongphaseHap::initialize(&self.id, &self.config.normal_name, &self.config)?;
  259. normal_hap.run()?;
  260. } else {
  261. info!("Haplotagged normal BAM is already up-to-date");
  262. }
  263. if !Path::new(&format!("{normal_hp_bam}.bai")).exists() {
  264. let mut normal_index = SamtoolsIndex::from_config(&self.config, &normal_hp_bam);
  265. normal_index.run()?;
  266. }
  267. if !Path::new(&tumoral_hp_bam).exists() {
  268. let mut tumoral_hap =
  269. LongphaseHap::initialize(&self.id, &self.config.tumoral_name, &self.config)?;
  270. tumoral_hap.run()?;
  271. } else {
  272. info!("Haplotagged tumoral BAM is already up-to-date");
  273. }
  274. if !Path::new(&format!("{tumoral_hp_bam}.bai")).exists() {
  275. let mut tumoral_index = SamtoolsIndex::from_config(&self.config, &tumoral_hp_bam);
  276. tumoral_index.run()?;
  277. }
  278. self.job_args = vec![
  279. "--tumour".to_string(),
  280. tumoral_hp_bam.clone(),
  281. "--normal".to_string(),
  282. normal_hp_bam.clone(),
  283. "--outdir".to_string(),
  284. self.config.savana_output_dir(&self.id),
  285. "--ref".to_string(),
  286. self.config.reference.clone(),
  287. "--snp_vcf".to_string(),
  288. phased_germline_vcf.clone(),
  289. "--no_blacklist".to_string(),
  290. "--threads".to_string(),
  291. self.config.savana_threads.to_string(),
  292. "--cna_threads".to_string(),
  293. self.config.savana_threads.to_string(),
  294. ];
  295. let output =
  296. run!(&self.config, self).context("Error while running Savana (local or Slurm)")?;
  297. let log_file = format!("{}/savana_", self.log_dir);
  298. output
  299. .save_to_file(&log_file)
  300. .context(format!("Error while writing logs into {log_file}"))?;
  301. } else {
  302. debug!(
  303. "Savana output already exists for {}, skipping execution.",
  304. self.id
  305. );
  306. }
  307. // Keep PASS
  308. let passed_vcf = self.config.savana_passed_vcf(&self.id);
  309. if !Path::new(&passed_vcf).exists() && Path::new(output_vcf).exists() {
  310. let mut keep_pass =
  311. BcftoolsKeepPass::from_config(&self.config, output_vcf, &passed_vcf);
  312. let report = run!(&self.config, &mut keep_pass).context(format!(
  313. "Error while running bcftools keep PASS for {output_vcf}"
  314. ))?;
  315. let log_file = format!("{}/bcftools_pass", self.log_dir);
  316. report
  317. .save_to_file(&log_file)
  318. .context(format!("Error while writing logs into {log_file}"))?;
  319. }
  320. Ok(())
  321. }
  322. }
  323. impl Version for Savana {
  324. /// Retrieves the Savana version by running `savana --version` in its conda environment.
  325. ///
  326. /// # Errors
  327. /// Returns an error if command execution fails or "Version " not found in output.
  328. fn version(config: &Config) -> anyhow::Result<String> {
  329. let mut job = Savana {
  330. id: "version".into(),
  331. config: config.clone(),
  332. log_dir: config.tmp_dir.clone(),
  333. job_args: vec!["--version".to_string()],
  334. };
  335. let out =
  336. run!(&config, &mut job).context("Error while running `savana --version` (local)")?;
  337. parse_savana_version(&out)
  338. }
  339. }
  340. fn parse_savana_version(out: &CapturedOutput) -> anyhow::Result<String> {
  341. let mut log = out.stdout.clone();
  342. if !out.stderr.is_empty() {
  343. log.push_str(&out.stderr);
  344. }
  345. if let Some(epilog) = &out.slurm_epilog {
  346. log.push_str(&format!("\n{epilog:#?}"));
  347. }
  348. let start = log
  349. .find("Version ")
  350. .context("Failed to find 'Version ' in the log")?;
  351. let start_index = start + "Version ".len();
  352. let end = log[start_index..]
  353. .find(|c: char| c.is_whitespace())
  354. .map(|idx| start_index + idx)
  355. .unwrap_or_else(|| log.len());
  356. Ok(log[start_index..end].to_string())
  357. }
  358. impl CallerCat for Savana {
  359. /// Tags the caller identity for downstream variant classification and traceability.
  360. /// Identifies this tool as the Savana caller producing somatic variants.
  361. fn caller_cat(&self) -> Annotation {
  362. Annotation::Callers(Caller::Savana, Sample::Somatic)
  363. }
  364. }
  365. impl Variants for Savana {
  366. /// Loads variants from the PASS-filtered VCF produced by Savana and annotates them.
  367. ///
  368. /// Reads the VCF file, applies the caller tag via the `Annotations` object, and
  369. /// wraps the parsed data into a `VariantCollection`.
  370. ///
  371. /// # Arguments
  372. /// * `annotations` - Global annotation registry shared across callers.
  373. ///
  374. /// # Returns
  375. /// A populated `VariantCollection`, or an error if the VCF fails to parse.
  376. fn variants(&self, annotations: &Annotations) -> anyhow::Result<VariantCollection> {
  377. let caller = self.caller_cat();
  378. let add = vec![caller.clone()];
  379. let passed_vcf = self.config.savana_passed_vcf(&self.id);
  380. info!("Loading variants from {}: {}", caller, &passed_vcf);
  381. let variants = read_vcf(&passed_vcf)
  382. .map_err(|e| anyhow::anyhow!("Failed to read Savana VCF {}.\n{e}", passed_vcf))?;
  383. variants.par_iter().for_each(|v| {
  384. annotations.insert_update(v.hash(), &add);
  385. });
  386. info!("{}, {} variants loaded.", caller, variants.len());
  387. Ok(VariantCollection {
  388. variants,
  389. vcf: Vcf::new(passed_vcf.clone().into())?,
  390. caller,
  391. })
  392. }
  393. }
  394. impl Label for Savana {
  395. /// Returns the string label for this caller.
  396. fn label(&self) -> String {
  397. self.caller_cat().to_string()
  398. }
  399. }
  400. /// A row from Savana copy number segmentation output.
  401. ///
  402. /// Represents a genomic segment with associated copy number information,
  403. /// including optional allele-specific data.
  404. pub struct SavanaCNRow {
  405. /// The genomic range (chromosome, start, end) of this segment
  406. pub range: GenomeRange,
  407. /// Unique identifier for this segment
  408. pub segment_id: String,
  409. /// Number of bins included in this segment
  410. pub bin_count: u32,
  411. /// Sum of the lengths of all bins in this segment
  412. pub sum_of_bin_lengths: u32,
  413. /// Weight assigned to this segment in the analysis
  414. pub weight: f32,
  415. /// Estimated copy number for this segment
  416. pub copy_number: f32,
  417. /// Minor allele copy number, if available (requires heterozygous SNPs)
  418. pub minor_allele_copy_number: Option<f64>,
  419. /// Mean B-allele frequency for heterozygous SNPs in this segment
  420. pub mean_baf: Option<f64>,
  421. /// Number of heterozygous SNPs in this segment
  422. pub n_het_snps: u32,
  423. }
  424. impl FromStr for SavanaCNRow {
  425. type Err = anyhow::Error;
  426. /// Parses a tab-separated line from Savana copy number output.
  427. ///
  428. /// # Format
  429. /// Expected columns: chromosome, start, end, segment_id, bin_count,
  430. /// sum_of_bin_lengths, weight, copy_number, minor_allele_copy_number,
  431. /// mean_baf, n_het_snps
  432. ///
  433. /// # Errors
  434. /// Returns an error if any field is missing or cannot be parsed to the expected type.
  435. fn from_str(s: &str) -> anyhow::Result<Self> {
  436. let cells: Vec<&str> = s.split("\t").collect();
  437. let range = GenomeRange::from_1_inclusive_str(
  438. cells
  439. .first()
  440. .context(format!("Error while parsing contig {s}."))?,
  441. cells
  442. .get(1)
  443. .context(format!("Error while parsing start {s}."))?,
  444. cells
  445. .get(2)
  446. .context(format!("Error while parsing end {s}."))?,
  447. )
  448. .map_err(|e| anyhow::anyhow!("Error while parsing range {s}.\n{e}"))?;
  449. Ok(Self {
  450. range,
  451. segment_id: cells
  452. .get(3)
  453. .context(format!("Error while parsing segment_id {s}."))?
  454. .to_string(),
  455. bin_count: cells
  456. .get(4)
  457. .context(format!("Error while parsing bin_count {s}."))?
  458. .parse()
  459. .map_err(|e| anyhow::anyhow!("Error while parsing bin_count {s}.\n{e}"))?,
  460. sum_of_bin_lengths: cells
  461. .get(5)
  462. .context(format!("Error while parsing sum_of_bin_lengths {s}."))?
  463. .parse()
  464. .map_err(|e| anyhow::anyhow!("Error while parsing sum_of_bin_lengths {s}.\n{e}"))?,
  465. weight: cells
  466. .get(6)
  467. .context(format!("Error while parsing weight {s}."))?
  468. .parse()
  469. .map_err(|e| anyhow::anyhow!("Error while parsing weight {s}.\n{e}"))?,
  470. copy_number: cells
  471. .get(7)
  472. .context(format!("Error while parsing copy_number {s}."))?
  473. .parse()
  474. .map_err(|e| anyhow::anyhow!("Error while parsing copy_number {s}.\n{e}"))?,
  475. minor_allele_copy_number: cells
  476. .get(8)
  477. .context(format!("Error while parsing minor_allele_copy_number {s}."))?
  478. .parse::<f64>()
  479. .map(Some)
  480. .unwrap_or(None),
  481. mean_baf: cells
  482. .get(9)
  483. .context(format!("Error while parsing mean_baf {s}."))?
  484. .parse::<f64>()
  485. .map(Some)
  486. .unwrap_or(None),
  487. n_het_snps: cells
  488. .get(10)
  489. .context(format!("Error while parsing n_het_snps {s}."))?
  490. .parse()
  491. .map_err(|e| anyhow::anyhow!("Error while parsing n_het_snps {s}.\n{e}"))?,
  492. })
  493. }
  494. }
  495. /// Container for Savana copy number segmentation data.
  496. ///
  497. /// Holds all copy number segments for a sample, loaded from Savana output files.
  498. pub struct SavanaCopyNumber {
  499. /// Vector of copy number segments
  500. pub data: Vec<SavanaCNRow>,
  501. }
  502. impl SavanaCopyNumber {
  503. /// Loads copy number data for a given sample ID.
  504. ///
  505. /// # Arguments
  506. /// * `id` - Sample identifier
  507. /// * `config` - Configuration containing path information
  508. ///
  509. /// # Errors
  510. /// Returns an error if the file cannot be read or parsed.
  511. pub fn load_id(id: &str, config: Config) -> anyhow::Result<Self> {
  512. let path = config.savana_copy_number(id);
  513. let reader = get_gz_reader(&path)?;
  514. let mut data = Vec::new();
  515. for line in reader.lines() {
  516. let line = line?;
  517. if line.starts_with("chromosome") {
  518. continue;
  519. }
  520. let cn: SavanaCNRow = line.parse()?;
  521. data.push(cn);
  522. }
  523. Ok(Self { data })
  524. }
  525. }
  526. /// A row from Savana read count output.
  527. ///
  528. /// Contains read depth information for tumor and normal samples in genomic bins.
  529. pub struct SavanaRCRow {
  530. /// The genomic range (chromosome, start, end) of this bin
  531. pub range: GenomeRange,
  532. /// Percentage of bases in the bin with known sequence (not N)
  533. pub perc_known_bases: f32,
  534. /// Whether this bin should be used in analysis
  535. pub use_bin: bool,
  536. /// Number of reads from the tumor sample in this bin
  537. pub tumour_read_count: u32,
  538. /// Number of reads from the normal sample in this bin
  539. pub normal_read_count: u32,
  540. }
  541. impl FromStr for SavanaRCRow {
  542. type Err = anyhow::Error;
  543. /// Parses a tab-separated line from Savana read count output.
  544. ///
  545. /// # Format
  546. /// Expected columns: bin (format: chr:start_end), chromosome, start, end,
  547. /// perc_known_bases, use_bin, tumour_read_count, normal_read_count
  548. ///
  549. /// # Errors
  550. /// Returns an error if any field is missing or cannot be parsed.
  551. fn from_str(s: &str) -> anyhow::Result<Self> {
  552. let cells: Vec<&str> = s.split("\t").collect();
  553. let bin = cells
  554. .first()
  555. .context(format!("Error while parsing range {s}."))?;
  556. let range = bin
  557. .split_once(':')
  558. .and_then(|(a, b)| {
  559. b.split_once('_').map(|(b, c)| {
  560. GenomeRange::from_1_inclusive_str(a, b, c)
  561. .context(format!("Error while parsing range {bin}"))
  562. })
  563. })
  564. .context(format!("Invalid range format: {bin}"))??;
  565. Ok(Self {
  566. range,
  567. perc_known_bases: cells
  568. .get(4)
  569. .context(format!("Error while parsing perc_known_bases {s}."))?
  570. .parse()
  571. .map_err(|e| anyhow::anyhow!("Error while parsing perc_known_bases {s}.\n{e}"))?,
  572. use_bin: cells
  573. .get(5)
  574. .context(format!("Error while parsing use_bin {s}."))?
  575. .to_lowercase()
  576. .parse()
  577. .map_err(|e| anyhow::anyhow!("Error while parsing use_bin {s}.\n{e}"))?,
  578. tumour_read_count: cells
  579. .get(6)
  580. .context(format!("Error while parsing tumour_read_count {s}."))?
  581. .parse()
  582. .map_err(|e| anyhow::anyhow!("Error while parsing tumour_read_count {s}.\n{e}"))?,
  583. normal_read_count: cells
  584. .get(7)
  585. .context(format!("Error while parsing normal_read_count {s}."))?
  586. .parse()
  587. .map_err(|e| anyhow::anyhow!("Error while parsing normal_read_count {s}.\n{e}"))?,
  588. })
  589. }
  590. }
  591. /// Container for Savana read count data.
  592. ///
  593. /// Holds read depth information across genomic bins for tumor-normal pairs.
  594. pub struct SavanaReadCounts {
  595. /// Vector of read count bins
  596. pub data: Vec<SavanaRCRow>,
  597. }
  598. impl SavanaReadCounts {
  599. /// Loads read count data for a given sample ID.
  600. ///
  601. /// # Arguments
  602. /// * `id` - Sample identifier
  603. /// * `config` - Configuration containing path information
  604. ///
  605. /// # Errors
  606. /// Returns an error if the file cannot be read or parsed.
  607. pub fn load_id(id: &str, config: Config) -> anyhow::Result<Self> {
  608. let path = config.savana_read_counts(id);
  609. let reader = get_gz_reader(&path)?;
  610. let mut data = Vec::new();
  611. for line in reader.lines() {
  612. let line = line?;
  613. if line.starts_with("bin") {
  614. continue;
  615. }
  616. let cn: SavanaRCRow = line.parse()?;
  617. data.push(cn);
  618. }
  619. Ok(Self { data })
  620. }
  621. /// Calculates the total number of tumor reads across all bins.
  622. ///
  623. /// # Returns
  624. /// Sum of tumor read counts from all bins.
  625. pub fn n_tumoral_reads(&self) -> usize {
  626. self.data
  627. .par_iter()
  628. .map(|c| c.tumour_read_count as usize)
  629. .sum::<usize>()
  630. }
  631. /// Calculates the total number of normal reads across all bins.
  632. ///
  633. /// # Returns
  634. /// Sum of normal read counts from all bins.
  635. pub fn n_normal_reads(&self) -> usize {
  636. self.data
  637. .par_iter()
  638. .map(|c| c.normal_read_count as usize)
  639. .sum::<usize>()
  640. }
  641. /// Calculates normalized read counts per chromosome.
  642. ///
  643. /// Normalizes each chromosome's read count by the expected number of reads
  644. /// based on the number of bins in that chromosome relative to total bins.
  645. ///
  646. /// # Returns
  647. /// Vector of tuples containing (chromosome_name, normalized_count) pairs.
  648. pub fn norm_chr_counts(&self) -> Vec<(String, f64)> {
  649. let n_tum = self.n_tumoral_reads();
  650. let n_bins = self.data.len();
  651. self.data
  652. .iter()
  653. .chunk_by(|c| c.range.contig)
  654. .into_iter()
  655. .map(|(contig_num, chr_counts)| {
  656. let chr_counts: Vec<_> = chr_counts.collect();
  657. let chr_n_bins = chr_counts.len();
  658. let chr_counts = chr_counts
  659. .par_iter()
  660. .map(|c| c.tumour_read_count as usize)
  661. .sum::<usize>();
  662. let r = chr_counts as f64 / ((chr_n_bins as f64 * n_tum as f64) / n_bins as f64);
  663. (num_to_contig(contig_num), r)
  664. })
  665. .collect()
  666. }
  667. }
  668. /// Savana copy number data with serialization support.
  669. ///
  670. /// Alternative representation of copy number segments optimized for
  671. /// serialization and deserialization.
  672. #[derive(Debug, Serialize, Deserialize)]
  673. pub struct SavanaCN {
  674. /// Vector of copy number segments
  675. pub segments: Vec<CNSegment>,
  676. }
  677. /// A copy number segment with full metadata.
  678. ///
  679. /// Contains all information about a genomic segment including copy number,
  680. /// allelic information, and quality metrics.
  681. #[derive(Debug, Serialize, Deserialize)]
  682. pub struct CNSegment {
  683. /// Chromosome name
  684. pub chromosome: String,
  685. /// Start position (1-based, inclusive)
  686. pub start: u64,
  687. /// End position (1-based, inclusive)
  688. pub end: u64,
  689. /// Unique segment identifier
  690. pub segment_id: String,
  691. /// Number of bins merged into this segment
  692. pub bin_count: u32,
  693. /// Total length of all bins in base pairs
  694. pub sum_of_bin_lengths: u64,
  695. /// Statistical weight of this segment
  696. pub weight: f64,
  697. /// Estimated total copy number
  698. pub copy_number: f64,
  699. /// Minor allele copy number (requires heterozygous SNPs)
  700. pub minor_allele_copy_number: Option<f64>,
  701. /// Mean B-allele frequency across heterozygous SNPs
  702. pub mean_baf: Option<f64>,
  703. /// Number of heterozygous SNPs in this segment
  704. pub no_het_snps: u32,
  705. }
  706. impl SavanaCN {
  707. /// Parses a Savana copy number file into structured segments.
  708. ///
  709. /// # Arguments
  710. /// * `id` - Sample identifier
  711. /// * `config` - Configuration containing path information
  712. ///
  713. /// # Returns
  714. /// Parsed copy number data with all segments.
  715. ///
  716. /// # Errors
  717. /// Returns an error if:
  718. /// * File cannot be opened or read
  719. /// * Line has incorrect number of columns (expected 11)
  720. /// * Any field cannot be parsed to the expected type
  721. pub fn parse_file(id: &str, config: &Config) -> anyhow::Result<Self> {
  722. let path = config.savana_copy_number(id);
  723. let reader =
  724. get_gz_reader(&path).context(anyhow::anyhow!("Error while reading: {path}"))?;
  725. let mut segments = Vec::new();
  726. for (index, line) in reader.lines().enumerate() {
  727. let line = line.context("Failed to read line from file")?;
  728. // Skip header line
  729. if index == 0 {
  730. continue;
  731. }
  732. let parts: Vec<&str> = line.split('\t').collect();
  733. if parts.len() != 11 {
  734. return Err(anyhow::anyhow!(
  735. "Invalid number of columns at line {} (expected 11, got {})",
  736. index + 1,
  737. parts.len()
  738. ));
  739. }
  740. // Helper closure for parsing with context
  741. let parse_field = |field: &str, name: &str, line_num: usize| -> anyhow::Result<f64> {
  742. field
  743. .parse()
  744. .context(format!("Failed to parse {name} at line {line_num}"))
  745. };
  746. let parse_field_u32 =
  747. |field: &str, name: &str, line_num: usize| -> anyhow::Result<u32> {
  748. field
  749. .parse()
  750. .context(format!("Failed to parse {name} at line {line_num}"))
  751. };
  752. let line_num = index + 1;
  753. let segment = CNSegment {
  754. chromosome: parts[0].to_string(),
  755. start: parse_field_u32(parts[1], "start", line_num)? as u64,
  756. end: parse_field_u32(parts[2], "end", line_num)? as u64,
  757. segment_id: parts[3].to_string(),
  758. bin_count: parse_field_u32(parts[4], "bin_count", line_num)?,
  759. sum_of_bin_lengths: parse_field_u32(parts[5], "sum_of_bin_lengths", line_num)?
  760. as u64,
  761. weight: parse_field(parts[6], "weight", line_num)?,
  762. copy_number: parse_field(parts[7], "copy_number", line_num)?,
  763. minor_allele_copy_number: match parts[8] {
  764. "" | "NA" => None,
  765. v => Some(parse_field(v, "minor_allele_copy_number", line_num)?),
  766. },
  767. mean_baf: match parts[9] {
  768. "" | "NA" => None,
  769. v => Some(parse_field(v, "mean_baf", line_num)?),
  770. },
  771. no_het_snps: if parts[10].is_empty() {
  772. 0
  773. } else {
  774. parse_field_u32(parts[10], "no_het_snps", line_num)?
  775. },
  776. };
  777. segments.push(segment);
  778. }
  779. Ok(Self { segments })
  780. }
  781. /// Writes CoRAL `--cn_segs` BED: all segments with absolute CN.
  782. ///
  783. /// Format: `chrom\tstart\tend\tcopy_number`
  784. pub fn write_coral_cn_segs(&self, path: &Path) -> anyhow::Result<()> {
  785. let mut w = BufWriter::new(
  786. File::create(path)
  787. .with_context(|| format!("Cannot create cn_segs BED: {}", path.display()))?,
  788. );
  789. for seg in &self.segments {
  790. writeln!(
  791. w,
  792. "{}\t{}\t{}\t{:.4}",
  793. seg.chromosome, seg.start, seg.end, seg.copy_number
  794. )?;
  795. }
  796. Ok(())
  797. }
  798. /// Writes CoRAL `--seeds` BED: amplified segments only, centromeres excluded.
  799. ///
  800. /// Uses a purity-aware threshold so that tumour CN dilution by normal
  801. /// contamination is accounted for:
  802. ///
  803. /// ```text
  804. /// fitted_CN = purity * tumour_CN + (1 - purity) * ploidy
  805. /// seed_threshold = purity * min_tumour_cn + (1 - purity) * ploidy
  806. /// ```
  807. ///
  808. /// Centromeric segments (stain `acen` in the cytoband BED) are excluded
  809. /// regardless of copy number, as they produce artifactual high-depth signal
  810. /// due to repetitive satellite sequences.
  811. ///
  812. /// # Arguments
  813. /// * `path` - Output BED path
  814. /// * `purity` - Tumour purity (0..1) from `_fitted_purity_ploidy.tsv`
  815. /// * `ploidy` - Tumour ploidy from `_fitted_purity_ploidy.tsv`
  816. /// * `min_tumour_cn` - Minimum tumour CN to qualify as a seed (typically `4 * ploidy`)
  817. /// * `centromeres` - Parsed centromere intervals from `parse_centromere_intervals()`
  818. pub fn write_coral_seeds(
  819. &self,
  820. path: &Path,
  821. purity: f64,
  822. ploidy: f64,
  823. min_tumour_cn: f64,
  824. centromeres: &[(String, u64, u64)],
  825. ) -> anyhow::Result<()> {
  826. let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy;
  827. let mut w = BufWriter::new(
  828. File::create(path)
  829. .with_context(|| format!("Cannot create seeds BED: {}", path.display()))?,
  830. );
  831. for seg in &self.segments {
  832. if seg.copy_number >= threshold
  833. && !overlaps_centromere(&seg.chromosome, seg.start, seg.end, centromeres)
  834. {
  835. writeln!(w, "{}\t{}\t{}", seg.chromosome, seg.start, seg.end)?;
  836. }
  837. }
  838. Ok(())
  839. }
  840. /// Returns the number of amplified non-centromeric seed segments.
  841. pub fn n_seeds(
  842. &self,
  843. purity: f64,
  844. ploidy: f64,
  845. min_tumour_cn: f64,
  846. centromeres: &[(String, u64, u64)],
  847. ) -> usize {
  848. let threshold = purity * min_tumour_cn + (1.0 - purity) * ploidy;
  849. self.segments
  850. .iter()
  851. .filter(|s| {
  852. s.copy_number >= threshold
  853. && !overlaps_centromere(&s.chromosome, s.start, s.end, centromeres)
  854. })
  855. .count()
  856. }
  857. }
  858. /// Returns true if [seg_start, seg_end) overlaps any centromeric interval
  859. /// on the same chromosome.
  860. fn overlaps_centromere(
  861. chrom: &str,
  862. start: u64,
  863. end: u64,
  864. centromeres: &[(String, u64, u64)],
  865. ) -> bool {
  866. centromeres
  867. .iter()
  868. .any(|(c, cs, ce)| c == chrom && start < *ce && end > *cs)
  869. }
  870. /// Reads SAVANA fitted_purity_ploidy.tsv.
  871. /// Expected format:
  872. /// purity ploidy distance rank
  873. /// 0.48 1.98 0.105 1
  874. pub fn read_savana_purity_ploidy(path: &Path) -> anyhow::Result<(f64, f64)> {
  875. let content = std::fs::read_to_string(path)?;
  876. let mut lines = content.lines();
  877. // Skip header
  878. lines.next().context("purity/ploidy file is empty")?;
  879. let data = lines.next().context("purity/ploidy file has no data row")?;
  880. let fields: Vec<&str> = data.split('\t').collect();
  881. anyhow::ensure!(
  882. fields.len() >= 2,
  883. "expected at least 2 columns, got {}",
  884. fields.len()
  885. );
  886. let purity = fields[0]
  887. .trim()
  888. .parse::<f64>()
  889. .with_context(|| format!("cannot parse purity: '{}'", fields[0]))?;
  890. let ploidy = fields[1]
  891. .trim()
  892. .parse::<f64>()
  893. .with_context(|| format!("cannot parse ploidy: '{}'", fields[1]))?;
  894. Ok((purity, ploidy))
  895. }
  896. #[cfg(test)]
  897. mod tests {
  898. use super::*;
  899. use crate::helpers::test_init;
  900. #[test]
  901. fn savana_version() -> anyhow::Result<()> {
  902. test_init();
  903. let vl = Savana::version(&Config::default())?;
  904. info!("Savana version: {vl}");
  905. Ok(())
  906. }
  907. #[test]
  908. fn savana_run() -> anyhow::Result<()> {
  909. test_init();
  910. let config = Config::default();
  911. let mut caller = Savana::initialize("DUMCO", &config)?;
  912. caller.run()?;
  913. Ok(())
  914. }
  915. }