use crate::{ annotation::{Annotation, Annotations, Caller, CallerCat, Sample}, collection::{vcf::Vcf, HasOutputs, Initialize, Version}, commands::{ bcftools::{bcftools_keep_pass, BcftoolsConfig}, longphase::{LongphaseConfig, LongphaseHap, LongphasePhase}, }, config::Config, io::{readers::get_gz_reader, vcf::read_vcf}, positions::{num_to_contig, GenomeRange}, runners::{run_wait, CommandRun, Run}, variant::{ variant::{RunnerVariants, Variants}, variant_collection::VariantCollection, }, }; use anyhow::Context; use itertools::Itertools; use log::info; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::{ fs::{self}, io::BufRead, path::Path, str::FromStr, }; #[derive(Debug)] pub struct Savana { pub id: String, pub passed_vcf: String, pub phased_germline_vcf: String, pub normal_hp_bam: String, pub tumoral_hp_bam: String, pub config: Config, pub log_dir: String, } impl Initialize for Savana { fn initialize(id: &str, config: Config) -> anyhow::Result { let log_dir = format!("{}/{}/log/savana", config.result_dir, id); if !Path::new(&log_dir).exists() { fs::create_dir_all(&log_dir) .context(format!("Failed to create {log_dir} directory"))?; } // Check for phased germline vcf // no required anymore since >= 1.3.0 let phased_germline_vcf = config.constit_phased_vcf(id); if !Path::new(&phased_germline_vcf).exists() { LongphasePhase::initialize(id, config.clone())?.run()?; } let normal_hp_bam = config.normal_haplotagged_bam(id); let tumoral_hp_bam = config.tumoral_haplotagged_bam(id); let passed_vcf = config.savana_passed_vcf(id); fs::create_dir_all(config.savana_output_dir(id))?; Ok(Self { id: id.to_string(), passed_vcf, config, log_dir, phased_germline_vcf, normal_hp_bam, tumoral_hp_bam, }) } } impl Run for Savana { fn run(&mut self) -> anyhow::Result<()> { // force_or_not( // &self.config.savana_output_vcf(&self.id), // self.config.savana_force, // )?; let id = &self.id; let output_vcf = &self.config.savana_output_vcf(id); if !Path::new(&output_vcf).exists() { info!("Running Savana v{}", Savana::version(&self.config)?); // Check for haplotagged bam if !Path::new(&self.normal_hp_bam).exists() { LongphaseHap::new( id, &self.config.normal_bam(id), &self.phased_germline_vcf, LongphaseConfig::default(), ) .run()?; } if !Path::new(&self.tumoral_hp_bam).exists() { LongphaseHap::new( id, &self.config.tumoral_bam(id), &self.phased_germline_vcf, LongphaseConfig::default(), ) .run()?; } let savana_args = [ // "run", "--tumour", &self.tumoral_hp_bam, "--normal", &self.normal_hp_bam, "--outdir", &self.config.savana_output_dir(id), "--ref", &self.config.reference, "--snp_vcf", &self.phased_germline_vcf, "--no_blacklist", "--threads", &self.config.savana_threads.to_string(), ]; let args = [ "-c", &format!( "source {} && conda activate savana && {} {}", self.config.conda_sh, self.config.savana_bin, savana_args.join(" ") ), ]; let mut cmd_run = CommandRun::new("bash", &args); let report = run_wait(&mut cmd_run).context(format!( "Error while running `severus.py {}`", args.join(" ") ))?; let log_file = format!("{}/savana_", self.log_dir); report .save_to_file(&log_file) .context(format!("Error while writing logs into {log_file}"))?; } // Keep PASS if !Path::new(&self.passed_vcf).exists() && Path::new(output_vcf).exists() { let report = bcftools_keep_pass(output_vcf, &self.passed_vcf, BcftoolsConfig::default()) .context(format!( "Error while running bcftools keep PASS for {output_vcf}" ))?; let log_file = format!("{}/bcftools_pass", self.log_dir); report .save_to_file(&log_file) .context(format!("Error while writing logs into {log_file}"))?; } Ok(()) } } impl HasOutputs for Savana { fn has_output(&self, id: &str) -> (bool, bool) { let output = &self.config.savana_passed_vcf(id); let passed = &self.config.savana_passed_vcf(id); (Path::new(output).exists(), Path::new(passed).exists()) } } impl Version for Savana { fn version(config: &Config) -> anyhow::Result { let savana_args = ["--version"]; let args = [ "-c", &format!( "source {} && conda activate savana && {} {}", config.conda_sh, config.savana_bin, savana_args.join(" ") ), ]; let mut cmd_run = CommandRun::new("bash", &args); let report = run_wait(&mut cmd_run) .context(format!("Error while running `savana {}`", args.join(" ")))?; let log = report.log; let start = log .find("Version ") .context("Failed to find 'Version ' in the log")?; let start_index = start + "Version ".len(); let end = log[start_index..] .find('\n') .context("Failed to find newline after 'Version '")?; Ok(log[start_index..start_index + end] .to_string() .trim() .to_string()) } } impl CallerCat for Savana { fn caller_cat(&self) -> Annotation { Annotation::Callers(Caller::Savana, Sample::Somatic) } } impl Variants for Savana { fn variants(&self, annotations: &Annotations) -> anyhow::Result { let caller = self.caller_cat(); let add = vec![caller.clone()]; info!("Loading variants from {}: {}", caller, self.passed_vcf); let variants = read_vcf(&self.passed_vcf) .map_err(|e| anyhow::anyhow!("Failed to read Savana VCF {}.\n{e}", self.passed_vcf))?; variants.par_iter().for_each(|v| { annotations.insert_update(v.hash(), &add); }); info!("{}, {} variants loaded.", caller, variants.len()); Ok(VariantCollection { variants, vcf: Vcf::new(self.passed_vcf.clone().into())?, caller, }) } } impl RunnerVariants for Savana {} pub struct SavanaCNRow { pub range: GenomeRange, pub segment_id: String, pub bin_count: u32, pub sum_of_bin_lengths: u32, pub weight: f32, pub copy_number: f32, pub minor_allele_copy_number: Option, pub mean_baf: Option, pub n_het_snps: u32, } impl FromStr for SavanaCNRow { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { let cells: Vec<&str> = s.split("\t").collect(); let range = GenomeRange::from_1_inclusive_str( cells .first() .context(format!("Error while parsing contig {s}."))?, cells .get(1) .context(format!("Error while parsing start {s}."))?, cells .get(2) .context(format!("Error while parsing end {s}."))?, ) .map_err(|e| anyhow::anyhow!("Error while parsing range {s}.\n{e}"))?; Ok(Self { range, segment_id: cells .get(3) .context(format!("Error while parsing segment_id {s}."))? .to_string(), bin_count: cells .get(4) .context(format!("Error while parsing bin_count {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing bin_count {s}.\n{e}"))?, sum_of_bin_lengths: cells .get(5) .context(format!("Error while parsing sum_of_bin_lengths {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing sum_of_bin_lengths {s}.\n{e}"))?, weight: cells .get(6) .context(format!("Error while parsing weight {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing weight {s}.\n{e}"))?, copy_number: cells .get(7) .context(format!("Error while parsing copy_number {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing copy_number {s}.\n{e}"))?, minor_allele_copy_number: cells .get(8) .context(format!("Error while parsing minor_allele_copy_number {s}."))? .parse::() .map(Some) .unwrap_or(None), mean_baf: cells .get(9) .context(format!("Error while parsing mean_baf {s}."))? .parse::() .map(Some) .unwrap_or(None), n_het_snps: cells .get(10) .context(format!("Error while parsing n_het_snps {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing n_het_snps {s}.\n{e}"))?, }) } } pub struct SavanaCopyNumber { pub data: Vec, } impl SavanaCopyNumber { pub fn load_id(id: &str, config: Config) -> anyhow::Result { let path = config.savana_copy_number(id); let reader = get_gz_reader(&path)?; let mut data = Vec::new(); for line in reader.lines() { let line = line?; if line.starts_with("chromosome") { continue; } let cn: SavanaCNRow = line.parse()?; data.push(cn); } Ok(Self { data }) } } // bin chromosome start end perc_known_bases use_bin tumour_read_count normal_read_count pub struct SavanaRCRow { pub range: GenomeRange, pub perc_known_bases: f32, pub use_bin: bool, pub tumour_read_count: u32, pub normal_read_count: u32, } impl FromStr for SavanaRCRow { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { let cells: Vec<&str> = s.split("\t").collect(); let bin = cells .first() .context(format!("Error while parsing range {s}."))?; let range = bin .split_once(':') .and_then(|(a, b)| { b.split_once('_').map(|(b, c)| { GenomeRange::from_1_inclusive_str(a, b, c) .context(format!("Error while parsing range {}", bin)) }) }) .context(format!("Invalid range format: {}", bin))??; Ok(Self { range, perc_known_bases: cells .get(4) .context(format!("Error while parsing perc_known_bases {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing perc_known_bases {s}.\n{e}"))?, use_bin: cells .get(5) .context(format!("Error while parsing use_bin {s}."))? .to_lowercase() .parse() .map_err(|e| anyhow::anyhow!("Error while parsing use_bin {s}.\n{e}"))?, tumour_read_count: cells .get(6) .context(format!("Error while parsing tumour_read_count {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing tumour_read_count {s}.\n{e}"))?, normal_read_count: cells .get(7) .context(format!("Error while parsing normal_read_count {s}."))? .parse() .map_err(|e| anyhow::anyhow!("Error while parsing normal_read_count {s}.\n{e}"))?, }) } } pub struct SavanaReadCounts { pub data: Vec, } impl SavanaReadCounts { pub fn load_id(id: &str, config: Config) -> anyhow::Result { let path = config.savana_read_counts(id); let reader = get_gz_reader(&path)?; let mut data = Vec::new(); for line in reader.lines() { let line = line?; if line.starts_with("bin") { continue; } let cn: SavanaRCRow = line.parse()?; data.push(cn); } Ok(Self { data }) } pub fn n_tumoral_reads(&self) -> usize { self.data .par_iter() .map(|c| c.tumour_read_count as usize) .sum::() } pub fn n_normal_reads(&self) -> usize { self.data .par_iter() .map(|c| c.normal_read_count as usize) .sum::() } pub fn norm_chr_counts(&self) -> Vec<(String, f64)> { let n_tum = self.n_tumoral_reads(); let n_bins = self.data.len(); self.data .iter() .chunk_by(|c| c.range.contig) .into_iter() .map(|(contig_num, chr_counts)| { let chr_counts: Vec<_> = chr_counts.collect(); let chr_n_bins = chr_counts.len(); let chr_counts = chr_counts .par_iter() .map(|c| c.tumour_read_count as usize) .sum::(); let r = chr_counts as f64 / ((chr_n_bins as f64 * n_tum as f64) / n_bins as f64); (num_to_contig(contig_num), r) }) .collect() } } #[derive(Debug, Serialize, Deserialize)] pub struct SavanaCN { pub segments: Vec, } #[derive(Debug, Serialize, Deserialize)] pub struct CNSegment { pub chromosome: String, pub start: u64, pub end: u64, pub segment_id: String, pub bin_count: u32, pub sum_of_bin_lengths: u64, pub weight: f64, pub copy_number: f64, pub minor_allele_copy_number: Option, pub mean_baf: Option, pub no_het_snps: u32, } impl SavanaCN { pub fn parse_file(id: &str, config: &Config) -> anyhow::Result { let path = config.savana_copy_number(id); let reader = get_gz_reader(&path)?; // let file = File::open(&path).context("Failed to open the file")?; // let reader = io::BufReader::new(file); let mut segments = Vec::new(); for (index, line) in reader.lines().enumerate() { let line = line.context("Failed to read line from file")?; // Skip header line if index == 0 { continue; } let parts: Vec<&str> = line.split('\t').collect(); if parts.len() != 11 { return Err(anyhow::anyhow!( "Invalid number of columns at line {} (expected 11, got {})", index + 1, parts.len() )); } // Helper closure for parsing with context let parse_field = |field: &str, name: &str, line_num: usize| -> anyhow::Result { field .parse() .context(format!("Failed to parse {name} at line {line_num}")) }; let parse_field_u32 = |field: &str, name: &str, line_num: usize| -> anyhow::Result { field .parse() .context(format!("Failed to parse {name} at line {line_num}")) }; let line_num = index + 1; let segment = CNSegment { chromosome: parts[0].to_string(), start: parse_field_u32(parts[1], "start", line_num)? as u64, end: parse_field_u32(parts[2], "end", line_num)? as u64, segment_id: parts[3].to_string(), bin_count: parse_field_u32(parts[4], "bin_count", line_num)?, sum_of_bin_lengths: parse_field_u32(parts[5], "sum_of_bin_lengths", line_num)? as u64, weight: parse_field(parts[6], "weight", line_num)?, copy_number: parse_field(parts[7], "copy_number", line_num)?, minor_allele_copy_number: if parts[8].is_empty() { None } else { Some(parse_field(parts[8], "minor_allele_copy_number", line_num)?) }, mean_baf: if parts[9].is_empty() { None } else { Some(parse_field(parts[9], "mean_baf", line_num)?) }, no_het_snps: if parts[10].is_empty() { 0 } else { parse_field_u32(parts[10], "no_het_snps", line_num)? }, }; segments.push(segment); } Ok(Self { segments }) } }