|
@@ -1,14 +1,16 @@
|
|
|
-use std::{fmt, fs, fs::File, io::Write, sync::Mutex};
|
|
|
|
|
|
|
+use std::{fmt, fs, io::Write, sync::Mutex};
|
|
|
|
|
|
|
|
|
|
+use anyhow::Context;
|
|
|
use log::{debug, error, info};
|
|
use log::{debug, error, info};
|
|
|
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator};
|
|
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator};
|
|
|
use rayon::{
|
|
use rayon::{
|
|
|
iter::{IntoParallelIterator, ParallelIterator},
|
|
iter::{IntoParallelIterator, ParallelIterator},
|
|
|
slice::ParallelSliceMut,
|
|
slice::ParallelSliceMut,
|
|
|
};
|
|
};
|
|
|
|
|
+use rust_htslib::bam::IndexedReader;
|
|
|
|
|
|
|
|
|
|
+use crate::io::writers::get_gz_writer;
|
|
|
use crate::math::filter_outliers_modified_z_score_with_indices;
|
|
use crate::math::filter_outliers_modified_z_score_with_indices;
|
|
|
-use crate::positions::{contig_to_num, GenomeRange, GetGenomeRange};
|
|
|
|
|
use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};
|
|
use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};
|
|
|
|
|
|
|
|
/// Represents a count of reads in a genomic bin, including various metrics and outlier information.
|
|
/// Represents a count of reads in a genomic bin, including various metrics and outlier information.
|
|
@@ -32,6 +34,7 @@ pub struct BinCount {
|
|
|
pub ratio_end: f64,
|
|
pub ratio_end: f64,
|
|
|
/// Optional vector of outlier types for this bin.
|
|
/// Optional vector of outlier types for this bin.
|
|
|
pub outlier: Option<Vec<BinOutlier>>,
|
|
pub outlier: Option<Vec<BinOutlier>>,
|
|
|
|
|
+ pub depths: Vec<u32>,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
impl From<&Bin> for BinCount {
|
|
impl From<&Bin> for BinCount {
|
|
@@ -52,11 +55,12 @@ impl From<&Bin> for BinCount {
|
|
|
start: bin.start,
|
|
start: bin.start,
|
|
|
end: bin.end,
|
|
end: bin.end,
|
|
|
n_reads: n_reads as u32,
|
|
n_reads: n_reads as u32,
|
|
|
- coverage: bin.mean_coverage(),
|
|
|
|
|
|
|
+ coverage: bin.mean_coverage_from_depths(),
|
|
|
ratio_sa: n_sa as f64 / n_reads_float,
|
|
ratio_sa: n_sa as f64 / n_reads_float,
|
|
|
ratio_start: n_start as f64 / n_reads_float,
|
|
ratio_start: n_start as f64 / n_reads_float,
|
|
|
ratio_end: n_end as f64 / n_reads_float,
|
|
ratio_end: n_end as f64 / n_reads_float,
|
|
|
outlier: None,
|
|
outlier: None,
|
|
|
|
|
+ depths: bin.depths.clone(),
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -68,7 +72,7 @@ impl BinCount {
|
|
|
/// A `String` representing the `BinCount` as a TSV row.
|
|
/// A `String` representing the `BinCount` as a TSV row.
|
|
|
pub fn to_tsv_row(&self) -> String {
|
|
pub fn to_tsv_row(&self) -> String {
|
|
|
format!(
|
|
format!(
|
|
|
- "{}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.6}\t{:.6}\t{}",
|
|
|
|
|
|
|
+ "{}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.6}\t{:.6}\t{}\t{}",
|
|
|
self.contig,
|
|
self.contig,
|
|
|
self.start,
|
|
self.start,
|
|
|
self.end,
|
|
self.end,
|
|
@@ -84,7 +88,12 @@ impl BinCount {
|
|
|
.map(|e| e.to_string())
|
|
.map(|e| e.to_string())
|
|
|
.collect::<Vec<String>>()
|
|
.collect::<Vec<String>>()
|
|
|
.join(", "))
|
|
.join(", "))
|
|
|
- .unwrap_or_default()
|
|
|
|
|
|
|
+ .unwrap_or_default(),
|
|
|
|
|
+ self.depths
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .map(|e| e.to_string())
|
|
|
|
|
+ .collect::<Vec<String>>()
|
|
|
|
|
+ .join(",")
|
|
|
)
|
|
)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -104,7 +113,7 @@ impl BinCount {
|
|
|
/// - The outlier field is not in the expected format.
|
|
/// - The outlier field is not in the expected format.
|
|
|
pub fn from_tsv_row(row: &str) -> anyhow::Result<Self> {
|
|
pub fn from_tsv_row(row: &str) -> anyhow::Result<Self> {
|
|
|
let fields: Vec<&str> = row.split('\t').collect();
|
|
let fields: Vec<&str> = row.split('\t').collect();
|
|
|
- if fields.len() != 9 {
|
|
|
|
|
|
|
+ if fields.len() != 10 {
|
|
|
anyhow::bail!("Invalid number of fields in TSV row");
|
|
anyhow::bail!("Invalid number of fields in TSV row");
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -124,6 +133,15 @@ impl BinCount {
|
|
|
None
|
|
None
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
|
|
+ let depths = fields[9]
|
|
|
|
|
+ .split(',')
|
|
|
|
|
+ .map(|s| {
|
|
|
|
|
+ s.trim() // Remove any whitespace around numbers
|
|
|
|
|
+ .parse::<u32>()
|
|
|
|
|
+ .with_context(|| format!("Failed to parse '{}' as u32", s))
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect::<anyhow::Result<Vec<u32>>>()?;
|
|
|
|
|
+
|
|
|
Ok(BinCount {
|
|
Ok(BinCount {
|
|
|
contig: fields[0].to_string(),
|
|
contig: fields[0].to_string(),
|
|
|
start: fields[1].parse()?,
|
|
start: fields[1].parse()?,
|
|
@@ -134,6 +152,7 @@ impl BinCount {
|
|
|
ratio_start: fields[6].parse()?,
|
|
ratio_start: fields[6].parse()?,
|
|
|
ratio_end: fields[7].parse()?,
|
|
ratio_end: fields[7].parse()?,
|
|
|
outlier,
|
|
outlier,
|
|
|
|
|
+ depths,
|
|
|
})
|
|
})
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -144,7 +163,6 @@ impl BinCount {
|
|
|
// }
|
|
// }
|
|
|
// }
|
|
// }
|
|
|
|
|
|
|
|
-
|
|
|
|
|
/// Represents types of outliers that can be detected in a genomic bin.
|
|
/// Represents types of outliers that can be detected in a genomic bin.
|
|
|
#[derive(Debug, Clone)]
|
|
#[derive(Debug, Clone)]
|
|
|
pub enum BinOutlier {
|
|
pub enum BinOutlier {
|
|
@@ -231,13 +249,9 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
|
|
|
let bin_size = config.count_bin_size;
|
|
let bin_size = config.count_bin_size;
|
|
|
let chunk_n_bin = config.count_n_chunks;
|
|
let chunk_n_bin = config.count_n_chunks;
|
|
|
info!("Starting whole genome scan for {bam_path}, with bin size of {bin_size} nt and by chunks of {chunk_n_bin} bins.");
|
|
info!("Starting whole genome scan for {bam_path}, with bin size of {bin_size} nt and by chunks of {chunk_n_bin} bins.");
|
|
|
- fs::create_dir(out_dir)?;
|
|
|
|
|
|
|
+ fs::create_dir_all(out_dir)?;
|
|
|
|
|
|
|
|
for (contig, length) in read_dict(&config.dict_file)? {
|
|
for (contig, length) in read_dict(&config.dict_file)? {
|
|
|
- if contig.as_str() == "chrM" {
|
|
|
|
|
- continue;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
let n_bin = length / bin_size;
|
|
let n_bin = length / bin_size;
|
|
|
// Calculate number of chunks using ceiling division
|
|
// Calculate number of chunks using ceiling division
|
|
|
let n_chunks = n_bin.div_ceil(chunk_n_bin);
|
|
let n_chunks = n_bin.div_ceil(chunk_n_bin);
|
|
@@ -259,6 +273,9 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
|
|
|
// Calculate number of bins in this chunk with ceiling division
|
|
// Calculate number of bins in this chunk with ceiling division
|
|
|
let n_bins_in_chunk = chunk_length.div_ceil(bin_size);
|
|
let n_bins_in_chunk = chunk_length.div_ceil(bin_size);
|
|
|
|
|
|
|
|
|
|
+ let mut bam_reader = IndexedReader::from_path(bam_path)
|
|
|
|
|
+ .with_context(|| format!("Can't open BAM file: {}", bam_path)).unwrap();
|
|
|
|
|
+
|
|
|
// Process each bin in the chunk
|
|
// Process each bin in the chunk
|
|
|
(0..n_bins_in_chunk)
|
|
(0..n_bins_in_chunk)
|
|
|
// .into_iter()
|
|
// .into_iter()
|
|
@@ -267,7 +284,13 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
|
|
|
// Ensure we don't exceed remaining length
|
|
// Ensure we don't exceed remaining length
|
|
|
let bin_length = std::cmp::min(bin_size, chunk_length - j * bin_size);
|
|
let bin_length = std::cmp::min(bin_size, chunk_length - j * bin_size);
|
|
|
// debug!("chunk start:{chunk_start}, length: {chunk_length}, n_bins: {n_bins_in_chunk}, first bin start: {bin_start} bin length: {bin_length}");
|
|
// debug!("chunk start:{chunk_start}, length: {chunk_length}, n_bins: {n_bins_in_chunk}, first bin start: {bin_start} bin length: {bin_length}");
|
|
|
- match Bin::new(bam_path, &contig, bin_start, bin_length) {
|
|
|
|
|
|
|
+ match Bin::new(
|
|
|
|
|
+ &mut bam_reader,
|
|
|
|
|
+ &contig,
|
|
|
|
|
+ bin_start,
|
|
|
|
|
+ bin_length,
|
|
|
|
|
+ config.bam_min_mapq,
|
|
|
|
|
+ ) {
|
|
|
Ok(bin) => Some(BinCount::from(&bin)),
|
|
Ok(bin) => Some(BinCount::from(&bin)),
|
|
|
Err(e) => {
|
|
Err(e) => {
|
|
|
error!("Failed to get Bin: {e}");
|
|
error!("Failed to get Bin: {e}");
|
|
@@ -285,9 +308,11 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
|
|
|
debug!("Scan {contig}, computing outliers");
|
|
debug!("Scan {contig}, computing outliers");
|
|
|
fill_outliers(&mut bins);
|
|
fill_outliers(&mut bins);
|
|
|
|
|
|
|
|
- let out_file = format!("{out_dir}/{contig}_count.tsv");
|
|
|
|
|
|
|
+ let out_file = format!("{out_dir}/{contig}_count.tsv.gz");
|
|
|
debug!("Scan {contig}, writing file");
|
|
debug!("Scan {contig}, writing file");
|
|
|
- let mut file = File::create(out_file)?;
|
|
|
|
|
|
|
+
|
|
|
|
|
+ let mut file = get_gz_writer(&out_file, true)
|
|
|
|
|
+ .with_context(|| anyhow::anyhow!("failed to open the file: {out_file}"))?;
|
|
|
for bin in bins {
|
|
for bin in bins {
|
|
|
writeln!(file, "{}", bin.to_tsv_row())?;
|
|
writeln!(file, "{}", bin.to_tsv_row())?;
|
|
|
}
|
|
}
|
|
@@ -342,51 +367,38 @@ pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow:
|
|
|
/// // After this call, some BinCount objects may have their outlier field populated
|
|
/// // After this call, some BinCount objects may have their outlier field populated
|
|
|
/// ```
|
|
/// ```
|
|
|
pub fn fill_outliers(bin_counts: &mut [BinCount]) {
|
|
pub fn fill_outliers(bin_counts: &mut [BinCount]) {
|
|
|
- let bin_counts = Mutex::new(bin_counts);
|
|
|
|
|
-
|
|
|
|
|
- fn get_ratio_sa(c: &BinCount) -> f64 {
|
|
|
|
|
- c.ratio_sa
|
|
|
|
|
- }
|
|
|
|
|
- fn get_ratio_start(c: &BinCount) -> f64 {
|
|
|
|
|
- c.ratio_start
|
|
|
|
|
- }
|
|
|
|
|
- fn get_ratio_end(c: &BinCount) -> f64 {
|
|
|
|
|
- c.ratio_end
|
|
|
|
|
- }
|
|
|
|
|
- type OutlierTypeInfo = (fn(&BinCount) -> f64, BinOutlier);
|
|
|
|
|
|
|
+ let intermediate: Vec<_> = bin_counts
|
|
|
|
|
+ .par_iter()
|
|
|
|
|
+ .enumerate()
|
|
|
|
|
+ .map(|(i, c)| (i, [c.ratio_sa, c.ratio_start, c.ratio_end]))
|
|
|
|
|
+ .collect();
|
|
|
|
|
|
|
|
- let outlier_types: [OutlierTypeInfo; 3] = [
|
|
|
|
|
- (get_ratio_sa, BinOutlier::SA),
|
|
|
|
|
- (get_ratio_start, BinOutlier::Start),
|
|
|
|
|
- (get_ratio_end, BinOutlier::End),
|
|
|
|
|
- ];
|
|
|
|
|
|
|
+ let outlier_types = [BinOutlier::SA, BinOutlier::Start, BinOutlier::End];
|
|
|
|
|
|
|
|
- for (get_ratio, outlier_type) in outlier_types.iter() {
|
|
|
|
|
- let (indices, ratios): (Vec<usize>, Vec<f64>) = bin_counts
|
|
|
|
|
- .lock()
|
|
|
|
|
- .unwrap()
|
|
|
|
|
- .par_iter()
|
|
|
|
|
- .enumerate()
|
|
|
|
|
- .filter_map(|(i, c)| {
|
|
|
|
|
- let ratio = get_ratio(c);
|
|
|
|
|
- if !ratio.is_nan() {
|
|
|
|
|
- Some((i, ratio))
|
|
|
|
|
- } else {
|
|
|
|
|
- None
|
|
|
|
|
- }
|
|
|
|
|
- })
|
|
|
|
|
- .unzip();
|
|
|
|
|
|
|
+ let outliers: Vec<(usize, BinOutlier)> = outlier_types
|
|
|
|
|
+ .iter()
|
|
|
|
|
+ .enumerate()
|
|
|
|
|
+ .flat_map(|(ratio_idx, outlier_type)| {
|
|
|
|
|
+ let (indices, ratios): (Vec<_>, Vec<_>) = intermediate
|
|
|
|
|
+ .par_iter()
|
|
|
|
|
+ .filter_map(|(i, ratio_array)| {
|
|
|
|
|
+ let ratio = ratio_array[ratio_idx];
|
|
|
|
|
+ (!ratio.is_nan()).then_some((*i, ratio))
|
|
|
|
|
+ })
|
|
|
|
|
+ .unzip();
|
|
|
|
|
|
|
|
- let outlier_indices = filter_outliers_modified_z_score_with_indices(&ratios, indices);
|
|
|
|
|
|
|
+ filter_outliers_modified_z_score_with_indices(&ratios, indices)
|
|
|
|
|
+ .into_iter()
|
|
|
|
|
+ .map(move |i| (i, outlier_type.clone()))
|
|
|
|
|
+ })
|
|
|
|
|
+ .collect();
|
|
|
|
|
|
|
|
- outlier_indices.par_iter().for_each(|&i| {
|
|
|
|
|
- let mut bin_counts = bin_counts.lock().unwrap();
|
|
|
|
|
- bin_counts[i]
|
|
|
|
|
- .outlier
|
|
|
|
|
- .get_or_insert_with(Vec::new)
|
|
|
|
|
- .push(outlier_type.clone());
|
|
|
|
|
- });
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ outliers.iter().for_each(|(i, outlier_type)| {
|
|
|
|
|
+ bin_counts[*i]
|
|
|
|
|
+ .outlier
|
|
|
|
|
+ .get_or_insert_with(Vec::new)
|
|
|
|
|
+ .push(outlier_type.clone());
|
|
|
|
|
+ });
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// Performs a somatic scan for a given sample ID by analyzing both normal and tumoral BAM files.
|
|
/// Performs a somatic scan for a given sample ID by analyzing both normal and tumoral BAM files.
|
|
@@ -427,3 +439,4 @@ pub fn somatic_scan(id: &str, config: &Config) -> anyhow::Result<()> {
|
|
|
config,
|
|
config,
|
|
|
)
|
|
)
|
|
|
}
|
|
}
|
|
|
|
|
+
|