|
|
@@ -0,0 +1,429 @@
|
|
|
+use std::{fmt, fs, fs::File, io::Write, sync::Mutex};
|
|
|
+
|
|
|
+use log::{debug, error, info};
|
|
|
+use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator};
|
|
|
+use rayon::{
|
|
|
+ iter::{IntoParallelIterator, ParallelIterator},
|
|
|
+ slice::ParallelSliceMut,
|
|
|
+};
|
|
|
+
|
|
|
+use crate::math::filter_outliers_modified_z_score_with_indices;
|
|
|
+use crate::positions::{contig_to_num, GenomeRange, GetGenomeRange};
|
|
|
+use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};
|
|
|
+
|
|
|
+/// Represents a count of reads in a genomic bin, including various metrics and outlier information.
|
|
|
+#[derive(Debug)]
|
|
|
+pub struct BinCount {
|
|
|
+ /// The name of the contig (chromosome) this bin belongs to.
|
|
|
+ pub contig: String,
|
|
|
+ /// The start position of the bin in the contig.
|
|
|
+ pub start: u32,
|
|
|
+ /// The end position of the bin in the contig.
|
|
|
+ pub end: u32,
|
|
|
+ /// The total number of reads in this bin.
|
|
|
+ pub n_reads: u32,
|
|
|
+ /// The average coverage of reads in this bin.
|
|
|
+ pub coverage: f64,
|
|
|
+ /// The ratio of supplementary alignments to total reads.
|
|
|
+ pub ratio_sa: f64,
|
|
|
+ /// The ratio of reads starting in this bin to total reads.
|
|
|
+ pub ratio_start: f64,
|
|
|
+ /// The ratio of reads ending in this bin to total reads.
|
|
|
+ pub ratio_end: f64,
|
|
|
+ /// Optional vector of outlier types for this bin.
|
|
|
+ pub outlier: Option<Vec<BinOutlier>>,
|
|
|
+}
|
|
|
+
|
|
|
+impl From<&Bin> for BinCount {
|
|
|
+ /// Converts a `Bin` reference to a `BinCount`.
|
|
|
+ ///
|
|
|
+ /// # Parameters
|
|
|
+ /// - `bin: &Bin`: A reference to the `Bin` object to convert.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// A new `BinCount` instance populated with data from the `Bin`.
|
|
|
+ fn from(bin: &Bin) -> Self {
|
|
|
+ let n_reads = bin.n_reads();
|
|
|
+ let n_reads_float = n_reads as f64;
|
|
|
+ let (n_sa, n_start, n_end) = bin.count_reads_sa_start_end();
|
|
|
+
|
|
|
+ Self {
|
|
|
+ contig: bin.contig.clone(),
|
|
|
+ start: bin.start,
|
|
|
+ end: bin.end,
|
|
|
+ n_reads: n_reads as u32,
|
|
|
+ coverage: bin.mean_coverage(),
|
|
|
+ ratio_sa: n_sa as f64 / n_reads_float,
|
|
|
+ ratio_start: n_start as f64 / n_reads_float,
|
|
|
+ ratio_end: n_end as f64 / n_reads_float,
|
|
|
+ outlier: None,
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl BinCount {
|
|
|
+ /// Converts the `BinCount` instance to a TSV (Tab-Separated Values) row string.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// A `String` representing the `BinCount` as a TSV row.
|
|
|
+ pub fn to_tsv_row(&self) -> String {
|
|
|
+ format!(
|
|
|
+ "{}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.6}\t{:.6}\t{}",
|
|
|
+ self.contig,
|
|
|
+ self.start,
|
|
|
+ self.end,
|
|
|
+ self.n_reads,
|
|
|
+ self.coverage,
|
|
|
+ self.ratio_sa,
|
|
|
+ self.ratio_start,
|
|
|
+ self.ratio_end,
|
|
|
+ self.outlier
|
|
|
+ .clone()
|
|
|
+ .map(|v| v
|
|
|
+ .iter()
|
|
|
+ .map(|e| e.to_string())
|
|
|
+ .collect::<Vec<String>>()
|
|
|
+ .join(", "))
|
|
|
+ .unwrap_or_default()
|
|
|
+ )
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Parses a TSV row and creates a BinCount object.
|
|
|
+ ///
|
|
|
+ /// # Parameters
|
|
|
+ /// - `row: &str`: A string slice representing a TSV row.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// A `Result` containing either a `BinCount` object if parsing is successful,
|
|
|
+ /// or an `anyhow::Error` if parsing fails.
|
|
|
+ ///
|
|
|
+ /// # Errors
|
|
|
+ /// This function will return an error if:
|
|
|
+ /// - The row does not contain the expected number of fields.
|
|
|
+ /// - Any of the numeric fields fail to parse.
|
|
|
+ /// - The outlier field is not in the expected format.
|
|
|
+ pub fn from_tsv_row(row: &str) -> anyhow::Result<Self> {
|
|
|
+ let fields: Vec<&str> = row.split('\t').collect();
|
|
|
+ if fields.len() != 9 {
|
|
|
+ anyhow::bail!("Invalid number of fields in TSV row");
|
|
|
+ }
|
|
|
+
|
|
|
+ let outlier = if !fields[8].is_empty() {
|
|
|
+ Some(
|
|
|
+ fields[8]
|
|
|
+ .split(", ")
|
|
|
+ .map(|s| match s {
|
|
|
+ "SA" => Ok(BinOutlier::SA),
|
|
|
+ "Start" => Ok(BinOutlier::Start),
|
|
|
+ "End" => Ok(BinOutlier::End),
|
|
|
+ _ => Err(anyhow::anyhow!("Invalid outlier type: {}", s)),
|
|
|
+ })
|
|
|
+ .collect::<Result<Vec<BinOutlier>, _>>()?,
|
|
|
+ )
|
|
|
+ } else {
|
|
|
+ None
|
|
|
+ };
|
|
|
+
|
|
|
+ Ok(BinCount {
|
|
|
+ contig: fields[0].to_string(),
|
|
|
+ start: fields[1].parse()?,
|
|
|
+ end: fields[2].parse()?,
|
|
|
+ n_reads: fields[3].parse()?,
|
|
|
+ coverage: fields[4].parse()?,
|
|
|
+ ratio_sa: fields[5].parse()?,
|
|
|
+ ratio_start: fields[6].parse()?,
|
|
|
+ ratio_end: fields[7].parse()?,
|
|
|
+ outlier,
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// impl GetGenomeRange for BinCount {
|
|
|
+// fn range(&self) -> GenomeRange {
|
|
|
+// GenomeRange { contig: contig_to_num(&self.contig), range: self.start..(self.end + 1) }
|
|
|
+// }
|
|
|
+// }
|
|
|
+
|
|
|
+
|
|
|
+/// Represents types of outliers that can be detected in a genomic bin.
|
|
|
+#[derive(Debug, Clone)]
|
|
|
+pub enum BinOutlier {
|
|
|
+ /// Indicates an outlier in supplementary alignments.
|
|
|
+ SA,
|
|
|
+ /// Indicates an outlier in reads starting in this bin.
|
|
|
+ Start,
|
|
|
+ /// Indicates an outlier in reads ending in this bin.
|
|
|
+ End,
|
|
|
+}
|
|
|
+
|
|
|
+impl fmt::Display for BinOutlier {
|
|
|
+ /// Implements the `Display` trait for `BinOutlier`.
|
|
|
+ ///
|
|
|
+ /// # Parameters
|
|
|
+ /// - `f: &mut fmt::Formatter<'_>`: The formatter to write the string representation.
|
|
|
+ ///
|
|
|
+ /// # Returns
|
|
|
+ /// A `fmt::Result` indicating success or failure of the formatting operation.
|
|
|
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
+ match self {
|
|
|
+ BinOutlier::SA => write!(f, "SA"),
|
|
|
+ BinOutlier::Start => write!(f, "Start"),
|
|
|
+ BinOutlier::End => write!(f, "End"),
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Performs a parallel whole genome scan on a BAM file, counting reads in bins across the genome
|
|
|
+/// and identifying outliers. The results are written to TSV files, one per contig.
|
|
|
+///
|
|
|
+/// # Parameters
|
|
|
+/// - `out_dir: &str`: The output directory where results will be saved.
|
|
|
+/// - `bam_path: &str`: The path to the input BAM file.
|
|
|
+/// - `config: &Config`: A configuration object containing the following fields:
|
|
|
+/// - `count_bin_size`: The size of each bin in base pairs.
|
|
|
+/// - `count_n_chunks`: The number of bins per chunk for parallel processing.
|
|
|
+/// - `dict_file`: Path to the dictionary file containing contig names and lengths.
|
|
|
+///
|
|
|
+/// # Returns
|
|
|
+/// - `anyhow::Result<()>`: Returns `Ok(())` if successful, or an error if any operation fails.
|
|
|
+///
|
|
|
+/// # Description
|
|
|
+/// This function processes the genome in parallel by dividing each contig into chunks of bins.
|
|
|
+/// Each bin represents a region of the genome, and the function counts reads in these bins.
|
|
|
+/// After processing, bins are sorted, outliers are identified, and results are written to TSV files.
|
|
|
+///
|
|
|
+/// ## Workflow:
|
|
|
+/// 1. **Initialization**:
|
|
|
+/// - Logs the start of the scan with bin size and chunk information.
|
|
|
+/// - Creates the output directory if it does not exist.
|
|
|
+///
|
|
|
+/// 2. **Contig Processing**:
|
|
|
+/// - Reads contigs and their lengths from the dictionary file.
|
|
|
+/// - Skips the mitochondrial chromosome ("chrM").
|
|
|
+///
|
|
|
+/// 3. **Parallel Scanning**:
|
|
|
+/// - For each contig:
|
|
|
+/// - Calculates the number of bins and chunks using ceiling division.
|
|
|
+/// - Processes chunks in parallel using `into_par_iter()`.
|
|
|
+/// - For each chunk:
|
|
|
+/// - Calculates chunk start position and length.
|
|
|
+/// - Processes individual bins within the chunk by creating `Bin` objects and converting them to `BinCount`.
|
|
|
+///
|
|
|
+/// 4. **Post-Processing**:
|
|
|
+/// - Sorts bins by their start positions using parallel sorting.
|
|
|
+/// - Identifies outlier bins using a custom function (`fill_outliers`).
|
|
|
+///
|
|
|
+/// 5. **Output**:
|
|
|
+/// - Writes results for each contig to a TSV file in the specified output directory.
|
|
|
+///
|
|
|
+/// ## Notes:
|
|
|
+/// - The function uses ceiling division (`div_ceil`) to handle edge cases in bin and chunk calculations.
|
|
|
+/// - It includes debug logging for various stages of processing.
|
|
|
+/// - Handles edge cases for the last chunk and bin to ensure proper processing of all data.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+/// This function will return an error if:
|
|
|
+/// - The output directory cannot be created.
|
|
|
+/// - The dictionary file cannot be read.
|
|
|
+/// - A `Bin` object cannot be created for a specific region.
|
|
|
+/// - Any I/O operation (e.g., writing results) fails.
|
|
|
+pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow::Result<()> {
|
|
|
+ let bin_size = config.count_bin_size;
|
|
|
+ let chunk_n_bin = config.count_n_chunks;
|
|
|
+ info!("Starting whole genome scan for {bam_path}, with bin size of {bin_size} nt and by chunks of {chunk_n_bin} bins.");
|
|
|
+ fs::create_dir(out_dir)?;
|
|
|
+
|
|
|
+ for (contig, length) in read_dict(&config.dict_file)? {
|
|
|
+ if contig.as_str() == "chrM" {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ let n_bin = length / bin_size;
|
|
|
+ // Calculate number of chunks using ceiling division
|
|
|
+ let n_chunks = n_bin.div_ceil(chunk_n_bin);
|
|
|
+ info!("Scan of contig: {contig}");
|
|
|
+
|
|
|
+ let mut bins: Vec<BinCount> = (0..n_chunks)
|
|
|
+ .into_par_iter()
|
|
|
+ .flat_map(|i| {
|
|
|
+ // Calculate chunk start position
|
|
|
+ let chunk_start = i * chunk_n_bin * bin_size;
|
|
|
+
|
|
|
+ // Calculate chunk length
|
|
|
+ let chunk_length = if i == n_chunks - 1 {
|
|
|
+ length - chunk_start // Handle last chunk
|
|
|
+ } else {
|
|
|
+ chunk_n_bin * bin_size // Standard chunk size
|
|
|
+ };
|
|
|
+
|
|
|
+ // Calculate number of bins in this chunk with ceiling division
|
|
|
+ let n_bins_in_chunk = chunk_length.div_ceil(bin_size);
|
|
|
+
|
|
|
+ // Process each bin in the chunk
|
|
|
+ (0..n_bins_in_chunk)
|
|
|
+ // .into_iter()
|
|
|
+ .filter_map(|j| {
|
|
|
+ let bin_start = chunk_start + j * bin_size;
|
|
|
+ // Ensure we don't exceed remaining length
|
|
|
+ let bin_length = std::cmp::min(bin_size, chunk_length - j * bin_size);
|
|
|
+ // debug!("chunk start:{chunk_start}, length: {chunk_length}, n_bins: {n_bins_in_chunk}, first bin start: {bin_start} bin length: {bin_length}");
|
|
|
+ match Bin::new(bam_path, &contig, bin_start, bin_length) {
|
|
|
+ Ok(bin) => Some(BinCount::from(&bin)),
|
|
|
+ Err(e) => {
|
|
|
+ error!("Failed to get Bin: {e}");
|
|
|
+ None
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect::<Vec<BinCount>>()
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ debug!("Scan {contig}, sorting bins");
|
|
|
+ bins.par_sort_unstable_by(|a, b| a.start.cmp(&b.start));
|
|
|
+
|
|
|
+ debug!("Scan {contig}, computing outliers");
|
|
|
+ fill_outliers(&mut bins);
|
|
|
+
|
|
|
+ let out_file = format!("{out_dir}/{contig}_count.tsv");
|
|
|
+ debug!("Scan {contig}, writing file");
|
|
|
+ let mut file = File::create(out_file)?;
|
|
|
+ for bin in bins {
|
|
|
+ writeln!(file, "{}", bin.to_tsv_row())?;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Ok(())
|
|
|
+}
|
|
|
+
|
|
|
+/// Identifies and marks outliers in a slice of `BinCount` objects based on various ratio metrics.
|
|
|
+///
|
|
|
+/// # Parameters
|
|
|
+/// - `bin_counts: &mut [BinCount]`: A mutable slice of `BinCount` objects to process.
|
|
|
+///
|
|
|
+/// # Description
|
|
|
+/// This function analyzes the `BinCount` objects for outliers in three different ratios:
|
|
|
+/// supplementary alignments (SA), read starts, and read ends. It uses a modified Z-score
|
|
|
+/// method to identify outliers and marks them in the `outlier` field of each `BinCount`.
|
|
|
+///
|
|
|
+/// ## Workflow:
|
|
|
+/// 1. Wraps the input slice in a `Mutex` for thread-safe access.
|
|
|
+/// 2. Defines helper functions to extract specific ratios from `BinCount` objects.
|
|
|
+/// 3. Iterates over three outlier types: SA, Start, and End.
|
|
|
+/// 4. For each type:
|
|
|
+/// a. Filters and collects non-NaN ratios along with their indices.
|
|
|
+/// b. Identifies outliers using the `filter_outliers_modified_z_score_with_indices` function.
|
|
|
+/// c. Marks identified outliers in the original `BinCount` objects.
|
|
|
+///
|
|
|
+/// ## Parallelization:
|
|
|
+/// - Uses `par_iter()` for parallel processing of `BinCount` objects.
|
|
|
+/// - Applies outlier marking in parallel using `par_iter().for_each()`.
|
|
|
+///
|
|
|
+/// ## Thread Safety:
|
|
|
+/// - Uses `Mutex` to ensure thread-safe access to the shared `bin_counts` slice.
|
|
|
+///
|
|
|
+/// ## Outlier Types:
|
|
|
+/// - `BinOutlier::SA`: Outliers in supplementary alignment ratio.
|
|
|
+/// - `BinOutlier::Start`: Outliers in read start ratio.
|
|
|
+/// - `BinOutlier::End`: Outliers in read end ratio.
|
|
|
+///
|
|
|
+/// # Notes
|
|
|
+/// - This function modifies the input slice in-place.
|
|
|
+/// - It skips any `BinCount` objects with NaN ratios.
|
|
|
+/// - The `filter_outliers_modified_z_score_with_indices` function is assumed to be defined elsewhere
|
|
|
+/// and is responsible for the actual outlier detection algorithm.
|
|
|
+///
|
|
|
+/// # Example
|
|
|
+/// ```
|
|
|
+/// let mut bin_counts = vec![
|
|
|
+/// BinCount { ratio_sa: 0.1, ratio_start: 0.2, ratio_end: 0.3, outlier: None, /* other fields */ },
|
|
|
+/// // ... more BinCount objects ...
|
|
|
+/// ];
|
|
|
+/// fill_outliers(&mut bin_counts);
|
|
|
+/// // After this call, some BinCount objects may have their outlier field populated
|
|
|
+/// ```
|
|
|
+pub fn fill_outliers(bin_counts: &mut [BinCount]) {
|
|
|
+ let bin_counts = Mutex::new(bin_counts);
|
|
|
+
|
|
|
+ fn get_ratio_sa(c: &BinCount) -> f64 {
|
|
|
+ c.ratio_sa
|
|
|
+ }
|
|
|
+ fn get_ratio_start(c: &BinCount) -> f64 {
|
|
|
+ c.ratio_start
|
|
|
+ }
|
|
|
+ fn get_ratio_end(c: &BinCount) -> f64 {
|
|
|
+ c.ratio_end
|
|
|
+ }
|
|
|
+ type OutlierTypeInfo = (fn(&BinCount) -> f64, BinOutlier);
|
|
|
+
|
|
|
+ let outlier_types: [OutlierTypeInfo; 3] = [
|
|
|
+ (get_ratio_sa, BinOutlier::SA),
|
|
|
+ (get_ratio_start, BinOutlier::Start),
|
|
|
+ (get_ratio_end, BinOutlier::End),
|
|
|
+ ];
|
|
|
+
|
|
|
+ for (get_ratio, outlier_type) in outlier_types.iter() {
|
|
|
+ let (indices, ratios): (Vec<usize>, Vec<f64>) = bin_counts
|
|
|
+ .lock()
|
|
|
+ .unwrap()
|
|
|
+ .par_iter()
|
|
|
+ .enumerate()
|
|
|
+ .filter_map(|(i, c)| {
|
|
|
+ let ratio = get_ratio(c);
|
|
|
+ if !ratio.is_nan() {
|
|
|
+ Some((i, ratio))
|
|
|
+ } else {
|
|
|
+ None
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .unzip();
|
|
|
+
|
|
|
+ let outlier_indices = filter_outliers_modified_z_score_with_indices(&ratios, indices);
|
|
|
+
|
|
|
+ outlier_indices.par_iter().for_each(|&i| {
|
|
|
+ let mut bin_counts = bin_counts.lock().unwrap();
|
|
|
+ bin_counts[i]
|
|
|
+ .outlier
|
|
|
+ .get_or_insert_with(Vec::new)
|
|
|
+ .push(outlier_type.clone());
|
|
|
+ });
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/// Performs a somatic scan for a given sample ID by analyzing both normal and tumoral BAM files.
|
|
|
+///
|
|
|
+/// # Parameters
|
|
|
+/// - `id: &str`: The unique identifier for the sample being scanned.
|
|
|
+/// - `config: &Config`: A configuration object containing paths and settings for the scan.
|
|
|
+///
|
|
|
+/// # Returns
|
|
|
+/// - `anyhow::Result<()>`: Returns `Ok(())` if both scans succeed, or an error if any operation fails.
|
|
|
+///
|
|
|
+/// # Description
|
|
|
+/// This function performs a somatic scan by invoking the `par_whole_scan` function twice:
|
|
|
+/// once for the normal BAM file and once for the tumoral BAM file. The results are saved
|
|
|
+/// in separate directories specified by the configuration object.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+/// This function will return an error if:
|
|
|
+/// - Either call to `par_whole_scan` fails (e.g., due to invalid paths or processing errors).
|
|
|
+///
|
|
|
+/// ## Example Usage
|
|
|
+/// ```
|
|
|
+/// let config = Config::new(); // Assume Config is properly initialized
|
|
|
+/// let sample_id = "sample123";
|
|
|
+///
|
|
|
+/// match somatic_scan(sample_id, &config) {
|
|
|
+/// Ok(_) => println!("Somatic scan completed successfully."),
|
|
|
+/// Err(e) => eprintln!("Error during somatic scan: {}", e),
|
|
|
+/// }
|
|
|
+/// ```
|
|
|
+pub fn somatic_scan(id: &str, config: &Config) -> anyhow::Result<()> {
|
|
|
+ info!("Starting scan for {id} normal.");
|
|
|
+ par_whole_scan(&config.normal_dir_count(id), &config.normal_bam(id), config)?;
|
|
|
+ info!("Starting scan for {id} tumoral.");
|
|
|
+ par_whole_scan(
|
|
|
+ &config.tumoral_dir_count(id),
|
|
|
+ &config.tumoral_bam(id),
|
|
|
+ config,
|
|
|
+ )
|
|
|
+}
|