Browse Source

gff + scan

Thomas 9 months ago
parent
commit
1cb6ad7442
14 changed files with 1450 additions and 14 deletions
  1. 6 0
      src/annotation/vep.rs
  2. 2 2
      src/callers/savana.rs
  3. 17 0
      src/config.rs
  4. 0 3
      src/helpers.rs
  5. 246 0
      src/io/bam.rs
  6. 29 0
      src/io/gff.rs
  7. 2 0
      src/io/mod.rs
  8. 73 3
      src/lib.rs
  9. 161 0
      src/math.rs
  10. 73 3
      src/positions.rs
  11. 343 0
      src/scan/bin.rs
  12. 4 0
      src/scan/mod.rs
  13. 429 0
      src/scan/scan.rs
  14. 65 3
      src/variant/variant_collection.rs

+ 6 - 0
src/annotation/vep.rs

@@ -125,6 +125,12 @@ pub struct VEP {
     pub extra: VEPExtra,
 }
 
+impl VEP {
+    pub fn impact(&self) -> Option<VepImpact> {
+        self.extra.impact.clone()
+    }
+}
+
 /// Represents the predicted consequences of a genetic variant as determined by
 /// the Ensembl Variant Effect Predictor (VEP).
 ///

+ 2 - 2
src/callers/savana.rs

@@ -237,7 +237,7 @@ impl FromStr for SavanaCNRow {
 
     fn from_str(s: &str) -> anyhow::Result<Self> {
         let cells: Vec<&str> = s.split("\t").collect();
-        let range = GenomeRange::from_1_inclusive(
+        let range = GenomeRange::from_1_inclusive_str(
             cells
                 .first()
                 .context(format!("Error while parsing contig {s}."))?,
@@ -340,7 +340,7 @@ impl FromStr for SavanaRCRow {
             .split_once(':')
             .and_then(|(a, b)| {
                 b.split_once('_').map(|(b, c)| {
-                    GenomeRange::from_1_inclusive(a, b, c)
+                    GenomeRange::from_1_inclusive_str(a, b, c)
                         .context(format!("Error while parsing range {}", bin))
                 })
             })

+ 17 - 0
src/config.rs

@@ -6,12 +6,16 @@ pub struct Config {
     pub reference: String,
     pub reference_name: String,
     pub dict_file: String,
+    pub refseq_gff: String,
     pub docker_max_memory_go: u16,
     pub savana_bin: String,
     pub savana_threads: u8,
     pub tumoral_name: String,
     pub normal_name: String,
     pub haplotagged_bam_tag_name: String,
+    pub count_dir_name: String,
+    pub count_bin_size: u32,
+    pub count_n_chunks: u32,
     pub savana_output_dir: String,
     pub savana_copy_number: String,
     pub savana_read_counts: String,
@@ -84,6 +88,7 @@ impl Default for Config {
             reference: "/data/ref/hs1/chm13v2.0.fa".to_string(),
             reference_name: "hs1".to_string(),
             dict_file: "/data/ref/hs1/chm13v2.0.dict".to_string(),
+            refseq_gff: "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz".to_string(),
 
             docker_max_memory_go: 400,
 
@@ -94,6 +99,10 @@ impl Default for Config {
             normal_name: "mrd".to_string(),
             haplotagged_bam_tag_name: "HP".to_string(),
 
+            count_dir_name: "counts".to_string(),
+            count_bin_size: 1_000,
+            count_n_chunks: 1_000,
+
             bam_min_mapq: 40,
             bam_n_threads: 150,
 
@@ -271,6 +280,14 @@ impl Config {
         )
     }
 
+    pub fn normal_dir_count(&self, id: &str) -> String {
+        format!("{}/{}", self.normal_dir(id), self.count_dir_name)
+    }
+
+    pub fn tumoral_dir_count(&self, id: &str) -> String {
+        format!("{}/{}", self.tumoral_dir(id), self.count_dir_name)
+    }
+
     pub fn mask_bed(&self, id: &str) -> String {
         self.mask_bed
             .replace("{result_dir}", &self.result_dir)

+ 0 - 3
src/helpers.rs

@@ -286,10 +286,7 @@ where
         return 0.0;
     }
 
-    // Calculate the sum of the values
     let sum: T = values.iter().copied().sum();
-
-    // Calculate and return the average as f64
     sum.into() / count as f64
 }
 

+ 246 - 0
src/io/bam.rs

@@ -0,0 +1,246 @@
+use std::collections::{HashMap, HashSet};
+
+use log::warn;
+use rust_htslib::bam::{record::Aux, IndexedReader, Read, Record};
+
+
+/// Parses an SA tag string and extracts chromosome and position information.
+///
+/// # Arguments
+///
+/// * `sa` - The SA tag string to parse
+///
+/// # Returns
+///
+/// A vector of tuples containing chromosome names and positions
+pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
+    sa.split(';')
+        .filter(|s| !s.is_empty())
+        .filter_map(|s| {
+            let parts: Vec<&str> = s.split(',').take(2).collect();
+            if parts.len() < 2 {
+                return None;
+            }
+
+            let chr = parts[0];
+            match parts[1].parse::<i32>() {
+                Ok(position) => Some((chr, position)),
+                Err(_) => None,
+            }
+        })
+        .collect()
+}
+
+/// Resolves the primary record for supplementary alignments using BAM SA tag
+///
+/// For supplementary alignments, searches linked primary records using genomic positions
+/// listed in the SA tag. Returns the first non-supplementary record with matching query name.
+///
+/// # Arguments
+/// * `bam` - Mutable reference to indexed BAM reader for random access
+/// * `record` - Input record to evaluate (typically a supplementary alignment)
+///
+/// # Returns
+/// - Original record if it's not supplementary
+/// - First matching primary record found via SA tag positions
+/// - Original record if no primary matches found
+///
+/// # Panics
+/// - If SA tag format is invalid (missing fields or non-integer positions)
+/// - If BAM fetch operation fails
+///
+/// # Example
+/// ```
+/// use rust_htslib::{bam::{IndexedReader, Read}};
+///
+/// let mut bam = IndexedReader::from_path("input.bam").unwrap();
+/// let record = bam.records().next().unwrap().unwrap();
+/// let primary = primary_record(&mut bam, record);
+/// ```
+pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
+    // Return immediately if not a supplementary alignment
+    if !record.is_supplementary() {
+        return record;
+    }
+
+    let qname = record.qname();
+
+    // Process SA tag if present
+    if let Ok(Aux::String(sa)) = record.aux(b"SA") {
+        // Search potential primary alignments at each SA position
+        for (chr, start) in parse_sa_tag(sa) {
+            bam.fetch((chr, start, start + 1))
+                .expect("BAM fetch failed");
+
+            for result in bam.records() {
+                let candidate = result.expect("Invalid BAM record");
+                if candidate.qname() == qname && !candidate.is_supplementary() {
+                    return candidate.clone();
+                }
+            }
+        }
+    }
+
+    // Fallback to original record if no primary found
+    record
+}
+
+/// Creates optimized position bins for fetching records.
+///
+/// Groups positions by chromosome and creates bins of ±1000 bp.
+/// Sorts bins by size (descending) to prioritize regions with more alignment hits.
+///
+/// # Arguments
+///
+/// * `positions` - Vector of chromosome/position tuples
+///
+/// # Returns
+///
+/// A vector of position bins, sorted by bin size
+fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
+    // Sort positions by chromosome and position
+    let mut sorted_positions = positions.to_vec();
+    sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
+    sorted_positions.dedup();
+    
+    // Group by chromosome and create bins of ±1000 bp
+    let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
+    
+    for (chr, pos) in sorted_positions {
+        let bins = grouped.entry(chr).or_default();
+        
+        if let Some(last_bin) = bins.last_mut() {
+            if last_bin.is_empty() || (pos - last_bin[0].1).abs() <= 1000 {
+                last_bin.push((chr, pos));
+            } else {
+                bins.push(vec![(chr, pos)]);
+            }
+        } else {
+            bins.push(vec![(chr, pos)]);
+        }
+    }
+    
+    // Flatten and sort by bin size (descending)
+    let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values()
+        .flatten()
+        .cloned()
+        .collect();
+    
+    // Sort bins by size (descending) to prioritize regions with more hits
+    flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
+    
+    flattened
+}
+
+/// Retrieves primary alignment records based on a set of input records.
+///
+/// This function processes a collection of BAM records and retrieves their primary alignments.
+/// When supplementary alignments are found (with SA tags), it fetches the corresponding
+/// primary alignments from the provided BAM file.
+///
+/// # Arguments
+///
+/// * `bam` - A mutable reference to an IndexedReader for the BAM file
+/// * `records` - A vector of input records to process
+///
+/// # Returns
+///
+/// A vector of records containing both:
+/// - Original primary alignments from the input
+/// - Primary alignments fetched for any supplementary records in the input
+///
+/// # Examples
+///
+/// ```
+/// use rust_htslib::bam::{IndexedReader, Record};
+/// let mut bam = IndexedReader::from_path("sample.bam").unwrap();
+/// let records = vec![/* some records */];
+/// let primary_alignments = primary_records(&mut bam, records);
+/// ```
+pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
+    let mut res = Vec::with_capacity(records.len());
+    let mut all_positions = Vec::new();
+    let mut all_qnames_to_fetch = Vec::new();
+    
+    // First pass: collect primary records and positions to fetch
+    for record in records.iter() {
+        if record.is_supplementary() {
+            let qname = record.qname();
+            // Safely extract and process the SA tag
+            if let Ok(Aux::String(sa)) = record.aux(b"SA") {
+                let positions = parse_sa_tag(sa);
+                all_positions.extend(positions);
+                
+                match String::from_utf8(qname.to_vec()) {
+                    Ok(qname_str) => all_qnames_to_fetch.push(qname_str),
+                    Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
+                }
+            }
+        } else {
+            res.push(record.clone());
+        }
+    }
+    
+    // Track which query names we've already found
+    let mut primary_records: HashSet<String> = res
+        .iter()
+        .filter_map(|r| String::from_utf8(r.qname().to_vec()).ok())
+        .collect();
+    
+    // Create position bins for efficient fetching
+    let position_bins = create_position_bins(&all_positions);
+    
+    // Fetch records for each bin until we find all primaries
+    for bin in position_bins {
+        if bin.is_empty() {
+            continue;
+        }
+        
+        let (chr, start) = bin.first().unwrap();
+        let end = bin.last().unwrap().1 + 1;
+        
+        // Fetch and process records in this region
+        if let Err(e) = bam.fetch((*chr, *start, end)) {
+            warn!("Failed to fetch region {}:{}-{}: {}", chr, start, end, e);
+            continue;
+        }
+        
+        for record_result in bam.records() {
+            match record_result {
+                Ok(record) => {
+                    // Skip secondary alignments
+                    if record.is_secondary() {
+                        continue;
+                    }
+                    
+                    // Check if this is a primary we're looking for
+                    match String::from_utf8(record.qname().to_vec()) {
+                        Ok(qname) => {
+                            if primary_records.contains(&qname) {
+                                continue;
+                            }
+                            
+                            if all_qnames_to_fetch.contains(&qname) {
+                                res.push(record);
+                                primary_records.insert(qname);
+                            }
+                        },
+                        Err(_) => continue,
+                    }
+                },
+                Err(e) => warn!("Error reading record: {}", e),
+            }
+        }
+        
+        // Early exit if we found all the records we were looking for
+        let remaining = all_qnames_to_fetch.iter()
+            .filter(|q| !primary_records.contains(*q))
+            .count();
+        
+        if remaining == 0 {
+            break;
+        }
+    }
+    
+    res
+}

+ 29 - 0
src/io/gff.rs

@@ -0,0 +1,29 @@
+use anyhow::Context;
+use noodles_gff as gff;
+
+use crate::{config::Config, positions::GenomeRange};
+
+use super::readers::get_gz_reader;
+
+pub fn features_ranges(feature_type: &str, config: &Config) -> anyhow::Result<Vec<GenomeRange>> {
+    let path = &config.refseq_gff;
+    let mut reader = get_gz_reader(path)
+        .map(gff::io::Reader::new)
+        .with_context(|| format!("Error while creating the reader of {path}"))?;
+
+    let mut res = Vec::new();
+    for result in reader.record_bufs() {
+        let record = result?;
+        let ty = record.ty();
+        if ty != feature_type {
+            continue;
+        }
+
+        res.push(GenomeRange::from_1_inclusive(
+            record.reference_sequence_name(),
+            record.start().get() as u32,
+            record.end().get() as u32,
+        ));
+    }
+    Ok(res)
+}

+ 2 - 0
src/io/mod.rs

@@ -5,3 +5,5 @@ pub mod bed;
 pub mod dict;
 pub mod fasta;
 pub mod pod5_footer_generated;
+pub mod gff;
+pub mod bam;

+ 73 - 3
src/lib.rs

@@ -14,6 +14,8 @@ pub mod pipes;
 pub mod positions;
 pub mod annotation;
 pub mod cases;
+pub mod scan;
+pub mod math;
 
 #[macro_use]
 extern crate lazy_static;
@@ -25,7 +27,7 @@ lazy_static! {
 
 #[cfg(test)]
 mod tests {
-    use std::{collections::HashMap, fs, path::Path};
+    use std::{collections::HashMap, fs, ops::Index, path::Path};
 
     use annotation::{vep::{VepLine, VEP}, Annotations};
     use callers::{nanomonsv::nanomonsv_create_pon, savana::{Savana, SavanaReadCounts}, severus::{Severus, SeverusSolo}};
@@ -43,7 +45,7 @@ mod tests {
 
     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
     use super::*;
-    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::{CNSegment, SavanaCN}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, pipes::somatic::const_stats, variant::{variant::AlterationCategory, variant_collection::VariantsStats}};
+    use crate::{annotation::{vep::VepImpact, Annotation}, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::{CNSegment, SavanaCN}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, par_overlaps}, scan::scan::{par_whole_scan, somatic_scan}, variant::{variant::AlterationCategory, variant_collection::VariantsStats}};
 
     // export RUST_LOG="debug"
     fn init() {
@@ -753,7 +755,6 @@ mod tests {
             )
         });
         Ok(())
-
     }
 
     #[test]
@@ -801,4 +802,73 @@ Ok(())
         println!("{} lines", r.len());
         println!("{:#?}", r.first().unwrap());
     }
+
+    #[test]
+    fn whole_scan() -> anyhow::Result<()> {
+        init();
+        let id = "ADJAGBA";
+        let config = Config::default();
+        somatic_scan(id, &config)?;
+        Ok(())
+    }
+
+    #[test]
+    fn parse_gff() -> anyhow::Result<()> {
+        init();
+        let id = "ADJAGBA";
+        let config = Config::default();
+        let path = format!("{}/{id}/diag/somatic_variants.bit", config.result_dir);
+        let r = features_ranges("exon", &config)?;
+        let merged = merge_overlapping_genome_ranges(&r);
+        
+        let variants = variant_collection::Variants::load_from_file(&path)?;
+        let full = variant_collection::somatic_rates(&variants.data, &merged, &config);
+        info!("{full:#?}");
+
+        let restrained: Vec<variant_collection::Variant> = variants.data.iter().filter(|v| v.vcf_variants.len() >= 2)
+            .cloned().collect();
+        let min_2 = variant_collection::somatic_rates(&restrained, &merged, &config);
+        info!("{min_2:#?}");
+
+        let restrained: Vec<variant_collection::Variant> = restrained.iter().filter(|v| v.vcf_variants.len() >= 3)
+            .cloned().collect();
+        let min_3 = variant_collection::somatic_rates(&restrained, &merged, &config);
+        info!("{min_3:#?}");
+
+
+        // info!("n variants loaded: {}", variants.data.len());
+        // 
+        // let r = features_ranges("exon", &config::Config::default())?;
+        // info!("n exon: {}", r.len());
+        //
+        // let merged = merge_overlapping_genome_ranges(&r);
+        // info!("n merged exon: {}", merged.len());
+        //
+        // let ol = par_overlaps(&variants.data, &r);
+        // info!("variants in exon {}", ol.len());
+        //
+        // let n_coding = ol.iter().filter_map(|i| variants.data[*i].best_vep().ok() ).filter_map(|bv| bv.impact()).filter(|impact| *impact <= VepImpact::MODERATE).count();
+        // info!("coding variants {n_coding}");
+        // 
+        // let n_bases_m = merged.par_iter().map(|gr| gr.length()).sum::<u32>();
+        // info!("{n_bases_m}nt");
+        // 
+        // let mega_base_m = n_bases_m as f64 / 10.0e6;
+        //
+        // let wgs_len = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum::<u32>();
+        // info!("wgs len {wgs_len}");
+        // let rate_wgs = variants.data.len() as f64 / (wgs_len as f64 / 10.0e6);
+        // info!("somatic mutation rate {rate_wgs:.2}/mb");
+        //
+        // let n_exons_mb = ol.len() as f64 / mega_base_m;
+        // info!("somatic mutation rate in the coding region {n_exons_mb:.2}/mb");
+        // 
+        // let n_exons_mb = n_coding as f64 / mega_base_m;
+        // info!("somatic non synonymous mutation rate in the coding region {n_exons_mb:.2}/mb");
+
+        Ok(())
+    }
+
+
+
 }

+ 161 - 0
src/math.rs

@@ -0,0 +1,161 @@
+use rayon::prelude::*;
+
+
+/// Filters outliers from a dataset using the Modified Z-Score method and returns the indices of the outliers.
+///
+/// The Modified Z-Score is a robust statistical method for detecting outliers. It is calculated as:
+/// \[
+/// \text{Modified Z-Score} = \frac{0.6745 \times |x - \text{median}|}{\text{MAD}}
+/// \]
+/// where:
+/// - \( x \) is a data point,
+/// - \( \text{median} \) is the median of the dataset,
+/// - \( \text{MAD} \) is the Median Absolute Deviation.
+///
+/// Data points with a Modified Z-Score greater than the threshold (default: 3.5) are considered outliers.
+///
+/// # Arguments
+///
+/// * `data` - A slice of `f64` values representing the dataset.
+/// * `indices` - A vector of `u32` values representing the indices of the corresponding data points.
+///
+/// # Returns
+///
+/// A vector of `u32` values containing the indices of the outliers.
+///
+/// # Examples
+///
+/// ```
+/// let data = vec![1.0, 2.0, 3.0, 100.0]; // 100.0 is an outlier
+/// let indices = vec![0, 1, 2, 3];
+/// let outlier_indices = filter_outliers_modified_z_score_with_indices(&data, indices);
+/// assert_eq!(outlier_indices, vec![3]);
+/// ```
+///
+/// # Notes
+///
+/// - This function uses parallel processing via the `rayon` crate for improved performance on large datasets.
+/// - The time complexity is O(n log n) due to the sorting step required for median calculation.
+///
+pub fn filter_outliers_modified_z_score_with_indices(
+    data: &[f64],
+    indices: Vec<usize>,
+) -> Vec<usize> {
+    // Early return if the dataset is empty
+    if data.is_empty() {
+        return Vec::new();
+    }
+
+    // Create a sorted copy of the data for median and MAD calculations
+    let mut sorted_data = data.to_vec();
+    sorted_data.par_sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+
+    // Compute the median of the dataset
+    let median = compute_median(&sorted_data);
+
+    // Compute the Median Absolute Deviation (MAD) of the dataset
+    let mad = compute_mad(&sorted_data, median);
+
+    // Threshold for Modified Z-Score (commonly 3.5 for outlier detection)
+    const THRESHOLD: f64 = 3.5;
+
+    // Filter indices based on the Modified Z-Score
+    indices
+        .into_iter()
+        .zip(data.iter())
+        .filter_map(|(index, &x)| {
+            // Calculate the Modified Z-Score
+            let modified_z_score = 0.6745 * (x - median).abs() / mad;
+
+            // Keep the index if the Modified Z-Score exceeds the threshold
+            if modified_z_score > THRESHOLD {
+                Some(index)
+            } else {
+                None
+            }
+        })
+        .collect()
+}
+
+/// Computes the median of a **sorted** slice of `f64` values.
+///
+/// The median is the middle value in a sorted list of numbers. If the list has an even number
+/// of elements, the median is the average of the two middle values. This function assumes
+/// that the input data is already sorted.
+///
+/// # Arguments
+///
+/// * `data` - A **sorted** slice of `f64` values for which the median is to be computed.
+///
+/// # Returns
+///
+/// The median value as an `f64`. If the input slice is empty, returns `NaN`.
+///
+/// # Examples
+///
+/// ```
+/// let data = vec![1.0, 2.0, 3.0]; // Sorted input
+/// let median = compute_median(&data);
+/// assert_eq!(median, 2.0);
+///
+/// let data = vec![1.0, 2.0, 3.0, 4.0]; // Sorted input
+/// let median = compute_median(&data);
+/// assert_eq!(median, 2.5);
+/// ```
+pub fn compute_median(data: &[f64]) -> f64 {
+    // Early return if the dataset is empty
+    if data.is_empty() {
+        return f64::NAN; // Return NaN for an empty dataset
+    }
+
+    // Determine the length of the sorted data
+    let len = data.len();
+
+    // Calculate the median based on whether the length is even or odd
+    if len % 2 == 0 {
+        // If even, average the two middle values
+        (data[len / 2 - 1] + data[len / 2]) / 2.0
+    } else {
+        // If odd, return the middle value
+        data[len / 2]
+    }
+}
+
+/// Computes the Median Absolute Deviation (MAD) of a **sorted** slice of `f64` values.
+///
+/// The Median Absolute Deviation is a measure of statistical dispersion. It is calculated as the
+/// median of the absolute deviations of each data point from the median of the dataset. This function
+/// assumes that the input data is already sorted.
+///
+/// # Arguments
+///
+/// * `data` - A **sorted** slice of `f64` values for which the MAD is to be computed.
+/// * `median` - The median of the dataset, which is used as the central point for computing deviations.
+///
+/// # Returns
+///
+/// The Median Absolute Deviation as an `f64`. If the input slice is empty, returns `NaN`.
+///
+/// # Examples
+///
+/// ```
+/// let data = vec![1.0, 2.0, 3.0, 4.0, 5.0]; // Sorted input
+/// let median = 3.0;
+/// let mad = compute_mad(&data, median);
+/// assert_eq!(mad, 1.0);
+/// ```
+pub fn compute_mad(data: &[f64], median: f64) -> f64 {
+    // Early return if the dataset is empty
+    if data.is_empty() {
+        return f64::NAN; // Return NaN for an empty dataset
+    }
+
+    // Compute the absolute deviations of each data point from the median
+    let mut deviations: Vec<f64> = data.par_iter().map(|&x| (x - median).abs()).collect();
+
+    // Sort the deviations to compute their median
+    deviations.par_sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+    // Compute the median of the absolute deviations
+    compute_median(&deviations)
+}

+ 73 - 3
src/positions.rs

@@ -1,4 +1,10 @@
-use std::{cmp::Ordering, fmt::Display, ops::Range, str::FromStr};
+use std::{
+    cmp::{max, Ordering},
+    fmt::Display,
+    iter::Sum,
+    ops::Range,
+    str::FromStr,
+};
 
 use anyhow::Context;
 use bitcode::{Decode, Encode};
@@ -469,20 +475,84 @@ impl GenomeRange {
     /// assert_eq!(range.range.start, 999);  // Converted to 0-based
     /// assert_eq!(range.range.end, 2000);   // End is exclusive in 0-based
     /// ```
-    pub fn from_1_inclusive(contig: &str, start: &str, end: &str) -> anyhow::Result<Self> {
+    pub fn from_1_inclusive_str(contig: &str, start: &str, end: &str) -> anyhow::Result<Self> {
         Ok(Self {
             contig: contig_to_num(contig),
             range: Range {
                 start: start
                     .parse::<u32>()
                     .context(format!("Can't parse {start} into u32."))?
-                    - 1,
+                    .saturating_sub(1),
                 end: end
                     .parse()
                     .context(format!("Can't parse {end} into u32."))?,
             },
         })
     }
+
+    pub fn from_1_inclusive(contig: &str, start: u32, end: u32) -> Self {
+        Self {
+            contig: contig_to_num(contig),
+            range: Range {
+                start: start.saturating_sub(1),
+                end,
+            },
+        }
+    }
+
+    pub fn length(&self) -> u32 {
+        self.range.end - self.range.start
+    }
+}
+
+impl GetGenomePosition for GenomePosition {
+    fn position(&self) -> &GenomePosition {
+        self
+    }
+}
+
+impl GetGenomeRange for GenomeRange {
+    fn range(&self) -> &GenomeRange {
+        self
+    }
+}
+
+pub fn merge_overlapping_genome_ranges(ranges: &[GenomeRange]) -> Vec<GenomeRange> {
+    if ranges.is_empty() {
+        return vec![];
+    }
+
+    // ranges.sort_by_key(|r| (r.contig, r.range.start));
+
+    let chunk_size = (ranges.len() / rayon::current_num_threads()).max(1);
+
+    ranges
+        .par_chunks(chunk_size)
+        .map(|chunk| {
+            let mut merged = Vec::new();
+            let mut current = chunk[0].clone();
+
+            for range in chunk.iter().skip(1) {
+                if current.contig == range.contig && current.range.end >= range.range.start {
+                    current.range.end = max(current.range.end, range.range.end);
+                } else {
+                    merged.push(current);
+                    current = range.clone();
+                }
+            }
+            merged.push(current);
+            merged
+        })
+        .reduce(Vec::new, |mut acc, mut chunk| {
+            if let (Some(last), Some(first)) = (acc.last_mut(), chunk.first()) {
+                if last.contig == first.contig && last.range.end >= first.range.start {
+                    last.range.end = max(last.range.end, first.range.end);
+                    chunk.remove(0);
+                }
+            }
+            acc.extend(chunk);
+            acc
+        })
 }
 
 /// Finds overlaps between a list of genome positions and a list of genome ranges in parallel.

+ 343 - 0
src/scan/bin.rs

@@ -0,0 +1,343 @@
+use std::collections::HashMap;
+
+use anyhow::Context;
+use rust_htslib::bam::{ext::BamRecordExtensions, record::Aux, IndexedReader, Read, Record};
+
+use crate::io::bam::{primary_record, primary_records};
+
+/// A genomic bin containing reads from a specific region.
+///
+/// This struct represents a segment of a BAM file, storing reads that overlap
+/// a specific genomic region. It provides methods to analyze read distributions
+/// and extract primary alignments for supplementary alignments.
+#[derive(Debug)]
+pub struct Bin {
+    /// The contig/chromosome name
+    pub contig: String,
+    /// The 0-based inclusive start position
+    pub start: u32,
+    /// The 0-based inclusive end position
+    pub end: u32,
+    /// Map of read query names to their BAM records
+    pub reads_store: HashMap<Vec<u8>, Record>,
+    /// Indexed BAM reader for fetching additional records
+    pub bam_reader: IndexedReader,
+    /// Average read length within this bin
+    pub reads_mean_len: u32,
+    /// Number of reads filtered due to low mapping quality
+    pub n_low_mapq: u32,
+}
+
+impl Bin {
+    /// Creates a new genomic bin from a BAM file.
+    ///
+    /// # Arguments
+    ///
+    /// * `bam_path` - Path to the BAM file
+    /// * `contig` - Chromosome/contig name
+    /// * `start` - 0-based inclusive start position
+    /// * `length` - Length of the region to extract
+    ///
+    /// # Returns
+    ///
+    /// A Result containing the new Bin if successful, or an error if the BAM file
+    /// couldn't be read or the region couldn't be fetched.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let bin = Bin::new("sample.bam", "chr1", 1000000, 1000)?;
+    /// println!("Loaded {} reads", bin.n_reads());
+    /// ```
+    pub fn new(bam_path: &str, contig: &str, start: u32, length: u32) -> anyhow::Result<Self> {
+        const DEFAULT_MAPQ: u8 = 50;
+        let mut bam_reader = IndexedReader::from_path(bam_path)
+            .with_context(|| format!("Can't open BAM file: {}", bam_path))?;
+
+        // Calculate inclusive end position
+        let end = start + length - 1;
+
+        // debug!("Creating bin {}:{}-{}", contig, start, end);
+
+        // Fetch reads in the required region
+        bam_reader
+            .fetch((contig, start as i64, end as i64))
+            .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?;
+
+        let mut reads_store: HashMap<Vec<u8>, Record> = HashMap::new();
+        let mut n_low_mapq = 0;
+        let mut lengths = Vec::new();
+
+        // Process all records in the region
+        for read_result in bam_reader.records() {
+            let record = read_result.context("Error while parsing BAM record")?;
+
+            // Skip reads with low mapping quality
+            if record.mapq() < DEFAULT_MAPQ {
+                n_low_mapq += 1;
+                continue;
+            }
+
+            // Store the read length
+            lengths.push(
+                record
+                    .reference_end()
+                    .saturating_sub(record.reference_start()),
+            );
+
+            // Store the record by query name
+            reads_store.insert(record.qname().to_vec(), record);
+        }
+
+        // Calculate mean read length, handling the empty case
+        let reads_mean_len = if lengths.is_empty() {
+            0
+        } else {
+            (lengths.iter().sum::<i64>() as f64 / lengths.len() as f64) as u32
+        };
+
+        Ok(Bin {
+            contig: contig.to_string(),
+            start,
+            end,
+            reads_store,
+            bam_reader,
+            n_low_mapq,
+            reads_mean_len,
+        })
+    }
+
+    /// Returns the total number of reads in this bin.
+    pub fn n_reads(&self) -> usize {
+        self.reads_store.len()
+    }
+
+    /// Returns the number of reads with SA (supplementary alignment) tags.
+    pub fn n_sa(&self) -> usize {
+        self.reads_store
+            .values()
+            .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
+            .count()
+    }
+
+    /// Retrieves primary alignments for all reads with SA tags in this bin.
+    ///
+    /// # Returns
+    ///
+    /// A vector of primary alignment records for reads with supplementary alignments.
+    pub fn sa_primary(&mut self) -> Vec<Record> {
+        self.reads_store
+            .values()
+            .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
+            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
+            .collect()
+    }
+
+    /// Finds the position with the highest concentration of read starts or ends.
+    ///
+    /// # Returns
+    ///
+    /// A tuple containing (position, count) of the position with the most read starts or ends.
+    pub fn max_start_or_end(&self) -> (u32, usize) {
+        let mut position_counts: HashMap<u32, usize> = HashMap::new();
+
+        for record in self.reads_store.values() {
+            let reference_start = record.reference_start() as u32;
+            let reference_end = record.reference_end() as u32;
+
+            // Count start positions within the bin
+            if reference_start >= self.start && reference_start <= self.end {
+                *position_counts.entry(reference_start).or_default() += 1;
+            }
+
+            // Count end positions within the bin
+            if reference_end >= self.start && reference_end <= self.end {
+                *position_counts.entry(reference_end).or_default() += 1;
+            }
+        }
+
+        position_counts
+            .into_iter()
+            .max_by_key(|(_, count)| *count)
+            .unwrap_or((0, 0))
+    }
+
+    /// Computes the number of reads with SA tags, and finds the highest
+    /// counts of read starts and ends at the same position.
+    ///
+    /// # Returns
+    ///
+    /// A tuple containing:
+    /// - Number of reads with SA tags
+    /// - Highest count of reads starting at any single position
+    /// - Highest count of reads ending at any single position
+    pub fn count_reads_sa_start_end(&self) -> (usize, usize, usize) {
+        let mut n_sa = 0;
+        let mut start_counts: HashMap<u32, usize> = HashMap::new();
+        let mut end_counts: HashMap<u32, usize> = HashMap::new();
+
+        for record in self.reads_store.values() {
+            // Count reads with SA tags
+            if matches!(record.aux(b"SA"), Ok(Aux::String(_))) {
+                n_sa += 1;
+            }
+
+            let reference_start = record.reference_start() as u32;
+            let reference_end = record.reference_end() as u32;
+
+            // Count start positions within the bin
+            if reference_start >= self.start && reference_start <= self.end {
+                *start_counts.entry(reference_start).or_default() += 1;
+            }
+
+            // Count end positions within the bin
+            if reference_end >= self.start && reference_end <= self.end {
+                *end_counts.entry(reference_end).or_default() += 1;
+            }
+        }
+
+        let max_start_count = start_counts.values().max().copied().unwrap_or(0);
+        let max_end_count = end_counts.values().max().copied().unwrap_or(0);
+
+        (n_sa, max_start_count, max_end_count)
+    }
+
+    /// Finds the position with the highest concentration of read starts.
+    ///
+    /// # Returns
+    ///
+    /// A tuple containing (position, count) of the position with the most read starts.
+    pub fn max_start(&self) -> (u32, usize) {
+        let mut start_counts: HashMap<u32, usize> = HashMap::new();
+
+        for record in self.reads_store.values() {
+            let reference_start = record.reference_start() as u32;
+
+            if reference_start >= self.start && reference_start <= self.end {
+                *start_counts.entry(reference_start).or_default() += 1;
+            }
+        }
+
+        start_counts
+            .into_iter()
+            .max_by_key(|(_, count)| *count)
+            .unwrap_or((0, 0))
+    }
+
+    /// Finds the position with the highest concentration of read ends.
+    ///
+    /// # Returns
+    ///
+    /// A tuple containing (position, count) of the position with the most read ends.
+    pub fn max_end(&self) -> (u32, usize) {
+        let mut end_counts: HashMap<u32, usize> = HashMap::new();
+
+        for record in self.reads_store.values() {
+            let reference_end = record.reference_end() as u32;
+
+            if reference_end >= self.start && reference_end <= self.end {
+                *end_counts.entry(reference_end).or_default() += 1;
+            }
+        }
+
+        end_counts
+            .into_iter()
+            .max_by_key(|(_, count)| *count)
+            .unwrap_or((0, 0))
+    }
+
+    /// Gets primary alignments for reads that start or end at a specific position.
+    ///
+    /// # Arguments
+    ///
+    /// * `pos` - The position to check for read starts or ends
+    ///
+    /// # Returns
+    ///
+    /// A vector of primary alignment records.
+    pub fn se_primary(&mut self, pos: u32) -> Vec<Record> {
+        let records: Vec<Record> = self
+            .reads_store
+            .values()
+            .filter(|record| {
+                record.reference_start() as u32 == pos || record.reference_end() as u32 == pos
+            })
+            .cloned()
+            .collect();
+
+        primary_records(&mut self.bam_reader, records)
+    }
+
+    /// Gets primary alignments for reads that start at a specific position.
+    ///
+    /// # Arguments
+    ///
+    /// * `pos` - The position to check for read starts
+    ///
+    /// # Returns
+    ///
+    /// A vector of primary alignment records.
+    pub fn s_primary(&mut self, pos: u32) -> Vec<Record> {
+        self.reads_store
+            .values()
+            .filter(|record| record.reference_start() as u32 == pos)
+            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
+            .collect()
+    }
+
+    /// Gets primary alignments for reads that end at a specific position.
+    ///
+    /// # Arguments
+    ///
+    /// * `pos` - The position to check for read ends
+    ///
+    /// # Returns
+    ///
+    /// A vector of primary alignment records.
+    pub fn e_primary(&mut self, pos: u32) -> Vec<Record> {
+        self.reads_store
+            .values()
+            .filter(|record| record.reference_end() as u32 == pos)
+            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
+            .collect()
+    }
+
+    /// Calculates the mean coverage by considering the actual
+    /// span of each read in the bin.
+    ///
+    /// # Returns
+    ///
+    /// The mean bin coverage as a floating-point value.
+    pub fn mean_coverage(&self) -> f64 {
+        // If bin has no length or no reads, return 0
+        if self.end <= self.start || self.reads_store.is_empty() {
+            return 0.0;
+        }
+
+        // Calculate the total length of the bin
+        let bin_length = (self.end - self.start + 1) as f64;
+
+        // Calculate the total bases covered by all reads within the bin
+        let mut total_bases_covered = 0.0;
+        for record in self.reads_store.values() {
+            let read_start = record.reference_start() as u32;
+            let read_end = record.reference_end() as u32;
+
+            // Skip reads that don't overlap with our bin
+            if read_end < self.start || read_start > self.end {
+                continue;
+            }
+
+            // Calculate the overlapping region
+            let overlap_start = read_start.max(self.start);
+            let overlap_end = read_end.min(self.end);
+
+            // Add the number of bases this read covers within our bin
+            total_bases_covered += (overlap_end - overlap_start + 1) as f64;
+        }
+
+        // Divide by the bin length to get average coverage
+        total_bases_covered / bin_length
+    }
+}
+

+ 4 - 0
src/scan/mod.rs

@@ -0,0 +1,4 @@
+
+pub mod bin;
+pub mod scan;
+

+ 429 - 0
src/scan/scan.rs

@@ -0,0 +1,429 @@
+use std::{fmt, fs, fs::File, io::Write, sync::Mutex};
+
+use log::{debug, error, info};
+use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator};
+use rayon::{
+    iter::{IntoParallelIterator, ParallelIterator},
+    slice::ParallelSliceMut,
+};
+
+use crate::math::filter_outliers_modified_z_score_with_indices;
+use crate::positions::{contig_to_num, GenomeRange, GetGenomeRange};
+use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};
+
+/// Represents a count of reads in a genomic bin, including various metrics and outlier information.
+#[derive(Debug)]
+pub struct BinCount {
+    /// The name of the contig (chromosome) this bin belongs to.
+    pub contig: String,
+    /// The start position of the bin in the contig.
+    pub start: u32,
+    /// The end position of the bin in the contig.
+    pub end: u32,
+    /// The total number of reads in this bin.
+    pub n_reads: u32,
+    /// The average coverage of reads in this bin.
+    pub coverage: f64,
+    /// The ratio of supplementary alignments to total reads.
+    pub ratio_sa: f64,
+    /// The ratio of reads starting in this bin to total reads.
+    pub ratio_start: f64,
+    /// The ratio of reads ending in this bin to total reads.
+    pub ratio_end: f64,
+    /// Optional vector of outlier types for this bin.
+    pub outlier: Option<Vec<BinOutlier>>,
+}
+
+impl From<&Bin> for BinCount {
+    /// Converts a `Bin` reference to a `BinCount`.
+    ///
+    /// # Parameters
+    /// - `bin: &Bin`: A reference to the `Bin` object to convert.
+    ///
+    /// # Returns
+    /// A new `BinCount` instance populated with data from the `Bin`.
+    fn from(bin: &Bin) -> Self {
+        let n_reads = bin.n_reads();
+        let n_reads_float = n_reads as f64;
+        let (n_sa, n_start, n_end) = bin.count_reads_sa_start_end();
+
+        Self {
+            contig: bin.contig.clone(),
+            start: bin.start,
+            end: bin.end,
+            n_reads: n_reads as u32,
+            coverage: bin.mean_coverage(),
+            ratio_sa: n_sa as f64 / n_reads_float,
+            ratio_start: n_start as f64 / n_reads_float,
+            ratio_end: n_end as f64 / n_reads_float,
+            outlier: None,
+        }
+    }
+}
+
+impl BinCount {
+    /// Converts the `BinCount` instance to a TSV (Tab-Separated Values) row string.
+    ///
+    /// # Returns
+    /// A `String` representing the `BinCount` as a TSV row.
+    pub fn to_tsv_row(&self) -> String {
+        format!(
+            "{}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.6}\t{:.6}\t{}",
+            self.contig,
+            self.start,
+            self.end,
+            self.n_reads,
+            self.coverage,
+            self.ratio_sa,
+            self.ratio_start,
+            self.ratio_end,
+            self.outlier
+                .clone()
+                .map(|v| v
+                    .iter()
+                    .map(|e| e.to_string())
+                    .collect::<Vec<String>>()
+                    .join(", "))
+                .unwrap_or_default()
+        )
+    }
+
+    /// Parses a TSV row and creates a BinCount object.
+    ///
+    /// # Parameters
+    /// - `row: &str`: A string slice representing a TSV row.
+    ///
+    /// # Returns
+    /// A `Result` containing either a `BinCount` object if parsing is successful,
+    /// or an `anyhow::Error` if parsing fails.
+    ///
+    /// # Errors
+    /// This function will return an error if:
+    /// - The row does not contain the expected number of fields.
+    /// - Any of the numeric fields fail to parse.
+    /// - The outlier field is not in the expected format.
+    pub fn from_tsv_row(row: &str) -> anyhow::Result<Self> {
+        let fields: Vec<&str> = row.split('\t').collect();
+        if fields.len() != 9 {
+            anyhow::bail!("Invalid number of fields in TSV row");
+        }
+
+        let outlier = if !fields[8].is_empty() {
+            Some(
+                fields[8]
+                    .split(", ")
+                    .map(|s| match s {
+                        "SA" => Ok(BinOutlier::SA),
+                        "Start" => Ok(BinOutlier::Start),
+                        "End" => Ok(BinOutlier::End),
+                        _ => Err(anyhow::anyhow!("Invalid outlier type: {}", s)),
+                    })
+                    .collect::<Result<Vec<BinOutlier>, _>>()?,
+            )
+        } else {
+            None
+        };
+
+        Ok(BinCount {
+            contig: fields[0].to_string(),
+            start: fields[1].parse()?,
+            end: fields[2].parse()?,
+            n_reads: fields[3].parse()?,
+            coverage: fields[4].parse()?,
+            ratio_sa: fields[5].parse()?,
+            ratio_start: fields[6].parse()?,
+            ratio_end: fields[7].parse()?,
+            outlier,
+        })
+    }
+}
+
+// impl GetGenomeRange for BinCount {
+//     fn range(&self) -> GenomeRange {
+//         GenomeRange { contig: contig_to_num(&self.contig), range: self.start..(self.end + 1) }
+//     }
+// }
+
+
+/// Represents types of outliers that can be detected in a genomic bin.
+#[derive(Debug, Clone)]
+pub enum BinOutlier {
+    /// Indicates an outlier in supplementary alignments.
+    SA,
+    /// Indicates an outlier in reads starting in this bin.
+    Start,
+    /// Indicates an outlier in reads ending in this bin.
+    End,
+}
+
+impl fmt::Display for BinOutlier {
+    /// Implements the `Display` trait for `BinOutlier`.
+    ///
+    /// # Parameters
+    /// - `f: &mut fmt::Formatter<'_>`: The formatter to write the string representation.
+    ///
+    /// # Returns
+    /// A `fmt::Result` indicating success or failure of the formatting operation.
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            BinOutlier::SA => write!(f, "SA"),
+            BinOutlier::Start => write!(f, "Start"),
+            BinOutlier::End => write!(f, "End"),
+        }
+    }
+}
+
+/// Performs a parallel whole genome scan on a BAM file, counting reads in bins across the genome
+/// and identifying outliers. The results are written to TSV files, one per contig.
+///
+/// # Parameters
+/// - `out_dir: &str`: The output directory where results will be saved.
+/// - `bam_path: &str`: The path to the input BAM file.
+/// - `config: &Config`: A configuration object containing the following fields:
+///   - `count_bin_size`: The size of each bin in base pairs.
+///   - `count_n_chunks`: The number of bins per chunk for parallel processing.
+///   - `dict_file`: Path to the dictionary file containing contig names and lengths.
+///
+/// # Returns
+/// - `anyhow::Result<()>`: Returns `Ok(())` if successful, or an error if any operation fails.
+///
+/// # Description
+/// This function processes the genome in parallel by dividing each contig into chunks of bins.
+/// Each bin represents a region of the genome, and the function counts reads in these bins.
+/// After processing, bins are sorted, outliers are identified, and results are written to TSV files.
+///
+/// ## Workflow:
+/// 1. **Initialization**:
+///    - Logs the start of the scan with bin size and chunk information.
+///    - Creates the output directory if it does not exist.
+///
+/// 2. **Contig Processing**:
+///    - Reads contigs and their lengths from the dictionary file.
+///    - Skips the mitochondrial chromosome ("chrM").
+///
+/// 3. **Parallel Scanning**:
+///    - For each contig:
+///      - Calculates the number of bins and chunks using ceiling division.
+///      - Processes chunks in parallel using `into_par_iter()`.
+///      - For each chunk:
+///        - Calculates chunk start position and length.
+///        - Processes individual bins within the chunk by creating `Bin` objects and converting them to `BinCount`.
+///
+/// 4. **Post-Processing**:
+///    - Sorts bins by their start positions using parallel sorting.
+///    - Identifies outlier bins using a custom function (`fill_outliers`).
+///
+/// 5. **Output**:
+///    - Writes results for each contig to a TSV file in the specified output directory.
+///
+/// ## Notes:
+/// - The function uses ceiling division (`div_ceil`) to handle edge cases in bin and chunk calculations.
+/// - It includes debug logging for various stages of processing.
+/// - Handles edge cases for the last chunk and bin to ensure proper processing of all data.
+///
+/// # Errors
+/// This function will return an error if:
+/// - The output directory cannot be created.
+/// - The dictionary file cannot be read.
+/// - A `Bin` object cannot be created for a specific region.
+/// - Any I/O operation (e.g., writing results) fails.
+pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow::Result<()> {
+    let bin_size = config.count_bin_size;
+    let chunk_n_bin = config.count_n_chunks;
+    info!("Starting whole genome scan for {bam_path}, with bin size of {bin_size} nt and by chunks of {chunk_n_bin} bins.");
+    fs::create_dir(out_dir)?;
+
+    for (contig, length) in read_dict(&config.dict_file)? {
+        if contig.as_str() == "chrM" {
+            continue;
+        }
+
+        let n_bin = length / bin_size;
+        // Calculate number of chunks using ceiling division
+        let n_chunks = n_bin.div_ceil(chunk_n_bin);
+        info!("Scan of contig: {contig}");
+
+        let mut bins: Vec<BinCount> = (0..n_chunks)
+            .into_par_iter()
+            .flat_map(|i| {
+                // Calculate chunk start position
+                let chunk_start = i * chunk_n_bin * bin_size;
+
+                // Calculate chunk length
+                let chunk_length = if i == n_chunks - 1 {
+                    length - chunk_start // Handle last chunk
+                } else {
+                    chunk_n_bin * bin_size // Standard chunk size
+                };
+
+                // Calculate number of bins in this chunk with ceiling division
+                let n_bins_in_chunk = chunk_length.div_ceil(bin_size);
+
+                // Process each bin in the chunk
+                (0..n_bins_in_chunk)
+                    // .into_iter()
+                    .filter_map(|j| {
+                        let bin_start = chunk_start + j * bin_size;
+                        // Ensure we don't exceed remaining length
+                        let bin_length = std::cmp::min(bin_size, chunk_length - j * bin_size);
+                        // debug!("chunk start:{chunk_start}, length: {chunk_length}, n_bins: {n_bins_in_chunk}, first bin start: {bin_start} bin length: {bin_length}");
+                        match Bin::new(bam_path, &contig, bin_start, bin_length) {
+                            Ok(bin) => Some(BinCount::from(&bin)),
+                            Err(e) => {
+                                error!("Failed to get Bin: {e}");
+                                None
+                            }
+                        }
+                    })
+                    .collect::<Vec<BinCount>>()
+            })
+            .collect();
+
+        debug!("Scan {contig}, sorting bins");
+        bins.par_sort_unstable_by(|a, b| a.start.cmp(&b.start));
+
+        debug!("Scan {contig}, computing outliers");
+        fill_outliers(&mut bins);
+
+        let out_file = format!("{out_dir}/{contig}_count.tsv");
+        debug!("Scan {contig}, writing file");
+        let mut file = File::create(out_file)?;
+        for bin in bins {
+            writeln!(file, "{}", bin.to_tsv_row())?;
+        }
+    }
+    Ok(())
+}
+
+/// Identifies and marks outliers in a slice of `BinCount` objects based on various ratio metrics.
+///
+/// # Parameters
+/// - `bin_counts: &mut [BinCount]`: A mutable slice of `BinCount` objects to process.
+///
+/// # Description
+/// This function analyzes the `BinCount` objects for outliers in three different ratios:
+/// supplementary alignments (SA), read starts, and read ends. It uses a modified Z-score
+/// method to identify outliers and marks them in the `outlier` field of each `BinCount`.
+///
+/// ## Workflow:
+/// 1. Wraps the input slice in a `Mutex` for thread-safe access.
+/// 2. Defines helper functions to extract specific ratios from `BinCount` objects.
+/// 3. Iterates over three outlier types: SA, Start, and End.
+/// 4. For each type:
+///    a. Filters and collects non-NaN ratios along with their indices.
+///    b. Identifies outliers using the `filter_outliers_modified_z_score_with_indices` function.
+///    c. Marks identified outliers in the original `BinCount` objects.
+///
+/// ## Parallelization:
+/// - Uses `par_iter()` for parallel processing of `BinCount` objects.
+/// - Applies outlier marking in parallel using `par_iter().for_each()`.
+///
+/// ## Thread Safety:
+/// - Uses `Mutex` to ensure thread-safe access to the shared `bin_counts` slice.
+///
+/// ## Outlier Types:
+/// - `BinOutlier::SA`: Outliers in supplementary alignment ratio.
+/// - `BinOutlier::Start`: Outliers in read start ratio.
+/// - `BinOutlier::End`: Outliers in read end ratio.
+///
+/// # Notes
+/// - This function modifies the input slice in-place.
+/// - It skips any `BinCount` objects with NaN ratios.
+/// - The `filter_outliers_modified_z_score_with_indices` function is assumed to be defined elsewhere
+///   and is responsible for the actual outlier detection algorithm.
+///
+/// # Example
+/// ```
+/// let mut bin_counts = vec![
+///     BinCount { ratio_sa: 0.1, ratio_start: 0.2, ratio_end: 0.3, outlier: None, /* other fields */ },
+///     // ... more BinCount objects ...
+/// ];
+/// fill_outliers(&mut bin_counts);
+/// // After this call, some BinCount objects may have their outlier field populated
+/// ```
+pub fn fill_outliers(bin_counts: &mut [BinCount]) {
+    let bin_counts = Mutex::new(bin_counts);
+
+    fn get_ratio_sa(c: &BinCount) -> f64 {
+        c.ratio_sa
+    }
+    fn get_ratio_start(c: &BinCount) -> f64 {
+        c.ratio_start
+    }
+    fn get_ratio_end(c: &BinCount) -> f64 {
+        c.ratio_end
+    }
+    type OutlierTypeInfo = (fn(&BinCount) -> f64, BinOutlier);
+
+    let outlier_types: [OutlierTypeInfo; 3] = [
+        (get_ratio_sa, BinOutlier::SA),
+        (get_ratio_start, BinOutlier::Start),
+        (get_ratio_end, BinOutlier::End),
+    ];
+
+    for (get_ratio, outlier_type) in outlier_types.iter() {
+        let (indices, ratios): (Vec<usize>, Vec<f64>) = bin_counts
+            .lock()
+            .unwrap()
+            .par_iter()
+            .enumerate()
+            .filter_map(|(i, c)| {
+                let ratio = get_ratio(c);
+                if !ratio.is_nan() {
+                    Some((i, ratio))
+                } else {
+                    None
+                }
+            })
+            .unzip();
+
+        let outlier_indices = filter_outliers_modified_z_score_with_indices(&ratios, indices);
+
+        outlier_indices.par_iter().for_each(|&i| {
+            let mut bin_counts = bin_counts.lock().unwrap();
+            bin_counts[i]
+                .outlier
+                .get_or_insert_with(Vec::new)
+                .push(outlier_type.clone());
+        });
+    }
+}
+
+/// Performs a somatic scan for a given sample ID by analyzing both normal and tumoral BAM files.
+///
+/// # Parameters
+/// - `id: &str`: The unique identifier for the sample being scanned.
+/// - `config: &Config`: A configuration object containing paths and settings for the scan.
+///
+/// # Returns
+/// - `anyhow::Result<()>`: Returns `Ok(())` if both scans succeed, or an error if any operation fails.
+///
+/// # Description
+/// This function performs a somatic scan by invoking the `par_whole_scan` function twice:
+/// once for the normal BAM file and once for the tumoral BAM file. The results are saved
+/// in separate directories specified by the configuration object.
+///
+/// # Errors
+/// This function will return an error if:
+/// - Either call to `par_whole_scan` fails (e.g., due to invalid paths or processing errors).
+///
+/// ## Example Usage
+/// ```
+/// let config = Config::new(); // Assume Config is properly initialized
+/// let sample_id = "sample123";
+///
+/// match somatic_scan(sample_id, &config) {
+///     Ok(_) => println!("Somatic scan completed successfully."),
+///     Err(e) => eprintln!("Error during somatic scan: {}", e),
+/// }
+/// ```
+pub fn somatic_scan(id: &str, config: &Config) -> anyhow::Result<()> {
+    info!("Starting scan for {id} normal.");
+    par_whole_scan(&config.normal_dir_count(id), &config.normal_bam(id), config)?;
+    info!("Starting scan for {id} tumoral.");
+    par_whole_scan(
+        &config.tumoral_dir_count(id),
+        &config.tumoral_bam(id),
+        config,
+    )
+}

+ 65 - 3
src/variant/variant_collection.rs

@@ -22,16 +22,23 @@ use crate::{
         cosmic::Cosmic,
         echtvar::{parse_echtvar_val, run_echtvar},
         gnomad::GnomAD,
-        vep::{get_best_vep, run_vep, VepLine, VEP},
+        vep::{get_best_vep, run_vep, VepImpact, VepLine, VEP},
         Annotation, Annotations,
     },
     collection::{
         bam::{counts_at, counts_ins_at},
         vcf::Vcf,
     },
+    config::Config,
     helpers::{app_storage_dir, bin_data, estimate_shannon_entropy, mean, temp_file_path, Hash128},
-    io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header},
-    positions::GenomePosition,
+    io::{
+        dict::read_dict, fasta::sequence_at, gff::features_ranges, readers::get_reader,
+        vcf::vcf_header,
+    },
+    positions::{
+        merge_overlapping_genome_ranges, par_overlaps, GenomePosition, GenomeRange,
+        GetGenomePosition,
+    },
 };
 
 /// A collection of VCF variants along with associated metadata.
@@ -424,6 +431,12 @@ impl PartialEq for Variant {
     }
 }
 
+impl GetGenomePosition for Variant {
+    fn position(&self) -> &GenomePosition {
+        &self.position
+    }
+}
+
 impl Variant {
     pub fn id(&self) -> String {
         format!(
@@ -941,6 +954,55 @@ impl VariantsStats {
     }
 }
 
+#[derive(Debug)]
+pub struct SomaticVariantRates {
+    pub wgs_length: u32,
+    pub total_variants: usize,
+    pub somatic_mutation_rate_wgs: f64,
+    pub exon_count: usize,
+    pub variants_in_coding: usize,
+    pub coding_variants: usize,
+    pub total_exon_bases: u32,
+    pub somatic_mutation_rate_coding: f64,
+    pub somatic_nonsynonymous_rate_coding: f64,
+}
+
+pub fn somatic_rates(
+    variants: &[Variant],
+    feature_ranges: &Vec<GenomeRange>,
+    config: &Config,
+) -> anyhow::Result<SomaticVariantRates> {
+    let ol = par_overlaps(variants, feature_ranges);
+
+    let n_coding = ol
+        .iter()
+        .filter_map(|i| variants[*i].best_vep().ok())
+        .filter_map(|bv| bv.impact())
+        .filter(|impact| *impact <= VepImpact::MODERATE)
+        .count();
+
+    let n_bases_m: u32 = feature_ranges.par_iter().map(|gr| gr.length()).sum();
+    let mega_base_m = n_bases_m as f64 / 10.0e6;
+
+    let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
+    let rate_wgs = variants.len() as f64 / (wgs_len as f64 / 10.0e6);
+
+    let n_exons_mb = ol.len() as f64 / mega_base_m;
+    let n_coding_mb = n_coding as f64 / mega_base_m;
+
+    Ok(SomaticVariantRates {
+        total_variants: variants.len(),
+        exon_count: feature_ranges.len(),
+        variants_in_coding: ol.len(),
+        coding_variants: n_coding,
+        total_exon_bases: n_bases_m,
+        wgs_length: wgs_len,
+        somatic_mutation_rate_wgs: rate_wgs,
+        somatic_mutation_rate_coding: n_exons_mb,
+        somatic_nonsynonymous_rate_coding: n_coding_mb,
+    })
+}
+
 /// Creates a new Variant instance from a collection of VcfVariants and annotations.
 ///
 /// This function consolidates information from one or more VcfVariants into a single Variant,