9 months ago · 1cb6ad7442
--- a/src/annotation/vep.rs
+++ b/src/annotation/vep.rs
@@ -125,6 +125,12 @@ pub struct VEP {
 
				     pub extra: VEPExtra,
			
 
				 }
			
 
				 
			
 
				+impl VEP {
			
 
				+    pub fn impact(&self) -> Option<VepImpact> {
			
 
				+        self.extra.impact.clone()
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 /// Represents the predicted consequences of a genetic variant as determined by
			
 
				 /// the Ensembl Variant Effect Predictor (VEP).
			
 
				 ///
			
--- a/src/callers/savana.rs
+++ b/src/callers/savana.rs
@@ -237,7 +237,7 @@ impl FromStr for SavanaCNRow {
 
				 
			
 
				     fn from_str(s: &str) -> anyhow::Result<Self> {
			
 
				         let cells: Vec<&str> = s.split("\t").collect();
			
 
				-        let range = GenomeRange::from_1_inclusive(
			
 
				+        let range = GenomeRange::from_1_inclusive_str(
			
 
				             cells
			
 
				                 .first()
			
 
				                 .context(format!("Error while parsing contig {s}."))?,
			
@@ -340,7 +340,7 @@ impl FromStr for SavanaRCRow {
 
				             .split_once(':')
			
 
				             .and_then(|(a, b)| {
			
 
				                 b.split_once('_').map(|(b, c)| {
			
 
				-                    GenomeRange::from_1_inclusive(a, b, c)
			
 
				+                    GenomeRange::from_1_inclusive_str(a, b, c)
			
 
				                         .context(format!("Error while parsing range {}", bin))
			
 
				                 })
			
 
				             })
			
--- a/src/config.rs
+++ b/src/config.rs
@@ -6,12 +6,16 @@ pub struct Config {
 
				     pub reference: String,
			
 
				     pub reference_name: String,
			
 
				     pub dict_file: String,
			
 
				+    pub refseq_gff: String,
			
 
				     pub docker_max_memory_go: u16,
			
 
				     pub savana_bin: String,
			
 
				     pub savana_threads: u8,
			
 
				     pub tumoral_name: String,
			
 
				     pub normal_name: String,
			
 
				     pub haplotagged_bam_tag_name: String,
			
 
				+    pub count_dir_name: String,
			
 
				+    pub count_bin_size: u32,
			
 
				+    pub count_n_chunks: u32,
			
 
				     pub savana_output_dir: String,
			
 
				     pub savana_copy_number: String,
			
 
				     pub savana_read_counts: String,
			
@@ -84,6 +88,7 @@ impl Default for Config {
 
				             reference: "/data/ref/hs1/chm13v2.0.fa".to_string(),
			
 
				             reference_name: "hs1".to_string(),
			
 
				             dict_file: "/data/ref/hs1/chm13v2.0.dict".to_string(),
			
 
				+            refseq_gff: "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz".to_string(),
			
 
				 
			
 
				             docker_max_memory_go: 400,
			
 
				 
			
@@ -94,6 +99,10 @@ impl Default for Config {
 
				             normal_name: "mrd".to_string(),
			
 
				             haplotagged_bam_tag_name: "HP".to_string(),
			
 
				 
			
 
				+            count_dir_name: "counts".to_string(),
			
 
				+            count_bin_size: 1_000,
			
 
				+            count_n_chunks: 1_000,
			
 
				+
			
 
				             bam_min_mapq: 40,
			
 
				             bam_n_threads: 150,
			
 
				 
			
@@ -271,6 +280,14 @@ impl Config {
 
				         )
			
 
				     }
			
 
				 
			
 
				+    pub fn normal_dir_count(&self, id: &str) -> String {
			
 
				+        format!("{}/{}", self.normal_dir(id), self.count_dir_name)
			
 
				+    }
			
 
				+
			
 
				+    pub fn tumoral_dir_count(&self, id: &str) -> String {
			
 
				+        format!("{}/{}", self.tumoral_dir(id), self.count_dir_name)
			
 
				+    }
			
 
				+
			
 
				     pub fn mask_bed(&self, id: &str) -> String {
			
 
				         self.mask_bed
			
 
				             .replace("{result_dir}", &self.result_dir)
			
--- a/src/helpers.rs
+++ b/src/helpers.rs
@@ -286,10 +286,7 @@ where
 
				         return 0.0;
			
 
				     }
			
 
				 
			
 
				-    // Calculate the sum of the values
			
 
				     let sum: T = values.iter().copied().sum();
			
 
				-
			
 
				-    // Calculate and return the average as f64
			
 
				     sum.into() / count as f64
			
 
				 }
			
 
				 
			
--- a/src/io/bam.rs
+++ b/src/io/bam.rs
@@ -0,0 +1,246 @@
 
				+use std::collections::{HashMap, HashSet};
			
 
				+
			
 
				+use log::warn;
			
 
				+use rust_htslib::bam::{record::Aux, IndexedReader, Read, Record};
			
 
				+
			
 
				+
			
 
				+/// Parses an SA tag string and extracts chromosome and position information.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `sa` - The SA tag string to parse
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A vector of tuples containing chromosome names and positions
			
 
				+pub fn parse_sa_tag(sa: &str) -> Vec<(&str, i32)> {
			
 
				+    sa.split(';')
			
 
				+        .filter(|s| !s.is_empty())
			
 
				+        .filter_map(|s| {
			
 
				+            let parts: Vec<&str> = s.split(',').take(2).collect();
			
 
				+            if parts.len() < 2 {
			
 
				+                return None;
			
 
				+            }
			
 
				+
			
 
				+            let chr = parts[0];
			
 
				+            match parts[1].parse::<i32>() {
			
 
				+                Ok(position) => Some((chr, position)),
			
 
				+                Err(_) => None,
			
 
				+            }
			
 
				+        })
			
 
				+        .collect()
			
 
				+}
			
 
				+
			
 
				+/// Resolves the primary record for supplementary alignments using BAM SA tag
			
 
				+///
			
 
				+/// For supplementary alignments, searches linked primary records using genomic positions
			
 
				+/// listed in the SA tag. Returns the first non-supplementary record with matching query name.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+/// * `bam` - Mutable reference to indexed BAM reader for random access
			
 
				+/// * `record` - Input record to evaluate (typically a supplementary alignment)
			
 
				+///
			
 
				+/// # Returns
			
 
				+/// - Original record if it's not supplementary
			
 
				+/// - First matching primary record found via SA tag positions
			
 
				+/// - Original record if no primary matches found
			
 
				+///
			
 
				+/// # Panics
			
 
				+/// - If SA tag format is invalid (missing fields or non-integer positions)
			
 
				+/// - If BAM fetch operation fails
			
 
				+///
			
 
				+/// # Example
			
 
				+/// ```
			
 
				+/// use rust_htslib::{bam::{IndexedReader, Read}};
			
 
				+///
			
 
				+/// let mut bam = IndexedReader::from_path("input.bam").unwrap();
			
 
				+/// let record = bam.records().next().unwrap().unwrap();
			
 
				+/// let primary = primary_record(&mut bam, record);
			
 
				+/// ```
			
 
				+pub fn primary_record(bam: &mut IndexedReader, record: Record) -> Record {
			
 
				+    // Return immediately if not a supplementary alignment
			
 
				+    if !record.is_supplementary() {
			
 
				+        return record;
			
 
				+    }
			
 
				+
			
 
				+    let qname = record.qname();
			
 
				+
			
 
				+    // Process SA tag if present
			
 
				+    if let Ok(Aux::String(sa)) = record.aux(b"SA") {
			
 
				+        // Search potential primary alignments at each SA position
			
 
				+        for (chr, start) in parse_sa_tag(sa) {
			
 
				+            bam.fetch((chr, start, start + 1))
			
 
				+                .expect("BAM fetch failed");
			
 
				+
			
 
				+            for result in bam.records() {
			
 
				+                let candidate = result.expect("Invalid BAM record");
			
 
				+                if candidate.qname() == qname && !candidate.is_supplementary() {
			
 
				+                    return candidate.clone();
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Fallback to original record if no primary found
			
 
				+    record
			
 
				+}
			
 
				+
			
 
				+/// Creates optimized position bins for fetching records.
			
 
				+///
			
 
				+/// Groups positions by chromosome and creates bins of ±1000 bp.
			
 
				+/// Sorts bins by size (descending) to prioritize regions with more alignment hits.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `positions` - Vector of chromosome/position tuples
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A vector of position bins, sorted by bin size
			
 
				+fn create_position_bins<'a>(positions: &[(&'a str, i32)]) -> Vec<Vec<(&'a str, i32)>> {
			
 
				+    // Sort positions by chromosome and position
			
 
				+    let mut sorted_positions = positions.to_vec();
			
 
				+    sorted_positions.sort_by_key(|(chr, pos)| (*chr, *pos));
			
 
				+    sorted_positions.dedup();
			
 
				+    
			
 
				+    // Group by chromosome and create bins of ±1000 bp
			
 
				+    let mut grouped: HashMap<&str, Vec<Vec<(&str, i32)>>> = HashMap::new();
			
 
				+    
			
 
				+    for (chr, pos) in sorted_positions {
			
 
				+        let bins = grouped.entry(chr).or_default();
			
 
				+        
			
 
				+        if let Some(last_bin) = bins.last_mut() {
			
 
				+            if last_bin.is_empty() || (pos - last_bin[0].1).abs() <= 1000 {
			
 
				+                last_bin.push((chr, pos));
			
 
				+            } else {
			
 
				+                bins.push(vec![(chr, pos)]);
			
 
				+            }
			
 
				+        } else {
			
 
				+            bins.push(vec![(chr, pos)]);
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    // Flatten and sort by bin size (descending)
			
 
				+    let mut flattened: Vec<Vec<(&str, i32)>> = grouped.values()
			
 
				+        .flatten()
			
 
				+        .cloned()
			
 
				+        .collect();
			
 
				+    
			
 
				+    // Sort bins by size (descending) to prioritize regions with more hits
			
 
				+    flattened.sort_by_key(|bin| std::cmp::Reverse(bin.len()));
			
 
				+    
			
 
				+    flattened
			
 
				+}
			
 
				+
			
 
				+/// Retrieves primary alignment records based on a set of input records.
			
 
				+///
			
 
				+/// This function processes a collection of BAM records and retrieves their primary alignments.
			
 
				+/// When supplementary alignments are found (with SA tags), it fetches the corresponding
			
 
				+/// primary alignments from the provided BAM file.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `bam` - A mutable reference to an IndexedReader for the BAM file
			
 
				+/// * `records` - A vector of input records to process
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A vector of records containing both:
			
 
				+/// - Original primary alignments from the input
			
 
				+/// - Primary alignments fetched for any supplementary records in the input
			
 
				+///
			
 
				+/// # Examples
			
 
				+///
			
 
				+/// ```
			
 
				+/// use rust_htslib::bam::{IndexedReader, Record};
			
 
				+/// let mut bam = IndexedReader::from_path("sample.bam").unwrap();
			
 
				+/// let records = vec![/* some records */];
			
 
				+/// let primary_alignments = primary_records(&mut bam, records);
			
 
				+/// ```
			
 
				+pub fn primary_records(bam: &mut IndexedReader, records: Vec<Record>) -> Vec<Record> {
			
 
				+    let mut res = Vec::with_capacity(records.len());
			
 
				+    let mut all_positions = Vec::new();
			
 
				+    let mut all_qnames_to_fetch = Vec::new();
			
 
				+    
			
 
				+    // First pass: collect primary records and positions to fetch
			
 
				+    for record in records.iter() {
			
 
				+        if record.is_supplementary() {
			
 
				+            let qname = record.qname();
			
 
				+            // Safely extract and process the SA tag
			
 
				+            if let Ok(Aux::String(sa)) = record.aux(b"SA") {
			
 
				+                let positions = parse_sa_tag(sa);
			
 
				+                all_positions.extend(positions);
			
 
				+                
			
 
				+                match String::from_utf8(qname.to_vec()) {
			
 
				+                    Ok(qname_str) => all_qnames_to_fetch.push(qname_str),
			
 
				+                    Err(e) => warn!("Invalid UTF-8 in qname: {}", e),
			
 
				+                }
			
 
				+            }
			
 
				+        } else {
			
 
				+            res.push(record.clone());
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    // Track which query names we've already found
			
 
				+    let mut primary_records: HashSet<String> = res
			
 
				+        .iter()
			
 
				+        .filter_map(|r| String::from_utf8(r.qname().to_vec()).ok())
			
 
				+        .collect();
			
 
				+    
			
 
				+    // Create position bins for efficient fetching
			
 
				+    let position_bins = create_position_bins(&all_positions);
			
 
				+    
			
 
				+    // Fetch records for each bin until we find all primaries
			
 
				+    for bin in position_bins {
			
 
				+        if bin.is_empty() {
			
 
				+            continue;
			
 
				+        }
			
 
				+        
			
 
				+        let (chr, start) = bin.first().unwrap();
			
 
				+        let end = bin.last().unwrap().1 + 1;
			
 
				+        
			
 
				+        // Fetch and process records in this region
			
 
				+        if let Err(e) = bam.fetch((*chr, *start, end)) {
			
 
				+            warn!("Failed to fetch region {}:{}-{}: {}", chr, start, end, e);
			
 
				+            continue;
			
 
				+        }
			
 
				+        
			
 
				+        for record_result in bam.records() {
			
 
				+            match record_result {
			
 
				+                Ok(record) => {
			
 
				+                    // Skip secondary alignments
			
 
				+                    if record.is_secondary() {
			
 
				+                        continue;
			
 
				+                    }
			
 
				+                    
			
 
				+                    // Check if this is a primary we're looking for
			
 
				+                    match String::from_utf8(record.qname().to_vec()) {
			
 
				+                        Ok(qname) => {
			
 
				+                            if primary_records.contains(&qname) {
			
 
				+                                continue;
			
 
				+                            }
			
 
				+                            
			
 
				+                            if all_qnames_to_fetch.contains(&qname) {
			
 
				+                                res.push(record);
			
 
				+                                primary_records.insert(qname);
			
 
				+                            }
			
 
				+                        },
			
 
				+                        Err(_) => continue,
			
 
				+                    }
			
 
				+                },
			
 
				+                Err(e) => warn!("Error reading record: {}", e),
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        // Early exit if we found all the records we were looking for
			
 
				+        let remaining = all_qnames_to_fetch.iter()
			
 
				+            .filter(|q| !primary_records.contains(*q))
			
 
				+            .count();
			
 
				+        
			
 
				+        if remaining == 0 {
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    res
			
 
				+}
			
--- a/src/io/gff.rs
+++ b/src/io/gff.rs
@@ -0,0 +1,29 @@
 
				+use anyhow::Context;
			
 
				+use noodles_gff as gff;
			
 
				+
			
 
				+use crate::{config::Config, positions::GenomeRange};
			
 
				+
			
 
				+use super::readers::get_gz_reader;
			
 
				+
			
 
				+pub fn features_ranges(feature_type: &str, config: &Config) -> anyhow::Result<Vec<GenomeRange>> {
			
 
				+    let path = &config.refseq_gff;
			
 
				+    let mut reader = get_gz_reader(path)
			
 
				+        .map(gff::io::Reader::new)
			
 
				+        .with_context(|| format!("Error while creating the reader of {path}"))?;
			
 
				+
			
 
				+    let mut res = Vec::new();
			
 
				+    for result in reader.record_bufs() {
			
 
				+        let record = result?;
			
 
				+        let ty = record.ty();
			
 
				+        if ty != feature_type {
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        res.push(GenomeRange::from_1_inclusive(
			
 
				+            record.reference_sequence_name(),
			
 
				+            record.start().get() as u32,
			
 
				+            record.end().get() as u32,
			
 
				+        ));
			
 
				+    }
			
 
				+    Ok(res)
			
 
				+}
			
--- a/src/io/mod.rs
+++ b/src/io/mod.rs
@@ -5,3 +5,5 @@ pub mod bed;
 
				 pub mod dict;
			
 
				 pub mod fasta;
			
 
				 pub mod pod5_footer_generated;
			
 
				+pub mod gff;
			
 
				+pub mod bam;
			
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -14,6 +14,8 @@ pub mod pipes;
 
				 pub mod positions;
			
 
				 pub mod annotation;
			
 
				 pub mod cases;
			
 
				+pub mod scan;
			
 
				+pub mod math;
			
 
				 
			
 
				 #[macro_use]
			
 
				 extern crate lazy_static;
			
@@ -25,7 +27,7 @@ lazy_static! {
 
				 
			
 
				 #[cfg(test)]
			
 
				 mod tests {
			
 
				-    use std::{collections::HashMap, fs, path::Path};
			
 
				+    use std::{collections::HashMap, fs, ops::Index, path::Path};
			
 
				 
			
 
				     use annotation::{vep::{VepLine, VEP}, Annotations};
			
 
				     use callers::{nanomonsv::nanomonsv_create_pon, savana::{Savana, SavanaReadCounts}, severus::{Severus, SeverusSolo}};
			
@@ -43,7 +45,7 @@ mod tests {
 
				 
			
 
				     use self::{collection::pod5::{FlowCellCase, Pod5Collection}, commands::dorado, config::Config};
			
 
				     use super::*;
			
 
				-    use crate::{annotation::Annotation, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::{CNSegment, SavanaCN}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, pipes::somatic::const_stats, variant::{variant::AlterationCategory, variant_collection::VariantsStats}};
			
 
				+    use crate::{annotation::{vep::VepImpact, Annotation}, callers::{clairs::ClairS, deep_variant::DeepVariant, nanomonsv::{NanomonSV, NanomonSVSolo}, savana::{CNSegment, SavanaCN}, Callers}, collection::{bam, pod5::{scan_archive, FlowCells}, run_tasks, vcf::VcfCollection, Collections, CollectionsConfig}, commands::dorado::Dorado, io::{dict::read_dict, gff::features_ranges}, pipes::somatic::const_stats, positions::{merge_overlapping_genome_ranges, par_overlaps}, scan::scan::{par_whole_scan, somatic_scan}, variant::{variant::AlterationCategory, variant_collection::VariantsStats}};
			
 
				 
			
 
				     // export RUST_LOG="debug"
			
 
				     fn init() {
			
@@ -753,7 +755,6 @@ mod tests {
 
				             )
			
 
				         });
			
 
				         Ok(())
			
 
				-
			
 
				     }
			
 
				 
			
 
				     #[test]
			
@@ -801,4 +802,73 @@ Ok(())
 
				         println!("{} lines", r.len());
			
 
				         println!("{:#?}", r.first().unwrap());
			
 
				     }
			
 
				+
			
 
				+    #[test]
			
 
				+    fn whole_scan() -> anyhow::Result<()> {
			
 
				+        init();
			
 
				+        let id = "ADJAGBA";
			
 
				+        let config = Config::default();
			
 
				+        somatic_scan(id, &config)?;
			
 
				+        Ok(())
			
 
				+    }
			
 
				+
			
 
				+    #[test]
			
 
				+    fn parse_gff() -> anyhow::Result<()> {
			
 
				+        init();
			
 
				+        let id = "ADJAGBA";
			
 
				+        let config = Config::default();
			
 
				+        let path = format!("{}/{id}/diag/somatic_variants.bit", config.result_dir);
			
 
				+        let r = features_ranges("exon", &config)?;
			
 
				+        let merged = merge_overlapping_genome_ranges(&r);
			
 
				+        
			
 
				+        let variants = variant_collection::Variants::load_from_file(&path)?;
			
 
				+        let full = variant_collection::somatic_rates(&variants.data, &merged, &config);
			
 
				+        info!("{full:#?}");
			
 
				+
			
 
				+        let restrained: Vec<variant_collection::Variant> = variants.data.iter().filter(|v| v.vcf_variants.len() >= 2)
			
 
				+            .cloned().collect();
			
 
				+        let min_2 = variant_collection::somatic_rates(&restrained, &merged, &config);
			
 
				+        info!("{min_2:#?}");
			
 
				+
			
 
				+        let restrained: Vec<variant_collection::Variant> = restrained.iter().filter(|v| v.vcf_variants.len() >= 3)
			
 
				+            .cloned().collect();
			
 
				+        let min_3 = variant_collection::somatic_rates(&restrained, &merged, &config);
			
 
				+        info!("{min_3:#?}");
			
 
				+
			
 
				+
			
 
				+        // info!("n variants loaded: {}", variants.data.len());
			
 
				+        // 
			
 
				+        // let r = features_ranges("exon", &config::Config::default())?;
			
 
				+        // info!("n exon: {}", r.len());
			
 
				+        //
			
 
				+        // let merged = merge_overlapping_genome_ranges(&r);
			
 
				+        // info!("n merged exon: {}", merged.len());
			
 
				+        //
			
 
				+        // let ol = par_overlaps(&variants.data, &r);
			
 
				+        // info!("variants in exon {}", ol.len());
			
 
				+        //
			
 
				+        // let n_coding = ol.iter().filter_map(|i| variants.data[*i].best_vep().ok() ).filter_map(|bv| bv.impact()).filter(|impact| *impact <= VepImpact::MODERATE).count();
			
 
				+        // info!("coding variants {n_coding}");
			
 
				+        // 
			
 
				+        // let n_bases_m = merged.par_iter().map(|gr| gr.length()).sum::<u32>();
			
 
				+        // info!("{n_bases_m}nt");
			
 
				+        // 
			
 
				+        // let mega_base_m = n_bases_m as f64 / 10.0e6;
			
 
				+        //
			
 
				+        // let wgs_len = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum::<u32>();
			
 
				+        // info!("wgs len {wgs_len}");
			
 
				+        // let rate_wgs = variants.data.len() as f64 / (wgs_len as f64 / 10.0e6);
			
 
				+        // info!("somatic mutation rate {rate_wgs:.2}/mb");
			
 
				+        //
			
 
				+        // let n_exons_mb = ol.len() as f64 / mega_base_m;
			
 
				+        // info!("somatic mutation rate in the coding region {n_exons_mb:.2}/mb");
			
 
				+        // 
			
 
				+        // let n_exons_mb = n_coding as f64 / mega_base_m;
			
 
				+        // info!("somatic non synonymous mutation rate in the coding region {n_exons_mb:.2}/mb");
			
 
				+
			
 
				+        Ok(())
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				 }
			
--- a/src/math.rs
+++ b/src/math.rs
@@ -0,0 +1,161 @@
 
				+use rayon::prelude::*;
			
 
				+
			
 
				+
			
 
				+/// Filters outliers from a dataset using the Modified Z-Score method and returns the indices of the outliers.
			
 
				+///
			
 
				+/// The Modified Z-Score is a robust statistical method for detecting outliers. It is calculated as:
			
 
				+/// \[
			
 
				+/// \text{Modified Z-Score} = \frac{0.6745 \times |x - \text{median}|}{\text{MAD}}
			
 
				+/// \]
			
 
				+/// where:
			
 
				+/// - \( x \) is a data point,
			
 
				+/// - \( \text{median} \) is the median of the dataset,
			
 
				+/// - \( \text{MAD} \) is the Median Absolute Deviation.
			
 
				+///
			
 
				+/// Data points with a Modified Z-Score greater than the threshold (default: 3.5) are considered outliers.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `data` - A slice of `f64` values representing the dataset.
			
 
				+/// * `indices` - A vector of `u32` values representing the indices of the corresponding data points.
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// A vector of `u32` values containing the indices of the outliers.
			
 
				+///
			
 
				+/// # Examples
			
 
				+///
			
 
				+/// ```
			
 
				+/// let data = vec![1.0, 2.0, 3.0, 100.0]; // 100.0 is an outlier
			
 
				+/// let indices = vec![0, 1, 2, 3];
			
 
				+/// let outlier_indices = filter_outliers_modified_z_score_with_indices(&data, indices);
			
 
				+/// assert_eq!(outlier_indices, vec![3]);
			
 
				+/// ```
			
 
				+///
			
 
				+/// # Notes
			
 
				+///
			
 
				+/// - This function uses parallel processing via the `rayon` crate for improved performance on large datasets.
			
 
				+/// - The time complexity is O(n log n) due to the sorting step required for median calculation.
			
 
				+///
			
 
				+pub fn filter_outliers_modified_z_score_with_indices(
			
 
				+    data: &[f64],
			
 
				+    indices: Vec<usize>,
			
 
				+) -> Vec<usize> {
			
 
				+    // Early return if the dataset is empty
			
 
				+    if data.is_empty() {
			
 
				+        return Vec::new();
			
 
				+    }
			
 
				+
			
 
				+    // Create a sorted copy of the data for median and MAD calculations
			
 
				+    let mut sorted_data = data.to_vec();
			
 
				+    sorted_data.par_sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
			
 
				+
			
 
				+    // Compute the median of the dataset
			
 
				+    let median = compute_median(&sorted_data);
			
 
				+
			
 
				+    // Compute the Median Absolute Deviation (MAD) of the dataset
			
 
				+    let mad = compute_mad(&sorted_data, median);
			
 
				+
			
 
				+    // Threshold for Modified Z-Score (commonly 3.5 for outlier detection)
			
 
				+    const THRESHOLD: f64 = 3.5;
			
 
				+
			
 
				+    // Filter indices based on the Modified Z-Score
			
 
				+    indices
			
 
				+        .into_iter()
			
 
				+        .zip(data.iter())
			
 
				+        .filter_map(|(index, &x)| {
			
 
				+            // Calculate the Modified Z-Score
			
 
				+            let modified_z_score = 0.6745 * (x - median).abs() / mad;
			
 
				+
			
 
				+            // Keep the index if the Modified Z-Score exceeds the threshold
			
 
				+            if modified_z_score > THRESHOLD {
			
 
				+                Some(index)
			
 
				+            } else {
			
 
				+                None
			
 
				+            }
			
 
				+        })
			
 
				+        .collect()
			
 
				+}
			
 
				+
			
 
				+/// Computes the median of a **sorted** slice of `f64` values.
			
 
				+///
			
 
				+/// The median is the middle value in a sorted list of numbers. If the list has an even number
			
 
				+/// of elements, the median is the average of the two middle values. This function assumes
			
 
				+/// that the input data is already sorted.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `data` - A **sorted** slice of `f64` values for which the median is to be computed.
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// The median value as an `f64`. If the input slice is empty, returns `NaN`.
			
 
				+///
			
 
				+/// # Examples
			
 
				+///
			
 
				+/// ```
			
 
				+/// let data = vec![1.0, 2.0, 3.0]; // Sorted input
			
 
				+/// let median = compute_median(&data);
			
 
				+/// assert_eq!(median, 2.0);
			
 
				+///
			
 
				+/// let data = vec![1.0, 2.0, 3.0, 4.0]; // Sorted input
			
 
				+/// let median = compute_median(&data);
			
 
				+/// assert_eq!(median, 2.5);
			
 
				+/// ```
			
 
				+pub fn compute_median(data: &[f64]) -> f64 {
			
 
				+    // Early return if the dataset is empty
			
 
				+    if data.is_empty() {
			
 
				+        return f64::NAN; // Return NaN for an empty dataset
			
 
				+    }
			
 
				+
			
 
				+    // Determine the length of the sorted data
			
 
				+    let len = data.len();
			
 
				+
			
 
				+    // Calculate the median based on whether the length is even or odd
			
 
				+    if len % 2 == 0 {
			
 
				+        // If even, average the two middle values
			
 
				+        (data[len / 2 - 1] + data[len / 2]) / 2.0
			
 
				+    } else {
			
 
				+        // If odd, return the middle value
			
 
				+        data[len / 2]
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/// Computes the Median Absolute Deviation (MAD) of a **sorted** slice of `f64` values.
			
 
				+///
			
 
				+/// The Median Absolute Deviation is a measure of statistical dispersion. It is calculated as the
			
 
				+/// median of the absolute deviations of each data point from the median of the dataset. This function
			
 
				+/// assumes that the input data is already sorted.
			
 
				+///
			
 
				+/// # Arguments
			
 
				+///
			
 
				+/// * `data` - A **sorted** slice of `f64` values for which the MAD is to be computed.
			
 
				+/// * `median` - The median of the dataset, which is used as the central point for computing deviations.
			
 
				+///
			
 
				+/// # Returns
			
 
				+///
			
 
				+/// The Median Absolute Deviation as an `f64`. If the input slice is empty, returns `NaN`.
			
 
				+///
			
 
				+/// # Examples
			
 
				+///
			
 
				+/// ```
			
 
				+/// let data = vec![1.0, 2.0, 3.0, 4.0, 5.0]; // Sorted input
			
 
				+/// let median = 3.0;
			
 
				+/// let mad = compute_mad(&data, median);
			
 
				+/// assert_eq!(mad, 1.0);
			
 
				+/// ```
			
 
				+pub fn compute_mad(data: &[f64], median: f64) -> f64 {
			
 
				+    // Early return if the dataset is empty
			
 
				+    if data.is_empty() {
			
 
				+        return f64::NAN; // Return NaN for an empty dataset
			
 
				+    }
			
 
				+
			
 
				+    // Compute the absolute deviations of each data point from the median
			
 
				+    let mut deviations: Vec<f64> = data.par_iter().map(|&x| (x - median).abs()).collect();
			
 
				+
			
 
				+    // Sort the deviations to compute their median
			
 
				+    deviations.par_sort_by(|a, b| a.partial_cmp(b).unwrap());
			
 
				+
			
 
				+    // Compute the median of the absolute deviations
			
 
				+    compute_median(&deviations)
			
 
				+}
			
--- a/src/positions.rs
+++ b/src/positions.rs
@@ -1,4 +1,10 @@
 
				-use std::{cmp::Ordering, fmt::Display, ops::Range, str::FromStr};
			
 
				+use std::{
			
 
				+    cmp::{max, Ordering},
			
 
				+    fmt::Display,
			
 
				+    iter::Sum,
			
 
				+    ops::Range,
			
 
				+    str::FromStr,
			
 
				+};
			
 
				 
			
 
				 use anyhow::Context;
			
 
				 use bitcode::{Decode, Encode};
			
@@ -469,20 +475,84 @@ impl GenomeRange {
 
				     /// assert_eq!(range.range.start, 999);  // Converted to 0-based
			
 
				     /// assert_eq!(range.range.end, 2000);   // End is exclusive in 0-based
			
 
				     /// ```
			
 
				-    pub fn from_1_inclusive(contig: &str, start: &str, end: &str) -> anyhow::Result<Self> {
			
 
				+    pub fn from_1_inclusive_str(contig: &str, start: &str, end: &str) -> anyhow::Result<Self> {
			
 
				         Ok(Self {
			
 
				             contig: contig_to_num(contig),
			
 
				             range: Range {
			
 
				                 start: start
			
 
				                     .parse::<u32>()
			
 
				                     .context(format!("Can't parse {start} into u32."))?
			
 
				-                    - 1,
			
 
				+                    .saturating_sub(1),
			
 
				                 end: end
			
 
				                     .parse()
			
 
				                     .context(format!("Can't parse {end} into u32."))?,
			
 
				             },
			
 
				         })
			
 
				     }
			
 
				+
			
 
				+    pub fn from_1_inclusive(contig: &str, start: u32, end: u32) -> Self {
			
 
				+        Self {
			
 
				+            contig: contig_to_num(contig),
			
 
				+            range: Range {
			
 
				+                start: start.saturating_sub(1),
			
 
				+                end,
			
 
				+            },
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    pub fn length(&self) -> u32 {
			
 
				+        self.range.end - self.range.start
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+impl GetGenomePosition for GenomePosition {
			
 
				+    fn position(&self) -> &GenomePosition {
			
 
				+        self
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+impl GetGenomeRange for GenomeRange {
			
 
				+    fn range(&self) -> &GenomeRange {
			
 
				+        self
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+pub fn merge_overlapping_genome_ranges(ranges: &[GenomeRange]) -> Vec<GenomeRange> {
			
 
				+    if ranges.is_empty() {
			
 
				+        return vec![];
			
 
				+    }
			
 
				+
			
 
				+    // ranges.sort_by_key(|r| (r.contig, r.range.start));
			
 
				+
			
 
				+    let chunk_size = (ranges.len() / rayon::current_num_threads()).max(1);
			
 
				+
			
 
				+    ranges
			
 
				+        .par_chunks(chunk_size)
			
 
				+        .map(|chunk| {
			
 
				+            let mut merged = Vec::new();
			
 
				+            let mut current = chunk[0].clone();
			
 
				+
			
 
				+            for range in chunk.iter().skip(1) {
			
 
				+                if current.contig == range.contig && current.range.end >= range.range.start {
			
 
				+                    current.range.end = max(current.range.end, range.range.end);
			
 
				+                } else {
			
 
				+                    merged.push(current);
			
 
				+                    current = range.clone();
			
 
				+                }
			
 
				+            }
			
 
				+            merged.push(current);
			
 
				+            merged
			
 
				+        })
			
 
				+        .reduce(Vec::new, |mut acc, mut chunk| {
			
 
				+            if let (Some(last), Some(first)) = (acc.last_mut(), chunk.first()) {
			
 
				+                if last.contig == first.contig && last.range.end >= first.range.start {
			
 
				+                    last.range.end = max(last.range.end, first.range.end);
			
 
				+                    chunk.remove(0);
			
 
				+                }
			
 
				+            }
			
 
				+            acc.extend(chunk);
			
 
				+            acc
			
 
				+        })
			
 
				 }
			
 
				 
			
 
				 /// Finds overlaps between a list of genome positions and a list of genome ranges in parallel.
			
--- a/src/scan/bin.rs
+++ b/src/scan/bin.rs
@@ -0,0 +1,343 @@
 
				+use std::collections::HashMap;
			
 
				+
			
 
				+use anyhow::Context;
			
 
				+use rust_htslib::bam::{ext::BamRecordExtensions, record::Aux, IndexedReader, Read, Record};
			
 
				+
			
 
				+use crate::io::bam::{primary_record, primary_records};
			
 
				+
			
 
				+/// A genomic bin containing reads from a specific region.
			
 
				+///
			
 
				+/// This struct represents a segment of a BAM file, storing reads that overlap
			
 
				+/// a specific genomic region. It provides methods to analyze read distributions
			
 
				+/// and extract primary alignments for supplementary alignments.
			
 
				+#[derive(Debug)]
			
 
				+pub struct Bin {
			
 
				+    /// The contig/chromosome name
			
 
				+    pub contig: String,
			
 
				+    /// The 0-based inclusive start position
			
 
				+    pub start: u32,
			
 
				+    /// The 0-based inclusive end position
			
 
				+    pub end: u32,
			
 
				+    /// Map of read query names to their BAM records
			
 
				+    pub reads_store: HashMap<Vec<u8>, Record>,
			
 
				+    /// Indexed BAM reader for fetching additional records
			
 
				+    pub bam_reader: IndexedReader,
			
 
				+    /// Average read length within this bin
			
 
				+    pub reads_mean_len: u32,
			
 
				+    /// Number of reads filtered due to low mapping quality
			
 
				+    pub n_low_mapq: u32,
			
 
				+}
			
 
				+
			
 
				+impl Bin {
			
 
				+    /// Creates a new genomic bin from a BAM file.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    ///
			
 
				+    /// * `bam_path` - Path to the BAM file
			
 
				+    /// * `contig` - Chromosome/contig name
			
 
				+    /// * `start` - 0-based inclusive start position
			
 
				+    /// * `length` - Length of the region to extract
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// A Result containing the new Bin if successful, or an error if the BAM file
			
 
				+    /// couldn't be read or the region couldn't be fetched.
			
 
				+    ///
			
 
				+    /// # Examples
			
 
				+    ///
			
 
				+    /// ```
			
 
				+    /// let bin = Bin::new("sample.bam", "chr1", 1000000, 1000)?;
			
 
				+    /// println!("Loaded {} reads", bin.n_reads());
			
 
				+    /// ```
			
 
				+    pub fn new(bam_path: &str, contig: &str, start: u32, length: u32) -> anyhow::Result<Self> {
			
 
				+        const DEFAULT_MAPQ: u8 = 50;
			
 
				+        let mut bam_reader = IndexedReader::from_path(bam_path)
			
 
				+            .with_context(|| format!("Can't open BAM file: {}", bam_path))?;
			
 
				+
			
 
				+        // Calculate inclusive end position
			
 
				+        let end = start + length - 1;
			
 
				+
			
 
				+        // debug!("Creating bin {}:{}-{}", contig, start, end);
			
 
				+
			
 
				+        // Fetch reads in the required region
			
 
				+        bam_reader
			
 
				+            .fetch((contig, start as i64, end as i64))
			
 
				+            .with_context(|| format!("Failed to fetch region {}:{}-{}", contig, start, end))?;
			
 
				+
			
 
				+        let mut reads_store: HashMap<Vec<u8>, Record> = HashMap::new();
			
 
				+        let mut n_low_mapq = 0;
			
 
				+        let mut lengths = Vec::new();
			
 
				+
			
 
				+        // Process all records in the region
			
 
				+        for read_result in bam_reader.records() {
			
 
				+            let record = read_result.context("Error while parsing BAM record")?;
			
 
				+
			
 
				+            // Skip reads with low mapping quality
			
 
				+            if record.mapq() < DEFAULT_MAPQ {
			
 
				+                n_low_mapq += 1;
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // Store the read length
			
 
				+            lengths.push(
			
 
				+                record
			
 
				+                    .reference_end()
			
 
				+                    .saturating_sub(record.reference_start()),
			
 
				+            );
			
 
				+
			
 
				+            // Store the record by query name
			
 
				+            reads_store.insert(record.qname().to_vec(), record);
			
 
				+        }
			
 
				+
			
 
				+        // Calculate mean read length, handling the empty case
			
 
				+        let reads_mean_len = if lengths.is_empty() {
			
 
				+            0
			
 
				+        } else {
			
 
				+            (lengths.iter().sum::<i64>() as f64 / lengths.len() as f64) as u32
			
 
				+        };
			
 
				+
			
 
				+        Ok(Bin {
			
 
				+            contig: contig.to_string(),
			
 
				+            start,
			
 
				+            end,
			
 
				+            reads_store,
			
 
				+            bam_reader,
			
 
				+            n_low_mapq,
			
 
				+            reads_mean_len,
			
 
				+        })
			
 
				+    }
			
 
				+
			
 
				+    /// Returns the total number of reads in this bin.
			
 
				+    pub fn n_reads(&self) -> usize {
			
 
				+        self.reads_store.len()
			
 
				+    }
			
 
				+
			
 
				+    /// Returns the number of reads with SA (supplementary alignment) tags.
			
 
				+    pub fn n_sa(&self) -> usize {
			
 
				+        self.reads_store
			
 
				+            .values()
			
 
				+            .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
			
 
				+            .count()
			
 
				+    }
			
 
				+
			
 
				+    /// Retrieves primary alignments for all reads with SA tags in this bin.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// A vector of primary alignment records for reads with supplementary alignments.
			
 
				+    pub fn sa_primary(&mut self) -> Vec<Record> {
			
 
				+        self.reads_store
			
 
				+            .values()
			
 
				+            .filter(|record| matches!(record.aux(b"SA"), Ok(Aux::String(_))))
			
 
				+            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
			
 
				+            .collect()
			
 
				+    }
			
 
				+
			
 
				+    /// Finds the position with the highest concentration of read starts or ends.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// A tuple containing (position, count) of the position with the most read starts or ends.
			
 
				+    pub fn max_start_or_end(&self) -> (u32, usize) {
			
 
				+        let mut position_counts: HashMap<u32, usize> = HashMap::new();
			
 
				+
			
 
				+        for record in self.reads_store.values() {
			
 
				+            let reference_start = record.reference_start() as u32;
			
 
				+            let reference_end = record.reference_end() as u32;
			
 
				+
			
 
				+            // Count start positions within the bin
			
 
				+            if reference_start >= self.start && reference_start <= self.end {
			
 
				+                *position_counts.entry(reference_start).or_default() += 1;
			
 
				+            }
			
 
				+
			
 
				+            // Count end positions within the bin
			
 
				+            if reference_end >= self.start && reference_end <= self.end {
			
 
				+                *position_counts.entry(reference_end).or_default() += 1;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        position_counts
			
 
				+            .into_iter()
			
 
				+            .max_by_key(|(_, count)| *count)
			
 
				+            .unwrap_or((0, 0))
			
 
				+    }
			
 
				+
			
 
				+    /// Computes the number of reads with SA tags, and finds the highest
			
 
				+    /// counts of read starts and ends at the same position.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// A tuple containing:
			
 
				+    /// - Number of reads with SA tags
			
 
				+    /// - Highest count of reads starting at any single position
			
 
				+    /// - Highest count of reads ending at any single position
			
 
				+    pub fn count_reads_sa_start_end(&self) -> (usize, usize, usize) {
			
 
				+        let mut n_sa = 0;
			
 
				+        let mut start_counts: HashMap<u32, usize> = HashMap::new();
			
 
				+        let mut end_counts: HashMap<u32, usize> = HashMap::new();
			
 
				+
			
 
				+        for record in self.reads_store.values() {
			
 
				+            // Count reads with SA tags
			
 
				+            if matches!(record.aux(b"SA"), Ok(Aux::String(_))) {
			
 
				+                n_sa += 1;
			
 
				+            }
			
 
				+
			
 
				+            let reference_start = record.reference_start() as u32;
			
 
				+            let reference_end = record.reference_end() as u32;
			
 
				+
			
 
				+            // Count start positions within the bin
			
 
				+            if reference_start >= self.start && reference_start <= self.end {
			
 
				+                *start_counts.entry(reference_start).or_default() += 1;
			
 
				+            }
			
 
				+
			
 
				+            // Count end positions within the bin
			
 
				+            if reference_end >= self.start && reference_end <= self.end {
			
 
				+                *end_counts.entry(reference_end).or_default() += 1;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        let max_start_count = start_counts.values().max().copied().unwrap_or(0);
			
 
				+        let max_end_count = end_counts.values().max().copied().unwrap_or(0);
			
 
				+
			
 
				+        (n_sa, max_start_count, max_end_count)
			
 
				+    }
			
 
				+
			
 
				+    /// Finds the position with the highest concentration of read starts.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// A tuple containing (position, count) of the position with the most read starts.
			
 
				+    pub fn max_start(&self) -> (u32, usize) {
			
 
				+        let mut start_counts: HashMap<u32, usize> = HashMap::new();
			
 
				+
			
 
				+        for record in self.reads_store.values() {
			
 
				+            let reference_start = record.reference_start() as u32;
			
 
				+
			
 
				+            if reference_start >= self.start && reference_start <= self.end {
			
 
				+                *start_counts.entry(reference_start).or_default() += 1;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        start_counts
			
 
				+            .into_iter()
			
 
				+            .max_by_key(|(_, count)| *count)
			
 
				+            .unwrap_or((0, 0))
			
 
				+    }
			
 
				+
			
 
				+    /// Finds the position with the highest concentration of read ends.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// A tuple containing (position, count) of the position with the most read ends.
			
 
				+    pub fn max_end(&self) -> (u32, usize) {
			
 
				+        let mut end_counts: HashMap<u32, usize> = HashMap::new();
			
 
				+
			
 
				+        for record in self.reads_store.values() {
			
 
				+            let reference_end = record.reference_end() as u32;
			
 
				+
			
 
				+            if reference_end >= self.start && reference_end <= self.end {
			
 
				+                *end_counts.entry(reference_end).or_default() += 1;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        end_counts
			
 
				+            .into_iter()
			
 
				+            .max_by_key(|(_, count)| *count)
			
 
				+            .unwrap_or((0, 0))
			
 
				+    }
			
 
				+
			
 
				+    /// Gets primary alignments for reads that start or end at a specific position.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    ///
			
 
				+    /// * `pos` - The position to check for read starts or ends
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// A vector of primary alignment records.
			
 
				+    pub fn se_primary(&mut self, pos: u32) -> Vec<Record> {
			
 
				+        let records: Vec<Record> = self
			
 
				+            .reads_store
			
 
				+            .values()
			
 
				+            .filter(|record| {
			
 
				+                record.reference_start() as u32 == pos || record.reference_end() as u32 == pos
			
 
				+            })
			
 
				+            .cloned()
			
 
				+            .collect();
			
 
				+
			
 
				+        primary_records(&mut self.bam_reader, records)
			
 
				+    }
			
 
				+
			
 
				+    /// Gets primary alignments for reads that start at a specific position.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    ///
			
 
				+    /// * `pos` - The position to check for read starts
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// A vector of primary alignment records.
			
 
				+    pub fn s_primary(&mut self, pos: u32) -> Vec<Record> {
			
 
				+        self.reads_store
			
 
				+            .values()
			
 
				+            .filter(|record| record.reference_start() as u32 == pos)
			
 
				+            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
			
 
				+            .collect()
			
 
				+    }
			
 
				+
			
 
				+    /// Gets primary alignments for reads that end at a specific position.
			
 
				+    ///
			
 
				+    /// # Arguments
			
 
				+    ///
			
 
				+    /// * `pos` - The position to check for read ends
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// A vector of primary alignment records.
			
 
				+    pub fn e_primary(&mut self, pos: u32) -> Vec<Record> {
			
 
				+        self.reads_store
			
 
				+            .values()
			
 
				+            .filter(|record| record.reference_end() as u32 == pos)
			
 
				+            .map(|record| primary_record(&mut self.bam_reader, record.clone()))
			
 
				+            .collect()
			
 
				+    }
			
 
				+
			
 
				+    /// Calculates the mean coverage by considering the actual
			
 
				+    /// span of each read in the bin.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    ///
			
 
				+    /// The mean bin coverage as a floating-point value.
			
 
				+    pub fn mean_coverage(&self) -> f64 {
			
 
				+        // If bin has no length or no reads, return 0
			
 
				+        if self.end <= self.start || self.reads_store.is_empty() {
			
 
				+            return 0.0;
			
 
				+        }
			
 
				+
			
 
				+        // Calculate the total length of the bin
			
 
				+        let bin_length = (self.end - self.start + 1) as f64;
			
 
				+
			
 
				+        // Calculate the total bases covered by all reads within the bin
			
 
				+        let mut total_bases_covered = 0.0;
			
 
				+        for record in self.reads_store.values() {
			
 
				+            let read_start = record.reference_start() as u32;
			
 
				+            let read_end = record.reference_end() as u32;
			
 
				+
			
 
				+            // Skip reads that don't overlap with our bin
			
 
				+            if read_end < self.start || read_start > self.end {
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // Calculate the overlapping region
			
 
				+            let overlap_start = read_start.max(self.start);
			
 
				+            let overlap_end = read_end.min(self.end);
			
 
				+
			
 
				+            // Add the number of bases this read covers within our bin
			
 
				+            total_bases_covered += (overlap_end - overlap_start + 1) as f64;
			
 
				+        }
			
 
				+
			
 
				+        // Divide by the bin length to get average coverage
			
 
				+        total_bases_covered / bin_length
			
 
				+    }
			
 
				+}
			
 
				+
			
--- a/src/scan/mod.rs
+++ b/src/scan/mod.rs
@@ -0,0 +1,4 @@
 
				+
			
 
				+pub mod bin;
			
 
				+pub mod scan;
			
 
				+
			
--- a/src/scan/scan.rs
+++ b/src/scan/scan.rs
@@ -0,0 +1,429 @@
 
				+use std::{fmt, fs, fs::File, io::Write, sync::Mutex};
			
 
				+
			
 
				+use log::{debug, error, info};
			
 
				+use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator};
			
 
				+use rayon::{
			
 
				+    iter::{IntoParallelIterator, ParallelIterator},
			
 
				+    slice::ParallelSliceMut,
			
 
				+};
			
 
				+
			
 
				+use crate::math::filter_outliers_modified_z_score_with_indices;
			
 
				+use crate::positions::{contig_to_num, GenomeRange, GetGenomeRange};
			
 
				+use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};
			
 
				+
			
 
				+/// Represents a count of reads in a genomic bin, including various metrics and outlier information.
			
 
				+#[derive(Debug)]
			
 
				+pub struct BinCount {
			
 
				+    /// The name of the contig (chromosome) this bin belongs to.
			
 
				+    pub contig: String,
			
 
				+    /// The start position of the bin in the contig.
			
 
				+    pub start: u32,
			
 
				+    /// The end position of the bin in the contig.
			
 
				+    pub end: u32,
			
 
				+    /// The total number of reads in this bin.
			
 
				+    pub n_reads: u32,
			
 
				+    /// The average coverage of reads in this bin.
			
 
				+    pub coverage: f64,
			
 
				+    /// The ratio of supplementary alignments to total reads.
			
 
				+    pub ratio_sa: f64,
			
 
				+    /// The ratio of reads starting in this bin to total reads.
			
 
				+    pub ratio_start: f64,
			
 
				+    /// The ratio of reads ending in this bin to total reads.
			
 
				+    pub ratio_end: f64,
			
 
				+    /// Optional vector of outlier types for this bin.
			
 
				+    pub outlier: Option<Vec<BinOutlier>>,
			
 
				+}
			
 
				+
			
 
				+impl From<&Bin> for BinCount {
			
 
				+    /// Converts a `Bin` reference to a `BinCount`.
			
 
				+    ///
			
 
				+    /// # Parameters
			
 
				+    /// - `bin: &Bin`: A reference to the `Bin` object to convert.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// A new `BinCount` instance populated with data from the `Bin`.
			
 
				+    fn from(bin: &Bin) -> Self {
			
 
				+        let n_reads = bin.n_reads();
			
 
				+        let n_reads_float = n_reads as f64;
			
 
				+        let (n_sa, n_start, n_end) = bin.count_reads_sa_start_end();
			
 
				+
			
 
				+        Self {
			
 
				+            contig: bin.contig.clone(),
			
 
				+            start: bin.start,
			
 
				+            end: bin.end,
			
 
				+            n_reads: n_reads as u32,
			
 
				+            coverage: bin.mean_coverage(),
			
 
				+            ratio_sa: n_sa as f64 / n_reads_float,
			
 
				+            ratio_start: n_start as f64 / n_reads_float,
			
 
				+            ratio_end: n_end as f64 / n_reads_float,
			
 
				+            outlier: None,
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+impl BinCount {
			
 
				+    /// Converts the `BinCount` instance to a TSV (Tab-Separated Values) row string.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// A `String` representing the `BinCount` as a TSV row.
			
 
				+    pub fn to_tsv_row(&self) -> String {
			
 
				+        format!(
			
 
				+            "{}\t{}\t{}\t{}\t{:.6}\t{:.6}\t{:.6}\t{:.6}\t{}",
			
 
				+            self.contig,
			
 
				+            self.start,
			
 
				+            self.end,
			
 
				+            self.n_reads,
			
 
				+            self.coverage,
			
 
				+            self.ratio_sa,
			
 
				+            self.ratio_start,
			
 
				+            self.ratio_end,
			
 
				+            self.outlier
			
 
				+                .clone()
			
 
				+                .map(|v| v
			
 
				+                    .iter()
			
 
				+                    .map(|e| e.to_string())
			
 
				+                    .collect::<Vec<String>>()
			
 
				+                    .join(", "))
			
 
				+                .unwrap_or_default()
			
 
				+        )
			
 
				+    }
			
 
				+
			
 
				+    /// Parses a TSV row and creates a BinCount object.
			
 
				+    ///
			
 
				+    /// # Parameters
			
 
				+    /// - `row: &str`: A string slice representing a TSV row.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// A `Result` containing either a `BinCount` object if parsing is successful,
			
 
				+    /// or an `anyhow::Error` if parsing fails.
			
 
				+    ///
			
 
				+    /// # Errors
			
 
				+    /// This function will return an error if:
			
 
				+    /// - The row does not contain the expected number of fields.
			
 
				+    /// - Any of the numeric fields fail to parse.
			
 
				+    /// - The outlier field is not in the expected format.
			
 
				+    pub fn from_tsv_row(row: &str) -> anyhow::Result<Self> {
			
 
				+        let fields: Vec<&str> = row.split('\t').collect();
			
 
				+        if fields.len() != 9 {
			
 
				+            anyhow::bail!("Invalid number of fields in TSV row");
			
 
				+        }
			
 
				+
			
 
				+        let outlier = if !fields[8].is_empty() {
			
 
				+            Some(
			
 
				+                fields[8]
			
 
				+                    .split(", ")
			
 
				+                    .map(|s| match s {
			
 
				+                        "SA" => Ok(BinOutlier::SA),
			
 
				+                        "Start" => Ok(BinOutlier::Start),
			
 
				+                        "End" => Ok(BinOutlier::End),
			
 
				+                        _ => Err(anyhow::anyhow!("Invalid outlier type: {}", s)),
			
 
				+                    })
			
 
				+                    .collect::<Result<Vec<BinOutlier>, _>>()?,
			
 
				+            )
			
 
				+        } else {
			
 
				+            None
			
 
				+        };
			
 
				+
			
 
				+        Ok(BinCount {
			
 
				+            contig: fields[0].to_string(),
			
 
				+            start: fields[1].parse()?,
			
 
				+            end: fields[2].parse()?,
			
 
				+            n_reads: fields[3].parse()?,
			
 
				+            coverage: fields[4].parse()?,
			
 
				+            ratio_sa: fields[5].parse()?,
			
 
				+            ratio_start: fields[6].parse()?,
			
 
				+            ratio_end: fields[7].parse()?,
			
 
				+            outlier,
			
 
				+        })
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// impl GetGenomeRange for BinCount {
			
 
				+//     fn range(&self) -> GenomeRange {
			
 
				+//         GenomeRange { contig: contig_to_num(&self.contig), range: self.start..(self.end + 1) }
			
 
				+//     }
			
 
				+// }
			
 
				+
			
 
				+
			
 
				+/// Represents types of outliers that can be detected in a genomic bin.
			
 
				+#[derive(Debug, Clone)]
			
 
				+pub enum BinOutlier {
			
 
				+    /// Indicates an outlier in supplementary alignments.
			
 
				+    SA,
			
 
				+    /// Indicates an outlier in reads starting in this bin.
			
 
				+    Start,
			
 
				+    /// Indicates an outlier in reads ending in this bin.
			
 
				+    End,
			
 
				+}
			
 
				+
			
 
				+impl fmt::Display for BinOutlier {
			
 
				+    /// Implements the `Display` trait for `BinOutlier`.
			
 
				+    ///
			
 
				+    /// # Parameters
			
 
				+    /// - `f: &mut fmt::Formatter<'_>`: The formatter to write the string representation.
			
 
				+    ///
			
 
				+    /// # Returns
			
 
				+    /// A `fmt::Result` indicating success or failure of the formatting operation.
			
 
				+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
			
 
				+        match self {
			
 
				+            BinOutlier::SA => write!(f, "SA"),
			
 
				+            BinOutlier::Start => write!(f, "Start"),
			
 
				+            BinOutlier::End => write!(f, "End"),
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/// Performs a parallel whole genome scan on a BAM file, counting reads in bins across the genome
			
 
				+/// and identifying outliers. The results are written to TSV files, one per contig.
			
 
				+///
			
 
				+/// # Parameters
			
 
				+/// - `out_dir: &str`: The output directory where results will be saved.
			
 
				+/// - `bam_path: &str`: The path to the input BAM file.
			
 
				+/// - `config: &Config`: A configuration object containing the following fields:
			
 
				+///   - `count_bin_size`: The size of each bin in base pairs.
			
 
				+///   - `count_n_chunks`: The number of bins per chunk for parallel processing.
			
 
				+///   - `dict_file`: Path to the dictionary file containing contig names and lengths.
			
 
				+///
			
 
				+/// # Returns
			
 
				+/// - `anyhow::Result<()>`: Returns `Ok(())` if successful, or an error if any operation fails.
			
 
				+///
			
 
				+/// # Description
			
 
				+/// This function processes the genome in parallel by dividing each contig into chunks of bins.
			
 
				+/// Each bin represents a region of the genome, and the function counts reads in these bins.
			
 
				+/// After processing, bins are sorted, outliers are identified, and results are written to TSV files.
			
 
				+///
			
 
				+/// ## Workflow:
			
 
				+/// 1. **Initialization**:
			
 
				+///    - Logs the start of the scan with bin size and chunk information.
			
 
				+///    - Creates the output directory if it does not exist.
			
 
				+///
			
 
				+/// 2. **Contig Processing**:
			
 
				+///    - Reads contigs and their lengths from the dictionary file.
			
 
				+///    - Skips the mitochondrial chromosome ("chrM").
			
 
				+///
			
 
				+/// 3. **Parallel Scanning**:
			
 
				+///    - For each contig:
			
 
				+///      - Calculates the number of bins and chunks using ceiling division.
			
 
				+///      - Processes chunks in parallel using `into_par_iter()`.
			
 
				+///      - For each chunk:
			
 
				+///        - Calculates chunk start position and length.
			
 
				+///        - Processes individual bins within the chunk by creating `Bin` objects and converting them to `BinCount`.
			
 
				+///
			
 
				+/// 4. **Post-Processing**:
			
 
				+///    - Sorts bins by their start positions using parallel sorting.
			
 
				+///    - Identifies outlier bins using a custom function (`fill_outliers`).
			
 
				+///
			
 
				+/// 5. **Output**:
			
 
				+///    - Writes results for each contig to a TSV file in the specified output directory.
			
 
				+///
			
 
				+/// ## Notes:
			
 
				+/// - The function uses ceiling division (`div_ceil`) to handle edge cases in bin and chunk calculations.
			
 
				+/// - It includes debug logging for various stages of processing.
			
 
				+/// - Handles edge cases for the last chunk and bin to ensure proper processing of all data.
			
 
				+///
			
 
				+/// # Errors
			
 
				+/// This function will return an error if:
			
 
				+/// - The output directory cannot be created.
			
 
				+/// - The dictionary file cannot be read.
			
 
				+/// - A `Bin` object cannot be created for a specific region.
			
 
				+/// - Any I/O operation (e.g., writing results) fails.
			
 
				+pub fn par_whole_scan(out_dir: &str, bam_path: &str, config: &Config) -> anyhow::Result<()> {
			
 
				+    let bin_size = config.count_bin_size;
			
 
				+    let chunk_n_bin = config.count_n_chunks;
			
 
				+    info!("Starting whole genome scan for {bam_path}, with bin size of {bin_size} nt and by chunks of {chunk_n_bin} bins.");
			
 
				+    fs::create_dir(out_dir)?;
			
 
				+
			
 
				+    for (contig, length) in read_dict(&config.dict_file)? {
			
 
				+        if contig.as_str() == "chrM" {
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        let n_bin = length / bin_size;
			
 
				+        // Calculate number of chunks using ceiling division
			
 
				+        let n_chunks = n_bin.div_ceil(chunk_n_bin);
			
 
				+        info!("Scan of contig: {contig}");
			
 
				+
			
 
				+        let mut bins: Vec<BinCount> = (0..n_chunks)
			
 
				+            .into_par_iter()
			
 
				+            .flat_map(|i| {
			
 
				+                // Calculate chunk start position
			
 
				+                let chunk_start = i * chunk_n_bin * bin_size;
			
 
				+
			
 
				+                // Calculate chunk length
			
 
				+                let chunk_length = if i == n_chunks - 1 {
			
 
				+                    length - chunk_start // Handle last chunk
			
 
				+                } else {
			
 
				+                    chunk_n_bin * bin_size // Standard chunk size
			
 
				+                };
			
 
				+
			
 
				+                // Calculate number of bins in this chunk with ceiling division
			
 
				+                let n_bins_in_chunk = chunk_length.div_ceil(bin_size);
			
 
				+
			
 
				+                // Process each bin in the chunk
			
 
				+                (0..n_bins_in_chunk)
			
 
				+                    // .into_iter()
			
 
				+                    .filter_map(|j| {
			
 
				+                        let bin_start = chunk_start + j * bin_size;
			
 
				+                        // Ensure we don't exceed remaining length
			
 
				+                        let bin_length = std::cmp::min(bin_size, chunk_length - j * bin_size);
			
 
				+                        // debug!("chunk start:{chunk_start}, length: {chunk_length}, n_bins: {n_bins_in_chunk}, first bin start: {bin_start} bin length: {bin_length}");
			
 
				+                        match Bin::new(bam_path, &contig, bin_start, bin_length) {
			
 
				+                            Ok(bin) => Some(BinCount::from(&bin)),
			
 
				+                            Err(e) => {
			
 
				+                                error!("Failed to get Bin: {e}");
			
 
				+                                None
			
 
				+                            }
			
 
				+                        }
			
 
				+                    })
			
 
				+                    .collect::<Vec<BinCount>>()
			
 
				+            })
			
 
				+            .collect();
			
 
				+
			
 
				+        debug!("Scan {contig}, sorting bins");
			
 
				+        bins.par_sort_unstable_by(|a, b| a.start.cmp(&b.start));
			
 
				+
			
 
				+        debug!("Scan {contig}, computing outliers");
			
 
				+        fill_outliers(&mut bins);
			
 
				+
			
 
				+        let out_file = format!("{out_dir}/{contig}_count.tsv");
			
 
				+        debug!("Scan {contig}, writing file");
			
 
				+        let mut file = File::create(out_file)?;
			
 
				+        for bin in bins {
			
 
				+            writeln!(file, "{}", bin.to_tsv_row())?;
			
 
				+        }
			
 
				+    }
			
 
				+    Ok(())
			
 
				+}
			
 
				+
			
 
				+/// Identifies and marks outliers in a slice of `BinCount` objects based on various ratio metrics.
			
 
				+///
			
 
				+/// # Parameters
			
 
				+/// - `bin_counts: &mut [BinCount]`: A mutable slice of `BinCount` objects to process.
			
 
				+///
			
 
				+/// # Description
			
 
				+/// This function analyzes the `BinCount` objects for outliers in three different ratios:
			
 
				+/// supplementary alignments (SA), read starts, and read ends. It uses a modified Z-score
			
 
				+/// method to identify outliers and marks them in the `outlier` field of each `BinCount`.
			
 
				+///
			
 
				+/// ## Workflow:
			
 
				+/// 1. Wraps the input slice in a `Mutex` for thread-safe access.
			
 
				+/// 2. Defines helper functions to extract specific ratios from `BinCount` objects.
			
 
				+/// 3. Iterates over three outlier types: SA, Start, and End.
			
 
				+/// 4. For each type:
			
 
				+///    a. Filters and collects non-NaN ratios along with their indices.
			
 
				+///    b. Identifies outliers using the `filter_outliers_modified_z_score_with_indices` function.
			
 
				+///    c. Marks identified outliers in the original `BinCount` objects.
			
 
				+///
			
 
				+/// ## Parallelization:
			
 
				+/// - Uses `par_iter()` for parallel processing of `BinCount` objects.
			
 
				+/// - Applies outlier marking in parallel using `par_iter().for_each()`.
			
 
				+///
			
 
				+/// ## Thread Safety:
			
 
				+/// - Uses `Mutex` to ensure thread-safe access to the shared `bin_counts` slice.
			
 
				+///
			
 
				+/// ## Outlier Types:
			
 
				+/// - `BinOutlier::SA`: Outliers in supplementary alignment ratio.
			
 
				+/// - `BinOutlier::Start`: Outliers in read start ratio.
			
 
				+/// - `BinOutlier::End`: Outliers in read end ratio.
			
 
				+///
			
 
				+/// # Notes
			
 
				+/// - This function modifies the input slice in-place.
			
 
				+/// - It skips any `BinCount` objects with NaN ratios.
			
 
				+/// - The `filter_outliers_modified_z_score_with_indices` function is assumed to be defined elsewhere
			
 
				+///   and is responsible for the actual outlier detection algorithm.
			
 
				+///
			
 
				+/// # Example
			
 
				+/// ```
			
 
				+/// let mut bin_counts = vec![
			
 
				+///     BinCount { ratio_sa: 0.1, ratio_start: 0.2, ratio_end: 0.3, outlier: None, /* other fields */ },
			
 
				+///     // ... more BinCount objects ...
			
 
				+/// ];
			
 
				+/// fill_outliers(&mut bin_counts);
			
 
				+/// // After this call, some BinCount objects may have their outlier field populated
			
 
				+/// ```
			
 
				+pub fn fill_outliers(bin_counts: &mut [BinCount]) {
			
 
				+    let bin_counts = Mutex::new(bin_counts);
			
 
				+
			
 
				+    fn get_ratio_sa(c: &BinCount) -> f64 {
			
 
				+        c.ratio_sa
			
 
				+    }
			
 
				+    fn get_ratio_start(c: &BinCount) -> f64 {
			
 
				+        c.ratio_start
			
 
				+    }
			
 
				+    fn get_ratio_end(c: &BinCount) -> f64 {
			
 
				+        c.ratio_end
			
 
				+    }
			
 
				+    type OutlierTypeInfo = (fn(&BinCount) -> f64, BinOutlier);
			
 
				+
			
 
				+    let outlier_types: [OutlierTypeInfo; 3] = [
			
 
				+        (get_ratio_sa, BinOutlier::SA),
			
 
				+        (get_ratio_start, BinOutlier::Start),
			
 
				+        (get_ratio_end, BinOutlier::End),
			
 
				+    ];
			
 
				+
			
 
				+    for (get_ratio, outlier_type) in outlier_types.iter() {
			
 
				+        let (indices, ratios): (Vec<usize>, Vec<f64>) = bin_counts
			
 
				+            .lock()
			
 
				+            .unwrap()
			
 
				+            .par_iter()
			
 
				+            .enumerate()
			
 
				+            .filter_map(|(i, c)| {
			
 
				+                let ratio = get_ratio(c);
			
 
				+                if !ratio.is_nan() {
			
 
				+                    Some((i, ratio))
			
 
				+                } else {
			
 
				+                    None
			
 
				+                }
			
 
				+            })
			
 
				+            .unzip();
			
 
				+
			
 
				+        let outlier_indices = filter_outliers_modified_z_score_with_indices(&ratios, indices);
			
 
				+
			
 
				+        outlier_indices.par_iter().for_each(|&i| {
			
 
				+            let mut bin_counts = bin_counts.lock().unwrap();
			
 
				+            bin_counts[i]
			
 
				+                .outlier
			
 
				+                .get_or_insert_with(Vec::new)
			
 
				+                .push(outlier_type.clone());
			
 
				+        });
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/// Performs a somatic scan for a given sample ID by analyzing both normal and tumoral BAM files.
			
 
				+///
			
 
				+/// # Parameters
			
 
				+/// - `id: &str`: The unique identifier for the sample being scanned.
			
 
				+/// - `config: &Config`: A configuration object containing paths and settings for the scan.
			
 
				+///
			
 
				+/// # Returns
			
 
				+/// - `anyhow::Result<()>`: Returns `Ok(())` if both scans succeed, or an error if any operation fails.
			
 
				+///
			
 
				+/// # Description
			
 
				+/// This function performs a somatic scan by invoking the `par_whole_scan` function twice:
			
 
				+/// once for the normal BAM file and once for the tumoral BAM file. The results are saved
			
 
				+/// in separate directories specified by the configuration object.
			
 
				+///
			
 
				+/// # Errors
			
 
				+/// This function will return an error if:
			
 
				+/// - Either call to `par_whole_scan` fails (e.g., due to invalid paths or processing errors).
			
 
				+///
			
 
				+/// ## Example Usage
			
 
				+/// ```
			
 
				+/// let config = Config::new(); // Assume Config is properly initialized
			
 
				+/// let sample_id = "sample123";
			
 
				+///
			
 
				+/// match somatic_scan(sample_id, &config) {
			
 
				+///     Ok(_) => println!("Somatic scan completed successfully."),
			
 
				+///     Err(e) => eprintln!("Error during somatic scan: {}", e),
			
 
				+/// }
			
 
				+/// ```
			
 
				+pub fn somatic_scan(id: &str, config: &Config) -> anyhow::Result<()> {
			
 
				+    info!("Starting scan for {id} normal.");
			
 
				+    par_whole_scan(&config.normal_dir_count(id), &config.normal_bam(id), config)?;
			
 
				+    info!("Starting scan for {id} tumoral.");
			
 
				+    par_whole_scan(
			
 
				+        &config.tumoral_dir_count(id),
			
 
				+        &config.tumoral_bam(id),
			
 
				+        config,
			
 
				+    )
			
 
				+}
			
--- a/src/variant/variant_collection.rs
+++ b/src/variant/variant_collection.rs
@@ -22,16 +22,23 @@ use crate::{
 
				         cosmic::Cosmic,
			
 
				         echtvar::{parse_echtvar_val, run_echtvar},
			
 
				         gnomad::GnomAD,
			
 
				-        vep::{get_best_vep, run_vep, VepLine, VEP},
			
 
				+        vep::{get_best_vep, run_vep, VepImpact, VepLine, VEP},
			
 
				         Annotation, Annotations,
			
 
				     },
			
 
				     collection::{
			
 
				         bam::{counts_at, counts_ins_at},
			
 
				         vcf::Vcf,
			
 
				     },
			
 
				+    config::Config,
			
 
				     helpers::{app_storage_dir, bin_data, estimate_shannon_entropy, mean, temp_file_path, Hash128},
			
 
				-    io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header},
			
 
				-    positions::GenomePosition,
			
 
				+    io::{
			
 
				+        dict::read_dict, fasta::sequence_at, gff::features_ranges, readers::get_reader,
			
 
				+        vcf::vcf_header,
			
 
				+    },
			
 
				+    positions::{
			
 
				+        merge_overlapping_genome_ranges, par_overlaps, GenomePosition, GenomeRange,
			
 
				+        GetGenomePosition,
			
 
				+    },
			
 
				 };
			
 
				 
			
 
				 /// A collection of VCF variants along with associated metadata.
			
@@ -424,6 +431,12 @@ impl PartialEq for Variant {
 
				     }
			
 
				 }
			
 
				 
			
 
				+impl GetGenomePosition for Variant {
			
 
				+    fn position(&self) -> &GenomePosition {
			
 
				+        &self.position
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 impl Variant {
			
 
				     pub fn id(&self) -> String {
			
 
				         format!(
			
@@ -941,6 +954,55 @@ impl VariantsStats {
 
				     }
			
 
				 }
			
 
				 
			
 
				+#[derive(Debug)]
			
 
				+pub struct SomaticVariantRates {
			
 
				+    pub wgs_length: u32,
			
 
				+    pub total_variants: usize,
			
 
				+    pub somatic_mutation_rate_wgs: f64,
			
 
				+    pub exon_count: usize,
			
 
				+    pub variants_in_coding: usize,
			
 
				+    pub coding_variants: usize,
			
 
				+    pub total_exon_bases: u32,
			
 
				+    pub somatic_mutation_rate_coding: f64,
			
 
				+    pub somatic_nonsynonymous_rate_coding: f64,
			
 
				+}
			
 
				+
			
 
				+pub fn somatic_rates(
			
 
				+    variants: &[Variant],
			
 
				+    feature_ranges: &Vec<GenomeRange>,
			
 
				+    config: &Config,
			
 
				+) -> anyhow::Result<SomaticVariantRates> {
			
 
				+    let ol = par_overlaps(variants, feature_ranges);
			
 
				+
			
 
				+    let n_coding = ol
			
 
				+        .iter()
			
 
				+        .filter_map(|i| variants[*i].best_vep().ok())
			
 
				+        .filter_map(|bv| bv.impact())
			
 
				+        .filter(|impact| *impact <= VepImpact::MODERATE)
			
 
				+        .count();
			
 
				+
			
 
				+    let n_bases_m: u32 = feature_ranges.par_iter().map(|gr| gr.length()).sum();
			
 
				+    let mega_base_m = n_bases_m as f64 / 10.0e6;
			
 
				+
			
 
				+    let wgs_len: u32 = read_dict(&config.dict_file)?.iter().map(|(_, l)| *l).sum();
			
 
				+    let rate_wgs = variants.len() as f64 / (wgs_len as f64 / 10.0e6);
			
 
				+
			
 
				+    let n_exons_mb = ol.len() as f64 / mega_base_m;
			
 
				+    let n_coding_mb = n_coding as f64 / mega_base_m;
			
 
				+
			
 
				+    Ok(SomaticVariantRates {
			
 
				+        total_variants: variants.len(),
			
 
				+        exon_count: feature_ranges.len(),
			
 
				+        variants_in_coding: ol.len(),
			
 
				+        coding_variants: n_coding,
			
 
				+        total_exon_bases: n_bases_m,
			
 
				+        wgs_length: wgs_len,
			
 
				+        somatic_mutation_rate_wgs: rate_wgs,
			
 
				+        somatic_mutation_rate_coding: n_exons_mb,
			
 
				+        somatic_nonsynonymous_rate_coding: n_coding_mb,
			
 
				+    })
			
 
				+}
			
 
				+
			
 
				 /// Creates a new Variant instance from a collection of VcfVariants and annotations.
			
 
				 ///
			
 
				 /// This function consolidates information from one or more VcfVariants into a single Variant,