Browse Source

Trinucleotides and Early/Late annotations

Thomas 8 months ago
parent
commit
23d0e0ea17

+ 129 - 20
src/annotation/mod.rs

@@ -29,29 +29,124 @@ use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use vep::{get_best_vep, VEP};
 
+/// Represents various types of annotations that can be associated with a variant.
+///
+/// These annotations cover caller-specific metadata, biological properties, database hits,
+/// and computed statistics like entropy or trinucleotide context.
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
 pub enum Annotation {
+    /// Annotation from a specific variant caller and associated sample type.
     Callers(Caller, Sample),
+
+    /// Categorization of the alteration (e.g., SNV, indel, etc.).
     AlterationCategory(AlterationCategory),
+
+    /// Shannon entropy of the surrounding genomic region or read distribution.
     ShannonEntropy(f64),
+
+    /// Depth of coverage in the constitutional (normal) sample.
     ConstitDepth(u16),
+
+    /// Alternate allele count in the constitutional (normal) sample.
     ConstitAlt(u16),
+
+    /// Flag indicating low depth in the constitutional sample.
     LowConstitDepth,
+
+    /// Flag indicating high alternate allele count in the constitutional sample.
     HighConstitAlt,
+
+    /// COSMIC database hit (cancer-associated mutation).
     Cosmic(Cosmic),
+
+    /// GnomAD population frequency database annotation.
     GnomAD(GnomAD),
+
+    /// Flag indicating low Shannon entropy (possibly less reliable region).
     LowEntropy,
+
+    /// Trinucleotide context surrounding the variant.
+    TriNucleotides([Base; 3]),
+
+    /// Variant Effect Predictor (VEP) annotations.
     VEP(Vec<VEP>),
+
+    /// Timing of replication for the variant's genomic position.
+    ReplicationTiming(ReplicationClass),
 }
 
+/// Denotes the biological sample type associated with a variant call.
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
 pub enum Sample {
+    /// Tumor-only sample without matched normal.
     SoloTumor,
+
+    /// Constitutional (normal) sample without matched tumor.
     SoloConstit,
+
+    /// Variant observed in germline context.
     Germline,
+
+    /// Variant identified as somatic (tumor-specific).
     Somatic,
 }
 
+/// A nucleotide base used for representing DNA sequence.
+///
+/// Includes the four standard bases (A, T, C, G) and `N` for ambiguous or unknown bases.
+#[derive(Copy, Debug, Clone, PartialEq, Serialize, Deserialize, Encode, Decode)]
+pub enum Base {
+    /// Adenine
+    A,
+    /// Thymine
+    T,
+    /// Cytosine
+    C,
+    /// Guanine
+    G,
+    /// Unknown or ambiguous nucleotide
+    N,
+}
+
+impl fmt::Display for Base {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}",
+            match self {
+                Base::A => "A",
+                Base::T => "T",
+                Base::C => "C",
+                Base::G => "G",
+                Base::N => "N",
+            }
+        )
+    }
+}
+
+/// Helper to convert a 3-base string into a [Base; 3] array.
+/// Returns `None` if any base is invalid.
+pub fn parse_trinuc(s: &str) -> [Base; 3] {
+    fn char_to_base(c: char) -> Base {
+        match c.to_ascii_uppercase() {
+            'A' => Base::A,
+            'T' => Base::T,
+            'C' => Base::C,
+            'G' => Base::G,
+            _ => Base::N,
+        }
+    }
+
+    let chars: Vec<Base> = s.chars().map(char_to_base).collect();
+    [chars[0], chars[1], chars[2]]
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Encode, Decode)]
+pub enum ReplicationClass {
+    Early,
+    Late,
+}
+
 impl fmt::Display for Sample {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
@@ -81,22 +176,35 @@ impl FromStr for Sample {
     }
 }
 
+/// Implements string formatting for `Annotation`, providing a human-readable summary of each variant.
+///
+/// This is primarily used for debugging, logging, or text-based output of annotated variants.
+/// For most variants, a short descriptive label is used. Some variants include additional detail,
+/// such as base content or sample-specific information.
 impl fmt::Display for Annotation {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let str = match self {
-            Annotation::Callers(caller, sample) => &format!("{caller} {sample}"),
-            Annotation::ShannonEntropy(_) => "ShannonEntropy",
-            Annotation::ConstitDepth(_) => "ConstitDepth",
-            Annotation::ConstitAlt(_) => "ConstitAlt",
-            Annotation::LowConstitDepth => "LowConstitDepth",
-            Annotation::HighConstitAlt => "HighConstitAlt",
-            Annotation::Cosmic(_) => "Cosmic",
-            Annotation::GnomAD(_) => "GnomAD",
-            Annotation::LowEntropy => "LowEntropy",
-            Annotation::VEP(_) => "VEP",
-            Annotation::AlterationCategory(alt_cat) => &alt_cat.to_string(),
+        let s = match self {
+            Annotation::Callers(caller, sample) => format!("{caller} {sample}"),
+            Annotation::AlterationCategory(alt_cat) => alt_cat.to_string(),
+            Annotation::ShannonEntropy(_) => "ShannonEntropy".into(),
+            Annotation::ConstitDepth(_) => "ConstitDepth".into(),
+            Annotation::ConstitAlt(_) => "ConstitAlt".into(),
+            Annotation::LowConstitDepth => "LowConstitDepth".into(),
+            Annotation::HighConstitAlt => "HighConstitAlt".into(),
+            Annotation::Cosmic(_) => "Cosmic".into(),
+            Annotation::GnomAD(_) => "GnomAD".into(),
+            Annotation::LowEntropy => "LowEntropy".into(),
+            Annotation::VEP(_) => "VEP".into(),
+            Annotation::TriNucleotides(bases) => format!(
+                "Trinucleotides({})",
+                bases.iter().map(|b| b.to_string()).collect::<String>(),
+            ),
+            Annotation::ReplicationTiming(rt) => match rt {
+                ReplicationClass::Early => "ReplicationEarly".into(),
+                ReplicationClass::Late => "ReplicationLate".into(),
+            },
         };
-        write!(f, "{}", str)
+        write!(f, "{}", s)
     }
 }
 
@@ -234,6 +342,8 @@ impl Annotations {
                     | Annotation::LowEntropy
                     | Annotation::GnomAD(_)
                     | Annotation::VEP(_)
+                    | Annotation::TriNucleotides(_)
+                    | Annotation::ReplicationTiming(_)
                     | Annotation::HighConstitAlt => categorical.push(ann.to_string()),
                     Annotation::Callers(caller, sample) => {
                         categorical.push(format!("{caller} {sample}"))
@@ -533,14 +643,13 @@ pub trait CallerCat {
 /// - a GnomAD entry with AF > 0
 /// - and a ConstitAlt entry with n_alt > 0
 pub fn is_gnomad_and_constit_alt(anns: &[Annotation]) -> bool {
-    let gnomad = anns.iter().any(|a| {
-        matches!(a, Annotation::GnomAD(g) if g.gnomad_af > 0.0)
-    });
+    let gnomad = anns
+        .iter()
+        .any(|a| matches!(a, Annotation::GnomAD(g) if g.gnomad_af > 0.0));
 
-    let constit_alt = anns.iter().any(|a| {
-        matches!(a, Annotation::ConstitAlt(n) if *n > 0)
-    });
+    let constit_alt = anns
+        .iter()
+        .any(|a| matches!(a, Annotation::ConstitAlt(n) if *n > 0));
 
     gnomad && constit_alt
 }
-

+ 4 - 4
src/callers/clairs.rs

@@ -26,7 +26,7 @@ use std::{fs, path::Path};
 /// - Integration with variant annotation workflows
 ///
 /// # References
-/// - ClairS: https://github.com/HKU-BAL/ClairS
+/// - ClairS: <https://github.com/HKU-BAL/ClairS>
 #[derive(Debug, Clone)]
 pub struct ClairS {
     pub id: String,
@@ -64,7 +64,7 @@ impl Initialize for ClairS {
             config,
         };
 
-        if clairs.config.clairs_force || clairs.should_run() {
+        if clairs.config.clairs_force {
             remove_dir_if_exists(&clairs.config.clairs_output_dir(&clairs.id))?;
         }
 
@@ -76,8 +76,8 @@ impl ShouldRun for ClairS {
     /// Determines whether ClairS should be re-run based on BAM modification timestamps.
     fn should_run(&self) -> bool {
         let passed_vcf = &self.config.clairs_passed_vcf(&self.id);
-        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id)).unwrap_or(true)
-            || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id)).unwrap_or(true);
+        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true).unwrap_or(true)
+            || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
         if result {
             warn!("ClairS should run for id: {}.", self.id);
         }

+ 3 - 3
src/callers/deep_somatic.rs

@@ -53,7 +53,7 @@ impl Initialize for DeepSomatic {
             log_dir,
         };
 
-        if deep_somatic.config.deepsomatic_force || deep_somatic.should_run() {
+        if deep_somatic.config.deepsomatic_force {
             remove_dir_if_exists(&deep_somatic.config.deepsomatic_output_dir(&deep_somatic.id))?;
         }
 
@@ -72,8 +72,8 @@ impl Initialize for DeepSomatic {
 impl ShouldRun for DeepSomatic {
     fn should_run(&self) -> bool {
         let passed_vcf = &self.config.deepsomatic_passed_vcf(&self.id);
-        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id)).unwrap_or(true)
-            || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id)).unwrap_or(true);
+        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true).unwrap_or(true)
+            || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
         if result {
             info!("DeepSomatic should run for id: {}.", self.id);
         }

+ 2 - 2
src/callers/deep_variant.rs

@@ -63,7 +63,7 @@ impl InitializeSolo for DeepVariant {
             config,
         };
 
-        if deepvariant.config.deepvariant_force || deepvariant.should_run() {
+        if deepvariant.config.deepvariant_force {
             remove_dir_if_exists(
                 &deepvariant
                     .config
@@ -87,7 +87,7 @@ impl ShouldRun for DeepVariant {
             .config
             .deepvariant_solo_passed_vcf(&self.id, &self.time_point);
         let bam = self.config.solo_bam(&self.id, &self.time_point);
-        let result = is_file_older(&passed_vcf, &bam).unwrap_or(true);
+        let result = is_file_older(&passed_vcf, &bam, true).unwrap_or(true);
         if result {
             info!(
                 "DeepVariant should run for: {} {}.",

+ 7 - 7
src/callers/nanomonsv.rs

@@ -56,7 +56,7 @@ impl Initialize for NanomonSV {
             config,
         };
 
-        if nanomonsv.config.nanomonsv_force || nanomonsv.should_run() {
+        if nanomonsv.config.nanomonsv_force {
             remove_dir_if_exists(&nanomonsv.config.nanomonsv_output_dir(&nanomonsv.id, "diag"))?;
             remove_dir_if_exists(&nanomonsv.config.nanomonsv_output_dir(&nanomonsv.id, "mrd"))?;
         }
@@ -73,8 +73,8 @@ impl ShouldRun for NanomonSV {
     /// `true` if the passed VCF does not exist or is older than any input BAM.
     fn should_run(&self) -> bool {
         let passed_vcf = self.config.nanomonsv_passed_vcf(&self.id);
-        let result = is_file_older(&passed_vcf, &self.config.normal_bam(&self.id)).unwrap_or(true)
-            || is_file_older(&passed_vcf, &self.config.tumoral_bam(&self.id)).unwrap_or(true);
+        let result = is_file_older(&passed_vcf, &self.config.normal_bam(&self.id), true).unwrap_or(true)
+            || is_file_older(&passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
         if result {
             warn!("NanomonSV should run for id: {}.", self.id);
         }
@@ -317,9 +317,9 @@ impl CallerCat for NanomonSVSolo {
             ..
         } = &self.config;
         if *normal_name == self.time_point {
-            Annotation::Callers(Caller::DeepSomatic, Sample::SoloConstit)
+            Annotation::Callers(Caller::NanomonSVSolo, Sample::SoloConstit)
         } else if *tumoral_name == self.time_point {
-            Annotation::Callers(Caller::DeepSomatic, Sample::SoloTumor)
+            Annotation::Callers(Caller::NanomonSVSolo, Sample::SoloTumor)
         } else {
             panic!("Error in time_point name: {}", self.time_point);
         }
@@ -337,7 +337,7 @@ impl Variants for NanomonSVSolo {
         let caller = self.caller_cat();
         let add = vec![caller.clone()];
         info!(
-            "Loading variant from ClairS {} with annotations: {:?}",
+            "Loading variant from NanomonSVSolo {} with annotations: {:?}",
             self.id, add
         );
         let variants = read_vcf(&self.vcf_passed)?;
@@ -404,7 +404,7 @@ fn somatic_parse(
     for handle in threads_handles {
         handle
             .join()
-            .map_err(|_| anyhow::anyhow!("Nanomonsv Thread panicked"))??;
+            .map_err(|_| anyhow::anyhow!("Nanomonsv thread panicked"))??;
     }
 
     info!("Nanomonsv parsing completed successfully.");

+ 3 - 3
src/callers/savana.rs

@@ -63,7 +63,7 @@ impl Initialize for Savana {
         };
 
         // If forced re-run is enabled or a run is needed, remove old output directory
-        if savana.config.savana_force || savana.should_run() {
+        if savana.config.savana_force {
             remove_dir_if_exists(&savana.config.savana_output_dir(id))?;
         }
 
@@ -82,8 +82,8 @@ impl ShouldRun for Savana {
     /// `true` if an update is needed, or if timestamps can't be checked (file doesn't exist)
     fn should_run(&self) -> bool {
         let passed_vcf = &self.config.savana_passed_vcf(&self.id);
-        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id)).unwrap_or(true)
-            || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id)).unwrap_or(true);
+        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true).unwrap_or(true)
+            || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
         if result {
             warn!("Savana should run for id: {}.", self.id);
         }

+ 3 - 3
src/callers/severus.rs

@@ -53,7 +53,7 @@ impl Initialize for Severus {
             log_dir,
         };
 
-        if severus.config.severus_force || severus.should_run() {
+        if severus.config.severus_force {
             remove_dir_if_exists(&severus.config.severus_output_dir(id))?;
         }
 
@@ -70,8 +70,8 @@ impl ShouldRun for Severus {
     /// `true` if Severus needs to be re-run, otherwise `false`
     fn should_run(&self) -> bool {
         let passed_vcf = &self.config.severus_passed_vcf(&self.id);
-        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id)).unwrap_or(true)
-            || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id)).unwrap_or(true);
+        let result = is_file_older(passed_vcf, &self.config.normal_bam(&self.id), true).unwrap_or(true)
+            || is_file_older(passed_vcf, &self.config.tumoral_bam(&self.id), true).unwrap_or(true);
         if result {
             info!("Severus should run for: {}.", self.id);
         }

+ 25 - 25
src/collection/pod5.rs

@@ -741,9 +741,7 @@ impl FlowCellExperiment {
 }
 
 type ExperimentData = (MinKnowSampleSheet, Vec<(String, u64, DateTime<Utc>)>);
-pub fn scan_archive(
-    tar_path: &str,
-) -> anyhow::Result<ExperimentData> {
+pub fn scan_archive(tar_path: &str) -> anyhow::Result<ExperimentData> {
     info!("Scanning archive: {tar_path}");
 
     let file = File::open(tar_path)
@@ -793,9 +791,7 @@ pub fn scan_archive(
     Ok((sample_sheet, result))
 }
 
-pub fn scan_local(
-    dir: &str,
-) -> anyhow::Result<ExperimentData> {
+pub fn scan_local(dir: &str) -> anyhow::Result<ExperimentData> {
     let mut result = Vec::new();
     let mut sample_sheet: Option<String> = None;
 
@@ -924,7 +920,7 @@ impl FlowCells {
             Vec::new()
         };
 
-        let sample_sheets = find_files(&format!("{local_run_dir}/**/sample_sheet*"));
+        let sample_sheets = find_files(&format!("{local_run_dir}/**/sample_sheet*"))?;
 
         for sample_sheet in sample_sheets {
             let dir = sample_sheet.parent().ok_or_else(|| {
@@ -963,24 +959,28 @@ impl FlowCells {
 
         let pattern = format!("{archive_path}/*.tar");
 
-        let res: Vec<FlowCel> = glob(&pattern)
-            .map_err(|e| anyhow::anyhow!("Failed to read: {pattern}.\n{e}"))?
-            .filter_map(|entry| {
-                match entry {
-                    Ok(path) => match scan_archive(path.to_str().unwrap()) {
-                        Ok((sample_sheet, files)) => match FlowCel::new(
-                            sample_sheet,
-                            FlowCellLocation::Archived(archive_path.to_string()),
-                            files,
-                        ) {
-                            Ok(r) => return Some(r),
-                            Err(e) => warn!("{e}"),
-                        },
-                        Err(e) => warn!("{e}"),
-                    },
-                    Err(e) => warn!("Error: {:?}", e),
+        let res: Vec<FlowCel> = glob(&pattern)?
+            .filter_map(Result::ok)
+            .filter_map(|path| {
+                let archive_str = path.to_string_lossy();
+                let (sample_sheet, files) = match scan_archive(&archive_str) {
+                    Ok(r) => r,
+                    Err(e) => {
+                        warn!("Failed to scan archive {}: {e}", archive_str);
+                        return None;
+                    }
+                };
+                match FlowCel::new(
+                    sample_sheet,
+                    FlowCellLocation::Archived(archive_path.to_string()),
+                    files,
+                ) {
+                    Ok(fc) => Some(fc),
+                    Err(e) => {
+                        warn!("Failed to create FlowCel from {}: {e}", archive_str);
+                        None
+                    }
                 }
-                None
             })
             .collect();
 
@@ -989,7 +989,7 @@ impl FlowCells {
         all.dedup_by_key(|v| v.flowcell_id.clone());
 
         let n_final = all.len();
-        info!("{} new archive discovered.", n_final - n_before);
+        info!("{} new archive(s) discovered.", n_final - n_before);
 
         let json = serde_json::to_string_pretty(&all)
             .map_err(|e| anyhow::anyhow!("Can't convert into json.\n{e}"))?;

+ 2 - 1
src/config.rs

@@ -145,7 +145,8 @@ impl Default for Config {
             clairs_force: false,
 
             // Savana
-            savana_bin: "/home/prom/.local/bin/savana".to_string(),
+            savana_bin: "savana".to_string(),
+            // savana_bin: "/home/prom/.local/bin/savana".to_string(),
             savana_threads: 150,
             savana_output_dir: "{result_dir}/{id}/diag/savana".to_string(),
             savana_passed_vcf: "{output_dir}/{id}_diag_savana_PASSED.vcf.gz".to_string(),

+ 62 - 33
src/helpers.rs

@@ -1,6 +1,7 @@
 use anyhow::Context;
 use bitcode::{Decode, Encode};
 use glob::glob;
+use log::{debug, warn};
 use serde::{Deserialize, Serialize};
 use std::{
     cmp::Ordering,
@@ -422,19 +423,43 @@ pub fn get_dir_size(path: &Path) -> std::io::Result<u64> {
     Ok(total_size)
 }
 
-// "**/*.pod5"
-pub fn find_files(pattern: &str) -> Vec<PathBuf> {
+/// Finds all files matching the given glob pattern.
+///
+/// # Arguments
+///
+/// * `pattern` - A glob pattern string (e.g., `"data/**/*.txt"`).
+///
+/// # Returns
+///
+/// * `Ok(Vec<PathBuf>)` with all successfully matched file paths.
+/// * `Err` if the glob pattern is invalid or any matched path fails to resolve.
+///
+/// # Errors
+///
+/// Returns an error if:
+/// - The glob pattern is invalid.
+/// - A file path matching the pattern cannot be resolved.
+///
+/// # Examples
+///
+/// ```rust
+/// let files = find_files("src/**/*.rs")?;
+/// for file in files {
+///     println!("{:?}", file);
+/// }
+/// ```
+pub fn find_files(pattern: &str) -> anyhow::Result<Vec<PathBuf>> {
     let mut result = Vec::new();
 
-    glob(pattern)
-        .expect("Failed to read glob pattern")
-        .for_each(|entry| {
-            if let Ok(path) = entry {
-                result.push(path);
-            }
-        });
+    let entries = glob(pattern)
+        .with_context(|| format!("Invalid glob pattern: '{}'", pattern))?;
 
-    result
+    for entry in entries {
+        let path = entry.with_context(|| format!("Failed to resolve path for pattern '{}'", pattern))?;
+        result.push(path);
+    }
+
+    Ok(result)
 }
 
 // fn system_time_to_utc(system_time: SystemTime) -> Option<DateTime<Utc>> {
@@ -462,34 +487,30 @@ pub fn list_directories(dir_path: &str) -> std::io::Result<Vec<String>> {
     Ok(directories)
 }
 
-/// Returns `true` if the first file is older than the second file,
-/// based on their last modification times.
+/// Checks whether the modification time of `file1` is older than `file2`.
+///
+/// If `rm` is `true` and `file1` is older, attempts to remove the directory containing `file1`.
 ///
 /// # Arguments
 ///
 /// * `file1` - Path to the first file.
 /// * `file2` - Path to the second file.
+/// * `rm` - If true, and `file1` is older, attempts to remove its parent directory.
 ///
 /// # Returns
 ///
-/// * `Ok(true)` if `file1` is older than `file2`
-/// * `Ok(false)` if `file1` is not older than `file2`
-/// * `Err(e)` if one of the files cannot be read or metadata cannot be retrieved
+/// * `Ok(true)` if `file1` is older than `file2`.
+/// * `Ok(false)` otherwise.
 ///
 /// # Errors
 ///
-/// Returns an error if either file does not exist, cannot be accessed,
-/// or the modification time is unavailable.
-///
-/// # Example
+/// Returns an [`anyhow::Error`] if:
+/// - Either `file1` or `file2` does not exist.
+/// - File metadata cannot be read.
+/// - File modification times cannot be retrieved.
+/// - (if `rm == true`) Directory removal fails (if uncommented).
 ///
-/// ```
-/// let result = is_file_older("a.txt", "b.txt")?;
-/// if result {
-///     println!("a.txt is older than b.txt");
-/// }
-/// ```
-pub fn is_file_older(file1: &str, file2: &str) -> anyhow::Result<bool> {
+pub fn is_file_older(file1: &str, file2: &str, rm: bool) -> anyhow::Result<bool> {
     let mtime1 = fs::metadata(file1)
         .with_context(|| format!("Failed to read metadata for '{}'", file1))?
         .modified()
@@ -500,16 +521,24 @@ pub fn is_file_older(file1: &str, file2: &str) -> anyhow::Result<bool> {
         .modified()
         .with_context(|| format!("Failed to get modified time for '{}'", file2))?;
 
+    if mtime1 < mtime2 && rm {
+        if let Some(file1_dir) = Path::new(file1).parent() {
+            warn!("Removing old directory: {}", file1_dir.display());
+            // fs::remove_dir(file1_dir)?;
+        }
+    }
+
     Ok(mtime1 < mtime2)
 }
 
 pub fn remove_dir_if_exists(dir: &str) -> anyhow::Result<()> {
-    match fs::remove_dir_all(dir) {
-        Ok(_) => {}
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
-        Err(e) => {
-            anyhow::bail!("Failed to remove directory '{}': {}", dir, e);
-        }
-    };
+    debug!("Trying to remove: {dir}");
+    // match fs::remove_dir_all(dir) {
+    //     Ok(_) => {}
+    //     Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+    //     Err(e) => {
+    //         anyhow::bail!("Failed to remove directory '{}': {}", dir, e);
+    //     }
+    // };
     Ok(())
 }

+ 50 - 1
src/io/bed.rs

@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use log::warn;
 
-use crate::positions::{GenomeRange, GetGenomeRange};
+use crate::{annotation::Annotation, positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomeRange}, variant::variant_collection::Variants};
 
 use super::readers::get_reader;
 
@@ -71,3 +71,52 @@ pub fn read_bed(path: &str) -> anyhow::Result<Vec<BedRow>> {
 
     Ok(res)
 }
+
+/// Annotates variants with a given annotation based on overlap with a BED file.
+///
+/// This function reads a BED file, extracts genomic ranges, and finds overlaps between those
+/// ranges and the positions of the variants in the provided `Variants` object. For each
+/// overlapping variant, the specified annotation is added to its `annotations` list.
+/// Returns the total genomic length (in base pairs) covered by the BED regions useful for rate
+/// computing.
+///
+/// # Arguments
+/// * `variants` - A mutable reference to the `Variants` object to be annotated.
+/// * `bed_path` - Path to the BED file containing genomic intervals (e.g., early or late replication).
+/// * `annotation` - The `Annotation` to apply to overlapping variants.
+///
+/// # Returns
+/// * `Ok(total_bases)` — total number of base pairs in the BED intervals, regardless of overlap.
+///
+/// # Errors
+/// This function can return errors related to file I/O or malformed BED lines.
+///
+/// # Example
+/// ```
+/// annotate_with_bed(&mut variants, "early_25.bed", Annotation::Early)?;
+/// annotate_with_bed(&mut variants, "late_25.bed", Annotation::Late)?;
+/// ```
+pub fn annotate_with_bed(
+    variants: &mut Variants,
+    bed_path: &str,
+    annotation: Annotation,
+) -> anyhow::Result<u32> {
+    let bed_rows = read_bed(bed_path)?;
+    let ranges: Vec<&GenomeRange> = bed_rows.iter().map(|b| &b.range).collect();
+
+    let total_bases: u32 = ranges
+        .iter()
+        .map(|r| r.length())
+        .sum();
+
+    let positions: Vec<&GenomePosition> = variants.data.iter().map(|v| &v.position).collect();
+
+    let overlaps = overlaps_par(&positions, &ranges);
+
+    for &idx in &overlaps {
+        variants.data[idx].annotations.push(annotation.clone());
+    }
+
+    Ok(total_bases)
+}
+

+ 1 - 1
src/io/fasta.rs

@@ -1,6 +1,6 @@
 use std::fs::File;
 
-// 0-based position
+// 0-based position in input
 pub fn sequence_at(
     fasta_reader: &mut noodles_fasta::IndexedReader<noodles_fasta::io::BufReader<File>>,
     contig: &str,

+ 23 - 52
src/lib.rs

@@ -1,21 +1,23 @@
-//! # Long-read Somatic Variant Calling and Analysis Framework
+//! # 🧬 Long-read Somatic Variant Calling and Analysis Framework
 //!
 //! This Rust library provides a modular, parallelizable framework for somatic variant calling, annotation, and interpretation from long-read sequencing data. It is designed to support full pipelines for research and clinical workflows across multiple variant callers and analysis stages.
 //!
-//! ## Key Features
+//! The library also serves as an extensible platform that developers can leverage to add custom features, integrate new tools, and tailor workflows to specific use cases.
+//!
+//! ## 🧩 Key Features
 //!
-//! - **Pipeline Management**: Full orchestration of Dockerized execution pipelines for tools such as ClairS, Nanomonsv, DeepVariant, Savana, Modkit, and Severus.
 //! - **POD5 Demultiplexing and Alignment**: End-to-end support for processing ONT POD5 files:
-//!     - Demux using barcode metadata and custom CSV input
-//!     - POD5 subsetting and organization by flowcell case
+//!     - Barcode-aware demultiplexing using metadata CSVs
+//!     - POD5 subsetting and organization by case
 //!     - Integration with basecallers (e.g., Dorado) for read alignment
+//! - **Pipeline Management**: Full orchestration of Dockerized execution pipelines for tools such as ClairS, Nanomonsv, DeepVariant, Savana, Modkit, and Severus.
 //! - **Flexible Configuration**: Centralized configuration system (`Config`, `CollectionsConfig`) for all modules and pipelines.
 //! - **Input Abstraction**: Unified handling of BAM, POD5, and VCF file collections across cohorts and directories.
 //! - **Variant Processing**: Modular loading, filtering, statistical analysis, and annotation of somatic and germline variants.
 //! - **Haplotype Phasing and Methylation**: Support for LongPhase-based phasing and Modkit methylation pileups with support for multi-threaded pileup and aggregation.
 //! - **Parallel Execution**: Uses `rayon` for efficient multicore parallelization over large cohorts and tasks.
 //!
-//! ## Module Highlights
+//! ## 📚 Module Highlights
 //!
 //! - `callers`: Interfaces to variant calling tools (ClairS, DeepVariant, Nanomonsv, Savana, etc...)
 //! - `runners`: Pipeline runners (e.g. `Somatic`, `SeverusSolo`, `LongphasePhase`) that manage end-to-end execution.
@@ -25,26 +27,22 @@
 //! - `functions`: Custom logic for genome assembly, entropy estimation, and internal tooling.
 //! - `positions`, `variant`, `helpers`: Utilities for SV modeling, variant filtering, position overlap logic, and helper methods.
 //!
-//! ---
-//!
-//! ## 🧬 Workflow Overview
+//! ## ⚡ Workflow Overview
 //!
 //! ### 1. 📦 From POD5 to BAM Alignment
 //!
 //! - **Demultiplexing**: POD5 files are subset and demuxed using barcodes (via CSV metadata).
-//! - **Flowcell Case Management**: Each sample is identified by a `FlowCellCase` containing its ID, time point, and POD5 directory.
-//! - **Alignment**: The `Dorado` module handles alignment of POD5 reads to reference genome, producing BAMs.
+//! - **Flowcell Case Management**: Each sample is identified by a [`collection::pod5::FlowCellCase`] containing its ID, time point, and POD5 directory.
+//! - **Alignment**: The [`commands::dorado::Dorado`] module handles alignment of POD5 reads to reference genome, producing BAMs.
 //!
 //! ```rust
 //! let case = FlowCellCase { id: "PATIENT1", time_point: "diag", barcode: "01", pod_dir: "...".into() };
 //! Dorado::init(case, Config::default())?.run_pipe()?;
 //! ```
 //!
-//! ---
-//!
 //! ### 2. 🧬 Variant Calling (BAM ➝ VCF)
 //!
-//! Using the aligned BAMs, multiple variant callers can be run in parallel. The `callers` and `runners` modules support:
+//! Using the aligned BAMs, multiple variant callers can be run in parallel. The [`callers`] and [`runners`] modules support:
 //!
 //! - **ClairS** – somatic small variant calling with LongPhase haplotagging  
 //! - **Nanomonsv** – structural variants (SV)  
@@ -60,14 +58,12 @@
 //! NanomonSV::initialize("PATIENT1", Config::default())?.run()?;
 //! ```
 //!
-//! ---
-//!
 //! ### 3. 📈 Aggregation & Statistics (VCF ➝ JSON / Stats)
 //!
 //! After variant calling:
 //!
-//! - Annotate with VEP (`annotation` module)
-//! - Load and filter with `variant_collection`
+//! - Annotate with VEP ([`annotation`] module)
+//! - Load and filter with [`variant::variant_collection`]
 //! - Compute variant and region-level stats (e.g., mutation rates, alteration categories, coding overlaps)
 //!
 //! ```rust
@@ -76,8 +72,6 @@
 //! stats.save_to_json("/output/path/stats.json.gz")?;
 //! ```
 //!
-//! ---
-//!
 //! ### 4. 🧠 Intelligent Task Management (`collection` module)
 //!
 //! - Auto-discovers available samples, POD5s, BAMs, and VCFs
@@ -90,20 +84,6 @@
 //! collections.run()?;       // Run them automatically
 //! ```
 //!
-//! ---
-//!
-//! ## 📁 Module Highlights
-//!
-//! - `callers`: Interfaces to ClairS, DeepVariant, Savana, Nanomonsv, etc.
-//! - `runners`: Pipeline runners like `Somatic` and `LongphasePhase`
-//! - `collection`: Auto-discovery of BAM/VCF/POD5s, task orchestration
-//! - `annotation`: VEP line parsing and transcript-level annotations
-//! - `pipes`: High-level pipelines (e.g., `run_somatic`, `todo_deepvariants`)
-//! - `variant`: Variant structs, filtering, alteration categories
-//! - `positions`, `helpers`, `functions`, `math`: Utility layers
-//!
-//! ---
-//!
 //! ## 🔬 Testing
 //!
 //! Integration tests demonstrate the entire pipeline. Run with logging enabled:
@@ -113,8 +93,7 @@
 //! cargo test -- --nocapture
 //! ```
 //!
-//! ---
-//! ## Example Use Cases
+//! ## 🧪 Example Use Cases
 //!
 //! - Full somatic variant calling pipeline on matched tumor/normal samples
 //! - POD5-based pipeline from raw signal to variants
@@ -122,30 +101,22 @@
 //! - Methylation analysis using nanopore-specific tools
 //! - Variant calling and analysis in large-scale longitudinal studies
 //!
-//! ## Getting Started
+//! ## 🚀 Getting Started
 //!
 //! All workflows are initialized from `Config` and driven by the `Collections` structure:
 //!
 //! ```rust
-//! let config = Config::default();
 //! let collections = Collections::new(CollectionsConfig::default())?;
 //! collections.todo()?;
 //! collections.run()?;
 //! ```
 //!
-//! ## Running Tests
-//!
-//! Run the full suite with logging enabled:
-//!
-//! ```bash
-//! export RUST_LOG=debug
-//! cargo test -- --nocapture
-//! ```
-//!
 //! ## 🔗 References
-//! ### Basecalling and alignment
+//! 
+//! **Basecalling and alignment**
 //! - Dorado: <https://github.com/nanoporetech/dorado>
-//! ### Variants Callers
+//!
+//! **Variants Callers**
 //! - ClairS: <https://github.com/HKU-BAL/ClairS>
 //! - Nanomonsv: <https://github.com/friend1ws/nanomonsv>
 //! - Savana: <https://github.com/cortes-ciriano-lab/savana>
@@ -153,7 +124,8 @@
 //! - DeepSomatic: <https://github.com/google/deepsomatic>
 //! - LongPhase: <https://github.com/PorubskyResearch/LongPhase>
 //! - Modkit: <https://github.com/nanoporetech/modkit>
-//! ### Variants annotation
+//!
+//! **Variants annotation**
 //! - VEP: <https://www.ensembl.org/info/docs/tools/vep/index.html>
 //!
 //! ---
@@ -693,11 +665,10 @@ mod tests {
     #[test]
     fn pipe_somatic() -> anyhow::Result<()> {   
         init();
-        let id = "ACHITE";
+        let id = "AOUF";
         SomaticPipe::initialize(id, Config::default())?.run()
     }
 
-   
     #[test]
     fn overlaps() {
         init();

+ 46 - 2
src/pipes/somatic.rs

@@ -343,7 +343,7 @@ impl Run for SomaticPipe {
             self.config.reference
         );
         variants_collections.iter().for_each(|c| {
-            c.annotate_with_sequence_entropy(
+            c.annotate_with_sequence_context(
                 &annotations,
                 &self.config.reference,
                 self.config.entropy_seq_len,
@@ -445,7 +445,7 @@ impl Run for SomaticPipe {
         vep_stats.save_to_json(&format!("{stats_dir}/{id}_annotations_10_vep.json"))?;
 
         VariantsStats::new(&variants, &id, &config)?
-            .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json"))?;
+            .save_to_json(&format!("{stats_dir}/{id}_variants_stats_final.json.gz"))?;
 
         info!("Final unique variants: {}", variants.data.len());
         variants.save_to_json(&result_json)?;
@@ -731,15 +731,59 @@ impl SomaticPipeStats {
     }
 }
 
+/// Aggregated statistics on variant collections, grouped by sample type.
+///
+/// The `InputStats` struct stores the number of variants observed for each
+/// caller, categorized by biological sample type:
+///
+/// - `solo_tumor`: Variants from tumor-only samples.
+/// - `solo_constit`: Variants from normal-only (constitutional) samples.
+/// - `germline`: Variants classified as germline (shared between normal and tumor).
+/// - `somatic`: Variants classified as somatic (tumor-specific).
+///
+/// Each vector contains tuples of the form `(Annotation, usize)`, where:
+/// - `Annotation` identifies the caller and sample type.
+/// - `usize` represents the number of variants for that annotation.
 #[derive(Debug, Default, Clone)]
 pub struct InputStats {
+    /// Variants from tumor-only samples.
     pub solo_tumor: Vec<(Annotation, usize)>,
+    
+    /// Variants from normal-only (constitutional) samples.
     pub solo_constit: Vec<(Annotation, usize)>,
+    
+    /// Germline variants (present in both normal and tumor).
     pub germline: Vec<(Annotation, usize)>,
+    
+    /// Somatic variants (specific to tumor).
     pub somatic: Vec<(Annotation, usize)>,
 }
 
 impl InputStats {
+/// Constructs an `InputStats` instance by aggregating variant counts
+    /// from a list of [`VariantCollection`]s.
+    ///
+    /// Each `VariantCollection` is inspected to determine its sample type,
+    /// as indicated by the `Annotation::Callers(_, Sample)` variant. The
+    /// number of variants (`collection.variants.len()`) is recorded in the
+    /// appropriate field of `InputStats`, grouped by `Sample`.
+    ///
+    /// # Parameters
+    ///
+    /// - `collections`: A slice of [`VariantCollection`] objects containing
+    ///   variants annotated with a caller and sample type.
+    ///
+    /// # Returns
+    ///
+    /// A populated `InputStats` instance with counts per (caller, sample type).
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let collections: Vec<VariantCollection> = load_collections();
+    /// let stats = InputStats::from_collections(&collections);
+    /// println!("Somatic calls: {:?}", stats.somatic);
+    /// ```
     pub fn from_collections(collections: &[VariantCollection]) -> Self {
         let mut stats = Self::default();
         for collection in collections.iter() {

+ 4 - 92
src/scan/scan.rs

@@ -279,7 +279,8 @@ pub fn par_whole_scan(id: &str, time_point: &str, config: &Config) -> anyhow::Re
 
         // Skip this file if it already exists and is up-to-date compared to the input BAM,
         // unless forced by the `somatic_scan_force` flag.
-        if !is_file_older(&out_file, bam_path).unwrap_or(true) && !config.somatic_scan_force {
+        if !is_file_older(&out_file, bam_path, false).unwrap_or(true) && !config.somatic_scan_force
+        {
             continue;
         }
 
@@ -356,93 +357,6 @@ pub fn par_whole_scan(id: &str, time_point: &str, config: &Config) -> anyhow::Re
     Ok(())
 }
 
-// thread_local! {
-//     static BAM_READER: RefCell<Option<IndexedReader>> = const { RefCell::new(None) };
-// }
-//
-// pub fn par_whole_scan_local(out_dir: &str, bam_path: &str, config: &Config) -> anyhow::Result<()> {
-//     let bin_size = config.count_bin_size;
-//     let chunk_n_bin = config.count_n_chunks;
-//     info!("Starting whole genome scan for {bam_path}, with bin size of {bin_size} nt and by chunks of {chunk_n_bin} bins.");
-//     fs::create_dir_all(out_dir)?;
-//
-//     for (contig, length) in read_dict(&config.dict_file)? {
-//         let out_file = format!("{out_dir}/{contig}_count.tsv.gz");
-//
-//         // Skip this file if it already exists and is up-to-date compared to the input BAM,
-//         // unless forced by the `somatic_scan_force` flag.
-//         if !is_file_older(&out_file, bam_path).unwrap_or(true) && !config.somatic_scan_force {
-//             continue;
-//         }
-//
-//         let n_bin = length / bin_size;
-//         let n_chunks = n_bin.div_ceil(chunk_n_bin);
-//         info!("Scan of contig: {contig}");
-//
-//         let bins: Vec<BinCount> = (0..n_chunks)
-//             .into_par_iter()
-//             .flat_map(|i| {
-//                 let chunk_start = i * chunk_n_bin * bin_size;
-//                 let chunk_length = if i == n_chunks - 1 {
-//                     length - chunk_start
-//                 } else {
-//                     chunk_n_bin * bin_size
-//                 };
-//                 let n_bins_in_chunk = chunk_length.div_ceil(bin_size);
-//
-//                 // Use thread-local BAM reader
-//                 let result = BAM_READER.with(|reader_cell| {
-//                     let mut reader_ref = reader_cell.borrow_mut();
-//
-//                     // Initialize if not already set
-//                     if reader_ref.is_none() {
-//                         let reader = IndexedReader::from_path(bam_path)
-//                             .with_context(|| format!("Failed to open BAM file: {}", bam_path))
-//                             .ok()?; // handle error as Option
-//                         *reader_ref = Some(reader);
-//                     }
-//
-//                     let reader = reader_ref.as_mut().unwrap();
-//
-//                     // Seek to contig start for this chunk
-//                     let mut bins_in_chunk = Vec::new();
-//                     for j in 0..n_bins_in_chunk {
-//                         let bin_start = chunk_start + j * bin_size;
-//                         let bin_length = std::cmp::min(bin_size, chunk_length - j * bin_size);
-//                         match Bin::new(reader, &contig, bin_start, bin_length, config.bam_min_mapq)
-//                         {
-//                             Ok(bin) => bins_in_chunk.push(BinCount::from(&bin)),
-//                             Err(e) => {
-//                                 error!("Failed to get Bin at chunk {i} bin {j}: {e}");
-//                             }
-//                         }
-//                     }
-//                     Some(bins_in_chunk)
-//                 });
-//
-//                 result.into_iter().flatten().collect::<Vec<_>>()
-//             })
-//             .collect();
-//
-//         debug!("Scan {contig}, sorting bins");
-//         let mut bins = bins;
-//         bins.par_sort_unstable_by(|a, b| a.start.cmp(&b.start));
-//
-//         debug!("Scan {contig}, computing outliers");
-//         fill_outliers(&mut bins);
-//
-//         debug!("Scan {contig}, writing file");
-//
-//         let mut file = get_gz_writer(&out_file, true)
-//             .with_context(|| anyhow::anyhow!("failed to open the file: {out_file}"))?;
-//         for bin in bins {
-//             writeln!(file, "{}", bin.to_tsv_row())?;
-//         }
-//     }
-//
-//     Ok(())
-// }
-
 /// Identifies and marks outliers in a slice of `BinCount` objects based on various ratio metrics.
 ///
 /// # Parameters
@@ -563,7 +477,6 @@ pub fn somatic_scan(id: &str, config: &Config) -> anyhow::Result<()> {
     )
 }
 
-
 /// A pipeline runner for executing SomaticScan on matched tumor and normal samples.
 ///
 /// This struct encapsulates:
@@ -622,13 +535,13 @@ impl ShouldRun for SomaticScan {
                     let diag_count_file = self
                         .config
                         .somatic_scan_tumoral_count_file(&self.id, &contig);
-                    if is_file_older(&diag_count_file, diag_bam_path).unwrap_or(true) {
+                    if is_file_older(&diag_count_file, diag_bam_path, true).unwrap_or(true) {
                         return true;
                     }
                     let mrd_count_file = self
                         .config
                         .somatic_scan_normal_count_file(&self.id, &contig);
-                    if is_file_older(&mrd_count_file, mrd_bam_path).unwrap_or(true) {
+                    if is_file_older(&mrd_count_file, mrd_bam_path, true).unwrap_or(true) {
                         return true;
                     }
                 }
@@ -661,4 +574,3 @@ impl Label for SomaticScan {
         "Somatic Scan".to_string()
     }
 }
-

+ 79 - 27
src/variant/variant_collection.rs

@@ -20,15 +20,16 @@ use crate::{
         cosmic::Cosmic,
         echtvar::{parse_echtvar_val, run_echtvar},
         gnomad::GnomAD,
+        parse_trinuc,
         vep::{get_best_vep, run_vep, VepLine, VEP},
-        Annotation, Annotations,
+        Annotation, Annotations, ReplicationClass,
     },
     collection::{
         bam::{counts_at, counts_ins_at},
         vcf::Vcf,
     },
     helpers::{app_storage_dir, estimate_shannon_entropy, mean, temp_file_path, Hash128},
-    io::{fasta::sequence_at, readers::get_reader, vcf::vcf_header},
+    io::{bed::annotate_with_bed, fasta::sequence_at, readers::get_reader, vcf::vcf_header},
     positions::{GenomePosition, GetGenomePosition},
 };
 
@@ -215,28 +216,30 @@ impl VariantCollection {
         optimal_chunk_size.max(min_chunk_size)
     }
 
-    /// Annotates variants with sequence entropy information.
+    /// Annotates variants with sequence entropy and trinucleotide context information.
     ///
-    /// This function calculates and adds Shannon entropy annotations to the variants
-    /// based on the surrounding sequence context. It processes variants in parallel
-    /// chunks for improved performance.
+    /// This function calculates and adds Shannon entropy and the central trinucleotide
+    /// context from the surrounding sequence. It processes variants in parallel chunks
+    /// for improved performance.
     ///
     /// # Arguments
-    /// * `annotations` - A reference to the Annotations structure to store the results
-    /// * `reference` - Path to the reference FASTA file
+    /// * `annotations` - A reference to the Annotations structure to store the results.
+    /// * `reference` - Path to the reference FASTA file.
     /// * `seq_len` - Length of the sequence context to consider for entropy calculation
-    /// * `max_threads` - Maximum number of threads to use for parallel processing
+    ///               (must be >= 3 to extract trinucleotide context).
+    /// * `max_threads` - Maximum number of threads to use for parallel processing.
     ///
     /// # Behavior
-    /// - Processes variants in parallel chunks
-    /// - For each variant, retrieves the surrounding sequence from the reference
-    /// - Calculates Shannon entropy for the sequence
-    /// - Adds the entropy value as an annotation to the variant
-    /// - Skips variants that already have a Shannon entropy annotation
+    /// - For each variant:
+    ///   - Retrieves the surrounding sequence from the reference FASTA.
+    ///   - Calculates Shannon entropy and adds it as `Annotation::ShannonEntropy(f64)`
+    ///     if not already annotated.
+    ///   - Extracts and adds `Annotation::TriNucleotides([Base; 3])` centered on the variant
+    ///     if not already annotated.
     ///
     /// # Panics
     /// This function will panic if it fails to build the FASTA reader from the provided reference path.
-    pub fn annotate_with_sequence_entropy(
+    pub fn annotate_with_sequence_context(
         &self,
         annotations: &Annotations,
         reference: &str,
@@ -254,18 +257,33 @@ impl VariantCollection {
                     let key = c.hash();
                     let mut anns = annotations.store.entry(key).or_default();
 
-                    if !anns
-                        .iter()
-                        .any(|e| matches!(e, Annotation::ShannonEntropy(_)))
-                    {
-                        if let Ok(r) = sequence_at(
-                            &mut fasta_reader,
-                            &c.position.contig(),
-                            c.position.position as usize,
-                            seq_len,
-                        ) {
-                            let ent = estimate_shannon_entropy(r.as_str());
-                            anns.push(Annotation::ShannonEntropy(ent));
+                    if let Ok(seq) = sequence_at(
+                        &mut fasta_reader,
+                        &c.position.contig(),
+                        c.position.position as usize,
+                        seq_len,
+                    ) {
+                        // Shannon Entropy
+                        if !anns
+                            .iter()
+                            .any(|e| matches!(e, Annotation::ShannonEntropy(_)))
+                        {
+                            let entropy = estimate_shannon_entropy(seq.as_str());
+                            anns.push(Annotation::ShannonEntropy(entropy));
+                        }
+
+                        // Trinucleotide context
+                        if !anns
+                            .iter()
+                            .any(|e| matches!(e, Annotation::TriNucleotides(_)))
+                        {
+                            let center = seq_len / 2;
+                            if seq.len() >= center + 2 {
+                                let trinuc = &seq[center - 1..=center + 1];
+                                let bases = parse_trinuc(trinuc);
+                                debug!("{bases:?}");
+                                anns.push(Annotation::TriNucleotides(bases));
+                            }
                         }
                     }
                 }
@@ -638,6 +656,40 @@ impl Variants {
         self.data = result;
     }
 
+    /// Annotates variants based on overlap with early and late replication timing regions.
+    ///
+    /// This function applies `Annotation::Early` to variants that overlap with regions in the
+    /// provided early BED file, and `Annotation::Late` for variants overlapping the late BED file.
+    ///
+    /// # Arguments
+    /// * `early_bed` - Path to a BED file representing early-replicating regions.
+    /// * `late_bed` - Path to a BED file representing late-replicating regions.
+    ///
+    /// # Returns
+    /// * `Ok(())` on success, or an error if BED files cannot be read or parsed.
+    ///
+    /// # Example
+    /// ```
+    /// variants.annotate_replication_timing("early_25.bed", "late_25.bed")?;
+    /// ```
+    pub fn annotate_replication_timing(
+        &mut self,
+        early_bed: &str,
+        late_bed: &str,
+    ) -> anyhow::Result<()> {
+        annotate_with_bed(
+            self,
+            early_bed,
+            Annotation::ReplicationTiming(ReplicationClass::Early),
+        )?;
+        annotate_with_bed(
+            self,
+            late_bed,
+            Annotation::ReplicationTiming(ReplicationClass::Late),
+        )?;
+        Ok(())
+    }
+
     /// Filters and returns variants of a specific alteration category.
     ///
     /// This method creates a new vector containing clones of all variants