Thomas 9 months ago
parent
commit
ad0958ddb2
8 changed files with 729 additions and 73 deletions
  1. 3 1
      src/annotation/mod.rs
  2. 213 58
      src/annotation/vep.rs
  3. 1 1
      src/io/vcf.rs
  4. 16 0
      src/lib.rs
  5. 11 1
      src/pipes/somatic.rs
  6. 32 1
      src/positions.rs
  7. 15 1
      src/variant/variant.rs
  8. 438 10
      src/variant/variant_collection.rs

+ 3 - 1
src/annotation/mod.rs

@@ -26,7 +26,7 @@ use gnomad::GnomAD;
 use log::info;
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
-use vep::{get_best_vep, VepConsequence, VepImpact, VEP};
+use vep::{get_best_vep, VEP};
 
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub enum Annotation {
@@ -50,6 +50,7 @@ pub enum Sample {
     Germline,
     Somatic,
 }
+
 impl fmt::Display for Sample {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
@@ -78,6 +79,7 @@ impl FromStr for Sample {
         }
     }
 }
+
 impl fmt::Display for Annotation {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let str = match self {

+ 213 - 58
src/annotation/vep.rs

@@ -12,21 +12,39 @@ use std::{
 
 use super::ncbi::NCBIAcc;
 
+/// Represents a single line of output from the Variant Effect Predictor (VEP).
+///
+/// This struct encapsulates all fields typically found in a VEP output line,
+/// providing a structured way to work with VEP results.
 #[derive(Debug, PartialEq, Serialize, Deserialize)]
 pub struct VepLine {
+    /// The identifier of the input variant
     pub uploaded_variation: String,
+    /// The genomic location of the variant
     pub location: String,
+    /// The variant allele
     pub allele: String,
+    /// The gene symbol or identifier
     pub gene: String,
+    /// The affected feature (e.g., transcript ID)
     pub feature: String,
+    /// The type of feature affected (e.g., Transcript, RegulatoryFeature)
     pub feature_type: String,
+    /// The consequence of the variant
     pub consequence: String,
+    /// The position of the variant in the cDNA
     pub cdna_position: String,
+    /// The position of the variant in the CDS
     pub cds_position: String,
+    /// The position of the variant in the protein
     pub protein_position: String,
+    /// The amino acid change caused by the variant
     pub amino_acids: String,
+    /// The codon change caused by the variant
     pub codons: String,
+    /// Known identifiers for this variant
     pub existing_variation: String,
+    /// Additional information in key=value format
     pub extra: String,
 }
 
@@ -58,92 +76,174 @@ impl FromStr for VepLine {
     }
 }
 
+/// Represents a Variant Effect Predictor (VEP) annotation for a genetic variant.
+///
+/// This struct encapsulates various fields that describe the predicted effects
+/// of a variant on genes, transcripts, and proteins, as determined by VEP.
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct VEP {
+    /// The gene affected by the variant
     pub gene: Option<String>,
+    /// The specific feature (e.g., transcript) affected by the variant
     pub feature: Option<String>,
+    /// The type of feature (e.g., Transcript, RegulatoryFeature)
     pub feature_type: Option<String>,
+    /// The predicted consequence(s) of the variant
     pub consequence: Option<Vec<VepConsequence>>,
+    /// The position of the variant in the cDNA sequence
     pub cdna_position: Option<String>,
+    /// The position of the variant in the coding sequence
     pub cds_position: Option<String>,
+    /// The position of the variant in the protein sequence
     pub protein_position: Option<String>,
+    /// The amino acid change caused by the variant
     pub amino_acids: Option<String>,
+    /// The codon change caused by the variant
     pub codons: Option<String>,
+    /// Known identifiers for this variant
     pub existing_variation: Option<String>,
+    /// Additional VEP information
     pub extra: VEPExtra,
 }
 
-// ensembl.org/info/genome/variation/prediction/predicted_data.html
+/// Represents the predicted consequences of a genetic variant as determined by
+/// the Ensembl Variant Effect Predictor (VEP).
+///
+/// This enum aligns with the Sequence Ontology (SO) terms used by VEP to describe
+/// the effects of variants on transcripts, proteins, and regulatory regions.
+///
+/// The variants are generally ordered from most to least severe in terms of their
+/// potential impact on gene function.
+///
+/// For more information, see:
+/// https://ensembl.org/info/genome/variation/prediction/predicted_data.html
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub enum VepConsequence {
+    /// Complete destruction of a transcript
     TranscriptAblation,
+    /// Variant at the splice acceptor site
     SpliceAcceptorVariant,
+    /// Variant at the splice donor site
     SpliceDonorVariant,
+    /// Variant causing a premature stop codon
     StopGained,
+    /// Insertion or deletion causing a frameshift
     FrameshiftVariant,
+    /// Variant causing loss of stop codon
     StopLost,
+    /// Variant causing loss of start codon
     StartLost,
+    /// Amplification of a transcript
     TranscriptAmplification,
+    /// Insertion not causing a frameshift
     InframeInsertion,
+    /// Deletion not causing a frameshift
     InframeDeletion,
+    /// Variant causing an amino acid change
     MissenseVariant,
+    /// Variant that changes protein structure
     ProteinAlteringVariant,
+    /// Variant at the 5th base of a splice donor site
     SpliceDonor5thBaseVariant,
+    /// Variant in the splice region
     SpliceRegionVariant,
+    /// Variant in the splice donor region
     SpliceDonorRegionVariant,
+    /// Variant in the polypyrimidine tract near splice sites
     SplicePolyrimidineTractVariant,
+    /// Variant in the final, incomplete codon of a transcript
     IncompleteTerminalCodonVariant,
+    /// Variant in start codon resulting in no change
     StartRetainedVariant,
+    /// Variant in stop codon resulting in no change
     StopRetainedVariant,
+    /// Variant not changing the encoded amino acid
     SynonymousVariant,
+    /// Variant in coding sequence with indeterminate effect
     CodingSequenceVariant,
+    /// Variant in mature miRNA
     MatureMiRnaVariant,
+    /// Variant in 5' UTR
     FivePrimeUtrVariant,
+    /// Variant in 3' UTR
     ThreePrimeUtrVariant,
+    /// Variant in non-coding exon
     NonCodingTranscriptExonVariant,
+    /// Variant in intron
     IntronVariant,
+    /// Variant in transcript subject to nonsense-mediated decay
     NmdTranscriptVariant,
+    /// Variant in non-coding transcript
     NonCodingTranscriptVariant,
+    /// Variant upstream of a gene
     UpstreamGeneVariant,
+    /// Variant downstream of a gene
     DownstreamGeneVariant,
+    /// Deletion of a transcription factor binding site
     TfbsAblation,
+    /// Amplification of a transcription factor binding site
     TfbsAmplification,
+    /// Variant in a transcription factor binding site
     TfBindingSiteVariant,
+    /// Deletion of a regulatory region
     RegulatoryRegionAblation,
+    /// Amplification of a regulatory region
     RegulatoryRegionAmplification,
+    /// Variant causing a feature to be extended
     FeatureElongation,
+    /// Variant in a regulatory region
     RegulatoryRegionVariant,
+    /// Variant causing a feature to be shortened
     FeatureTruncation,
+    /// Variant in intergenic region
     IntergenicVariant,
+    /// General sequence variant
     SequenceVariant,
 }
 
+/// Represents the severity of a variant's impact as predicted by the
+/// Variant Effect Predictor (VEP).
+///
+/// The impact categories are ordered from most severe (HIGH) to least severe (MODIFIER).
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub enum VepImpact {
+    /// High impact variants are expected to have high (disruptive) impact in the protein,
+    /// probably causing protein truncation, loss of function or triggering nonsense mediated decay.
     HIGH,
+    /// Moderate impact variants are non-disruptive variants that might change protein effectiveness.
     MODERATE,
+    /// Low impact variants are mostly harmless or unlikely to change protein behavior.
     LOW,
+    /// Modifier variants are usually non-coding variants or variants affecting non-coding genes,
+    /// where predictions are difficult or there is no evidence of impact.
     MODIFIER,
 }
 
 impl From<VepImpact> for String {
+    /// Converts a VepImpact enum variant to its string representation.
     fn from(value: VepImpact) -> Self {
         match value {
             VepImpact::HIGH => "High",
             VepImpact::MODERATE => "Moderate",
             VepImpact::LOW => "Low",
             VepImpact::MODIFIER => "Modifier",
-        }.to_string()
+        }
+        .to_string()
     }
 }
 
 impl PartialOrd for VepImpact {
+    /// Implements partial ordering for VepImpact.
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
 impl Ord for VepImpact {
+    /// Implements total ordering for VepImpact.
+    ///
+    /// The ordering is inverse to severity: HIGH < MODERATE < LOW < MODIFIER
+    /// This allows for easy sorting where the most severe impacts come first.
     fn cmp(&self, other: &Self) -> Ordering {
         match (self, other) {
             (VepImpact::HIGH, VepImpact::HIGH) => Ordering::Equal,
@@ -224,56 +324,6 @@ impl From<&VepConsequence> for VepImpact {
     }
 }
 
-// impl VepImpact {
-//     pub fn from_consequence(consequence: &VepConsequence) -> VepImpact {
-//         match consequence {
-//             VepConsequence::TranscriptAblation
-//             | VepConsequence::SpliceAcceptorVariant
-//             | VepConsequence::SpliceDonorVariant
-//             | VepConsequence::StopGained
-//             | VepConsequence::FrameshiftVariant
-//             | VepConsequence::StopLost
-//             | VepConsequence::StartLost
-//             | VepConsequence::TranscriptAmplification
-//             | VepConsequence::FeatureElongation
-//             | VepConsequence::FeatureTruncation => VepImpact::HIGH,
-//
-//             VepConsequence::InframeInsertion
-//             | VepConsequence::InframeDeletion
-//             | VepConsequence::MissenseVariant
-//             | VepConsequence::ProteinAlteringVariant => VepImpact::MODERATE,
-//
-//             VepConsequence::SpliceDonor5thBaseVariant
-//             | VepConsequence::SpliceRegionVariant
-//             | VepConsequence::SpliceDonorRegionVariant
-//             | VepConsequence::SplicePolyrimidineTractVariant
-//             | VepConsequence::IncompleteTerminalCodonVariant
-//             | VepConsequence::StartRetainedVariant
-//             | VepConsequence::StopRetainedVariant
-//             | VepConsequence::SynonymousVariant => VepImpact::LOW,
-//
-//             VepConsequence::CodingSequenceVariant
-//             | VepConsequence::MatureMiRnaVariant
-//             | VepConsequence::FivePrimeUtrVariant
-//             | VepConsequence::ThreePrimeUtrVariant
-//             | VepConsequence::NonCodingTranscriptExonVariant
-//             | VepConsequence::IntronVariant
-//             | VepConsequence::NmdTranscriptVariant
-//             | VepConsequence::NonCodingTranscriptVariant
-//             | VepConsequence::UpstreamGeneVariant
-//             | VepConsequence::DownstreamGeneVariant
-//             | VepConsequence::TfbsAblation
-//             | VepConsequence::TfbsAmplification
-//             | VepConsequence::TfBindingSiteVariant
-//             | VepConsequence::RegulatoryRegionAblation
-//             | VepConsequence::RegulatoryRegionAmplification
-//             | VepConsequence::RegulatoryRegionVariant
-//             | VepConsequence::SequenceVariant
-//             | VepConsequence::IntergenicVariant => VepImpact::MODIFIER,
-//         }
-//     }
-// }
-
 impl From<VepConsequence> for String {
     fn from(consequence: VepConsequence) -> Self {
         match consequence {
@@ -424,18 +474,48 @@ impl TryFrom<&VepLine> for VEP {
     }
 }
 
+/// Represents additional information from Variant Effect Predictor (VEP) annotations.
+///
+/// This struct encapsulates various optional fields that provide extra context
+/// about a variant's predicted effect.
 #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
 pub struct VEPExtra {
+    /// The impact severity of the variant (e.g., HIGH, MODERATE, LOW, MODIFIER)
     pub impact: Option<VepImpact>,
+    /// The gene symbol associated with the variant
     pub symbol: Option<String>,
+    /// The distance to the feature (e.g., for upstream/downstream variants)
     pub distance: Option<u32>,
+    /// HGVS notation describing the variant at the coding sequence level
     pub hgvs_c: Option<String>,
+    /// HGVS notation describing the variant at the protein level
     pub hgvs_p: Option<String>,
 }
 
 impl FromStr for VEPExtra {
     type Err = anyhow::Error;
 
+    /// Parses a VEPExtra instance from a string representation.
+    ///
+    /// This method interprets a semicolon-separated key-value string typically
+    /// found in VEP output and constructs a VEPExtra struct from it.
+    ///
+    /// # Arguments
+    /// * `s` - A string slice containing the VEP extra information
+    ///
+    /// # Returns
+    /// * `Ok(VEPExtra)` if parsing is successful
+    /// * `Err(anyhow::Error)` if parsing fails
+    ///
+    /// # Example
+    /// ```
+    /// let vep_extra = VEPExtra::from_str("IMPACT=MODERATE;SYMBOL=BRCA1;DISTANCE=500;HGVSc=c.1A>T;HGVSp=p.Met1?");
+    /// assert!(vep_extra.is_ok());
+    /// ```
+    ///
+    /// # Note
+    /// This method is flexible and will not fail if some fields are missing.
+    /// It will simply set those fields to None in the resulting struct.
     fn from_str(s: &str) -> anyhow::Result<Self> {
         let kv: HashMap<_, _> = s.split(';').filter_map(|e| e.split_once('=')).collect();
 
@@ -455,17 +535,56 @@ impl FromStr for VEPExtra {
     }
 }
 
-// VEP need plugin Downstream and SpliceRegion /home/prom/.vep/Plugins
+/// Runs the Variant Effect Predictor (VEP) on a given input file and outputs the results.
+///
+/// This function executes the VEP tool with specific parameters to annotate genetic variants.
+/// It uses a predefined set of reference data and plugins to enhance the annotation process.
+///
+/// # Arguments
+/// * `in_path` - A string slice that holds the path to the input file
+/// * `out_path` - A string slice that holds the path where the output should be written
+///
+/// # Returns
+/// * `Ok(())` if VEP runs successfully
+/// * `Err(anyhow::Error)` if there's an error during execution
+///
+/// # Configuration
+/// The function uses hardcoded paths for:
+/// - VEP binary directory
+/// - VEP cache directory
+/// - Reference FASTA file
+/// - GFF annotation file
+///
+/// # VEP Parameters
+/// - Uses offline cache
+/// - Includes symbol information
+/// - Applies SpliceRegion and Downstream plugins
+/// - Generates HGVS notations
+///
+/// # Error Handling
+/// - Logs any lines containing "error" from VEP's stderr output as warnings
+/// - Returns an error if the VEP process fails to complete
+///
+/// # Example
+/// ```
+/// match run_vep("input.vcf", "output.vcf") {
+///     Ok(()) => println!("VEP annotation completed successfully"),
+///     Err(e) => eprintln!("VEP annotation failed: {}", e),
+/// }
+/// ```
+///
+/// # Note
+/// Ensure that the VEP tool and all necessary reference data are correctly installed
+/// and accessible at the specified paths before running this function.
 pub fn run_vep(in_path: &str, out_path: &str) -> anyhow::Result<()> {
+    // VEP need plugin Downstream and SpliceRegion /home/prom/.vep/Plugins
     debug!("Run VEP for {in_path} and ouput {out_path}");
 
     let bin_dir = "/data/tools/ensembl-vep";
     let dir_cache = "/data/ref/hs1/vepcache/";
     let fasta = "/data/ref/hs1/chm13v2.0.fa";
     let gff = "/data/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz";
-    // let gff = "/data/ref/hs1/ncbi_dataset/data/GCF_009914755.1/genomic_chr_sorted.gff.gz";
 
-    // info!("Running VEP for {}", in_path);
     let mut cmd = Command::new(format!("{}/vep", bin_dir))
         .arg("--dir_cache")
         .arg(dir_cache)
@@ -486,11 +605,8 @@ pub fn run_vep(in_path: &str, out_path: &str) -> anyhow::Result<()> {
         .arg("-o")
         .arg(out_path)
         .stderr(Stdio::piped())
-        // .stderr(Stdio::null())
         .spawn()
         .expect("VEP failed to start");
-    // .stderr
-    // .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "Could not capture standard output.")).unwrap();
 
     let stderr = cmd.stderr.take().unwrap();
     let reader = BufReader::new(stderr);
@@ -505,6 +621,45 @@ pub fn run_vep(in_path: &str, out_path: &str) -> anyhow::Result<()> {
     Ok(())
 }
 
+/// Selects the best Variant Effect Predictor (VEP) annotation from a slice of VEP annotations.
+///
+/// This function implements a prioritization algorithm to choose the most relevant VEP annotation
+/// based on impact severity and transcript preference.
+///
+/// # Arguments
+/// * `d` - A slice of VEP annotations
+///
+/// # Returns
+/// * `Ok(VEP)` containing the best VEP annotation if successful
+/// * `Err(anyhow::Error)` if no valid VEP annotation can be selected
+///
+/// # Algorithm
+/// 1. Groups VEPs by their impact (HIGH > MODERATE > LOW > MODIFIER)
+/// 2. Selects the group with the highest impact
+/// 3. If there's only one VEP in this group, it's returned
+/// 4. Otherwise, further filtering is done based on transcript identifiers:
+///    - Parses NCBI accessions from the feature field
+///    - Prioritizes 'NM' prefixed accessions over others
+///    - Sorts by accession number (higher numbers are preferred)
+///
+/// # Errors
+/// This function will return an error if:
+/// * The input slice is empty
+/// * No valid NCBI accession can be parsed from the selected VEPs
+///
+/// # Example
+/// ```
+/// let veps = vec![VEP { /* ... */ }, VEP { /* ... */ }];
+/// match get_best_vep(&veps) {
+///     Ok(best_vep) => println!("Best VEP: {:?}", best_vep),
+///     Err(e) => eprintln!("Failed to get best VEP: {}", e),
+/// }
+/// ```
+///
+/// # Note
+/// This function assumes that the VEP annotations are well-formed and contain
+/// the necessary fields for comparison (consequence, feature). It will log warnings
+/// for any parsing errors encountered during the selection process.
 pub fn get_best_vep(d: &[VEP]) -> anyhow::Result<VEP> {
     let best_impact_veps = d
         .iter()

+ 1 - 1
src/io/vcf.rs

@@ -77,7 +77,7 @@ pub fn vcf_header(dict: &str) -> anyhow::Result<Vec<String>> {
     header.push("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">".to_string());
 
     // version
-    header.push(format!("##Pandora_version={}", env!("CARGO_PKG_VERSION")));
+    header.push(format!("##Pandora_lib_version={}", env!("CARGO_PKG_VERSION")));
 
     // contig
     read_dict(dict)?

+ 16 - 0
src/lib.rs

@@ -445,6 +445,10 @@ mod tests {
         let var_string = variant.into_vcf_row();
         assert_eq!(row, &var_string);
 
+        let row = "chr1\t161417408\tr_10_0\tT\t[chr1:161417447[TTGGCAGGTTCC\t.\tPASS\tSVTYPE=BND;MATEID=r_10_1;SVINSLEN=11;SVINSSEQ=TTGGCAGGTTC\tTR:VR\t22:3\t12:0";
+        let variant: VcfVariant = row.parse()?;
+        println!("{variant:#?}");
+
         Ok(())
     }
 
@@ -471,6 +475,18 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn variant_load_nanomonsv() -> anyhow::Result<()> {   
+        init();
+        let id = "ADJAGBA";
+        let nanomonsv = NanomonSV::initialize(id, Config::default())?;
+        let annotations = Annotations::default();
+        let variant_collection = nanomonsv.variants(&annotations)?;
+        println!("NanomonSV for {id}: variants {} {}", variant_collection.variants.len(), variant_collection.vcf.n_variants);
+        println!("{:?}", variant_collection.variants.first());
+        Ok(())
+    }
+
     #[test]
     fn variant_load_clairs_germline() -> anyhow::Result<()> {   
         init();

+ 11 - 1
src/pipes/somatic.rs

@@ -1,5 +1,5 @@
 use itertools::Itertools;
-use log::{debug, info};
+use log::info;
 use std::{
     collections::HashMap,
     fs::{self, File},
@@ -496,6 +496,16 @@ impl Run for Somatic {
 
         annotations.vep_stats()?;
 
+        // Annotate echtvar
+        info!("GnomAD and Cosmic annotation.");
+        variants_collections
+            .iter()
+            .try_for_each(|c| -> anyhow::Result<()> {
+                let ext_annot = ExternalAnnotation::init()?;
+                ext_annot.annotate(&c.variants, &annotations)?;
+                Ok(())
+            })?;
+
         let variants = variants_collections.into_iter().fold(
             Variants::default(),
             |mut acc, variants_collection| {

+ 32 - 1
src/positions.rs

@@ -5,7 +5,38 @@ use anyhow::Context;
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 
-// 0-based
+/// Represents a position in the genome using 0-based coordinates.
+///
+/// This struct encapsulates a genomic position, consisting of a contig (chromosome)
+/// identifier and a position within that contig.
+///
+/// # Fields
+/// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
+/// * `position` - A 32-bit unsigned integer representing the 0-based position within the contig
+///
+/// # Derives
+/// This struct derives the following traits:
+/// * `Debug` for easier debugging and logging
+/// * `PartialEq` and `Eq` for equality comparisons
+/// * `Clone` for creating deep copies
+/// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
+/// * `Hash` for use in hash-based collections
+/// * `Default` for creating a default instance
+///
+/// # Note
+/// The position is 0-based, meaning the first position in a contig is represented as 0.
+///
+/// # Usage
+/// This struct is typically used to precisely locate genomic features or variants
+/// within a reference genome.
+///
+/// # Example
+/// ```
+/// let position = GenomePosition {
+///     contig: 1,  // Representing chromosome 1
+///     position: 1000,  // 1001st base pair in the chromosome (remember it's 0-based)
+/// };
+/// ```
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash, Default)]
 pub struct GenomePosition {
     pub contig: u8,

+ 15 - 1
src/variant/variant.rs

@@ -46,7 +46,7 @@ impl FromStr for VcfVariant {
             .try_into()
             .context(format!("Can't parse position from: {s}"))?;
 
-        let formats = if v.len() == 10 {
+        let formats = if v.len() >= 10 {
             (
                 *v.get(8).ok_or(anyhow!("Can't parse formats from: {s}"))?,
                 *v.get(9).ok_or(anyhow!("Can't parse formats from: {s}"))?,
@@ -197,6 +197,20 @@ impl VcfVariant {
         }
     }
 
+    /// Parses and constructs a BND (breakend) description from the alternative string.
+    ///
+    /// This function interprets the BND notation in the alternative string and creates
+    /// a `BNDDesc` struct containing detailed information about the breakend.
+    ///
+    /// # Returns
+    /// - `Ok(BNDDesc)` if parsing is successful
+    /// - `Err` if parsing fails or if the alteration is not a BND
+    ///
+    /// # Errors
+    /// This function will return an error if:
+    /// - The alteration category is not BND
+    /// - The alternative string cannot be parsed into exactly 3 parts
+    /// - The b_position cannot be parsed as a number
     pub fn bnd_desc(&self) -> anyhow::Result<BNDDesc> {
         let alt = self.alternative.to_string();
         if self.alteration_category() == AlterationCategory::BND {

+ 438 - 10
src/variant/variant_collection.rs

@@ -32,6 +32,24 @@ use crate::{
     positions::GenomePosition,
 };
 
+/// A collection of VCF variants along with associated metadata.
+///
+/// This struct represents a set of variants from a VCF file, including
+/// the original VCF metadata and the caller annotation.
+///
+/// # Fields
+/// * `variants` - A vector of VcfVariant instances representing individual variants
+/// * `vcf` - The Vcf struct containing metadata from the original VCF file
+/// * `caller` - An Annotation indicating the variant caller used
+///
+/// # Derives
+/// This struct derives Debug and Clone traits, allowing for easy debugging
+/// and cloning of VariantCollection instances.
+///
+/// # Usage
+/// This struct is typically used to group related variants from a single VCF file,
+/// preserving the context in which they were called and allowing for collective
+/// operations on the set of variants.
 #[derive(Debug, Clone)]
 pub struct VariantCollection {
     pub variants: Vec<VcfVariant>,
@@ -40,19 +58,130 @@ pub struct VariantCollection {
 }
 
 impl VariantCollection {
+    /// Returns a vector of hash keys for all variants in the collection.
+    ///
+    /// This method generates a unique hash for each variant in the collection
+    /// and returns these hashes as a vector.
+    ///
+    /// # Returns
+    /// A `Vec<Hash128>` containing the hash of each variant in the collection.
+    ///
+    /// # Performance
+    /// This method iterates over all variants in the collection, so its time complexity
+    /// is O(n) where n is the number of variants.
+    ///
+    /// # Usage
+    /// This method is useful for obtaining unique identifiers for each variant,
+    /// which can be used for indexing, comparison, or lookup operations.
+    ///
+    /// # Example
+    /// ```
+    /// let variant_keys = variant_collection.keys();
+    /// for key in variant_keys {
+    ///     println!("Variant hash: {:?}", key);
+    /// }
+    /// ```
     pub fn keys(&self) -> Vec<Hash128> {
         self.variants.iter().map(|v| v.hash()).collect()
     }
 
+    /// Retains only the variants whose hash keys are present in the provided set.
+    ///
+    /// This method filters the variants in the collection, keeping only those
+    /// whose hash keys are found in the `keys_to_keep` set.
+    ///
+    /// # Arguments
+    /// * `keys_to_keep` - A `HashSet<Hash128>` containing the hash keys of variants to retain
+    ///
+    /// # Effects
+    /// This method modifies the `VariantCollection` in place, potentially reducing
+    /// the number of variants it contains.
+    ///
+    /// # Performance
+    /// The time complexity is O(n), where n is the number of variants in the collection.
+    /// The space complexity is O(1) as it operates in place.
+    ///
+    /// # Usage
+    /// This method is useful for filtering variants based on a pre-computed set of keys,
+    /// which can be helpful in various scenarios such as:
+    /// - Removing duplicates
+    /// - Keeping only variants that meet certain criteria
+    /// - Synchronizing variants across different collections
+    ///
+    /// # Example
+    /// ```
+    /// let mut variant_collection = VariantCollection::new();
+    /// // ... populate variant_collection ...
+    ///
+    /// let keys_to_keep: HashSet<Hash128> = some_filtering_function();
+    /// variant_collection.retain_keys(&keys_to_keep);
+    /// ```
     pub fn retain_keys(&mut self, keys_to_keep: &HashSet<Hash128>) {
         self.variants.retain(|v| keys_to_keep.contains(&v.hash()));
     }
 
+    /// Removes variants whose hash keys are present in the provided set.
+    ///
+    /// This method filters the variants in the collection, removing those
+    /// whose hash keys are found in the `keys_to_remove` set.
+    ///
+    /// # Arguments
+    /// * `keys_to_remove` - A `HashSet<Hash128>` containing the hash keys of variants to remove
+    ///
+    /// # Effects
+    /// This method modifies the `VariantCollection` in place, potentially reducing
+    /// the number of variants it contains.
+    ///
+    /// # Performance
+    /// The time complexity is O(n), where n is the number of variants in the collection.
+    /// The space complexity is O(1) as it operates in place.
+    ///
+    /// # Usage
+    /// This method is useful for filtering out unwanted variants based on a pre-computed set of keys,
+    /// which can be helpful in various scenarios such as:
+    /// - Removing variants that fail certain criteria
+    /// - Eliminating duplicates identified by their hash keys
+    /// - Excluding variants present in another dataset
+    ///
+    /// # Example
+    /// ```
+    /// let mut variant_collection = VariantCollection::new();
+    /// // ... populate variant_collection ...
+    ///
+    /// let keys_to_remove: HashSet<Hash128> = some_filtering_function();
+    /// variant_collection.remove_keys(&keys_to_remove);
+    /// ```
     pub fn remove_keys(&mut self, keys_to_remove: &HashSet<Hash128>) {
         self.variants
             .retain(|v| !keys_to_remove.contains(&v.hash()));
     }
 
+    /// Partitions the VcfVariants into two sets based on a given predicate.
+    ///
+    /// This function splits the current VcfVariants instance into two new instances,
+    /// where the variants are divided based on whether they satisfy the given predicate.
+    ///
+    /// # Arguments
+    /// * `predicate` - A closure that takes a reference to a VcfVariant and returns a boolean
+    ///
+    /// # Returns
+    /// A tuple of two VcfVariants instances:
+    /// * The first contains all variants for which the predicate returned true
+    /// * The second contains all variants for which the predicate returned false
+    ///
+    /// # Type Parameters
+    /// * `F` - The type of the predicate function, which must implement Fn(&VcfVariant) -> bool
+    ///
+    /// # Behavior
+    /// - Consumes the original VcfVariants instance
+    /// - Creates two new VcfVariants instances with the partitioned data
+    /// - The VCF and caller information are cloned for the first returned instance
+    ///   and moved into the second returned instance
+    ///
+    /// # Examples
+    /// ```
+    /// let (passing, failing) = vcf_variants.partition(|variant| variant.quality.map_or(false, |q| q > 30));
+    /// ```
     pub fn partition<F>(self, predicate: F) -> (Self, Self)
     where
         F: Fn(&VcfVariant) -> bool,
@@ -77,7 +206,7 @@ impl VariantCollection {
         )
     }
 
-    pub fn chunk_size(&self, max_threads: u8) -> usize {
+    fn chunk_size(&self, max_threads: u8) -> usize {
         let total_items = self.variants.len();
         let min_chunk_size = 1000;
         let max_chunks = max_threads;
@@ -86,6 +215,27 @@ impl VariantCollection {
         optimal_chunk_size.max(min_chunk_size)
     }
 
+    /// Annotates variants with sequence entropy information.
+    ///
+    /// This function calculates and adds Shannon entropy annotations to the variants
+    /// based on the surrounding sequence context. It processes variants in parallel
+    /// chunks for improved performance.
+    ///
+    /// # Arguments
+    /// * `annotations` - A reference to the Annotations structure to store the results
+    /// * `reference` - Path to the reference FASTA file
+    /// * `seq_len` - Length of the sequence context to consider for entropy calculation
+    /// * `max_threads` - Maximum number of threads to use for parallel processing
+    ///
+    /// # Behavior
+    /// - Processes variants in parallel chunks
+    /// - For each variant, retrieves the surrounding sequence from the reference
+    /// - Calculates Shannon entropy for the sequence
+    /// - Adds the entropy value as an annotation to the variant
+    /// - Skips variants that already have a Shannon entropy annotation
+    ///
+    /// # Panics
+    /// This function will panic if it fails to build the FASTA reader from the provided reference path.
     pub fn annotate_with_sequence_entropy(
         &self,
         annotations: &Annotations,
@@ -122,6 +272,34 @@ impl VariantCollection {
             });
     }
 
+    /// Annotates variants with information from a constitutional BAM file.
+    ///
+    /// This function processes variants in parallel chunks and adds annotations
+    /// based on the read counts from a provided BAM file. It calculates and adds
+    /// annotations for the number of reads supporting the alternative allele and
+    /// the total read depth at each variant position.
+    ///
+    /// # Arguments
+    /// * `annotations` - A reference to the Annotations structure to store the results
+    /// * `constit_bam_path` - Path to the constituent BAM file
+    /// * `max_threads` - Maximum number of threads to use for parallel processing
+    ///
+    /// # Returns
+    /// * `Ok(())` if the annotation process completes successfully
+    /// * `Err` if there's an error opening the BAM file or processing the variants
+    ///
+    /// # Behavior
+    /// - Processes variants in parallel chunks
+    /// - For each variant, retrieves read counts from the BAM file
+    /// - Calculates the number of reads supporting the alternative allele and total depth
+    /// - Adds ConstitAlt and ConstitDepth annotations to each variant
+    /// - Skips variants that already have both ConstitAlt and ConstitDepth annotations
+    /// - Handles insertions, deletions, and other alteration types differently
+    ///
+    /// # Errors
+    /// This function will return an error if:
+    /// - It fails to open the BAM file
+    /// - There's an error while processing the variants or reading from the BAM file
     pub fn annotate_with_constit_bam(
         &self,
         annotations: &Annotations,
@@ -190,6 +368,42 @@ impl VariantCollection {
     }
 }
 
+/// Represents a consolidated genomic variant with associated information and annotations.
+///
+/// This struct encapsulates comprehensive data for a single genomic variant,
+/// including its unique identifier, genomic position, alleles, related VCF entries,
+/// and various annotations.
+///
+/// # Fields
+/// * `hash` - A unique `Hash128` identifier for the variant
+/// * `position` - The `GenomePosition` of the variant (0-based coordinates)
+/// * `reference` - The reference allele as a `ReferenceAlternative`
+/// * `alternative` - The alternative allele as a `ReferenceAlternative`
+/// * `vcf_variants` - A vector of `VcfVariant` instances associated with this variant
+/// * `annotations` - A vector of `Annotation` instances providing additional variant information
+///
+/// # Derives
+/// * `Debug` - For easier debugging and logging
+/// * `Serialize`, `Deserialize` - For serialization and deserialization (e.g., JSON)
+/// * `Clone` - For creating deep copies of the variant
+///
+/// # Usage
+/// This struct is central to variant analysis, providing a unified representation
+/// of a variant that may combine data from multiple VCF entries and include
+/// various annotations. It's useful for advanced variant processing, filtering,
+/// and reporting in genomic analysis pipelines.
+///
+/// # Example
+/// ```
+/// let variant = Variant {
+///     hash: Hash128::new(/* ... */),
+///     position: GenomePosition { contig: 1, position: 1000 },
+///     reference: ReferenceAlternative::new("A"),
+///     alternative: ReferenceAlternative::new("T"),
+///     vcf_variants: vec![/* VcfVariant instances */],
+///     annotations: vec![/* Annotation instances */],
+/// };
+/// ```
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct Variant {
     pub hash: Hash128,
@@ -232,17 +446,108 @@ impl Variant {
     }
 }
 
+/// A collection of genomic variants.
+///
+/// This struct represents a set of `Variant` instances, providing a container
+/// for multiple genomic variants.
+///
+/// # Fields
+/// * `data` - A vector of `Variant` instances
+///
+/// # Derives
+/// * `Debug` - For easier debugging and logging
+/// * `Default` - Allows creation of an empty `Variants` instance
+/// * `Serialize`, `Deserialize` - For serialization and deserialization (e.g., JSON)
+/// * `Clone` - For creating deep copies of the collection
+///
+/// # Usage
+/// This struct is typically used to group related variants, such as those from
+/// a single sample or analysis run. It provides a convenient way to handle and
+/// process multiple variants as a single unit.
+///
+/// # Example
+/// ```
+/// let mut variants = Variants::default();
+/// variants.data.push(Variant {
+///     // ... variant details ...
+/// });
+/// // Process or analyze the collection of variants
+/// ```
+///
+/// # Note
+/// The `Default` implementation creates an empty vector of variants.
 #[derive(Debug, Default, Serialize, Deserialize, Clone)]
 pub struct Variants {
     pub data: Vec<Variant>,
 }
 
 impl Variants {
+    /// Sorts the variants in-place based on their genomic positions.
+    ///
+    /// This method uses an unstable sort to order the variants in the collection
+    /// according to their genomic positions. The sorting is done in ascending order,
+    /// first by contig and then by position within the contig.
+    ///
+    /// # Effects
+    /// Modifies the order of variants in the `data` vector in-place.
+    ///
+    /// # Performance
+    /// Uses `sort_unstable_by` for better performance, with a time complexity of O(n log n),
+    /// where n is the number of variants.
+    ///
+    /// # Example
+    /// ```
+    /// let mut variants = Variants {
+    ///     data: vec![
+    ///         Variant { position: GenomePosition { contig: 0, position: 100 }, .. },
+    ///         Variant { position: GenomePosition { contig: 0, position: 50 }, .. },
+    ///         Variant { position: GenomePosition { contig: 1, position: 75 }, .. },
+    ///     ]
+    /// };
+    /// variants.sort();
+    /// // Variants are now sorted by position
+    /// ```
     pub fn sort(&mut self) {
         self.data
             .sort_unstable_by(|a, b| a.position.cmp(&b.position));
     }
 
+    /// Merges this Variants collection with another VariantCollection, combining overlapping variants.
+    ///
+    /// This method performs a merge operation, combining the current Variants with a VariantCollection.
+    /// It uses a two-pointer technique to efficiently merge the sorted collections, handling overlapping
+    /// variants and creating new variants as necessary.
+    ///
+    /// # Arguments
+    /// * `others` - A VariantCollection to merge with the current Variants
+    /// * `annotations` - A reference to Annotations used when creating new variants
+    ///
+    /// # Effects
+    /// * Modifies the current Variants in-place, replacing its data with the merged result
+    /// * Drains the data from the current Variants during the merge process
+    ///
+    /// # Behavior
+    /// * Variants are compared based on their genomic positions
+    /// * When positions are equal, variants with matching reference and alternative alleles are merged
+    /// * Non-matching variants at the same position are kept separate
+    /// * The method logs the number of merged variants
+    ///
+    /// # Performance
+    /// * Time complexity: O(n + m), where n and m are the sizes of the two collections
+    /// * Space complexity: O(n + m) for the merged result
+    ///
+    /// # Note
+    /// This method assumes that both the current Variants and the input VariantCollection are sorted
+    /// by genomic position. Use the `sort` method before merging if necessary.
+    ///
+    /// # Example
+    /// ```
+    /// let mut variants1 = Variants { /* ... */ };
+    /// let variants2 = VariantCollection { /* ... */ };
+    /// let annotations = Annotations::new();
+    /// variants1.merge(variants2, &annotations);
+    /// // variants1 now contains the merged data
+    /// ```
     pub fn merge(&mut self, others: VariantCollection, annotations: &Annotations) {
         let mut result = Vec::new();
         let mut n_merged = 0;
@@ -290,6 +595,73 @@ impl Variants {
         self.data = result;
     }
 
+    /// Filters and returns variants of a specific alteration category.
+    ///
+    /// This method creates a new vector containing clones of all variants
+    /// in the collection that match the specified alteration category.
+    ///
+    /// # Arguments
+    /// * `cat` - An `AlterationCategory` enum value specifying the category to filter by
+    ///
+    /// # Returns
+    /// A `Vec<Variant>` containing clones of all matching variants
+    ///
+    /// # Performance
+    /// * Time complexity: O(n), where n is the number of variants in the collection
+    /// * Space complexity: O(m), where m is the number of matching variants
+    ///
+    /// # Example
+    /// ```
+    /// let variants = Variants { /* ... */ };
+    /// let snvs = variants.get_alteration_cat(AlterationCategory::SNV);
+    /// println!("Found {} SNVs", snvs.len());
+    /// ```
+    ///
+    /// # Note
+    /// This method clones matching variants. If you need to process a large number of variants
+    /// and don't require ownership, consider implementing an iterator-based approach instead.
+    pub fn get_alteration_cat(&self, cat: AlterationCategory) -> Vec<Variant> {
+        self.data
+            .iter()
+            .filter(|v| v.alteration_category().contains(&cat))
+            .cloned()
+            .collect()
+    }
+
+    /// Saves the Variants collection to a BGZF-compressed JSON file.
+    ///
+    /// This method serializes the Variants struct to JSON format and writes it to a file
+    /// using BGZF (Blocked GNU Zip Format) compression.
+    ///
+    /// # Arguments
+    /// * `filename` - A string slice that holds the name of the file to write to
+    ///
+    /// # Returns
+    /// * `Ok(())` if the operation is successful
+    /// * `Err(anyhow::Error)` if any error occurs during file creation, serialization, or writing
+    ///
+    /// # Errors
+    /// This function will return an error if:
+    /// * The file cannot be created
+    /// * The Variants struct cannot be serialized to JSON
+    /// * The BGZF writer cannot be closed properly
+    ///
+    /// # Performance
+    /// The time and space complexity depends on the size of the Variants collection
+    /// and the efficiency of the JSON serialization and BGZF compression.
+    ///
+    /// # Example
+    /// ```
+    /// let variants = Variants { /* ... */ };
+    /// match variants.save_to_json("output.json.gz") {
+    ///     Ok(()) => println!("Successfully saved variants"),
+    ///     Err(e) => eprintln!("Failed to save variants: {}", e),
+    /// }
+    /// ```
+    ///
+    /// # Note
+    /// This method uses BGZF compression, which is compatible with standard gzip decompression.
+    /// The resulting file can be read using standard gzip-aware tools.
     pub fn save_to_json(&self, filename: &str) -> anyhow::Result<()> {
         let file = File::create(filename)
             .with_context(|| format!("Failed to create file: {}", filename))?;
@@ -306,6 +678,39 @@ impl Variants {
         Ok(())
     }
 
+    /// Loads a Variants collection from a BGZF-compressed JSON file.
+    ///
+    /// This method reads a BGZF-compressed file, deserializes its JSON content,
+    /// and constructs a new Variants instance from it.
+    ///
+    /// # Arguments
+    /// * `filename` - A string slice that holds the name of the file to read from
+    ///
+    /// # Returns
+    /// * `Ok(Self)` containing the deserialized Variants if successful
+    /// * `Err(anyhow::Error)` if any error occurs during file opening, reading, or deserialization
+    ///
+    /// # Errors
+    /// This function will return an error if:
+    /// * The file cannot be opened
+    /// * A BGZF reader cannot be created for the file
+    /// * The file content cannot be deserialized into a Variants struct
+    ///
+    /// # Performance
+    /// The time and space complexity depends on the size of the input file
+    /// and the efficiency of the JSON deserialization and BGZF decompression.
+    ///
+    /// # Example
+    /// ```
+    /// match Variants::load_from_json("input.json.gz") {
+    ///     Ok(variants) => println!("Successfully loaded {} variants", variants.data.len()),
+    ///     Err(e) => eprintln!("Failed to load variants: {}", e),
+    /// }
+    /// ```
+    ///
+    /// # Note
+    /// This method expects the input file to be in BGZF-compressed JSON format,
+    /// typically created by the `save_to_json` method of this struct.
     pub fn load_from_json(filename: &str) -> anyhow::Result<Self> {
         let file =
             File::open(filename).with_context(|| format!("Failed to open file: {}", filename))?;
@@ -318,17 +723,40 @@ impl Variants {
         debug!("Successfully loaded variants from {}", filename);
         Ok(variants)
     }
-
-    pub fn get_alteration_cat(&self, cat: AlterationCategory) -> Vec<Variant> {
-        self
-            .data
-            .iter()
-            .filter(|v| v.alteration_category().contains(&cat))
-            .cloned()
-            .collect()
-    }
 }
 
+/// Creates a new Variant instance from a collection of VcfVariants and annotations.
+///
+/// This function consolidates information from one or more VcfVariants into a single Variant,
+/// using the first VcfVariant as the primary source for most fields. It also retrieves
+/// and includes relevant annotations.
+///
+/// # Arguments
+/// * `vcf_variants` - A vector of VcfVariant instances to be consolidated
+/// * `annotations` - A reference to an Annotations structure containing annotation data
+///
+/// # Returns
+/// A new Variant instance
+///
+/// # Behavior
+/// - Uses the first VcfVariant in the vector as the source for hash, position, reference, and alternative
+/// - Includes all provided VcfVariants in the new Variant
+/// - Retrieves annotations for the variant based on its hash
+/// - If no annotations are found, an empty vector is used
+///
+/// # Panics
+/// This function will panic if `vcf_variants` is empty.
+///
+/// # Example
+/// ```
+/// let vcf_variants = vec![VcfVariant { /* ... */ }];
+/// let annotations = Annotations::new();
+/// let variant = create_variant(vcf_variants, &annotations);
+/// ```
+///
+/// # Note
+/// This function assumes that all VcfVariants in the input vector represent the same genomic variant
+/// and should be consolidated. It's the caller's responsibility to ensure this is the case.
 fn create_variant(vcf_variants: Vec<VcfVariant>, annotations: &Annotations) -> Variant {
     let first = &vcf_variants[0];
     let annotations = annotations