Browse Source

vcf_variants

Thomas 1 day ago
parent
commit
116b2d7111

+ 1 - 1
src/annotation/mod.rs

@@ -18,7 +18,7 @@ use std::{
 
 use crate::{
     helpers::{mean, Blake3BuildHasher, Hash128},
-    variant::{variant::AlterationCategory, variant_collection::VariantCollection},
+    variant::{vcf_variant::AlterationCategory, variant_collection::VariantCollection},
 };
 use bitcode::{Decode, Encode};
 use cosmic::Cosmic;

+ 4 - 4
src/callers/clairs.rs

@@ -120,7 +120,7 @@
 //!
 //! ```ignore
 //! use crate::annotation::Annotations;
-//! use crate::variant::variant::Variants;
+//! use crate::variant::vcf_variant::Variants;
 //!
 //! let annotations = Annotations::new();
 //! let somatic = clairs.variants(&annotations)?;
@@ -150,9 +150,9 @@
 //! - [`LocalRunner`](crate::commands::LocalRunner) — Local execution support
 //! - [`SbatchRunner`](crate::commands::SbatchRunner) — Slurm job submission
 //! - [`Run`](crate::runners::Run) — Unified execution interface
-//! - [`Variants`](crate::variant::variant::Variants) — Somatic variant loading
+//! - [`Variants`](crate::variant::vcf_variant::Variants) — Somatic variant loading
 //! - [`CallerCat`](crate::annotation::CallerCat) — Caller annotation category
-//! - [`Label`](crate::variant::variant::Label) — Human-readable identifier
+//! - [`Label`](crate::variant::vcf_variant::Label) — Human-readable identifier
 //! - [`Version`](crate::pipes::Version) — Tool version extraction
 //!
 //! ## Dependencies
@@ -185,7 +185,7 @@ use crate::{
     run, run_many,
     runners::Run,
     variant::{
-        variant::{Label, Variants},
+        vcf_variant::{Label, Variants},
         variant_collection::VariantCollection,
     },
 };

+ 1 - 1
src/callers/deep_somatic.rs

@@ -111,7 +111,7 @@ use crate::{
     run, run_many,
     runners::Run,
     variant::{
-        variant::{Label, Variants},
+        vcf_variant::{Label, Variants},
         variant_collection::VariantCollection,
     },
 };

+ 1 - 1
src/callers/deep_variant.rs

@@ -113,7 +113,7 @@ use crate::{
     run, run_many,
     runners::Run,
     variant::{
-        variant::{Label, Variants},
+        vcf_variant::{Label, Variants},
         variant_collection::VariantCollection,
     },
 };

+ 1 - 1
src/callers/nanomonsv.rs

@@ -91,7 +91,7 @@ use crate::{
     run,
     runners::Run,
     variant::{
-        variant::{Label, Variants},
+        vcf_variant::{Label, Variants},
         variant_collection::VariantCollection,
     },
 };

+ 1 - 1
src/callers/savana.rs

@@ -84,7 +84,7 @@ use crate::{
     run,
     runners::Run,
     variant::{
-        variant::{Label, Variants},
+        vcf_variant::{Label, Variants},
         variant_collection::VariantCollection,
     },
 };

+ 1 - 1
src/callers/severus.rs

@@ -90,7 +90,7 @@ use crate::{
     run,
     runners::Run,
     variant::{
-        variant::{Label, Variants},
+        vcf_variant::{Label, Variants},
         variant_collection::VariantCollection,
     },
 };

+ 1 - 1
src/io/vcf.rs

@@ -7,7 +7,7 @@ use anyhow::Context;
 use bgzip::{write::BGZFMultiThreadWriter, Compression};
 use log::{info, warn};
 
-use crate::variant::variant::VcfVariant;
+use crate::variant::vcf_variant::VcfVariant;
 
 use super::{dict::read_dict, readers::get_reader};
 

+ 2 - 2
src/lib.rs

@@ -173,7 +173,7 @@ mod tests {
     use log::{error, info};
     use positions::{overlaps_par, GenomePosition, GenomeRange};
     use rayon::prelude::*;
-    use variant::{variant::VcfVariant, variant_collection};
+    use variant::{vcf_variant::VcfVariant, variant_collection};
 
     use self::{/* collection::pod5::{FlowCellCase, Pod5Collection}, */ config::Config};
     use super::*;
@@ -189,7 +189,7 @@ mod tests {
         positions::{merge_overlapping_genome_ranges, range_intersection_par, sort_ranges},
         scan::scan::somatic_scan,
         variant::{
-            variant::{AlterationCategory, BNDDesc, BNDGraph, ToBNDGraph},
+            vcf_variant::{AlterationCategory, BNDDesc, BNDGraph, ToBNDGraph},
             variant_collection::{
                 group_variants_by_bnd_desc, group_variants_by_bnd_rc, VariantCollection,
             },

+ 1 - 1
src/scan/scan.rs

@@ -16,7 +16,7 @@ use crate::math::filter_outliers_modified_z_score_with_indices;
 
 use crate::pipes::{Initialize, ShouldRun};
 use crate::runners::Run;
-use crate::variant::variant::Label;
+use crate::variant::vcf_variant::Label;
 use crate::{config::Config, io::dict::read_dict, scan::bin::Bin};
 
 /// Represents a count of reads in a genomic bin, including various metrics and outlier information.

+ 42 - 1
src/variant/mod.rs

@@ -1,4 +1,45 @@
-pub mod variant;
+//! # Variant Representation and Analysis
+//!
+//! This module provides comprehensive support for genomic variant representation,
+//! parsing, and statistical analysis for VCF (Variant Call Format) files.
+//!
+//! ## Module Organization
+//!
+//! - **[vcf_variant]** - Core VCF variant types, parsing, and structural variant handling
+//! - **[variant_collection]** - Collections and aggregations of variants with annotation support
+//! - **[variants_stats]** - Statistical analysis and filtering of variant collections
+//!
+//! ## Primary Types
+//!
+//! - [`VcfVariant`](vcf_variant::VcfVariant) - Represents a single VCF variant (SNV, indel, or SV)
+//! - [`VariantCollection`](variant_collection::VariantCollection) - Aggregated variant data with annotations
+//! - [`Infos`](vcf_variant::Infos) - Parsed VCF INFO field data
+//! - [`Formats`](vcf_variant::Formats) - Parsed VCF FORMAT field data
+//!
+//! ## Key Traits
+//!
+//! - [`Variants`](vcf_variant::Variants) - Load variants from caller outputs
+//! - [`Label`](vcf_variant::Label) - Human-readable caller identifiers
+//! - [`VariantId`](vcf_variant::VariantId) - Unique variant identification
+//!
+//! ## Usage
+//!
+//! ```ignore
+//! use pandora_lib_promethion::variant::{
+//!     vcf_variant::VcfVariant,
+//!     variant_collection::VariantCollection,
+//! };
+//! use pandora_lib_promethion::io::vcf::read_vcf;
+//!
+//! // Read VCF file
+//! let variants: Vec<VcfVariant> = read_vcf("sample.vcf.gz")?;
+//!
+//! // Create variant collection
+//! let collection = VariantCollection::from_variants(variants, &annotations)?;
+//! # Ok::<(), anyhow::Error>(())
+//! ```
+
+pub mod vcf_variant;
 pub mod variant_collection;
 pub mod variants_stats;
 

+ 1 - 1
src/variant/variant_collection.rs

@@ -15,7 +15,7 @@ use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
-use super::variant::{
+use super::vcf_variant::{
     AlterationCategory, BNDDesc, Formats, Info, Infos, ReferenceAlternative, VcfVariant,
 };
 use crate::{

+ 108 - 0
src/variant/variant.rs → src/variant/vcf_variant.rs

@@ -1,3 +1,111 @@
+//! # VCF Variant Types and Parsing
+//!
+//! This module provides core types for representing and parsing VCF (Variant Call Format)
+//! variants, including SNVs, indels, and structural variants (SVs).
+//!
+//! ## Core Types
+//!
+//! ### Variant Representation
+//!
+//! - [`VcfVariant`] - Main VCF variant struct with genomic position, alleles, quality, filters, and annotations
+//! - [`ReferenceAlternative`] - Reference or alternative allele sequences
+//! - [`Filter`] - VCF FILTER field values (PASS, LowQual, etc.)
+//! - [`AlterationCategory`] - High-level variant classification (SNV, Insertion, Deletion, SV, etc.)
+//! - [`SVType`] - Structural variant types (DEL, INS, DUP, INV, BND, CNV)
+//!
+//! ### Structural Variants
+//!
+//! - [`BNDDesc`] - Breakend (BND) description for translocations and complex rearrangements
+//! - [`BNDGraph`] - Graph structure for analyzing breakend connections
+//! - [`DeletionDesc`] - Deletion-specific metadata
+//!
+//! ### VCF Fields
+//!
+//! - [`Infos`] / [`Info`] - Parsed INFO field key-value pairs with typed values
+//! - [`Formats`] / [`Format`] - Parsed FORMAT and sample genotype fields
+//!
+//! ## Key Traits
+//!
+//! - [`Variants`] - Load variants from variant caller outputs
+//! - [`Label`] - Provide human-readable caller labels
+//! - [`VariantId`] - Generate unique variant identifiers
+//! - [`GroupByThreshold`] - Group breakends by genomic proximity
+//! - [`ToBNDGraph`] - Convert breakend lists to graph representations
+//!
+//! ## Parsing
+//!
+//! VCF variants can be parsed from tab-separated strings using `FromStr`:
+//!
+//! ```ignore
+//! use pandora_lib_promethion::variant::vcf_variant::VcfVariant;
+//! use std::str::FromStr;
+//!
+//! let vcf_line = "chr1\t1000\t.\tA\tT\t30.0\tPASS\tDP=50\tGT:AD:DP\t0/1:25,25:50";
+//! let variant = VcfVariant::from_str(vcf_line)?;
+//!
+//! assert_eq!(variant.position.contig, "chr1");
+//! assert_eq!(variant.position.position, 1000);
+//! assert_eq!(variant.reference.seq, "A");
+//! assert_eq!(variant.alternative.seq, "T");
+//! # Ok::<(), anyhow::Error>(())
+//! ```
+//!
+//! ## Structural Variant Analysis
+//!
+//! Breakends (BND) can be grouped by proximity and analyzed as graphs:
+//!
+//! ```ignore
+//! use pandora_lib_promethion::variant::vcf_variant::{BNDDesc, GroupByThreshold, ToBNDGraph};
+//!
+//! let breakends: Vec<BNDDesc> = variants
+//!     .into_iter()
+//!     .filter_map(|v| v.to_bnd_desc().ok())
+//!     .collect();
+//!
+//! // Group nearby breakends (within 1000 bp)
+//! let groups = breakends.group_by_threshold(1000);
+//!
+//! // Convert to graph for complex SV analysis
+//! let graph = breakends.to_bnd_graph();
+//! # Ok::<(), anyhow::Error>(())
+//! ```
+//!
+//! ## Variant Classification
+//!
+//! Variants are automatically classified into categories:
+//!
+//! ```ignore
+//! use pandora_lib_promethion::variant::vcf_variant::AlterationCategory;
+//!
+//! match variant.alteration_category() {
+//!     AlterationCategory::SNV => println!("Single nucleotide variant"),
+//!     AlterationCategory::Insertion => println!("Insertion"),
+//!     AlterationCategory::Deletion => println!("Deletion"),
+//!     AlterationCategory::SV(sv_type) => println!("Structural variant: {}", sv_type),
+//!     _ => {}
+//! }
+//! # Ok::<(), anyhow::Error>(())
+//! ```
+//!
+//! ## INFO and FORMAT Field Access
+//!
+//! INFO and FORMAT fields are parsed into typed enums for safe access:
+//!
+//! ```ignore
+//! use pandora_lib_promethion::variant::vcf_variant::Info;
+//!
+//! // Access INFO fields
+//! if let Some(Info::DP(depth)) = variant.infos.get_key("DP") {
+//!     println!("Read depth: {}", depth);
+//! }
+//!
+//! // Access FORMAT fields
+//! if let Some(genotype) = variant.formats.get_genotype() {
+//!     println!("Genotype: {}", genotype);
+//! }
+//! # Ok::<(), anyhow::Error>(())
+//! ```
+
 use crate::{
     annotation::Annotations,
     helpers::{estimate_shannon_entropy, mean, Hash128},

+ 1 - 1
src/vcf_reader.rs

@@ -3,7 +3,7 @@ use std::{fs::File, io::BufReader};
 use csv::ReaderBuilder;
 use pandora_lib_variants::variants::Variant;
 
-use crate::{callers::Caller, variant::variant::VariantType};
+use crate::{callers::Caller, variant::vcf_variant::VariantType};
 
 #[derive(Debug, serde::Deserialize, Eq, PartialEq, Clone)]
 pub struct VCFRow {