Browse Source

doc positions

Thomas 9 months ago
parent
commit
a8717442ce
1 changed files with 364 additions and 10 deletions
  1. 364 10
      src/positions.rs

+ 364 - 10
src/positions.rs

@@ -1,4 +1,3 @@
-use core::panic;
 use std::{cmp::Ordering, fmt::Display, ops::Range, str::FromStr};
 
 use anyhow::Context;
@@ -44,17 +43,66 @@ pub struct GenomePosition {
 }
 
 impl GenomePosition {
+    /// Returns the contig (chromosome) identifier as a human-readable string.
+    ///
+    /// This method converts the internal numeric representation of the contig
+    /// to a string format commonly used in genomics, such as "chr1", "chrX", etc...
+    ///
+    /// # Returns
+    /// A `String` representing the contig identifier.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let pos = GenomePosition { contig: 1, position: 1000 };
+    /// assert_eq!(pos.contig(), "chr1");
+    ///
+    /// let pos_x = GenomePosition { contig: 23, position: 500 };
+    /// assert_eq!(pos_x.contig(), "chrX");
+    /// ```
+    ///
+    /// # Note
+    /// This method uses the `num_to_contig` function to perform the conversion.
+    /// Special cases like X, Y chromosomes and mitochondrial DNA are handled appropriately.
     pub fn contig(&self) -> String {
         num_to_contig(self.contig)
     }
 }
 
+/// Implements the `Display` trait for `GenomePosition`.
+///
+/// This implementation allows a `GenomePosition` to be formatted as a string
+/// in the form "contig:position", where contig is the string representation
+/// of the chromosome (e.g., "chr1", "chrX") and position is the 0-based position.
+///
+/// # Example
+/// ```
+/// let pos = GenomePosition { contig: 1, position: 1000 };
+/// assert_eq!(format!("{}", pos), "chr1:1000");
+/// ```
 impl Display for GenomePosition {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}:{}", self.contig(), self.position)
     }
 }
 
+/// Implements the `FromStr` trait for `GenomePosition`.
+///
+/// This allows creating a `GenomePosition` from a string in the format "contig:position".
+///
+/// # Errors
+/// Returns an `anyhow::Error` if:
+/// - The input string doesn't contain a ':' character
+/// - The contig part can't be converted to a numeric representation
+/// - The position part can't be parsed as a `u32`
+///
+/// # Example
+/// ```
+/// use std::str::FromStr;
+/// let pos = GenomePosition::from_str("chr1:1000").unwrap();
+/// assert_eq!(pos.contig, 1);
+/// assert_eq!(pos.position, 1000);
+/// ```
 impl FromStr for GenomePosition {
     type Err = anyhow::Error;
 
@@ -69,6 +117,23 @@ impl FromStr for GenomePosition {
     }
 }
 
+/// Implements `TryFrom<(&str, &str)>` for `GenomePosition`.
+///
+/// This allows creating a `GenomePosition` from a tuple of two string slices,
+/// where the first represents the contig and the second represents the position.
+///
+/// # Errors
+/// Returns an `anyhow::Error` if:
+/// - The contig part can't be converted to a numeric representation
+/// - The position part can't be parsed as a `u32`
+///
+/// # Example
+/// ```
+/// use std::convert::TryFrom;
+/// let pos = GenomePosition::try_from(("chr1", "1000")).unwrap();
+/// assert_eq!(pos.contig, 1);
+/// assert_eq!(pos.position, 1000);
+/// ```
 impl TryFrom<(&str, &str)> for GenomePosition {
     type Error = anyhow::Error;
 
@@ -80,6 +145,22 @@ impl TryFrom<(&str, &str)> for GenomePosition {
     }
 }
 
+/// Implements conversion from `VcfPosition` to `GenomePosition`.
+///
+/// This conversion adjusts for the difference between 1-based VCF coordinates
+/// and 0-based genomic coordinates. It also handles the special case of
+/// position 0 in VCF, which represents a variant inside telomeres.
+///
+/// # Note
+/// This implementation assumes that `VcfPosition` uses 1-based coordinates.
+///
+/// # Example
+/// ```
+/// let vcf_pos = VcfPosition { contig: 1, position: 1001 };
+/// let genome_pos: GenomePosition = vcf_pos.into();
+/// assert_eq!(genome_pos.contig, 1);
+/// assert_eq!(genome_pos.position, 1000);
+/// ```
 impl From<VcfPosition> for GenomePosition {
     fn from(value: VcfPosition) -> Self {
         let position = if value.position == 0 {
@@ -95,18 +176,49 @@ impl From<VcfPosition> for GenomePosition {
     }
 }
 
-impl PartialOrd for GenomePosition {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
+/// Implements conversion from `GenomePosition` to `(String, String)`.
+///
+/// This conversion creates a tuple where the first element is the string
+/// representation of the contig (e.g., "chr1", "chrX") and the second
+/// element is the string representation of the position.
+///
+/// # Example
+/// ```
+/// let pos = GenomePosition { contig: 1, position: 1000 };
+/// let (contig, position): (String, String) = pos.into();
+/// assert_eq!(contig, "chr1");
+/// assert_eq!(position, "1000");
+/// ```
 impl From<GenomePosition> for (String, String) {
     fn from(val: GenomePosition) -> Self {
         (num_to_contig(val.contig), val.position.to_string())
     }
 }
 
+/// Implements `PartialOrd` for `GenomePosition`.
+///
+/// This implementation allows `GenomePosition` instances to be compared
+/// and ordered based first on their contig, then on their position.
+impl PartialOrd for GenomePosition {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+/// Implements `Ord` for `GenomePosition`.
+///
+/// This implementation defines a total ordering for `GenomePosition` instances.
+/// Positions are first compared by their contig, and if the contigs are equal,
+/// then by their position.
+///
+/// # Example
+/// ```
+/// let pos1 = GenomePosition { contig: 1, position: 1000 };
+/// let pos2 = GenomePosition { contig: 1, position: 2000 };
+/// let pos3 = GenomePosition { contig: 2, position: 500 };
+/// assert!(pos1 < pos2);
+/// assert!(pos2 < pos3);
+/// ```
 impl Ord for GenomePosition {
     fn cmp(&self, other: &Self) -> Ordering {
         match self.contig.cmp(&other.contig) {
@@ -116,6 +228,30 @@ impl Ord for GenomePosition {
     }
 }
 
+/// Converts a contig (chromosome) identifier string to its numeric representation.
+///
+/// # Arguments
+/// * `contig` - A string slice representing the contig identifier (e.g., "chr1", "chrX")
+///
+/// # Returns
+/// A `u8` representing the numeric value of the contig:
+/// * 1-22 for autosomes
+/// * 23 for X chromosome
+/// * 24 for Y chromosome
+/// * 25 for mitochondrial DNA
+/// * `u8::MAX` (255) for any unrecognized input
+///
+/// # Examples
+/// ```
+/// assert_eq!(contig_to_num("chr1"), 1);
+/// assert_eq!(contig_to_num("chrX"), 23);
+/// assert_eq!(contig_to_num("chrM"), 25);
+/// assert_eq!(contig_to_num("chr_unknown"), u8::MAX);
+/// ```
+///
+/// # Note
+/// This function assumes the input string starts with "chr" for standard chromosomes.
+/// The "chr" prefix is trimmed before parsing numeric values.
 pub fn contig_to_num(contig: &str) -> u8 {
     match contig {
         "chrX" => 23,
@@ -125,6 +261,28 @@ pub fn contig_to_num(contig: &str) -> u8 {
     }
 }
 
+/// Converts a numeric contig (chromosome) representation to its string identifier.
+///
+/// # Arguments
+/// * `num` - A `u8` representing the numeric value of the contig
+///
+/// # Returns
+/// A `String` representing the contig identifier:
+/// * "chr1" to "chr22" for autosomes
+/// * "chrX" for 23
+/// * "chrY" for 24
+/// * "chrM" for 25
+/// * "chr{num}" for any other input
+///
+/// # Examples
+/// ```
+/// assert_eq!(num_to_contig(1), "chr1");
+/// assert_eq!(num_to_contig(23), "chrX");
+/// assert_eq!(num_to_contig(25), "chrM");
+/// ```
+///
+/// # Note
+/// This function always prepends "chr" to the output string.
 pub fn num_to_contig(num: u8) -> String {
     match num {
         23 => "chrX".to_string(),
@@ -134,13 +292,47 @@ pub fn num_to_contig(num: u8) -> String {
     }
 }
 
-// 1-based
+/// Represents a position in a VCF (Variant Call Format) file using 1-based coordinates.
+///
+/// This struct encapsulates a genomic position as represented in VCF files, consisting of a contig (chromosome)
+/// identifier and a position within that contig.
+///
+/// # Fields
+/// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
+/// * `position` - A 32-bit unsigned integer representing the 1-based position within the contig
+///
+/// # Derives
+/// This struct derives the following traits:
+/// * `Debug` for easier debugging and logging
+/// * `PartialEq` and `Eq` for equality comparisons
+/// * `Clone` for creating deep copies
+/// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
+///
+/// # Note
+/// The position is 1-based, as per VCF file format specifications.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct VcfPosition {
     pub contig: u8,
     pub position: u32,
 }
 
+/// Implements `TryFrom<(&str, &str)>` for `VcfPosition`.
+///
+/// This allows creating a `VcfPosition` from a tuple of two string slices,
+/// where the first represents the contig and the second represents the position.
+///
+/// # Errors
+/// Returns an `anyhow::Error` if:
+/// - The contig part can't be converted to a numeric representation
+/// - The position part can't be parsed as a `u32`
+///
+/// # Example
+/// ```
+/// use std::convert::TryFrom;
+/// let pos = VcfPosition::try_from(("chr1", "1000")).unwrap();
+/// assert_eq!(pos.contig, 1);
+/// assert_eq!(pos.position, 1000);
+/// ```
 impl TryFrom<(&str, &str)> for VcfPosition {
     type Error = anyhow::Error;
 
@@ -152,6 +344,18 @@ impl TryFrom<(&str, &str)> for VcfPosition {
     }
 }
 
+/// Implements conversion from `GenomePosition` to `VcfPosition`.
+///
+/// This conversion adjusts for the difference between 0-based genomic coordinates
+/// and 1-based VCF coordinates.
+///
+/// # Example
+/// ```
+/// let genome_pos = GenomePosition { contig: 1, position: 999 };
+/// let vcf_pos: VcfPosition = genome_pos.into();
+/// assert_eq!(vcf_pos.contig, 1);
+/// assert_eq!(vcf_pos.position, 1000);
+/// ```
 impl From<GenomePosition> for VcfPosition {
     fn from(value: GenomePosition) -> Self {
         Self {
@@ -161,19 +365,69 @@ impl From<GenomePosition> for VcfPosition {
     }
 }
 
+/// Implements conversion from `VcfPosition` to `(String, String)`.
+///
+/// This conversion creates a tuple where the first element is the string
+/// representation of the contig (e.g., "chr1", "chrX") and the second
+/// element is the string representation of the position.
+///
+/// # Example
+/// ```
+/// let pos = VcfPosition { contig: 1, position: 1000 };
+/// let (contig, position): (String, String) = pos.into();
+/// assert_eq!(contig, "chr1");
+/// assert_eq!(position, "1000");
+/// ```
 impl From<VcfPosition> for (String, String) {
     fn from(val: VcfPosition) -> Self {
         (num_to_contig(val.contig), val.position.to_string())
     }
 }
 
-// [0;n[
+/// Represents a range in the genome using 0-based, half-open coordinates [start, end).
+///
+/// This struct encapsulates a genomic range, consisting of a contig (chromosome)
+/// identifier and a range of positions within that contig.
+///
+/// # Fields
+/// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
+/// * `range` - A `Range<u32>` representing the start (inclusive) and end (exclusive) positions
+///
+/// # Derives
+/// This struct derives the following traits:
+/// * `Debug` for easier debugging and logging
+/// * `PartialEq` and `Eq` for equality comparisons
+/// * `Clone` for creating deep copies
+/// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
+///
+/// # Note
+/// The range is 0-based and half-open, meaning the start position is included but the end position is not.
+/// This is consistent with many programming languages and bioinformatics tools.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct GenomeRange {
     pub contig: u8,
     pub range: Range<u32>,
 }
 
+/// Implements `TryFrom<(&str, &str, &str)>` for `GenomeRange`.
+///
+/// This allows creating a `GenomeRange` from a tuple of three string slices,
+/// where the first represents the contig, the second the start position,
+/// and the third the end position.
+///
+/// # Errors
+/// Returns an `anyhow::Error` if:
+/// - The contig part can't be converted to a numeric representation
+/// - Either the start or end position can't be parsed as a `u32`
+///
+/// # Example
+/// ```
+/// use std::convert::TryFrom;
+/// let range = GenomeRange::try_from(("chr1", "1000", "2000")).unwrap();
+/// assert_eq!(range.contig, 1);
+/// assert_eq!(range.range.start, 1000);
+/// assert_eq!(range.range.end, 2000);
+/// ```
 impl TryFrom<(&str, &str, &str)> for GenomeRange {
     type Error = anyhow::Error;
 
@@ -189,6 +443,31 @@ impl TryFrom<(&str, &str, &str)> for GenomeRange {
 }
 
 impl GenomeRange {
+    /// Creates a `GenomeRange` from 1-based inclusive start and end positions.
+    ///
+    /// This method is useful when working with data sources that use 1-based coordinates
+    /// and inclusive ranges. It converts the input to the 0-based, half-open format used by `GenomeRange`.
+    ///
+    /// # Arguments
+    /// * `contig` - A string slice representing the contig/chromosome
+    /// * `start` - A string slice representing the 1-based inclusive start position
+    /// * `end` - A string slice representing the 1-based inclusive end position
+    ///
+    /// # Returns
+    /// A `Result` containing the `GenomeRange` if successful, or an `anyhow::Error` if parsing fails.
+    ///
+    /// # Errors
+    /// Returns an `anyhow::Error` if:
+    /// - The contig can't be converted to a numeric representation
+    /// - Either the start or end position can't be parsed as a `u32`
+    ///
+    /// # Example
+    /// ```
+    /// let range = GenomeRange::from_1_inclusive("chr1", "1000", "2000").unwrap();
+    /// assert_eq!(range.contig, 1);
+    /// assert_eq!(range.range.start, 999);  // Converted to 0-based
+    /// assert_eq!(range.range.end, 2000);   // End is exclusive in 0-based
+    /// ```
     pub fn from_1_inclusive(contig: &str, start: &str, end: &str) -> anyhow::Result<Self> {
         Ok(Self {
             contig: contig_to_num(contig),
@@ -197,12 +476,44 @@ impl GenomeRange {
                     .parse::<u32>()
                     .context(format!("Can't parse {start} into u32."))?
                     - 1,
-                end: end.parse().context(format!("Can't parse {end} into u32."))?,
+                end: end
+                    .parse()
+                    .context(format!("Can't parse {end} into u32."))?,
             },
         })
     }
 }
 
+/// Finds overlaps between a list of genome positions and a list of genome ranges in parallel.
+///
+/// This function uses Rayon for parallel processing, dividing the positions into chunks
+/// and processing them concurrently.
+///
+/// # Arguments
+/// * `positions` - A slice of references to `GenomePosition`s
+/// * `ranges` - A slice of references to `GenomeRange`s
+///
+/// # Returns
+/// A `Vec<usize>` containing the indices of positions that overlap with any range.
+///
+/// # Performance
+/// This function is optimized for scenarios where both `positions` and `ranges` are sorted
+/// by contig and position/range start. It uses this ordering to avoid unnecessary comparisons.
+///
+/// # Example
+/// ```
+/// let positions = vec![
+///     &GenomePosition { contig: 1, position: 100 },
+///     &GenomePosition { contig: 1, position: 200 },
+///     &GenomePosition { contig: 2, position: 150 },
+/// ];
+/// let ranges = vec![
+///     &GenomeRange { contig: 1, range: 50..150 },
+///     &GenomeRange { contig: 2, range: 100..200 },
+/// ];
+/// let overlaps = overlaps_par(&positions, &ranges);
+/// assert_eq!(overlaps, vec!);
+/// ```
 pub fn overlaps_par(positions: &[&GenomePosition], ranges: &[&GenomeRange]) -> Vec<usize> {
     let chunk_size = (positions.len() / rayon::current_num_threads()).max(1);
 
@@ -239,16 +550,59 @@ pub fn overlaps_par(positions: &[&GenomePosition], ranges: &[&GenomeRange]) -> V
         .collect()
 }
 
+/// A parallel version of the overlap finding function that works with any types
+/// implementing `GetGenomePosition` and `GetGenomeRange` traits.
+///
+/// This function first converts the input slices to vectors of `GenomePosition` and `GenomeRange`,
+/// then calls `overlaps_par`.
+///
+/// # Arguments
+/// * `a` - A slice of items implementing `GetGenomePosition`
+/// * `b` - A slice of items implementing `GetGenomeRange`
+///
+/// # Returns
+/// A `Vec<usize>` containing the indices of positions that overlap with any range.
+///
+/// # Example
+/// ```
+/// struct MyPosition(GenomePosition);
+/// impl GetGenomePosition for MyPosition {
+///     fn position(&self) -> &GenomePosition { &self.0 }
+/// }
+///
+/// struct MyRange(GenomeRange);
+/// impl GetGenomeRange for MyRange {
+///     fn range(&self) -> &GenomeRange { &self.0 }
+/// }
+///
+/// let positions = vec![
+///     MyPosition(GenomePosition { contig: 1, position: 100 }),
+///     MyPosition(GenomePosition { contig: 1, position: 200 }),
+/// ];
+/// let ranges = vec![
+///     MyRange(GenomeRange { contig: 1, range: 50..150 }),
+/// ];
+/// let overlaps = par_overlaps(&positions, &ranges);
+/// assert_eq!(overlaps, vec!);
+/// ```
 pub fn par_overlaps(a: &[impl GetGenomePosition], b: &[impl GetGenomeRange]) -> Vec<usize> {
     let a: Vec<_> = a.iter().map(|e| e.position()).collect();
     let b: Vec<_> = b.iter().map(|e| e.range()).collect();
     overlaps_par(&a, &b)
 }
 
+/// A trait for types that can provide a reference to a `GenomePosition`.
+///
+/// Implement this trait for types that contain or can compute a `GenomePosition`.
 pub trait GetGenomePosition {
+    // Returns a reference to the `GenomePosition` for this item.
     fn position(&self) -> &GenomePosition;
 }
 
+/// A trait for types that can provide a reference to a `GenomeRange`.
+///
+/// Implement this trait for types that contain or can compute a `GenomeRange`.
 pub trait GetGenomeRange {
+    /// Returns a reference to the `GenomeRange` for this item.
     fn range(&self) -> &GenomeRange;
 }