use std::{cmp::Ordering, fmt::Display, ops::Range, str::FromStr}; use anyhow::Context; use rayon::prelude::*; use serde::{Deserialize, Serialize}; /// Represents a position in the genome using 0-based coordinates. /// /// This struct encapsulates a genomic position, consisting of a contig (chromosome) /// identifier and a position within that contig. /// /// # Fields /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome /// * `position` - A 32-bit unsigned integer representing the 0-based position within the contig /// /// # Derives /// This struct derives the following traits: /// * `Debug` for easier debugging and logging /// * `PartialEq` and `Eq` for equality comparisons /// * `Clone` for creating deep copies /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats) /// * `Hash` for use in hash-based collections /// * `Default` for creating a default instance /// /// # Note /// The position is 0-based, meaning the first position in a contig is represented as 0. /// /// # Usage /// This struct is typically used to precisely locate genomic features or variants /// within a reference genome. /// /// # Example /// ``` /// let position = GenomePosition { /// contig: 1, // Representing chromosome 1 /// position: 1000, // 1001st base pair in the chromosome (remember it's 0-based) /// }; /// ``` #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash, Default)] pub struct GenomePosition { pub contig: u8, pub position: u32, } impl GenomePosition { /// Returns the contig (chromosome) identifier as a human-readable string. /// /// This method converts the internal numeric representation of the contig /// to a string format commonly used in genomics, such as "chr1", "chrX", etc... /// /// # Returns /// A `String` representing the contig identifier. /// /// # Examples /// /// ``` /// let pos = GenomePosition { contig: 1, position: 1000 }; /// assert_eq!(pos.contig(), "chr1"); /// /// let pos_x = GenomePosition { contig: 23, position: 500 }; /// assert_eq!(pos_x.contig(), "chrX"); /// ``` /// /// # Note /// This method uses the `num_to_contig` function to perform the conversion. /// Special cases like X, Y chromosomes and mitochondrial DNA are handled appropriately. pub fn contig(&self) -> String { num_to_contig(self.contig) } } /// Implements the `Display` trait for `GenomePosition`. /// /// This implementation allows a `GenomePosition` to be formatted as a string /// in the form "contig:position", where contig is the string representation /// of the chromosome (e.g., "chr1", "chrX") and position is the 0-based position. /// /// # Example /// ``` /// let pos = GenomePosition { contig: 1, position: 1000 }; /// assert_eq!(format!("{}", pos), "chr1:1000"); /// ``` impl Display for GenomePosition { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}:{}", self.contig(), self.position) } } /// Implements the `FromStr` trait for `GenomePosition`. /// /// This allows creating a `GenomePosition` from a string in the format "contig:position". /// /// # Errors /// Returns an `anyhow::Error` if: /// - The input string doesn't contain a ':' character /// - The contig part can't be converted to a numeric representation /// - The position part can't be parsed as a `u32` /// /// # Example /// ``` /// use std::str::FromStr; /// let pos = GenomePosition::from_str("chr1:1000").unwrap(); /// assert_eq!(pos.contig, 1); /// assert_eq!(pos.position, 1000); /// ``` impl FromStr for GenomePosition { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { let (c, p) = s .split_once(":") .ok_or(anyhow::anyhow!("Can't split {s}"))?; Ok(Self { contig: contig_to_num(c), position: p.parse()?, }) } } /// Implements `TryFrom<(&str, &str)>` for `GenomePosition`. /// /// This allows creating a `GenomePosition` from a tuple of two string slices, /// where the first represents the contig and the second represents the position. /// /// # Errors /// Returns an `anyhow::Error` if: /// - The contig part can't be converted to a numeric representation /// - The position part can't be parsed as a `u32` /// /// # Example /// ``` /// use std::convert::TryFrom; /// let pos = GenomePosition::try_from(("chr1", "1000")).unwrap(); /// assert_eq!(pos.contig, 1); /// assert_eq!(pos.position, 1000); /// ``` impl TryFrom<(&str, &str)> for GenomePosition { type Error = anyhow::Error; fn try_from((c, p): (&str, &str)) -> anyhow::Result { Ok(GenomePosition { contig: contig_to_num(c), position: p.parse().context(format!("Can't parse {p} into u32."))?, }) } } /// Implements conversion from `VcfPosition` to `GenomePosition`. /// /// This conversion adjusts for the difference between 1-based VCF coordinates /// and 0-based genomic coordinates. It also handles the special case of /// position 0 in VCF, which represents a variant inside telomeres. /// /// # Note /// This implementation assumes that `VcfPosition` uses 1-based coordinates. /// /// # Example /// ``` /// let vcf_pos = VcfPosition { contig: 1, position: 1001 }; /// let genome_pos: GenomePosition = vcf_pos.into(); /// assert_eq!(genome_pos.contig, 1); /// assert_eq!(genome_pos.position, 1000); /// ``` impl From for GenomePosition { fn from(value: VcfPosition) -> Self { let position = if value.position == 0 { // position 0 is a variant inside telomeres 0 } else { value.position - 1 }; Self { contig: value.contig, position, } } } /// Implements conversion from `GenomePosition` to `(String, String)`. /// /// This conversion creates a tuple where the first element is the string /// representation of the contig (e.g., "chr1", "chrX") and the second /// element is the string representation of the position. /// /// # Example /// ``` /// let pos = GenomePosition { contig: 1, position: 1000 }; /// let (contig, position): (String, String) = pos.into(); /// assert_eq!(contig, "chr1"); /// assert_eq!(position, "1000"); /// ``` impl From for (String, String) { fn from(val: GenomePosition) -> Self { (num_to_contig(val.contig), val.position.to_string()) } } /// Implements `PartialOrd` for `GenomePosition`. /// /// This implementation allows `GenomePosition` instances to be compared /// and ordered based first on their contig, then on their position. impl PartialOrd for GenomePosition { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } /// Implements `Ord` for `GenomePosition`. /// /// This implementation defines a total ordering for `GenomePosition` instances. /// Positions are first compared by their contig, and if the contigs are equal, /// then by their position. /// /// # Example /// ``` /// let pos1 = GenomePosition { contig: 1, position: 1000 }; /// let pos2 = GenomePosition { contig: 1, position: 2000 }; /// let pos3 = GenomePosition { contig: 2, position: 500 }; /// assert!(pos1 < pos2); /// assert!(pos2 < pos3); /// ``` impl Ord for GenomePosition { fn cmp(&self, other: &Self) -> Ordering { match self.contig.cmp(&other.contig) { Ordering::Equal => self.position.cmp(&other.position), other => other, } } } /// Converts a contig (chromosome) identifier string to its numeric representation. /// /// # Arguments /// * `contig` - A string slice representing the contig identifier (e.g., "chr1", "chrX") /// /// # Returns /// A `u8` representing the numeric value of the contig: /// * 1-22 for autosomes /// * 23 for X chromosome /// * 24 for Y chromosome /// * 25 for mitochondrial DNA /// * `u8::MAX` (255) for any unrecognized input /// /// # Examples /// ``` /// assert_eq!(contig_to_num("chr1"), 1); /// assert_eq!(contig_to_num("chrX"), 23); /// assert_eq!(contig_to_num("chrM"), 25); /// assert_eq!(contig_to_num("chr_unknown"), u8::MAX); /// ``` /// /// # Note /// This function assumes the input string starts with "chr" for standard chromosomes. /// The "chr" prefix is trimmed before parsing numeric values. pub fn contig_to_num(contig: &str) -> u8 { match contig { "chrX" => 23, "chrY" => 24, "chrM" => 25, c => c.trim_start_matches("chr").parse().unwrap_or(u8::MAX), } } /// Converts a numeric contig (chromosome) representation to its string identifier. /// /// # Arguments /// * `num` - A `u8` representing the numeric value of the contig /// /// # Returns /// A `String` representing the contig identifier: /// * "chr1" to "chr22" for autosomes /// * "chrX" for 23 /// * "chrY" for 24 /// * "chrM" for 25 /// * "chr{num}" for any other input /// /// # Examples /// ``` /// assert_eq!(num_to_contig(1), "chr1"); /// assert_eq!(num_to_contig(23), "chrX"); /// assert_eq!(num_to_contig(25), "chrM"); /// ``` /// /// # Note /// This function always prepends "chr" to the output string. pub fn num_to_contig(num: u8) -> String { match num { 23 => "chrX".to_string(), 24 => "chrY".to_string(), 25 => "chrM".to_string(), c => format!("chr{c}"), } } /// Represents a position in a VCF (Variant Call Format) file using 1-based coordinates. /// /// This struct encapsulates a genomic position as represented in VCF files, consisting of a contig (chromosome) /// identifier and a position within that contig. /// /// # Fields /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome /// * `position` - A 32-bit unsigned integer representing the 1-based position within the contig /// /// # Derives /// This struct derives the following traits: /// * `Debug` for easier debugging and logging /// * `PartialEq` and `Eq` for equality comparisons /// * `Clone` for creating deep copies /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats) /// /// # Note /// The position is 1-based, as per VCF file format specifications. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct VcfPosition { pub contig: u8, pub position: u32, } /// Implements `TryFrom<(&str, &str)>` for `VcfPosition`. /// /// This allows creating a `VcfPosition` from a tuple of two string slices, /// where the first represents the contig and the second represents the position. /// /// # Errors /// Returns an `anyhow::Error` if: /// - The contig part can't be converted to a numeric representation /// - The position part can't be parsed as a `u32` /// /// # Example /// ``` /// use std::convert::TryFrom; /// let pos = VcfPosition::try_from(("chr1", "1000")).unwrap(); /// assert_eq!(pos.contig, 1); /// assert_eq!(pos.position, 1000); /// ``` impl TryFrom<(&str, &str)> for VcfPosition { type Error = anyhow::Error; fn try_from((c, p): (&str, &str)) -> anyhow::Result { Ok(VcfPosition { contig: contig_to_num(c), position: p.parse().context(format!("Can't parse {p} into u32."))?, }) } } /// Implements conversion from `GenomePosition` to `VcfPosition`. /// /// This conversion adjusts for the difference between 0-based genomic coordinates /// and 1-based VCF coordinates. /// /// # Example /// ``` /// let genome_pos = GenomePosition { contig: 1, position: 999 }; /// let vcf_pos: VcfPosition = genome_pos.into(); /// assert_eq!(vcf_pos.contig, 1); /// assert_eq!(vcf_pos.position, 1000); /// ``` impl From for VcfPosition { fn from(value: GenomePosition) -> Self { Self { contig: value.contig, position: value.position + 1, } } } /// Implements conversion from `VcfPosition` to `(String, String)`. /// /// This conversion creates a tuple where the first element is the string /// representation of the contig (e.g., "chr1", "chrX") and the second /// element is the string representation of the position. /// /// # Example /// ``` /// let pos = VcfPosition { contig: 1, position: 1000 }; /// let (contig, position): (String, String) = pos.into(); /// assert_eq!(contig, "chr1"); /// assert_eq!(position, "1000"); /// ``` impl From for (String, String) { fn from(val: VcfPosition) -> Self { (num_to_contig(val.contig), val.position.to_string()) } } /// Represents a range in the genome using 0-based, half-open coordinates [start, end). /// /// This struct encapsulates a genomic range, consisting of a contig (chromosome) /// identifier and a range of positions within that contig. /// /// # Fields /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome /// * `range` - A `Range` representing the start (inclusive) and end (exclusive) positions /// /// # Derives /// This struct derives the following traits: /// * `Debug` for easier debugging and logging /// * `PartialEq` and `Eq` for equality comparisons /// * `Clone` for creating deep copies /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats) /// /// # Note /// The range is 0-based and half-open, meaning the start position is included but the end position is not. /// This is consistent with many programming languages and bioinformatics tools. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct GenomeRange { pub contig: u8, pub range: Range, } /// Implements `TryFrom<(&str, &str, &str)>` for `GenomeRange`. /// /// This allows creating a `GenomeRange` from a tuple of three string slices, /// where the first represents the contig, the second the start position, /// and the third the end position. /// /// # Errors /// Returns an `anyhow::Error` if: /// - The contig part can't be converted to a numeric representation /// - Either the start or end position can't be parsed as a `u32` /// /// # Example /// ``` /// use std::convert::TryFrom; /// let range = GenomeRange::try_from(("chr1", "1000", "2000")).unwrap(); /// assert_eq!(range.contig, 1); /// assert_eq!(range.range.start, 1000); /// assert_eq!(range.range.end, 2000); /// ``` impl TryFrom<(&str, &str, &str)> for GenomeRange { type Error = anyhow::Error; fn try_from((c, s, e): (&str, &str, &str)) -> anyhow::Result { Ok(Self { contig: contig_to_num(c), range: Range { start: s.parse().context(format!("Can't parse {s} into u32."))?, end: e.parse().context(format!("Can't parse {e} into u32."))?, }, }) } } impl GenomeRange { /// Creates a `GenomeRange` from 1-based inclusive start and end positions. /// /// This method is useful when working with data sources that use 1-based coordinates /// and inclusive ranges. It converts the input to the 0-based, half-open format used by `GenomeRange`. /// /// # Arguments /// * `contig` - A string slice representing the contig/chromosome /// * `start` - A string slice representing the 1-based inclusive start position /// * `end` - A string slice representing the 1-based inclusive end position /// /// # Returns /// A `Result` containing the `GenomeRange` if successful, or an `anyhow::Error` if parsing fails. /// /// # Errors /// Returns an `anyhow::Error` if: /// - The contig can't be converted to a numeric representation /// - Either the start or end position can't be parsed as a `u32` /// /// # Example /// ``` /// let range = GenomeRange::from_1_inclusive("chr1", "1000", "2000").unwrap(); /// assert_eq!(range.contig, 1); /// assert_eq!(range.range.start, 999); // Converted to 0-based /// assert_eq!(range.range.end, 2000); // End is exclusive in 0-based /// ``` pub fn from_1_inclusive(contig: &str, start: &str, end: &str) -> anyhow::Result { Ok(Self { contig: contig_to_num(contig), range: Range { start: start .parse::() .context(format!("Can't parse {start} into u32."))? - 1, end: end .parse() .context(format!("Can't parse {end} into u32."))?, }, }) } } /// Finds overlaps between a list of genome positions and a list of genome ranges in parallel. /// /// This function uses Rayon for parallel processing, dividing the positions into chunks /// and processing them concurrently. /// /// # Arguments /// * `positions` - A slice of references to `GenomePosition`s /// * `ranges` - A slice of references to `GenomeRange`s /// /// # Returns /// A `Vec` containing the indices of positions that overlap with any range. /// /// # Performance /// This function is optimized for scenarios where both `positions` and `ranges` are sorted /// by contig and position/range start. It uses this ordering to avoid unnecessary comparisons. /// /// # Example /// ``` /// let positions = vec![ /// &GenomePosition { contig: 1, position: 100 }, /// &GenomePosition { contig: 1, position: 200 }, /// &GenomePosition { contig: 2, position: 150 }, /// ]; /// let ranges = vec![ /// &GenomeRange { contig: 1, range: 50..150 }, /// &GenomeRange { contig: 2, range: 100..200 }, /// ]; /// let overlaps = overlaps_par(&positions, &ranges); /// assert_eq!(overlaps, vec!); /// ``` pub fn overlaps_par(positions: &[&GenomePosition], ranges: &[&GenomeRange]) -> Vec { let chunk_size = (positions.len() / rayon::current_num_threads()).max(1); positions .par_chunks(chunk_size) .enumerate() .flat_map(|(chunk_idx, chunk)| { let mut result = Vec::new(); let mut range_idx = ranges.partition_point(|r| r.contig < chunk[0].contig); for (pos_idx, pos) in chunk.iter().enumerate() { while range_idx < ranges.len() && ranges[range_idx].contig < pos.contig { range_idx += 1; } if range_idx < ranges.len() && ranges[range_idx].contig == pos.contig { while range_idx < ranges.len() && ranges[range_idx].contig == pos.contig && ranges[range_idx].range.end <= pos.position { range_idx += 1; } if range_idx < ranges.len() && ranges[range_idx].contig == pos.contig && ranges[range_idx].range.contains(&pos.position) { result.push(chunk_idx * chunk_size + pos_idx); } } } result }) .collect() } /// A parallel version of the overlap finding function that works with any types /// implementing `GetGenomePosition` and `GetGenomeRange` traits. /// /// This function first converts the input slices to vectors of `GenomePosition` and `GenomeRange`, /// then calls `overlaps_par`. /// /// # Arguments /// * `a` - A slice of items implementing `GetGenomePosition` /// * `b` - A slice of items implementing `GetGenomeRange` /// /// # Returns /// A `Vec` containing the indices of positions that overlap with any range. /// /// # Example /// ``` /// struct MyPosition(GenomePosition); /// impl GetGenomePosition for MyPosition { /// fn position(&self) -> &GenomePosition { &self.0 } /// } /// /// struct MyRange(GenomeRange); /// impl GetGenomeRange for MyRange { /// fn range(&self) -> &GenomeRange { &self.0 } /// } /// /// let positions = vec![ /// MyPosition(GenomePosition { contig: 1, position: 100 }), /// MyPosition(GenomePosition { contig: 1, position: 200 }), /// ]; /// let ranges = vec![ /// MyRange(GenomeRange { contig: 1, range: 50..150 }), /// ]; /// let overlaps = par_overlaps(&positions, &ranges); /// assert_eq!(overlaps, vec!); /// ``` pub fn par_overlaps(a: &[impl GetGenomePosition], b: &[impl GetGenomeRange]) -> Vec { let a: Vec<_> = a.iter().map(|e| e.position()).collect(); let b: Vec<_> = b.iter().map(|e| e.range()).collect(); overlaps_par(&a, &b) } /// A trait for types that can provide a reference to a `GenomePosition`. /// /// Implement this trait for types that contain or can compute a `GenomePosition`. pub trait GetGenomePosition { // Returns a reference to the `GenomePosition` for this item. fn position(&self) -> &GenomePosition; } /// A trait for types that can provide a reference to a `GenomeRange`. /// /// Implement this trait for types that contain or can compute a `GenomeRange`. pub trait GetGenomeRange { /// Returns a reference to the `GenomeRange` for this item. fn range(&self) -> &GenomeRange; }