| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608 |
- use std::{cmp::Ordering, fmt::Display, ops::Range, str::FromStr};
- use anyhow::Context;
- use rayon::prelude::*;
- use serde::{Deserialize, Serialize};
- /// Represents a position in the genome using 0-based coordinates.
- ///
- /// This struct encapsulates a genomic position, consisting of a contig (chromosome)
- /// identifier and a position within that contig.
- ///
- /// # Fields
- /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
- /// * `position` - A 32-bit unsigned integer representing the 0-based position within the contig
- ///
- /// # Derives
- /// This struct derives the following traits:
- /// * `Debug` for easier debugging and logging
- /// * `PartialEq` and `Eq` for equality comparisons
- /// * `Clone` for creating deep copies
- /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
- /// * `Hash` for use in hash-based collections
- /// * `Default` for creating a default instance
- ///
- /// # Note
- /// The position is 0-based, meaning the first position in a contig is represented as 0.
- ///
- /// # Usage
- /// This struct is typically used to precisely locate genomic features or variants
- /// within a reference genome.
- ///
- /// # Example
- /// ```
- /// let position = GenomePosition {
- /// contig: 1, // Representing chromosome 1
- /// position: 1000, // 1001st base pair in the chromosome (remember it's 0-based)
- /// };
- /// ```
- #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash, Default)]
- pub struct GenomePosition {
- pub contig: u8,
- pub position: u32,
- }
- impl GenomePosition {
- /// Returns the contig (chromosome) identifier as a human-readable string.
- ///
- /// This method converts the internal numeric representation of the contig
- /// to a string format commonly used in genomics, such as "chr1", "chrX", etc...
- ///
- /// # Returns
- /// A `String` representing the contig identifier.
- ///
- /// # Examples
- ///
- /// ```
- /// let pos = GenomePosition { contig: 1, position: 1000 };
- /// assert_eq!(pos.contig(), "chr1");
- ///
- /// let pos_x = GenomePosition { contig: 23, position: 500 };
- /// assert_eq!(pos_x.contig(), "chrX");
- /// ```
- ///
- /// # Note
- /// This method uses the `num_to_contig` function to perform the conversion.
- /// Special cases like X, Y chromosomes and mitochondrial DNA are handled appropriately.
- pub fn contig(&self) -> String {
- num_to_contig(self.contig)
- }
- }
- /// Implements the `Display` trait for `GenomePosition`.
- ///
- /// This implementation allows a `GenomePosition` to be formatted as a string
- /// in the form "contig:position", where contig is the string representation
- /// of the chromosome (e.g., "chr1", "chrX") and position is the 0-based position.
- ///
- /// # Example
- /// ```
- /// let pos = GenomePosition { contig: 1, position: 1000 };
- /// assert_eq!(format!("{}", pos), "chr1:1000");
- /// ```
- impl Display for GenomePosition {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- write!(f, "{}:{}", self.contig(), self.position)
- }
- }
- /// Implements the `FromStr` trait for `GenomePosition`.
- ///
- /// This allows creating a `GenomePosition` from a string in the format "contig:position".
- ///
- /// # Errors
- /// Returns an `anyhow::Error` if:
- /// - The input string doesn't contain a ':' character
- /// - The contig part can't be converted to a numeric representation
- /// - The position part can't be parsed as a `u32`
- ///
- /// # Example
- /// ```
- /// use std::str::FromStr;
- /// let pos = GenomePosition::from_str("chr1:1000").unwrap();
- /// assert_eq!(pos.contig, 1);
- /// assert_eq!(pos.position, 1000);
- /// ```
- impl FromStr for GenomePosition {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let (c, p) = s
- .split_once(":")
- .ok_or(anyhow::anyhow!("Can't split {s}"))?;
- Ok(Self {
- contig: contig_to_num(c),
- position: p.parse()?,
- })
- }
- }
- /// Implements `TryFrom<(&str, &str)>` for `GenomePosition`.
- ///
- /// This allows creating a `GenomePosition` from a tuple of two string slices,
- /// where the first represents the contig and the second represents the position.
- ///
- /// # Errors
- /// Returns an `anyhow::Error` if:
- /// - The contig part can't be converted to a numeric representation
- /// - The position part can't be parsed as a `u32`
- ///
- /// # Example
- /// ```
- /// use std::convert::TryFrom;
- /// let pos = GenomePosition::try_from(("chr1", "1000")).unwrap();
- /// assert_eq!(pos.contig, 1);
- /// assert_eq!(pos.position, 1000);
- /// ```
- impl TryFrom<(&str, &str)> for GenomePosition {
- type Error = anyhow::Error;
- fn try_from((c, p): (&str, &str)) -> anyhow::Result<Self> {
- Ok(GenomePosition {
- contig: contig_to_num(c),
- position: p.parse().context(format!("Can't parse {p} into u32."))?,
- })
- }
- }
- /// Implements conversion from `VcfPosition` to `GenomePosition`.
- ///
- /// This conversion adjusts for the difference between 1-based VCF coordinates
- /// and 0-based genomic coordinates. It also handles the special case of
- /// position 0 in VCF, which represents a variant inside telomeres.
- ///
- /// # Note
- /// This implementation assumes that `VcfPosition` uses 1-based coordinates.
- ///
- /// # Example
- /// ```
- /// let vcf_pos = VcfPosition { contig: 1, position: 1001 };
- /// let genome_pos: GenomePosition = vcf_pos.into();
- /// assert_eq!(genome_pos.contig, 1);
- /// assert_eq!(genome_pos.position, 1000);
- /// ```
- impl From<VcfPosition> for GenomePosition {
- fn from(value: VcfPosition) -> Self {
- let position = if value.position == 0 {
- // position 0 is a variant inside telomeres
- 0
- } else {
- value.position - 1
- };
- Self {
- contig: value.contig,
- position,
- }
- }
- }
- /// Implements conversion from `GenomePosition` to `(String, String)`.
- ///
- /// This conversion creates a tuple where the first element is the string
- /// representation of the contig (e.g., "chr1", "chrX") and the second
- /// element is the string representation of the position.
- ///
- /// # Example
- /// ```
- /// let pos = GenomePosition { contig: 1, position: 1000 };
- /// let (contig, position): (String, String) = pos.into();
- /// assert_eq!(contig, "chr1");
- /// assert_eq!(position, "1000");
- /// ```
- impl From<GenomePosition> for (String, String) {
- fn from(val: GenomePosition) -> Self {
- (num_to_contig(val.contig), val.position.to_string())
- }
- }
- /// Implements `PartialOrd` for `GenomePosition`.
- ///
- /// This implementation allows `GenomePosition` instances to be compared
- /// and ordered based first on their contig, then on their position.
- impl PartialOrd for GenomePosition {
- fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
- Some(self.cmp(other))
- }
- }
- /// Implements `Ord` for `GenomePosition`.
- ///
- /// This implementation defines a total ordering for `GenomePosition` instances.
- /// Positions are first compared by their contig, and if the contigs are equal,
- /// then by their position.
- ///
- /// # Example
- /// ```
- /// let pos1 = GenomePosition { contig: 1, position: 1000 };
- /// let pos2 = GenomePosition { contig: 1, position: 2000 };
- /// let pos3 = GenomePosition { contig: 2, position: 500 };
- /// assert!(pos1 < pos2);
- /// assert!(pos2 < pos3);
- /// ```
- impl Ord for GenomePosition {
- fn cmp(&self, other: &Self) -> Ordering {
- match self.contig.cmp(&other.contig) {
- Ordering::Equal => self.position.cmp(&other.position),
- other => other,
- }
- }
- }
- /// Converts a contig (chromosome) identifier string to its numeric representation.
- ///
- /// # Arguments
- /// * `contig` - A string slice representing the contig identifier (e.g., "chr1", "chrX")
- ///
- /// # Returns
- /// A `u8` representing the numeric value of the contig:
- /// * 1-22 for autosomes
- /// * 23 for X chromosome
- /// * 24 for Y chromosome
- /// * 25 for mitochondrial DNA
- /// * `u8::MAX` (255) for any unrecognized input
- ///
- /// # Examples
- /// ```
- /// assert_eq!(contig_to_num("chr1"), 1);
- /// assert_eq!(contig_to_num("chrX"), 23);
- /// assert_eq!(contig_to_num("chrM"), 25);
- /// assert_eq!(contig_to_num("chr_unknown"), u8::MAX);
- /// ```
- ///
- /// # Note
- /// This function assumes the input string starts with "chr" for standard chromosomes.
- /// The "chr" prefix is trimmed before parsing numeric values.
- pub fn contig_to_num(contig: &str) -> u8 {
- match contig {
- "chrX" => 23,
- "chrY" => 24,
- "chrM" => 25,
- c => c.trim_start_matches("chr").parse().unwrap_or(u8::MAX),
- }
- }
- /// Converts a numeric contig (chromosome) representation to its string identifier.
- ///
- /// # Arguments
- /// * `num` - A `u8` representing the numeric value of the contig
- ///
- /// # Returns
- /// A `String` representing the contig identifier:
- /// * "chr1" to "chr22" for autosomes
- /// * "chrX" for 23
- /// * "chrY" for 24
- /// * "chrM" for 25
- /// * "chr{num}" for any other input
- ///
- /// # Examples
- /// ```
- /// assert_eq!(num_to_contig(1), "chr1");
- /// assert_eq!(num_to_contig(23), "chrX");
- /// assert_eq!(num_to_contig(25), "chrM");
- /// ```
- ///
- /// # Note
- /// This function always prepends "chr" to the output string.
- pub fn num_to_contig(num: u8) -> String {
- match num {
- 23 => "chrX".to_string(),
- 24 => "chrY".to_string(),
- 25 => "chrM".to_string(),
- c => format!("chr{c}"),
- }
- }
- /// Represents a position in a VCF (Variant Call Format) file using 1-based coordinates.
- ///
- /// This struct encapsulates a genomic position as represented in VCF files, consisting of a contig (chromosome)
- /// identifier and a position within that contig.
- ///
- /// # Fields
- /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
- /// * `position` - A 32-bit unsigned integer representing the 1-based position within the contig
- ///
- /// # Derives
- /// This struct derives the following traits:
- /// * `Debug` for easier debugging and logging
- /// * `PartialEq` and `Eq` for equality comparisons
- /// * `Clone` for creating deep copies
- /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
- ///
- /// # Note
- /// The position is 1-based, as per VCF file format specifications.
- #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
- pub struct VcfPosition {
- pub contig: u8,
- pub position: u32,
- }
- /// Implements `TryFrom<(&str, &str)>` for `VcfPosition`.
- ///
- /// This allows creating a `VcfPosition` from a tuple of two string slices,
- /// where the first represents the contig and the second represents the position.
- ///
- /// # Errors
- /// Returns an `anyhow::Error` if:
- /// - The contig part can't be converted to a numeric representation
- /// - The position part can't be parsed as a `u32`
- ///
- /// # Example
- /// ```
- /// use std::convert::TryFrom;
- /// let pos = VcfPosition::try_from(("chr1", "1000")).unwrap();
- /// assert_eq!(pos.contig, 1);
- /// assert_eq!(pos.position, 1000);
- /// ```
- impl TryFrom<(&str, &str)> for VcfPosition {
- type Error = anyhow::Error;
- fn try_from((c, p): (&str, &str)) -> anyhow::Result<Self> {
- Ok(VcfPosition {
- contig: contig_to_num(c),
- position: p.parse().context(format!("Can't parse {p} into u32."))?,
- })
- }
- }
- /// Implements conversion from `GenomePosition` to `VcfPosition`.
- ///
- /// This conversion adjusts for the difference between 0-based genomic coordinates
- /// and 1-based VCF coordinates.
- ///
- /// # Example
- /// ```
- /// let genome_pos = GenomePosition { contig: 1, position: 999 };
- /// let vcf_pos: VcfPosition = genome_pos.into();
- /// assert_eq!(vcf_pos.contig, 1);
- /// assert_eq!(vcf_pos.position, 1000);
- /// ```
- impl From<GenomePosition> for VcfPosition {
- fn from(value: GenomePosition) -> Self {
- Self {
- contig: value.contig,
- position: value.position + 1,
- }
- }
- }
- /// Implements conversion from `VcfPosition` to `(String, String)`.
- ///
- /// This conversion creates a tuple where the first element is the string
- /// representation of the contig (e.g., "chr1", "chrX") and the second
- /// element is the string representation of the position.
- ///
- /// # Example
- /// ```
- /// let pos = VcfPosition { contig: 1, position: 1000 };
- /// let (contig, position): (String, String) = pos.into();
- /// assert_eq!(contig, "chr1");
- /// assert_eq!(position, "1000");
- /// ```
- impl From<VcfPosition> for (String, String) {
- fn from(val: VcfPosition) -> Self {
- (num_to_contig(val.contig), val.position.to_string())
- }
- }
- /// Represents a range in the genome using 0-based, half-open coordinates [start, end).
- ///
- /// This struct encapsulates a genomic range, consisting of a contig (chromosome)
- /// identifier and a range of positions within that contig.
- ///
- /// # Fields
- /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
- /// * `range` - A `Range<u32>` representing the start (inclusive) and end (exclusive) positions
- ///
- /// # Derives
- /// This struct derives the following traits:
- /// * `Debug` for easier debugging and logging
- /// * `PartialEq` and `Eq` for equality comparisons
- /// * `Clone` for creating deep copies
- /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
- ///
- /// # Note
- /// The range is 0-based and half-open, meaning the start position is included but the end position is not.
- /// This is consistent with many programming languages and bioinformatics tools.
- #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
- pub struct GenomeRange {
- pub contig: u8,
- pub range: Range<u32>,
- }
- /// Implements `TryFrom<(&str, &str, &str)>` for `GenomeRange`.
- ///
- /// This allows creating a `GenomeRange` from a tuple of three string slices,
- /// where the first represents the contig, the second the start position,
- /// and the third the end position.
- ///
- /// # Errors
- /// Returns an `anyhow::Error` if:
- /// - The contig part can't be converted to a numeric representation
- /// - Either the start or end position can't be parsed as a `u32`
- ///
- /// # Example
- /// ```
- /// use std::convert::TryFrom;
- /// let range = GenomeRange::try_from(("chr1", "1000", "2000")).unwrap();
- /// assert_eq!(range.contig, 1);
- /// assert_eq!(range.range.start, 1000);
- /// assert_eq!(range.range.end, 2000);
- /// ```
- impl TryFrom<(&str, &str, &str)> for GenomeRange {
- type Error = anyhow::Error;
- fn try_from((c, s, e): (&str, &str, &str)) -> anyhow::Result<Self> {
- Ok(Self {
- contig: contig_to_num(c),
- range: Range {
- start: s.parse().context(format!("Can't parse {s} into u32."))?,
- end: e.parse().context(format!("Can't parse {e} into u32."))?,
- },
- })
- }
- }
- impl GenomeRange {
- /// Creates a `GenomeRange` from 1-based inclusive start and end positions.
- ///
- /// This method is useful when working with data sources that use 1-based coordinates
- /// and inclusive ranges. It converts the input to the 0-based, half-open format used by `GenomeRange`.
- ///
- /// # Arguments
- /// * `contig` - A string slice representing the contig/chromosome
- /// * `start` - A string slice representing the 1-based inclusive start position
- /// * `end` - A string slice representing the 1-based inclusive end position
- ///
- /// # Returns
- /// A `Result` containing the `GenomeRange` if successful, or an `anyhow::Error` if parsing fails.
- ///
- /// # Errors
- /// Returns an `anyhow::Error` if:
- /// - The contig can't be converted to a numeric representation
- /// - Either the start or end position can't be parsed as a `u32`
- ///
- /// # Example
- /// ```
- /// let range = GenomeRange::from_1_inclusive("chr1", "1000", "2000").unwrap();
- /// assert_eq!(range.contig, 1);
- /// assert_eq!(range.range.start, 999); // Converted to 0-based
- /// assert_eq!(range.range.end, 2000); // End is exclusive in 0-based
- /// ```
- pub fn from_1_inclusive(contig: &str, start: &str, end: &str) -> anyhow::Result<Self> {
- Ok(Self {
- contig: contig_to_num(contig),
- range: Range {
- start: start
- .parse::<u32>()
- .context(format!("Can't parse {start} into u32."))?
- - 1,
- end: end
- .parse()
- .context(format!("Can't parse {end} into u32."))?,
- },
- })
- }
- }
- /// Finds overlaps between a list of genome positions and a list of genome ranges in parallel.
- ///
- /// This function uses Rayon for parallel processing, dividing the positions into chunks
- /// and processing them concurrently.
- ///
- /// # Arguments
- /// * `positions` - A slice of references to `GenomePosition`s
- /// * `ranges` - A slice of references to `GenomeRange`s
- ///
- /// # Returns
- /// A `Vec<usize>` containing the indices of positions that overlap with any range.
- ///
- /// # Performance
- /// This function is optimized for scenarios where both `positions` and `ranges` are sorted
- /// by contig and position/range start. It uses this ordering to avoid unnecessary comparisons.
- ///
- /// # Example
- /// ```
- /// let positions = vec![
- /// &GenomePosition { contig: 1, position: 100 },
- /// &GenomePosition { contig: 1, position: 200 },
- /// &GenomePosition { contig: 2, position: 150 },
- /// ];
- /// let ranges = vec![
- /// &GenomeRange { contig: 1, range: 50..150 },
- /// &GenomeRange { contig: 2, range: 100..200 },
- /// ];
- /// let overlaps = overlaps_par(&positions, &ranges);
- /// assert_eq!(overlaps, vec!);
- /// ```
- pub fn overlaps_par(positions: &[&GenomePosition], ranges: &[&GenomeRange]) -> Vec<usize> {
- let chunk_size = (positions.len() / rayon::current_num_threads()).max(1);
- positions
- .par_chunks(chunk_size)
- .enumerate()
- .flat_map(|(chunk_idx, chunk)| {
- let mut result = Vec::new();
- let mut range_idx = ranges.partition_point(|r| r.contig < chunk[0].contig);
- for (pos_idx, pos) in chunk.iter().enumerate() {
- while range_idx < ranges.len() && ranges[range_idx].contig < pos.contig {
- range_idx += 1;
- }
- if range_idx < ranges.len() && ranges[range_idx].contig == pos.contig {
- while range_idx < ranges.len()
- && ranges[range_idx].contig == pos.contig
- && ranges[range_idx].range.end <= pos.position
- {
- range_idx += 1;
- }
- if range_idx < ranges.len()
- && ranges[range_idx].contig == pos.contig
- && ranges[range_idx].range.contains(&pos.position)
- {
- result.push(chunk_idx * chunk_size + pos_idx);
- }
- }
- }
- result
- })
- .collect()
- }
- /// A parallel version of the overlap finding function that works with any types
- /// implementing `GetGenomePosition` and `GetGenomeRange` traits.
- ///
- /// This function first converts the input slices to vectors of `GenomePosition` and `GenomeRange`,
- /// then calls `overlaps_par`.
- ///
- /// # Arguments
- /// * `a` - A slice of items implementing `GetGenomePosition`
- /// * `b` - A slice of items implementing `GetGenomeRange`
- ///
- /// # Returns
- /// A `Vec<usize>` containing the indices of positions that overlap with any range.
- ///
- /// # Example
- /// ```
- /// struct MyPosition(GenomePosition);
- /// impl GetGenomePosition for MyPosition {
- /// fn position(&self) -> &GenomePosition { &self.0 }
- /// }
- ///
- /// struct MyRange(GenomeRange);
- /// impl GetGenomeRange for MyRange {
- /// fn range(&self) -> &GenomeRange { &self.0 }
- /// }
- ///
- /// let positions = vec![
- /// MyPosition(GenomePosition { contig: 1, position: 100 }),
- /// MyPosition(GenomePosition { contig: 1, position: 200 }),
- /// ];
- /// let ranges = vec![
- /// MyRange(GenomeRange { contig: 1, range: 50..150 }),
- /// ];
- /// let overlaps = par_overlaps(&positions, &ranges);
- /// assert_eq!(overlaps, vec!);
- /// ```
- pub fn par_overlaps(a: &[impl GetGenomePosition], b: &[impl GetGenomeRange]) -> Vec<usize> {
- let a: Vec<_> = a.iter().map(|e| e.position()).collect();
- let b: Vec<_> = b.iter().map(|e| e.range()).collect();
- overlaps_par(&a, &b)
- }
- /// A trait for types that can provide a reference to a `GenomePosition`.
- ///
- /// Implement this trait for types that contain or can compute a `GenomePosition`.
- pub trait GetGenomePosition {
- // Returns a reference to the `GenomePosition` for this item.
- fn position(&self) -> &GenomePosition;
- }
- /// A trait for types that can provide a reference to a `GenomeRange`.
- ///
- /// Implement this trait for types that contain or can compute a `GenomeRange`.
- pub trait GetGenomeRange {
- /// Returns a reference to the `GenomeRange` for this item.
- fn range(&self) -> &GenomeRange;
- }
|