positions.rs 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. use std::{
  2. cmp::{max, Ordering},
  3. fmt::Display,
  4. ops::Range,
  5. str::FromStr,
  6. };
  7. use anyhow::Context;
  8. use bitcode::{Decode, Encode};
  9. use rayon::prelude::*;
  10. use serde::{Deserialize, Serialize};
  11. /// Represents a position in the genome using 0-based coordinates.
  12. ///
  13. /// This struct encapsulates a genomic position, consisting of a contig (chromosome)
  14. /// identifier and a position within that contig.
  15. ///
  16. /// # Fields
  17. /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
  18. /// * `position` - A 32-bit unsigned integer representing the 0-based position within the contig
  19. ///
  20. /// # Derives
  21. /// This struct derives the following traits:
  22. /// * `Debug` for easier debugging and logging
  23. /// * `PartialEq` and `Eq` for equality comparisons
  24. /// * `Clone` for creating deep copies
  25. /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
  26. /// * `Hash` for use in hash-based collections
  27. /// * `Default` for creating a default instance
  28. ///
  29. /// # Note
  30. /// The position is 0-based, meaning the first position in a contig is represented as 0.
  31. ///
  32. /// # Usage
  33. /// This struct is typically used to precisely locate genomic features or variants
  34. /// within a reference genome.
  35. ///
  36. /// # Example
  37. /// ```
  38. /// let position = GenomePosition {
  39. /// contig: 1, // Representing chromosome 1
  40. /// position: 1000, // 1001st base pair in the chromosome (remember it's 0-based)
  41. /// };
  42. /// ```
  43. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash, Default, Encode, Decode)]
  44. pub struct GenomePosition {
  45. pub contig: u8,
  46. pub position: u32,
  47. }
  48. impl GenomePosition {
  49. /// Returns the contig (chromosome) identifier as a human-readable string.
  50. ///
  51. /// This method converts the internal numeric representation of the contig
  52. /// to a string format commonly used in genomics, such as "chr1", "chrX", etc...
  53. ///
  54. /// # Returns
  55. /// A `String` representing the contig identifier.
  56. ///
  57. /// # Examples
  58. ///
  59. /// ```
  60. /// let pos = GenomePosition { contig: 1, position: 1000 };
  61. /// assert_eq!(pos.contig(), "chr1");
  62. ///
  63. /// let pos_x = GenomePosition { contig: 23, position: 500 };
  64. /// assert_eq!(pos_x.contig(), "chrX");
  65. /// ```
  66. ///
  67. /// # Note
  68. /// This method uses the `num_to_contig` function to perform the conversion.
  69. /// Special cases like X, Y chromosomes and mitochondrial DNA are handled appropriately.
  70. pub fn contig(&self) -> String {
  71. num_to_contig(self.contig)
  72. }
  73. }
  74. /// Implements the `Display` trait for `GenomePosition`.
  75. ///
  76. /// This implementation allows a `GenomePosition` to be formatted as a string
  77. /// in the form "contig:position", where contig is the string representation
  78. /// of the chromosome (e.g., "chr1", "chrX") and position is the 0-based position.
  79. ///
  80. /// # Example
  81. /// ```
  82. /// let pos = GenomePosition { contig: 1, position: 1000 };
  83. /// assert_eq!(format!("{}", pos), "chr1:1000");
  84. /// ```
  85. impl Display for GenomePosition {
  86. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  87. write!(f, "{}:{}", self.contig(), self.position)
  88. }
  89. }
  90. /// Implements the `FromStr` trait for `GenomePosition`.
  91. ///
  92. /// This allows creating a `GenomePosition` from a string in the format "contig:position".
  93. ///
  94. /// # Errors
  95. /// Returns an `anyhow::Error` if:
  96. /// - The input string doesn't contain a ':' character
  97. /// - The contig part can't be converted to a numeric representation
  98. /// - The position part can't be parsed as a `u32`
  99. ///
  100. /// # Example
  101. /// ```
  102. /// use std::str::FromStr;
  103. /// let pos = GenomePosition::from_str("chr1:1000").unwrap();
  104. /// assert_eq!(pos.contig, 1);
  105. /// assert_eq!(pos.position, 1000);
  106. /// ```
  107. impl FromStr for GenomePosition {
  108. type Err = anyhow::Error;
  109. fn from_str(s: &str) -> anyhow::Result<Self> {
  110. let (c, p) = s
  111. .split_once(":")
  112. .ok_or(anyhow::anyhow!("Can't split {s}"))?;
  113. Ok(Self {
  114. contig: contig_to_num(c),
  115. position: p.parse()?,
  116. })
  117. }
  118. }
  119. /// Implements `TryFrom<(&str, &str)>` for `GenomePosition`.
  120. ///
  121. /// This allows creating a `GenomePosition` from a tuple of two string slices,
  122. /// where the first represents the contig and the second represents the position.
  123. ///
  124. /// # Errors
  125. /// Returns an `anyhow::Error` if:
  126. /// - The contig part can't be converted to a numeric representation
  127. /// - The position part can't be parsed as a `u32`
  128. ///
  129. /// # Example
  130. /// ```
  131. /// use std::convert::TryFrom;
  132. /// let pos = GenomePosition::try_from(("chr1", "1000")).unwrap();
  133. /// assert_eq!(pos.contig, 1);
  134. /// assert_eq!(pos.position, 1000);
  135. /// ```
  136. impl TryFrom<(&str, &str)> for GenomePosition {
  137. type Error = anyhow::Error;
  138. fn try_from((c, p): (&str, &str)) -> anyhow::Result<Self> {
  139. Ok(GenomePosition {
  140. contig: contig_to_num(c),
  141. position: p.parse().context(format!("Can't parse {p} into u32."))?,
  142. })
  143. }
  144. }
  145. /// Implements conversion from `VcfPosition` to `GenomePosition`.
  146. ///
  147. /// This conversion adjusts for the difference between 1-based VCF coordinates
  148. /// and 0-based genomic coordinates. It also handles the special case of
  149. /// position 0 in VCF, which represents a variant inside telomeres.
  150. ///
  151. /// # Note
  152. /// This implementation assumes that `VcfPosition` uses 1-based coordinates.
  153. ///
  154. /// # Example
  155. /// ```
  156. /// let vcf_pos = VcfPosition { contig: 1, position: 1001 };
  157. /// let genome_pos: GenomePosition = vcf_pos.into();
  158. /// assert_eq!(genome_pos.contig, 1);
  159. /// assert_eq!(genome_pos.position, 1000);
  160. /// ```
  161. impl From<VcfPosition> for GenomePosition {
  162. fn from(value: VcfPosition) -> Self {
  163. let position = if value.position == 0 {
  164. // position 0 is a variant inside telomeres
  165. 0
  166. } else {
  167. value.position - 1
  168. };
  169. Self {
  170. contig: value.contig,
  171. position,
  172. }
  173. }
  174. }
  175. /// Implements conversion from `GenomePosition` to `(String, String)`.
  176. ///
  177. /// This conversion creates a tuple where the first element is the string
  178. /// representation of the contig (e.g., "chr1", "chrX") and the second
  179. /// element is the string representation of the position.
  180. ///
  181. /// # Example
  182. /// ```
  183. /// let pos = GenomePosition { contig: 1, position: 1000 };
  184. /// let (contig, position): (String, String) = pos.into();
  185. /// assert_eq!(contig, "chr1");
  186. /// assert_eq!(position, "1000");
  187. /// ```
  188. impl From<GenomePosition> for (String, String) {
  189. fn from(val: GenomePosition) -> Self {
  190. (num_to_contig(val.contig), val.position.to_string())
  191. }
  192. }
  193. /// Implements `PartialOrd` for `GenomePosition`.
  194. ///
  195. /// This implementation allows `GenomePosition` instances to be compared
  196. /// and ordered based first on their contig, then on their position.
  197. impl PartialOrd for GenomePosition {
  198. fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
  199. Some(self.cmp(other))
  200. }
  201. }
  202. /// Implements `Ord` for `GenomePosition`.
  203. ///
  204. /// This implementation defines a total ordering for `GenomePosition` instances.
  205. /// Positions are first compared by their contig, and if the contigs are equal,
  206. /// then by their position.
  207. ///
  208. /// # Example
  209. /// ```
  210. /// let pos1 = GenomePosition { contig: 1, position: 1000 };
  211. /// let pos2 = GenomePosition { contig: 1, position: 2000 };
  212. /// let pos3 = GenomePosition { contig: 2, position: 500 };
  213. /// assert!(pos1 < pos2);
  214. /// assert!(pos2 < pos3);
  215. /// ```
  216. impl Ord for GenomePosition {
  217. fn cmp(&self, other: &Self) -> Ordering {
  218. match self.contig.cmp(&other.contig) {
  219. Ordering::Equal => self.position.cmp(&other.position),
  220. other => other,
  221. }
  222. }
  223. }
  224. /// Converts a contig (chromosome) identifier string to its numeric representation.
  225. ///
  226. /// # Arguments
  227. /// * `contig` - A string slice representing the contig identifier (e.g., "chr1", "chrX")
  228. ///
  229. /// # Returns
  230. /// A `u8` representing the numeric value of the contig:
  231. /// * 1-22 for autosomes
  232. /// * 23 for X chromosome
  233. /// * 24 for Y chromosome
  234. /// * 25 for mitochondrial DNA
  235. /// * `u8::MAX` (255) for any unrecognized input
  236. ///
  237. /// # Examples
  238. /// ```
  239. /// assert_eq!(contig_to_num("chr1"), 1);
  240. /// assert_eq!(contig_to_num("chrX"), 23);
  241. /// assert_eq!(contig_to_num("chrM"), 25);
  242. /// assert_eq!(contig_to_num("chr_unknown"), u8::MAX);
  243. /// ```
  244. ///
  245. /// # Note
  246. /// This function assumes the input string starts with "chr" for standard chromosomes.
  247. /// The "chr" prefix is trimmed before parsing numeric values.
  248. pub fn contig_to_num(contig: &str) -> u8 {
  249. match contig {
  250. "chrX" => 23,
  251. "chrY" => 24,
  252. "chrM" => 25,
  253. c => c.trim_start_matches("chr").parse().unwrap_or(u8::MAX),
  254. }
  255. }
  256. /// Converts a numeric contig (chromosome) representation to its string identifier.
  257. ///
  258. /// # Arguments
  259. /// * `num` - A `u8` representing the numeric value of the contig
  260. ///
  261. /// # Returns
  262. /// A `String` representing the contig identifier:
  263. /// * "chr1" to "chr22" for autosomes
  264. /// * "chrX" for 23
  265. /// * "chrY" for 24
  266. /// * "chrM" for 25
  267. /// * "chr{num}" for any other input
  268. ///
  269. /// # Examples
  270. /// ```
  271. /// assert_eq!(num_to_contig(1), "chr1");
  272. /// assert_eq!(num_to_contig(23), "chrX");
  273. /// assert_eq!(num_to_contig(25), "chrM");
  274. /// ```
  275. ///
  276. /// # Note
  277. /// This function always prepends "chr" to the output string.
  278. pub fn num_to_contig(num: u8) -> String {
  279. match num {
  280. 23 => "chrX".to_string(),
  281. 24 => "chrY".to_string(),
  282. 25 => "chrM".to_string(),
  283. c => format!("chr{c}"),
  284. }
  285. }
  286. /// Represents a position in a VCF (Variant Call Format) file using 1-based coordinates.
  287. ///
  288. /// This struct encapsulates a genomic position as represented in VCF files, consisting of a contig (chromosome)
  289. /// identifier and a position within that contig.
  290. ///
  291. /// # Fields
  292. /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
  293. /// * `position` - A 32-bit unsigned integer representing the 1-based position within the contig
  294. ///
  295. /// # Derives
  296. /// This struct derives the following traits:
  297. /// * `Debug` for easier debugging and logging
  298. /// * `PartialEq` and `Eq` for equality comparisons
  299. /// * `Clone` for creating deep copies
  300. /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
  301. ///
  302. /// # Note
  303. /// The position is 1-based, as per VCF file format specifications.
  304. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
  305. pub struct VcfPosition {
  306. pub contig: u8,
  307. pub position: u32,
  308. }
  309. /// Implements `TryFrom<(&str, &str)>` for `VcfPosition`.
  310. ///
  311. /// This allows creating a `VcfPosition` from a tuple of two string slices,
  312. /// where the first represents the contig and the second represents the position.
  313. ///
  314. /// # Errors
  315. /// Returns an `anyhow::Error` if:
  316. /// - The contig part can't be converted to a numeric representation
  317. /// - The position part can't be parsed as a `u32`
  318. ///
  319. /// # Example
  320. /// ```
  321. /// use std::convert::TryFrom;
  322. /// let pos = VcfPosition::try_from(("chr1", "1000")).unwrap();
  323. /// assert_eq!(pos.contig, 1);
  324. /// assert_eq!(pos.position, 1000);
  325. /// ```
  326. impl TryFrom<(&str, &str)> for VcfPosition {
  327. type Error = anyhow::Error;
  328. fn try_from((c, p): (&str, &str)) -> anyhow::Result<Self> {
  329. Ok(VcfPosition {
  330. contig: contig_to_num(c),
  331. position: p.parse().context(format!("Can't parse {p} into u32."))?,
  332. })
  333. }
  334. }
  335. /// Implements conversion from `GenomePosition` to `VcfPosition`.
  336. ///
  337. /// This conversion adjusts for the difference between 0-based genomic coordinates
  338. /// and 1-based VCF coordinates.
  339. ///
  340. /// # Example
  341. /// ```
  342. /// let genome_pos = GenomePosition { contig: 1, position: 999 };
  343. /// let vcf_pos: VcfPosition = genome_pos.into();
  344. /// assert_eq!(vcf_pos.contig, 1);
  345. /// assert_eq!(vcf_pos.position, 1000);
  346. /// ```
  347. impl From<GenomePosition> for VcfPosition {
  348. fn from(value: GenomePosition) -> Self {
  349. Self {
  350. contig: value.contig,
  351. position: value.position + 1,
  352. }
  353. }
  354. }
  355. /// Implements conversion from `VcfPosition` to `(String, String)`.
  356. ///
  357. /// This conversion creates a tuple where the first element is the string
  358. /// representation of the contig (e.g., "chr1", "chrX") and the second
  359. /// element is the string representation of the position.
  360. ///
  361. /// # Example
  362. /// ```
  363. /// let pos = VcfPosition { contig: 1, position: 1000 };
  364. /// let (contig, position): (String, String) = pos.into();
  365. /// assert_eq!(contig, "chr1");
  366. /// assert_eq!(position, "1000");
  367. /// ```
  368. impl From<VcfPosition> for (String, String) {
  369. fn from(val: VcfPosition) -> Self {
  370. (num_to_contig(val.contig), val.position.to_string())
  371. }
  372. }
  373. /// Represents a range in the genome using 0-based, half-open coordinates [start, end).
  374. ///
  375. /// This struct encapsulates a genomic range, consisting of a contig (chromosome)
  376. /// identifier and a range of positions within that contig.
  377. ///
  378. /// # Fields
  379. /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
  380. /// * `range` - A `Range<u32>` representing the start (inclusive) and end (exclusive) positions
  381. ///
  382. /// # Derives
  383. /// This struct derives the following traits:
  384. /// * `Debug` for easier debugging and logging
  385. /// * `PartialEq` and `Eq` for equality comparisons
  386. /// * `Clone` for creating deep copies
  387. /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
  388. ///
  389. /// # Note
  390. /// The range is 0-based and half-open, meaning the start position is included but the end position is not.
  391. /// This is consistent with many programming languages and bioinformatics tools.
  392. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
  393. pub struct GenomeRange {
  394. pub contig: u8,
  395. pub range: Range<u32>,
  396. }
  397. /// Implements `TryFrom<(&str, &str, &str)>` for `GenomeRange`.
  398. ///
  399. /// This allows creating a `GenomeRange` from a tuple of three string slices,
  400. /// where the first represents the contig, the second the start position,
  401. /// and the third the end position.
  402. ///
  403. /// # Errors
  404. /// Returns an `anyhow::Error` if:
  405. /// - The contig part can't be converted to a numeric representation
  406. /// - Either the start or end position can't be parsed as a `u32`
  407. ///
  408. /// # Example
  409. /// ```
  410. /// use std::convert::TryFrom;
  411. /// let range = GenomeRange::try_from(("chr1", "1000", "2000")).unwrap();
  412. /// assert_eq!(range.contig, 1);
  413. /// assert_eq!(range.range.start, 1000);
  414. /// assert_eq!(range.range.end, 2000);
  415. /// ```
  416. impl TryFrom<(&str, &str, &str)> for GenomeRange {
  417. type Error = anyhow::Error;
  418. fn try_from((c, s, e): (&str, &str, &str)) -> anyhow::Result<Self> {
  419. Ok(Self {
  420. contig: contig_to_num(c),
  421. range: Range {
  422. start: s.parse().context(format!("Can't parse {s} into u32."))?,
  423. end: e.parse().context(format!("Can't parse {e} into u32."))?,
  424. },
  425. })
  426. }
  427. }
  428. impl GenomeRange {
  429. pub fn new(contig: &str, start: u32, end: u32) -> Self {
  430. Self {
  431. contig: contig_to_num(contig),
  432. range: start..end,
  433. }
  434. }
  435. /// Creates a `GenomeRange` from 1-based inclusive start and end positions.
  436. ///
  437. /// This method is useful when working with data sources that use 1-based coordinates
  438. /// and inclusive ranges. It converts the input to the 0-based, half-open format used by `GenomeRange`.
  439. ///
  440. /// # Arguments
  441. /// * `contig` - A string slice representing the contig/chromosome
  442. /// * `start` - A string slice representing the 1-based inclusive start position
  443. /// * `end` - A string slice representing the 1-based inclusive end position
  444. ///
  445. /// # Returns
  446. /// A `Result` containing the `GenomeRange` if successful, or an `anyhow::Error` if parsing fails.
  447. ///
  448. /// # Errors
  449. /// Returns an `anyhow::Error` if:
  450. /// - The contig can't be converted to a numeric representation
  451. /// - Either the start or end position can't be parsed as a `u32`
  452. ///
  453. /// # Example
  454. /// ```
  455. /// let range = GenomeRange::from_1_inclusive("chr1", "1000", "2000").unwrap();
  456. /// assert_eq!(range.contig, 1);
  457. /// assert_eq!(range.range.start, 999); // Converted to 0-based
  458. /// assert_eq!(range.range.end, 2000); // End is exclusive in 0-based
  459. /// ```
  460. pub fn from_1_inclusive_str(contig: &str, start: &str, end: &str) -> anyhow::Result<Self> {
  461. Ok(Self {
  462. contig: contig_to_num(contig),
  463. range: Range {
  464. start: start
  465. .parse::<u32>()
  466. .context(format!("Can't parse {start} into u32."))?
  467. .saturating_sub(1),
  468. end: end
  469. .parse()
  470. .context(format!("Can't parse {end} into u32."))?,
  471. },
  472. })
  473. }
  474. pub fn from_1_inclusive(contig: &str, start: u32, end: u32) -> Self {
  475. Self {
  476. contig: contig_to_num(contig),
  477. range: Range {
  478. start: start.saturating_sub(1),
  479. end,
  480. },
  481. }
  482. }
  483. pub fn length(&self) -> u32 {
  484. self.range.end - self.range.start
  485. }
  486. }
  487. impl GetGenomePosition for GenomePosition {
  488. fn position(&self) -> &GenomePosition {
  489. self
  490. }
  491. }
  492. impl GetGenomeRange for GenomeRange {
  493. fn range(&self) -> &GenomeRange {
  494. self
  495. }
  496. }
  497. pub fn merge_overlapping_genome_ranges(ranges: &[GenomeRange]) -> Vec<GenomeRange> {
  498. if ranges.is_empty() {
  499. return vec![];
  500. }
  501. // ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  502. let chunk_size = (ranges.len() / rayon::current_num_threads()).max(1);
  503. ranges
  504. .par_chunks(chunk_size)
  505. .map(|chunk| {
  506. let mut merged = Vec::new();
  507. let mut current = chunk[0].clone();
  508. for range in chunk.iter().skip(1) {
  509. if current.contig == range.contig && current.range.end >= range.range.start {
  510. current.range.end = max(current.range.end, range.range.end);
  511. } else {
  512. merged.push(current);
  513. current = range.clone();
  514. }
  515. }
  516. merged.push(current);
  517. merged
  518. })
  519. .reduce(Vec::new, |mut acc, mut chunk| {
  520. if let (Some(last), Some(first)) = (acc.last_mut(), chunk.first()) {
  521. if last.contig == first.contig && last.range.end >= first.range.start {
  522. last.range.end = max(last.range.end, first.range.end);
  523. chunk.remove(0);
  524. }
  525. }
  526. acc.extend(chunk);
  527. acc
  528. })
  529. }
  530. /// Finds overlaps between a list of genome positions and a list of genome ranges in parallel.
  531. ///
  532. /// This function uses Rayon for parallel processing, dividing the positions into chunks
  533. /// and processing them concurrently.
  534. ///
  535. /// # Arguments
  536. /// * `positions` - A slice of references to `GenomePosition`s
  537. /// * `ranges` - A slice of references to `GenomeRange`s
  538. ///
  539. /// # Returns
  540. /// A `Vec<usize>` containing the indices of positions that overlap with any range.
  541. ///
  542. /// # Performance
  543. /// This function is optimized for scenarios where both `positions` and `ranges` are sorted
  544. /// by contig and position/range start. It uses this ordering to avoid unnecessary comparisons.
  545. ///
  546. /// # Example
  547. /// ```
  548. /// let positions = vec![
  549. /// &GenomePosition { contig: 1, position: 100 },
  550. /// &GenomePosition { contig: 1, position: 200 },
  551. /// &GenomePosition { contig: 2, position: 150 },
  552. /// ];
  553. /// let ranges = vec![
  554. /// &GenomeRange { contig: 1, range: 50..150 },
  555. /// &GenomeRange { contig: 2, range: 100..200 },
  556. /// ];
  557. /// let overlaps = overlaps_par(&positions, &ranges);
  558. /// assert_eq!(overlaps, vec!);
  559. /// ```
  560. pub fn overlaps_par(positions: &[&GenomePosition], ranges: &[&GenomeRange]) -> Vec<usize> {
  561. let chunk_size = (positions.len() / rayon::current_num_threads()).max(1);
  562. positions
  563. .par_chunks(chunk_size)
  564. .enumerate()
  565. .flat_map(|(chunk_idx, chunk)| {
  566. let mut result = Vec::new();
  567. let mut range_idx = ranges.partition_point(|r| r.contig < chunk[0].contig);
  568. for (pos_idx, pos) in chunk.iter().enumerate() {
  569. while range_idx < ranges.len() && ranges[range_idx].contig < pos.contig {
  570. range_idx += 1;
  571. }
  572. if range_idx < ranges.len() && ranges[range_idx].contig == pos.contig {
  573. while range_idx < ranges.len()
  574. && ranges[range_idx].contig == pos.contig
  575. && ranges[range_idx].range.end <= pos.position
  576. {
  577. range_idx += 1;
  578. }
  579. if range_idx < ranges.len()
  580. && ranges[range_idx].contig == pos.contig
  581. && ranges[range_idx].range.contains(&pos.position)
  582. {
  583. result.push(chunk_idx * chunk_size + pos_idx);
  584. }
  585. }
  586. }
  587. result
  588. })
  589. .collect()
  590. }
  591. //
  592. // pub fn range_intersection_par(a: &[GenomeRange], b: &[GenomeRange]) -> Vec<GenomeRange> {
  593. // // Group ranges by contig for both inputs
  594. // let mut a_map = std::collections::HashMap::new();
  595. // for range in a {
  596. // a_map.entry(&range.contig).or_insert(Vec::new()).push(range);
  597. // }
  598. //
  599. // let mut b_map = std::collections::HashMap::new();
  600. // for range in b {
  601. // b_map.entry(&range.contig).or_insert(Vec::new()).push(range);
  602. // }
  603. //
  604. // // Process common contigs in parallel
  605. // a_map
  606. // .into_par_iter()
  607. // .filter_map(|(contig, mut a_ranges)| {
  608. // // Sort ranges by start position (end is only used for tie-breaking)
  609. // a_ranges.sort_by_key(|r| (r.range.start, r.range.end));
  610. //
  611. // // Get corresponding ranges from b
  612. // let mut b_ranges = match b_map.get(contig) {
  613. // Some(ranges) => ranges.clone(),
  614. // None => return None,
  615. // };
  616. // b_ranges.sort_by_key(|r| (r.range.start, r.range.end));
  617. //
  618. // let mut intersections = Vec::new();
  619. // let mut i = 0;
  620. // let mut j = 0;
  621. //
  622. // // Two-pointer intersection algorithm for 0-based, end-exclusive ranges
  623. // while i < a_ranges.len() && j < b_ranges.len() {
  624. // let a_range = &a_ranges[i];
  625. // let b_range = &b_ranges[j];
  626. //
  627. // // Check if ranges are completely non-overlapping
  628. // if a_range.range.end <= b_range.range.start {
  629. // // |a_range| ends before |b_range| starts
  630. // i += 1;
  631. // } else if b_range.range.end <= a_range.range.start {
  632. // // |b_range| ends before |a_range| starts
  633. // j += 1;
  634. // } else {
  635. // // Calculate intersection of overlapping ranges
  636. // let start = a_range.range.start.max(b_range.range.start);
  637. // let end = a_range.range.end.min(b_range.range.end);
  638. //
  639. // // Only create a range if start < end (non-zero length)
  640. // if start < end {
  641. // intersections.push(GenomeRange {
  642. // contig: *contig,
  643. // range: start..end,
  644. // });
  645. // }
  646. //
  647. // // Move the pointer with the earlier end coordinate
  648. // if a_range.range.end < b_range.range.end {
  649. // i += 1;
  650. // } else {
  651. // j += 1;
  652. // }
  653. // }
  654. // }
  655. //
  656. // Some(intersections)
  657. // })
  658. // .flatten()
  659. // .collect()
  660. // }
  661. /// Returns every region that **overlaps** between two sorted
  662. /// slices of genome ranges, distributing the work across
  663. /// threads with **Rayon**.
  664. ///
  665. /// Both inputs **must**
  666. /// - be sorted by `(contig, start)`
  667. /// - contain non-overlapping, half-open ranges (`start < end`)
  668. /// - use the same coordinate system.
  669. ///
  670. /// # Parameters
  671. /// * `a` – Slice of references to `GenomeRange`.
  672. /// * `b` – Slice of references to `GenomeRange`.
  673. ///
  674. /// # Returns
  675. /// A `Vec<GenomeRange>` containing one entry for each pairwise
  676. /// intersection.
  677. /// The output is *not* guaranteed to be sorted.
  678. ///
  679. /// # Complexity
  680. /// `O(|a| + |b|)` total work; memory ≤ the number of produced
  681. /// intersections. Work is split per contig and executed in
  682. /// parallel.
  683. ///
  684. /// # Panics
  685. /// Never panics if the pre-conditions above are met.
  686. ///
  687. /// # Example
  688. /// ```
  689. /// # use pandora_lib_promethion::{GenomeRange, range_intersection_par};
  690. /// let a_range = GenomeRange { contig: 1, range: 100..200 };
  691. /// let b_range = GenomeRange { contig: 1, range: 150..250 };
  692. ///
  693. /// let out = range_intersection_par(&[&a_range], &[&b_range]);
  694. /// assert_eq!(out, vec![GenomeRange { contig: 1, range: 150..200 }]);
  695. /// ```
  696. pub fn range_intersection_par(a: &[&GenomeRange], b: &[&GenomeRange]) -> Vec<GenomeRange> {
  697. let (a_contigs, b_contigs) = rayon::join(
  698. || extract_contig_indices(a),
  699. || extract_contig_indices(b),
  700. );
  701. a_contigs.into_par_iter()
  702. .filter_map(|(contig, a_start, a_end)| {
  703. let (b_start, b_end) = find_contig_indices(&b_contigs, contig)?;
  704. let a_ranges = &a[a_start..a_end];
  705. let b_ranges = &b[b_start..b_end];
  706. let mut intersections = Vec::new();
  707. let (mut i, mut j) = (0, 0);
  708. while i < a_ranges.len() && j < b_ranges.len() {
  709. let a_range = &a_ranges[i].range;
  710. let b_range = &b_ranges[j].range;
  711. match (a_range.end <= b_range.start, b_range.end <= a_range.start) {
  712. (true, _) => i += 1,
  713. (_, true) => j += 1,
  714. _ => {
  715. let start = a_range.start.max(b_range.start);
  716. let end = a_range.end.min(b_range.end);
  717. if start < end {
  718. intersections.push(GenomeRange {
  719. contig,
  720. range: start..end,
  721. });
  722. }
  723. if a_range.end < b_range.end {
  724. i += 1;
  725. } else {
  726. j += 1;
  727. }
  728. }
  729. }
  730. }
  731. Some(intersections)
  732. })
  733. .flatten()
  734. .collect()
  735. }
  736. // Helper to create (contig, start index, end index) tuples
  737. pub fn extract_contig_indices(ranges: &[&GenomeRange]) -> Vec<(u8, usize, usize)> {
  738. let mut indices = Vec::new();
  739. let mut current_contig = None;
  740. let mut start_idx = 0;
  741. for (i, range) in ranges.iter().enumerate() {
  742. if current_contig != Some(range.contig) {
  743. if let Some(contig) = current_contig {
  744. indices.push((contig, start_idx, i));
  745. }
  746. current_contig = Some(range.contig);
  747. start_idx = i;
  748. }
  749. }
  750. if let Some(contig) = current_contig {
  751. indices.push((contig, start_idx, ranges.len()));
  752. }
  753. indices
  754. }
  755. // Binary search to find contig indices in precomputed list
  756. pub fn find_contig_indices(contigs: &[(u8, usize, usize)], target: u8) -> Option<(usize, usize)> {
  757. contigs.binary_search_by(|(c, _, _)| c.cmp(&target))
  758. .ok()
  759. .map(|idx| (contigs[idx].1, contigs[idx].2))
  760. }
  761. /// A parallel version of the overlap finding function that works with any types
  762. /// implementing `GetGenomePosition` and `GetGenomeRange` traits.
  763. ///
  764. /// This function first converts the input slices to vectors of `GenomePosition` and `GenomeRange`,
  765. /// then calls `overlaps_par`.
  766. ///
  767. /// # Arguments
  768. /// * `a` - A slice of items implementing `GetGenomePosition`
  769. /// * `b` - A slice of items implementing `GetGenomeRange`
  770. ///
  771. /// # Returns
  772. /// A `Vec<usize>` containing the indices of positions that overlap with any range.
  773. ///
  774. /// # Example
  775. /// ```
  776. /// struct MyPosition(GenomePosition);
  777. /// impl GetGenomePosition for MyPosition {
  778. /// fn position(&self) -> &GenomePosition { &self.0 }
  779. /// }
  780. ///
  781. /// struct MyRange(GenomeRange);
  782. /// impl GetGenomeRange for MyRange {
  783. /// fn range(&self) -> &GenomeRange { &self.0 }
  784. /// }
  785. ///
  786. /// let positions = vec![
  787. /// MyPosition(GenomePosition { contig: 1, position: 100 }),
  788. /// MyPosition(GenomePosition { contig: 1, position: 200 }),
  789. /// ];
  790. /// let ranges = vec![
  791. /// MyRange(GenomeRange { contig: 1, range: 50..150 }),
  792. /// ];
  793. /// let overlaps = par_overlaps(&positions, &ranges);
  794. /// assert_eq!(overlaps, vec!);
  795. /// ```
  796. pub fn par_overlaps(a: &[impl GetGenomePosition], b: &[impl GetGenomeRange]) -> Vec<usize> {
  797. let a: Vec<_> = a.iter().map(|e| e.position()).collect();
  798. let b: Vec<_> = b.iter().map(|e| e.range()).collect();
  799. overlaps_par(&a, &b)
  800. }
  801. /// A trait for types that can provide a reference to a `GenomePosition`.
  802. ///
  803. /// Implement this trait for types that contain or can compute a `GenomePosition`.
  804. pub trait GetGenomePosition {
  805. // Returns a reference to the `GenomePosition` for this item.
  806. fn position(&self) -> &GenomePosition;
  807. }
  808. /// A trait for types that can provide a reference to a `GenomeRange`.
  809. ///
  810. /// Implement this trait for types that contain or can compute a `GenomeRange`.
  811. pub trait GetGenomeRange {
  812. /// Returns a reference to the `GenomeRange` for this item.
  813. fn range(&self) -> &GenomeRange;
  814. }
  815. pub fn sort_ranges(ranges: &mut [GenomeRange]) {
  816. ranges.par_sort_by_key(|r| (r.contig, r.range.start, r.range.end));
  817. }