positions.rs 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884
  1. use std::{
  2. cmp::{max, Ordering},
  3. fmt::Display,
  4. ops::Range,
  5. str::FromStr,
  6. };
  7. use anyhow::Context;
  8. use bitcode::{Decode, Encode};
  9. use rayon::prelude::*;
  10. use serde::{Deserialize, Serialize};
  11. /// Represents a position in the genome using 0-based coordinates.
  12. ///
  13. /// This struct encapsulates a genomic position, consisting of a contig (chromosome)
  14. /// identifier and a position within that contig.
  15. ///
  16. /// # Fields
  17. /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
  18. /// * `position` - A 32-bit unsigned integer representing the 0-based position within the contig
  19. ///
  20. /// # Derives
  21. /// This struct derives the following traits:
  22. /// * `Debug` for easier debugging and logging
  23. /// * `PartialEq` and `Eq` for equality comparisons
  24. /// * `Clone` for creating deep copies
  25. /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
  26. /// * `Hash` for use in hash-based collections
  27. /// * `Default` for creating a default instance
  28. ///
  29. /// # Note
  30. /// The position is 0-based, meaning the first position in a contig is represented as 0.
  31. ///
  32. /// # Usage
  33. /// This struct is typically used to precisely locate genomic features or variants
  34. /// within a reference genome.
  35. ///
  36. /// # Example
  37. /// ```
  38. /// let position = GenomePosition {
  39. /// contig: 1, // Representing chromosome 1
  40. /// position: 1000, // 1001st base pair in the chromosome (remember it's 0-based)
  41. /// };
  42. /// ```
  43. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash, Default, Encode, Decode)]
  44. pub struct GenomePosition {
  45. pub contig: u8,
  46. pub position: u32,
  47. }
  48. impl GenomePosition {
  49. /// Returns the contig (chromosome) identifier as a human-readable string.
  50. ///
  51. /// This method converts the internal numeric representation of the contig
  52. /// to a string format commonly used in genomics, such as "chr1", "chrX", etc...
  53. ///
  54. /// # Returns
  55. /// A `String` representing the contig identifier.
  56. ///
  57. /// # Examples
  58. ///
  59. /// ```
  60. /// let pos = GenomePosition { contig: 1, position: 1000 };
  61. /// assert_eq!(pos.contig(), "chr1");
  62. ///
  63. /// let pos_x = GenomePosition { contig: 23, position: 500 };
  64. /// assert_eq!(pos_x.contig(), "chrX");
  65. /// ```
  66. ///
  67. /// # Note
  68. /// This method uses the `num_to_contig` function to perform the conversion.
  69. /// Special cases like X, Y chromosomes and mitochondrial DNA are handled appropriately.
  70. pub fn contig(&self) -> String {
  71. num_to_contig(self.contig)
  72. }
  73. }
  74. /// Implements the `Display` trait for `GenomePosition`.
  75. ///
  76. /// This implementation allows a `GenomePosition` to be formatted as a string
  77. /// in the form "contig:position", where contig is the string representation
  78. /// of the chromosome (e.g., "chr1", "chrX") and position is the 0-based position.
  79. ///
  80. /// # Example
  81. /// ```
  82. /// let pos = GenomePosition { contig: 1, position: 1000 };
  83. /// assert_eq!(format!("{}", pos), "chr1:1000");
  84. /// ```
  85. impl Display for GenomePosition {
  86. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  87. write!(f, "{}:{}", self.contig(), self.position)
  88. }
  89. }
  90. /// Implements the `FromStr` trait for `GenomePosition`.
  91. ///
  92. /// This allows creating a `GenomePosition` from a string in the format "contig:position".
  93. ///
  94. /// # Errors
  95. /// Returns an `anyhow::Error` if:
  96. /// - The input string doesn't contain a ':' character
  97. /// - The contig part can't be converted to a numeric representation
  98. /// - The position part can't be parsed as a `u32`
  99. ///
  100. /// # Example
  101. /// ```
  102. /// use std::str::FromStr;
  103. /// let pos = GenomePosition::from_str("chr1:1000").unwrap();
  104. /// assert_eq!(pos.contig, 1);
  105. /// assert_eq!(pos.position, 1000);
  106. /// ```
  107. impl FromStr for GenomePosition {
  108. type Err = anyhow::Error;
  109. fn from_str(s: &str) -> anyhow::Result<Self> {
  110. let (c, p) = s
  111. .split_once(":")
  112. .ok_or(anyhow::anyhow!("Can't split {s}"))?;
  113. Ok(Self {
  114. contig: contig_to_num(c),
  115. position: p.parse()?,
  116. })
  117. }
  118. }
  119. /// Implements `TryFrom<(&str, &str)>` for `GenomePosition`.
  120. ///
  121. /// This allows creating a `GenomePosition` from a tuple of two string slices,
  122. /// where the first represents the contig and the second represents the position.
  123. ///
  124. /// # Errors
  125. /// Returns an `anyhow::Error` if:
  126. /// - The contig part can't be converted to a numeric representation
  127. /// - The position part can't be parsed as a `u32`
  128. ///
  129. /// # Example
  130. /// ```
  131. /// use std::convert::TryFrom;
  132. /// let pos = GenomePosition::try_from(("chr1", "1000")).unwrap();
  133. /// assert_eq!(pos.contig, 1);
  134. /// assert_eq!(pos.position, 1000);
  135. /// ```
  136. impl TryFrom<(&str, &str)> for GenomePosition {
  137. type Error = anyhow::Error;
  138. fn try_from((c, p): (&str, &str)) -> anyhow::Result<Self> {
  139. Ok(GenomePosition {
  140. contig: contig_to_num(c),
  141. position: p.parse().context(format!("Can't parse {p} into u32."))?,
  142. })
  143. }
  144. }
  145. /// Implements conversion from `VcfPosition` to `GenomePosition`.
  146. ///
  147. /// This conversion adjusts for the difference between 1-based VCF coordinates
  148. /// and 0-based genomic coordinates. It also handles the special case of
  149. /// position 0 in VCF, which represents a variant inside telomeres.
  150. ///
  151. /// # Note
  152. /// This implementation assumes that `VcfPosition` uses 1-based coordinates.
  153. ///
  154. /// # Example
  155. /// ```
  156. /// let vcf_pos = VcfPosition { contig: 1, position: 1001 };
  157. /// let genome_pos: GenomePosition = vcf_pos.into();
  158. /// assert_eq!(genome_pos.contig, 1);
  159. /// assert_eq!(genome_pos.position, 1000);
  160. /// ```
  161. impl From<VcfPosition> for GenomePosition {
  162. fn from(value: VcfPosition) -> Self {
  163. let position = if value.position == 0 {
  164. // position 0 is a variant inside telomeres
  165. 0
  166. } else {
  167. value.position - 1
  168. };
  169. Self {
  170. contig: value.contig,
  171. position,
  172. }
  173. }
  174. }
  175. /// Implements conversion from `GenomePosition` to `(String, String)`.
  176. ///
  177. /// This conversion creates a tuple where the first element is the string
  178. /// representation of the contig (e.g., "chr1", "chrX") and the second
  179. /// element is the string representation of the position.
  180. ///
  181. /// # Example
  182. /// ```
  183. /// let pos = GenomePosition { contig: 1, position: 1000 };
  184. /// let (contig, position): (String, String) = pos.into();
  185. /// assert_eq!(contig, "chr1");
  186. /// assert_eq!(position, "1000");
  187. /// ```
  188. impl From<GenomePosition> for (String, String) {
  189. fn from(val: GenomePosition) -> Self {
  190. (num_to_contig(val.contig), val.position.to_string())
  191. }
  192. }
  193. /// Implements `PartialOrd` for `GenomePosition`.
  194. ///
  195. /// This implementation allows `GenomePosition` instances to be compared
  196. /// and ordered based first on their contig, then on their position.
  197. impl PartialOrd for GenomePosition {
  198. fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
  199. Some(self.cmp(other))
  200. }
  201. }
  202. /// Implements `Ord` for `GenomePosition`.
  203. ///
  204. /// This implementation defines a total ordering for `GenomePosition` instances.
  205. /// Positions are first compared by their contig, and if the contigs are equal,
  206. /// then by their position.
  207. ///
  208. /// # Example
  209. /// ```
  210. /// let pos1 = GenomePosition { contig: 1, position: 1000 };
  211. /// let pos2 = GenomePosition { contig: 1, position: 2000 };
  212. /// let pos3 = GenomePosition { contig: 2, position: 500 };
  213. /// assert!(pos1 < pos2);
  214. /// assert!(pos2 < pos3);
  215. /// ```
  216. impl Ord for GenomePosition {
  217. fn cmp(&self, other: &Self) -> Ordering {
  218. match self.contig.cmp(&other.contig) {
  219. Ordering::Equal => self.position.cmp(&other.position),
  220. other => other,
  221. }
  222. }
  223. }
  224. /// Converts a contig (chromosome) identifier string to its numeric representation.
  225. ///
  226. /// # Arguments
  227. /// * `contig` - A string slice representing the contig identifier (e.g., "chr1", "chrX")
  228. ///
  229. /// # Returns
  230. /// A `u8` representing the numeric value of the contig:
  231. /// * 1-22 for autosomes
  232. /// * 23 for X chromosome
  233. /// * 24 for Y chromosome
  234. /// * 25 for mitochondrial DNA
  235. /// * `u8::MAX` (255) for any unrecognized input
  236. ///
  237. /// # Examples
  238. /// ```
  239. /// assert_eq!(contig_to_num("chr1"), 1);
  240. /// assert_eq!(contig_to_num("chrX"), 23);
  241. /// assert_eq!(contig_to_num("chrM"), 25);
  242. /// assert_eq!(contig_to_num("chr_unknown"), u8::MAX);
  243. /// ```
  244. ///
  245. /// # Note
  246. /// This function assumes the input string starts with "chr" for standard chromosomes.
  247. /// The "chr" prefix is trimmed before parsing numeric values.
  248. pub fn contig_to_num(contig: &str) -> u8 {
  249. match contig {
  250. "chrX" => 23,
  251. "chrY" => 24,
  252. "chrM" => 25,
  253. c => c.trim_start_matches("chr").parse().unwrap_or(u8::MAX),
  254. }
  255. }
  256. /// Converts a numeric contig (chromosome) representation to its string identifier.
  257. ///
  258. /// # Arguments
  259. /// * `num` - A `u8` representing the numeric value of the contig
  260. ///
  261. /// # Returns
  262. /// A `String` representing the contig identifier:
  263. /// * "chr1" to "chr22" for autosomes
  264. /// * "chrX" for 23
  265. /// * "chrY" for 24
  266. /// * "chrM" for 25
  267. /// * "chr{num}" for any other input
  268. ///
  269. /// # Examples
  270. /// ```
  271. /// assert_eq!(num_to_contig(1), "chr1");
  272. /// assert_eq!(num_to_contig(23), "chrX");
  273. /// assert_eq!(num_to_contig(25), "chrM");
  274. /// ```
  275. ///
  276. /// # Note
  277. /// This function always prepends "chr" to the output string.
  278. pub fn num_to_contig(num: u8) -> String {
  279. match num {
  280. 23 => "chrX".to_string(),
  281. 24 => "chrY".to_string(),
  282. 25 => "chrM".to_string(),
  283. c => format!("chr{c}"),
  284. }
  285. }
  286. /// Represents a position in a VCF (Variant Call Format) file using 1-based coordinates.
  287. ///
  288. /// This struct encapsulates a genomic position as represented in VCF files, consisting of a contig (chromosome)
  289. /// identifier and a position within that contig.
  290. ///
  291. /// # Fields
  292. /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
  293. /// * `position` - A 32-bit unsigned integer representing the 1-based position within the contig
  294. ///
  295. /// # Derives
  296. /// This struct derives the following traits:
  297. /// * `Debug` for easier debugging and logging
  298. /// * `PartialEq` and `Eq` for equality comparisons
  299. /// * `Clone` for creating deep copies
  300. /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
  301. ///
  302. /// # Note
  303. /// The position is 1-based, as per VCF file format specifications.
  304. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
  305. pub struct VcfPosition {
  306. pub contig: u8,
  307. pub position: u32,
  308. }
  309. /// Implements `TryFrom<(&str, &str)>` for `VcfPosition`.
  310. ///
  311. /// This allows creating a `VcfPosition` from a tuple of two string slices,
  312. /// where the first represents the contig and the second represents the position.
  313. ///
  314. /// # Errors
  315. /// Returns an `anyhow::Error` if:
  316. /// - The contig part can't be converted to a numeric representation
  317. /// - The position part can't be parsed as a `u32`
  318. ///
  319. /// # Example
  320. /// ```
  321. /// use std::convert::TryFrom;
  322. /// let pos = VcfPosition::try_from(("chr1", "1000")).unwrap();
  323. /// assert_eq!(pos.contig, 1);
  324. /// assert_eq!(pos.position, 1000);
  325. /// ```
  326. impl TryFrom<(&str, &str)> for VcfPosition {
  327. type Error = anyhow::Error;
  328. fn try_from((c, p): (&str, &str)) -> anyhow::Result<Self> {
  329. Ok(VcfPosition {
  330. contig: contig_to_num(c),
  331. position: p.parse().context(format!("Can't parse {p} into u32."))?,
  332. })
  333. }
  334. }
  335. /// Implements conversion from `GenomePosition` to `VcfPosition`.
  336. ///
  337. /// This conversion adjusts for the difference between 0-based genomic coordinates
  338. /// and 1-based VCF coordinates.
  339. ///
  340. /// # Example
  341. /// ```
  342. /// let genome_pos = GenomePosition { contig: 1, position: 999 };
  343. /// let vcf_pos: VcfPosition = genome_pos.into();
  344. /// assert_eq!(vcf_pos.contig, 1);
  345. /// assert_eq!(vcf_pos.position, 1000);
  346. /// ```
  347. impl From<GenomePosition> for VcfPosition {
  348. fn from(value: GenomePosition) -> Self {
  349. Self {
  350. contig: value.contig,
  351. position: value.position + 1,
  352. }
  353. }
  354. }
  355. /// Implements conversion from `VcfPosition` to `(String, String)`.
  356. ///
  357. /// This conversion creates a tuple where the first element is the string
  358. /// representation of the contig (e.g., "chr1", "chrX") and the second
  359. /// element is the string representation of the position.
  360. ///
  361. /// # Example
  362. /// ```
  363. /// let pos = VcfPosition { contig: 1, position: 1000 };
  364. /// let (contig, position): (String, String) = pos.into();
  365. /// assert_eq!(contig, "chr1");
  366. /// assert_eq!(position, "1000");
  367. /// ```
  368. impl From<VcfPosition> for (String, String) {
  369. fn from(val: VcfPosition) -> Self {
  370. (num_to_contig(val.contig), val.position.to_string())
  371. }
  372. }
  373. /// Represents a range in the genome using 0-based, half-open coordinates [start, end).
  374. ///
  375. /// This struct encapsulates a genomic range, consisting of a contig (chromosome)
  376. /// identifier and a range of positions within that contig.
  377. ///
  378. /// # Fields
  379. /// * `contig` - An 8-bit unsigned integer representing the contig/chromosome
  380. /// * `range` - A `Range<u32>` representing the start (inclusive) and end (exclusive) positions
  381. ///
  382. /// # Derives
  383. /// This struct derives the following traits:
  384. /// * `Debug` for easier debugging and logging
  385. /// * `PartialEq` and `Eq` for equality comparisons
  386. /// * `Clone` for creating deep copies
  387. /// * `Serialize` and `Deserialize` for serialization (e.g., to JSON or other formats)
  388. ///
  389. /// # Note
  390. /// The range is 0-based and half-open, meaning the start position is included but the end position is not.
  391. /// This is consistent with many programming languages and bioinformatics tools.
  392. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
  393. pub struct GenomeRange {
  394. pub contig: u8,
  395. pub range: Range<u32>,
  396. }
  397. /// Implements `TryFrom<(&str, &str, &str)>` for `GenomeRange`.
  398. ///
  399. /// This allows creating a `GenomeRange` from a tuple of three string slices,
  400. /// where the first represents the contig, the second the start position,
  401. /// and the third the end position.
  402. ///
  403. /// # Errors
  404. /// Returns an `anyhow::Error` if:
  405. /// - The contig part can't be converted to a numeric representation
  406. /// - Either the start or end position can't be parsed as a `u32`
  407. ///
  408. /// # Example
  409. /// ```
  410. /// use std::convert::TryFrom;
  411. /// let range = GenomeRange::try_from(("chr1", "1000", "2000")).unwrap();
  412. /// assert_eq!(range.contig, 1);
  413. /// assert_eq!(range.range.start, 1000);
  414. /// assert_eq!(range.range.end, 2000);
  415. /// ```
  416. impl TryFrom<(&str, &str, &str)> for GenomeRange {
  417. type Error = anyhow::Error;
  418. fn try_from((c, s, e): (&str, &str, &str)) -> anyhow::Result<Self> {
  419. Ok(Self {
  420. contig: contig_to_num(c),
  421. range: Range {
  422. start: s.parse().context(format!("Can't parse {s} into u32."))?,
  423. end: e.parse().context(format!("Can't parse {e} into u32."))?,
  424. },
  425. })
  426. }
  427. }
  428. impl GenomeRange {
  429. pub fn new(contig: &str, start: u32, end: u32) -> Self {
  430. Self {
  431. contig: contig_to_num(contig),
  432. range: start..end,
  433. }
  434. }
  435. /// Creates a `GenomeRange` from 1-based inclusive start and end positions.
  436. ///
  437. /// This method is useful when working with data sources that use 1-based coordinates
  438. /// and inclusive ranges. It converts the input to the 0-based, half-open format used by `GenomeRange`.
  439. ///
  440. /// # Arguments
  441. /// * `contig` - A string slice representing the contig/chromosome
  442. /// * `start` - A string slice representing the 1-based inclusive start position
  443. /// * `end` - A string slice representing the 1-based inclusive end position
  444. ///
  445. /// # Returns
  446. /// A `Result` containing the `GenomeRange` if successful, or an `anyhow::Error` if parsing fails.
  447. ///
  448. /// # Errors
  449. /// Returns an `anyhow::Error` if:
  450. /// - The contig can't be converted to a numeric representation
  451. /// - Either the start or end position can't be parsed as a `u32`
  452. ///
  453. /// # Example
  454. /// ```
  455. /// let range = GenomeRange::from_1_inclusive("chr1", "1000", "2000").unwrap();
  456. /// assert_eq!(range.contig, 1);
  457. /// assert_eq!(range.range.start, 999); // Converted to 0-based
  458. /// assert_eq!(range.range.end, 2000); // End is exclusive in 0-based
  459. /// ```
  460. pub fn from_1_inclusive_str(contig: &str, start: &str, end: &str) -> anyhow::Result<Self> {
  461. Ok(Self {
  462. contig: contig_to_num(contig),
  463. range: Range {
  464. start: start
  465. .parse::<u32>()
  466. .context(format!("Can't parse {start} into u32."))?
  467. .saturating_sub(1),
  468. end: end
  469. .parse()
  470. .context(format!("Can't parse {end} into u32."))?,
  471. },
  472. })
  473. }
  474. pub fn from_1_inclusive(contig: &str, start: u32, end: u32) -> Self {
  475. Self {
  476. contig: contig_to_num(contig),
  477. range: Range {
  478. start: start.saturating_sub(1),
  479. end,
  480. },
  481. }
  482. }
  483. pub fn length(&self) -> u32 {
  484. self.range.end - self.range.start
  485. }
  486. }
  487. pub trait GenomeRangeExt {
  488. /// Total length in bp for 0-based, half-open ranges [start, end)
  489. fn total_len(&self) -> u64;
  490. }
  491. impl GenomeRangeExt for [GenomeRange] {
  492. fn total_len(&self) -> u64 {
  493. self.iter()
  494. .map(|r| (r.end - r.start) as u64)
  495. .sum()
  496. }
  497. }
  498. impl GetGenomePosition for GenomePosition {
  499. fn position(&self) -> &GenomePosition {
  500. self
  501. }
  502. }
  503. impl GetGenomeRange for GenomeRange {
  504. fn range(&self) -> &GenomeRange {
  505. self
  506. }
  507. }
  508. pub fn merge_overlapping_genome_ranges(ranges: &[GenomeRange]) -> Vec<GenomeRange> {
  509. if ranges.is_empty() {
  510. return vec![];
  511. }
  512. // ranges.par_sort_by_key(|r| (r.contig, r.range.start));
  513. let chunk_size = (ranges.len() / rayon::current_num_threads()).max(1);
  514. ranges
  515. .par_chunks(chunk_size)
  516. .map(|chunk| {
  517. let mut merged = Vec::new();
  518. let mut current = chunk[0].clone();
  519. for range in chunk.iter().skip(1) {
  520. if current.contig == range.contig && current.range.end >= range.range.start {
  521. current.range.end = max(current.range.end, range.range.end);
  522. } else {
  523. merged.push(current);
  524. current = range.clone();
  525. }
  526. }
  527. merged.push(current);
  528. merged
  529. })
  530. .reduce(Vec::new, |mut acc, mut chunk| {
  531. if let (Some(last), Some(first)) = (acc.last_mut(), chunk.first()) {
  532. if last.contig == first.contig && last.range.end >= first.range.start {
  533. last.range.end = max(last.range.end, first.range.end);
  534. chunk.remove(0);
  535. }
  536. }
  537. acc.extend(chunk);
  538. acc
  539. })
  540. }
  541. /// Finds overlaps between a list of genome positions and a list of genome ranges in parallel.
  542. ///
  543. /// This function uses Rayon for parallel processing, dividing the positions into chunks
  544. /// and processing them concurrently.
  545. ///
  546. /// # Arguments
  547. /// * `positions` - A slice of references to `GenomePosition`s
  548. /// * `ranges` - A slice of references to `GenomeRange`s
  549. ///
  550. /// # Returns
  551. /// A `Vec<usize>` containing the indices of positions that overlap with any range.
  552. ///
  553. /// # Performance
  554. /// This function is optimized for scenarios where both `positions` and `ranges` are sorted
  555. /// by contig and position/range start. It uses this ordering to avoid unnecessary comparisons.
  556. ///
  557. /// # Example
  558. /// ```
  559. /// let positions = vec![
  560. /// &GenomePosition { contig: 1, position: 100 },
  561. /// &GenomePosition { contig: 1, position: 200 },
  562. /// &GenomePosition { contig: 2, position: 150 },
  563. /// ];
  564. /// let ranges = vec![
  565. /// &GenomeRange { contig: 1, range: 50..150 },
  566. /// &GenomeRange { contig: 2, range: 100..200 },
  567. /// ];
  568. /// let overlaps = overlaps_par(&positions, &ranges);
  569. /// assert_eq!(overlaps, vec!);
  570. /// ```
  571. pub fn overlaps_par(positions: &[&GenomePosition], ranges: &[&GenomeRange]) -> Vec<usize> {
  572. let chunk_size = (positions.len() / rayon::current_num_threads()).max(1);
  573. positions
  574. .par_chunks(chunk_size)
  575. .enumerate()
  576. .flat_map(|(chunk_idx, chunk)| {
  577. let mut result = Vec::new();
  578. let mut range_idx = ranges.partition_point(|r| r.contig < chunk[0].contig);
  579. for (pos_idx, pos) in chunk.iter().enumerate() {
  580. while range_idx < ranges.len() && ranges[range_idx].contig < pos.contig {
  581. range_idx += 1;
  582. }
  583. if range_idx < ranges.len() && ranges[range_idx].contig == pos.contig {
  584. while range_idx < ranges.len()
  585. && ranges[range_idx].contig == pos.contig
  586. && ranges[range_idx].range.end <= pos.position
  587. {
  588. range_idx += 1;
  589. }
  590. if range_idx < ranges.len()
  591. && ranges[range_idx].contig == pos.contig
  592. && ranges[range_idx].range.contains(&pos.position)
  593. {
  594. result.push(chunk_idx * chunk_size + pos_idx);
  595. }
  596. }
  597. }
  598. result
  599. })
  600. .collect()
  601. }
  602. //
  603. // pub fn range_intersection_par(a: &[GenomeRange], b: &[GenomeRange]) -> Vec<GenomeRange> {
  604. // // Group ranges by contig for both inputs
  605. // let mut a_map = std::collections::HashMap::new();
  606. // for range in a {
  607. // a_map.entry(&range.contig).or_insert(Vec::new()).push(range);
  608. // }
  609. //
  610. // let mut b_map = std::collections::HashMap::new();
  611. // for range in b {
  612. // b_map.entry(&range.contig).or_insert(Vec::new()).push(range);
  613. // }
  614. //
  615. // // Process common contigs in parallel
  616. // a_map
  617. // .into_par_iter()
  618. // .filter_map(|(contig, mut a_ranges)| {
  619. // // Sort ranges by start position (end is only used for tie-breaking)
  620. // a_ranges.sort_by_key(|r| (r.range.start, r.range.end));
  621. //
  622. // // Get corresponding ranges from b
  623. // let mut b_ranges = match b_map.get(contig) {
  624. // Some(ranges) => ranges.clone(),
  625. // None => return None,
  626. // };
  627. // b_ranges.sort_by_key(|r| (r.range.start, r.range.end));
  628. //
  629. // let mut intersections = Vec::new();
  630. // let mut i = 0;
  631. // let mut j = 0;
  632. //
  633. // // Two-pointer intersection algorithm for 0-based, end-exclusive ranges
  634. // while i < a_ranges.len() && j < b_ranges.len() {
  635. // let a_range = &a_ranges[i];
  636. // let b_range = &b_ranges[j];
  637. //
  638. // // Check if ranges are completely non-overlapping
  639. // if a_range.range.end <= b_range.range.start {
  640. // // |a_range| ends before |b_range| starts
  641. // i += 1;
  642. // } else if b_range.range.end <= a_range.range.start {
  643. // // |b_range| ends before |a_range| starts
  644. // j += 1;
  645. // } else {
  646. // // Calculate intersection of overlapping ranges
  647. // let start = a_range.range.start.max(b_range.range.start);
  648. // let end = a_range.range.end.min(b_range.range.end);
  649. //
  650. // // Only create a range if start < end (non-zero length)
  651. // if start < end {
  652. // intersections.push(GenomeRange {
  653. // contig: *contig,
  654. // range: start..end,
  655. // });
  656. // }
  657. //
  658. // // Move the pointer with the earlier end coordinate
  659. // if a_range.range.end < b_range.range.end {
  660. // i += 1;
  661. // } else {
  662. // j += 1;
  663. // }
  664. // }
  665. // }
  666. //
  667. // Some(intersections)
  668. // })
  669. // .flatten()
  670. // .collect()
  671. // }
  672. /// Returns every region that **overlaps** between two sorted
  673. /// slices of genome ranges, distributing the work across
  674. /// threads with **Rayon**.
  675. ///
  676. /// Both inputs **must**
  677. /// - be sorted by `(contig, start)`
  678. /// - contain non-overlapping, half-open ranges (`start < end`)
  679. /// - use the same coordinate system.
  680. ///
  681. /// # Parameters
  682. /// * `a` – Slice of references to `GenomeRange`.
  683. /// * `b` – Slice of references to `GenomeRange`.
  684. ///
  685. /// # Returns
  686. /// A `Vec<GenomeRange>` containing one entry for each pairwise
  687. /// intersection.
  688. /// The output is *not* guaranteed to be sorted.
  689. ///
  690. /// # Complexity
  691. /// `O(|a| + |b|)` total work; memory ≤ the number of produced
  692. /// intersections. Work is split per contig and executed in
  693. /// parallel.
  694. ///
  695. /// # Panics
  696. /// Never panics if the pre-conditions above are met.
  697. ///
  698. /// # Example
  699. /// ```
  700. /// # use pandora_lib_promethion::{GenomeRange, range_intersection_par};
  701. /// let a_range = GenomeRange { contig: 1, range: 100..200 };
  702. /// let b_range = GenomeRange { contig: 1, range: 150..250 };
  703. ///
  704. /// let out = range_intersection_par(&[&a_range], &[&b_range]);
  705. /// assert_eq!(out, vec![GenomeRange { contig: 1, range: 150..200 }]);
  706. /// ```
  707. pub fn range_intersection_par(a: &[&GenomeRange], b: &[&GenomeRange]) -> Vec<GenomeRange> {
  708. let (a_contigs, b_contigs) = rayon::join(
  709. || extract_contig_indices(a),
  710. || extract_contig_indices(b),
  711. );
  712. a_contigs.into_par_iter()
  713. .filter_map(|(contig, a_start, a_end)| {
  714. let (b_start, b_end) = find_contig_indices(&b_contigs, contig)?;
  715. let a_ranges = &a[a_start..a_end];
  716. let b_ranges = &b[b_start..b_end];
  717. let mut intersections = Vec::new();
  718. let (mut i, mut j) = (0, 0);
  719. while i < a_ranges.len() && j < b_ranges.len() {
  720. let a_range = &a_ranges[i].range;
  721. let b_range = &b_ranges[j].range;
  722. match (a_range.end <= b_range.start, b_range.end <= a_range.start) {
  723. (true, _) => i += 1,
  724. (_, true) => j += 1,
  725. _ => {
  726. let start = a_range.start.max(b_range.start);
  727. let end = a_range.end.min(b_range.end);
  728. if start < end {
  729. intersections.push(GenomeRange {
  730. contig,
  731. range: start..end,
  732. });
  733. }
  734. if a_range.end < b_range.end {
  735. i += 1;
  736. } else {
  737. j += 1;
  738. }
  739. }
  740. }
  741. }
  742. Some(intersections)
  743. })
  744. .flatten()
  745. .collect()
  746. }
  747. // Helper to create (contig, start index, end index) tuples
  748. pub fn extract_contig_indices(ranges: &[&GenomeRange]) -> Vec<(u8, usize, usize)> {
  749. let mut indices = Vec::new();
  750. let mut current_contig = None;
  751. let mut start_idx = 0;
  752. for (i, range) in ranges.iter().enumerate() {
  753. if current_contig != Some(range.contig) {
  754. if let Some(contig) = current_contig {
  755. indices.push((contig, start_idx, i));
  756. }
  757. current_contig = Some(range.contig);
  758. start_idx = i;
  759. }
  760. }
  761. if let Some(contig) = current_contig {
  762. indices.push((contig, start_idx, ranges.len()));
  763. }
  764. indices
  765. }
  766. // Binary search to find contig indices in precomputed list
  767. pub fn find_contig_indices(contigs: &[(u8, usize, usize)], target: u8) -> Option<(usize, usize)> {
  768. contigs.binary_search_by(|(c, _, _)| c.cmp(&target))
  769. .ok()
  770. .map(|idx| (contigs[idx].1, contigs[idx].2))
  771. }
  772. /// A parallel version of the overlap finding function that works with any types
  773. /// implementing `GetGenomePosition` and `GetGenomeRange` traits.
  774. ///
  775. /// This function first converts the input slices to vectors of `GenomePosition` and `GenomeRange`,
  776. /// then calls `overlaps_par`.
  777. ///
  778. /// # Arguments
  779. /// * `a` - A slice of items implementing `GetGenomePosition`
  780. /// * `b` - A slice of items implementing `GetGenomeRange`
  781. ///
  782. /// # Returns
  783. /// A `Vec<usize>` containing the indices of positions that overlap with any range.
  784. ///
  785. /// # Example
  786. /// ```
  787. /// struct MyPosition(GenomePosition);
  788. /// impl GetGenomePosition for MyPosition {
  789. /// fn position(&self) -> &GenomePosition { &self.0 }
  790. /// }
  791. ///
  792. /// struct MyRange(GenomeRange);
  793. /// impl GetGenomeRange for MyRange {
  794. /// fn range(&self) -> &GenomeRange { &self.0 }
  795. /// }
  796. ///
  797. /// let positions = vec![
  798. /// MyPosition(GenomePosition { contig: 1, position: 100 }),
  799. /// MyPosition(GenomePosition { contig: 1, position: 200 }),
  800. /// ];
  801. /// let ranges = vec![
  802. /// MyRange(GenomeRange { contig: 1, range: 50..150 }),
  803. /// ];
  804. /// let overlaps = par_overlaps(&positions, &ranges);
  805. /// assert_eq!(overlaps, vec!);
  806. /// ```
  807. pub fn par_overlaps(a: &[impl GetGenomePosition], b: &[impl GetGenomeRange]) -> Vec<usize> {
  808. let a: Vec<_> = a.iter().map(|e| e.position()).collect();
  809. let b: Vec<_> = b.iter().map(|e| e.range()).collect();
  810. overlaps_par(&a, &b)
  811. }
  812. /// A trait for types that can provide a reference to a `GenomePosition`.
  813. ///
  814. /// Implement this trait for types that contain or can compute a `GenomePosition`.
  815. pub trait GetGenomePosition {
  816. // Returns a reference to the `GenomePosition` for this item.
  817. fn position(&self) -> &GenomePosition;
  818. }
  819. /// A trait for types that can provide a reference to a `GenomeRange`.
  820. ///
  821. /// Implement this trait for types that contain or can compute a `GenomeRange`.
  822. pub trait GetGenomeRange {
  823. /// Returns a reference to the `GenomeRange` for this item.
  824. fn range(&self) -> &GenomeRange;
  825. }
  826. pub fn sort_ranges(ranges: &mut [GenomeRange]) {
  827. ranges.par_sort_by_key(|r| (r.contig, r.range.start, r.range.end));
  828. }