straglr.rs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. //! Straglr TSV output parser.
  2. //!
  3. //! Parses TSV files produced by the Straglr STR genotyper (`--genotype_in_size` mode).
  4. //! Aggregates per-read rows into one [`StraglrRow`] per unique locus and sorts results
  5. //! in correct genomic order (chr1 < chr2 < … < chrX, not lexicographic).
  6. use anyhow::Context;
  7. use log::warn;
  8. use std::{
  9. collections::HashMap,
  10. io::{BufRead, BufReader},
  11. str::FromStr,
  12. };
  13. use super::readers::get_reader;
  14. use crate::positions::GenomeRange;
  15. /// Represents a single STR locus genotyped by Straglr.
  16. ///
  17. /// Aggregated from per-read rows sharing the same (chrom, start, end, repeat_unit).
  18. /// The genotype field contains allele sizes in format "size(count)" or "size1(n1);size2(n2)".
  19. #[derive(Debug, Clone, PartialEq)]
  20. pub struct StraglrRow {
  21. /// Chromosome name (e.g., "chr4", "chrX")
  22. pub chrom: String,
  23. /// Start position (0-based)
  24. pub start: u64,
  25. /// End position (exclusive)
  26. pub end: u64,
  27. /// Repeat unit motif (e.g., "CAG", "GGCCCC")
  28. pub repeat_unit: String,
  29. /// Genotype string: "size(count)" or "size1(n1);size2(n2)" for het
  30. pub genotype: String,
  31. /// Total read support (sum of counts from genotype)
  32. pub support: u32,
  33. }
  34. /// Raw per-read row from Straglr output (internal).
  35. #[derive(Debug)]
  36. struct RawReadRow {
  37. chrom: String,
  38. start: u64,
  39. end: u64,
  40. repeat_unit: String,
  41. genotype: String,
  42. }
  43. impl FromStr for RawReadRow {
  44. type Err = anyhow::Error;
  45. /// Parses a single TSV line from Straglr output.
  46. ///
  47. /// Expected format (11 tab-separated fields):
  48. /// ```text
  49. /// #chrom start end repeat_unit genotype read copy_number size read_start strand allele
  50. /// chr1 100075838 100075872 TTTAATT 35.9(54) uuid 5.3 37 2186 + 35.9
  51. /// ```
  52. fn from_str(s: &str) -> anyhow::Result<Self> {
  53. let fields: Vec<&str> = s.split('\t').collect();
  54. if fields.len() < 11 {
  55. anyhow::bail!(
  56. "Invalid Straglr TSV line: expected 11 fields, got {}",
  57. fields.len()
  58. );
  59. }
  60. Ok(Self {
  61. chrom: fields[0].to_string(),
  62. start: fields[1]
  63. .parse()
  64. .context(format!("Failed to parse start: {}", fields[1]))?,
  65. end: fields[2]
  66. .parse()
  67. .context(format!("Failed to parse end: {}", fields[2]))?,
  68. repeat_unit: fields[3].to_string(),
  69. genotype: fields[4].to_string(),
  70. })
  71. }
  72. }
  73. impl StraglrRow {
  74. /// Returns the locus length in base pairs.
  75. pub fn locus_length(&self) -> u64 {
  76. self.end.saturating_sub(self.start)
  77. }
  78. /// Returns the motif length.
  79. pub fn motif_length(&self) -> usize {
  80. self.repeat_unit.len()
  81. }
  82. /// Returns allele sizes parsed from genotype string.
  83. pub fn allele_sizes(&self) -> Vec<f64> {
  84. parse_genotype_sizes(&self.genotype)
  85. }
  86. /// Returns copy numbers (size / motif_length) for each allele.
  87. pub fn copy_numbers(&self) -> Vec<f64> {
  88. let motif_len = self.motif_length() as f64;
  89. self.allele_sizes()
  90. .into_iter()
  91. .map(|s| s / motif_len)
  92. .collect()
  93. }
  94. /// Returns the maximum allele size in bp, or `None` if no alleles are present.
  95. pub fn max_allele_size(&self) -> Option<f64> {
  96. self.allele_sizes().into_iter().max_by(f64::total_cmp)
  97. }
  98. /// Returns the maximum copy number across all alleles, or `None` if no alleles are present.
  99. pub fn max_copy_number(&self) -> Option<f64> {
  100. self.copy_numbers().into_iter().max_by(f64::total_cmp)
  101. }
  102. /// Returns `true` if any allele has a copy number ≥ `threshold_cn`.
  103. pub fn is_expanded(&self, threshold_cn: f64) -> bool {
  104. self.copy_numbers().into_iter().any(|cn| cn >= threshold_cn)
  105. }
  106. /// Returns true if the locus is homozygous (single allele).
  107. pub fn is_homozygous(&self) -> bool {
  108. self.allele_sizes().len() <= 1
  109. }
  110. /// Returns the location as a string (e.g., "chr4:3074876-3074933").
  111. pub fn location_string(&self) -> String {
  112. format!("{}:{}-{}", self.chrom, self.start, self.end)
  113. }
  114. }
  115. /// Parses genotype string "size(count)" or "size1(n1);size2(n2)" into sizes.
  116. fn parse_genotype_sizes(genotype: &str) -> Vec<f64> {
  117. genotype
  118. .split(';')
  119. .filter_map(|part| {
  120. let part = part.trim();
  121. if part.is_empty() || part == "." {
  122. return None;
  123. }
  124. if let Some(paren_idx) = part.find('(') {
  125. part[..paren_idx].parse().ok()
  126. } else {
  127. part.parse().ok()
  128. }
  129. })
  130. .collect()
  131. }
  132. /// Parses genotype string and returns total read support.
  133. fn parse_genotype_support(genotype: &str) -> u32 {
  134. genotype
  135. .split(';')
  136. .filter_map(|part| {
  137. let part = part.trim();
  138. if part.is_empty() || part == "." {
  139. return None;
  140. }
  141. let open = part.find('(')?;
  142. let close = part.find(')')?;
  143. part[open + 1..close].parse::<u32>().ok()
  144. })
  145. .sum()
  146. }
  147. /// Unique key for grouping reads by locus.
  148. #[derive(Debug, Clone, PartialEq, Eq, Hash)]
  149. struct LocusKey {
  150. chrom: String,
  151. start: u64,
  152. end: u64,
  153. repeat_unit: String,
  154. }
  155. /// Parse a Straglr TSV output file into one [`StraglrRow`] per unique locus.
  156. ///
  157. /// Aggregates all per-read rows that share the same `(chrom, start, end, repeat_unit)`.
  158. /// The genotype string from the first seen read is kept (all reads at a locus report the
  159. /// same genotype in Straglr output). Header lines (`#`) and blank lines are skipped.
  160. /// Lines that fail to parse are logged as warnings and skipped.
  161. ///
  162. /// Results are sorted in genomic order using numeric contig indices — `chr1` before
  163. /// `chr2` before `chr10` etc.
  164. ///
  165. /// # Arguments
  166. ///
  167. /// * `path` - Path to the Straglr TSV file (plain text or gzip)
  168. ///
  169. /// # Returns
  170. ///
  171. /// One [`StraglrRow`] per unique locus, sorted by `(contig, start, end)`.
  172. ///
  173. /// # Errors
  174. ///
  175. /// Returns an error if the file cannot be opened.
  176. pub fn read_straglr_tsv(path: &str) -> anyhow::Result<Vec<StraglrRow>> {
  177. let reader = BufReader::new(get_reader(path)?);
  178. let mut locus_map: HashMap<LocusKey, String> = HashMap::new();
  179. for (line_num, line) in reader.lines().enumerate() {
  180. let line = match line {
  181. Ok(l) => l,
  182. Err(e) => {
  183. warn!("Failed to read line {}: {}", line_num + 1, e);
  184. continue;
  185. }
  186. };
  187. if line.starts_with('#') || line.trim().is_empty() {
  188. continue;
  189. }
  190. match line.parse::<RawReadRow>() {
  191. Ok(row) => {
  192. let key = LocusKey {
  193. chrom: row.chrom,
  194. start: row.start,
  195. end: row.end,
  196. repeat_unit: row.repeat_unit,
  197. };
  198. // All reads at same locus share genotype; store first occurrence
  199. locus_map.entry(key).or_insert(row.genotype);
  200. }
  201. Err(e) => warn!(
  202. "Failed to parse line {}: {} (error: {})",
  203. line_num + 1,
  204. line,
  205. e
  206. ),
  207. }
  208. }
  209. // Convert to StraglrRow
  210. let mut results: Vec<StraglrRow> = locus_map
  211. .into_iter()
  212. .map(|(key, genotype)| {
  213. let support = parse_genotype_support(&genotype);
  214. StraglrRow {
  215. chrom: key.chrom,
  216. start: key.start,
  217. end: key.end,
  218. repeat_unit: key.repeat_unit,
  219. genotype,
  220. support,
  221. }
  222. })
  223. .collect();
  224. // Sort by genomic position using numeric contig ordering (chr1 < chr2 < … < chrX)
  225. // to avoid lexicographic pitfalls (e.g. "chr10" sorting before "chr2").
  226. results.sort_by_key(|r| {
  227. let contig = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32).contig;
  228. (contig, r.start, r.end)
  229. });
  230. Ok(results)
  231. }
  232. /// Retain only loci whose maximum copy number is at or above `min_copy_number`.
  233. pub fn filter_expanded(rows: Vec<StraglrRow>, min_copy_number: f64) -> Vec<StraglrRow> {
  234. rows.into_iter()
  235. .filter(|row| row.is_expanded(min_copy_number))
  236. .collect()
  237. }
  238. /// Group loci by chromosome name.
  239. pub fn group_by_chromosome(rows: Vec<StraglrRow>) -> HashMap<String, Vec<StraglrRow>> {
  240. let mut map: HashMap<String, Vec<StraglrRow>> = HashMap::new();
  241. for row in rows {
  242. map.entry(row.chrom.clone()).or_default().push(row);
  243. }
  244. map
  245. }
  246. #[cfg(test)]
  247. mod tests {
  248. use super::*;
  249. #[test]
  250. fn test_parse_raw_read_row() {
  251. let line = "chr1\t100075838\t100075872\tTTTAATT\t35.9(54)\t40ef6b9e-e0d6-4220-8300-eb66f4bfa5ea\t5.3\t37\t2186\t+\t35.9";
  252. let row: RawReadRow = line.parse().unwrap();
  253. assert_eq!(row.chrom, "chr1");
  254. assert_eq!(row.start, 100075838);
  255. assert_eq!(row.end, 100075872);
  256. assert_eq!(row.repeat_unit, "TTTAATT");
  257. assert_eq!(row.genotype, "35.9(54)");
  258. }
  259. #[test]
  260. fn test_parse_genotype_sizes() {
  261. assert_eq!(parse_genotype_sizes("35.9(54)"), vec![35.9]);
  262. assert_eq!(parse_genotype_sizes("20.1(25);45.3(29)"), vec![20.1, 45.3]);
  263. assert_eq!(parse_genotype_sizes("."), Vec::<f64>::new());
  264. }
  265. #[test]
  266. fn test_parse_genotype_support() {
  267. assert_eq!(parse_genotype_support("35.9(54)"), 54);
  268. assert_eq!(parse_genotype_support("20.1(25);45.3(29)"), 54);
  269. assert_eq!(parse_genotype_support("."), 0);
  270. }
  271. #[test]
  272. fn test_straglr_row_methods() {
  273. let row = StraglrRow {
  274. chrom: "chr4".to_string(),
  275. start: 100,
  276. end: 200,
  277. repeat_unit: "CAG".to_string(),
  278. genotype: "60.0(30);90.0(25)".to_string(),
  279. support: 55,
  280. };
  281. assert_eq!(row.locus_length(), 100);
  282. assert_eq!(row.motif_length(), 3);
  283. assert_eq!(row.allele_sizes(), vec![60.0, 90.0]);
  284. let cns = row.copy_numbers();
  285. assert!((cns[0] - 20.0).abs() < 0.01);
  286. assert!((cns[1] - 30.0).abs() < 0.01);
  287. assert!((row.max_allele_size().unwrap() - 90.0).abs() < 0.01);
  288. assert!((row.max_copy_number().unwrap() - 30.0).abs() < 0.01);
  289. assert!(row.is_expanded(25.0));
  290. assert!(!row.is_expanded(35.0));
  291. assert!(!row.is_homozygous());
  292. assert_eq!(row.location_string(), "chr4:100-200");
  293. }
  294. #[test]
  295. fn test_homozygous() {
  296. let row = StraglrRow {
  297. chrom: "chr4".to_string(),
  298. start: 100,
  299. end: 200,
  300. repeat_unit: "CAG".to_string(),
  301. genotype: "60.0(55)".to_string(),
  302. support: 55,
  303. };
  304. assert!(row.is_homozygous());
  305. }
  306. }