| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345 |
- //! Straglr TSV output parser.
- //!
- //! Parses TSV files produced by the Straglr STR genotyper (`--genotype_in_size` mode).
- //! Aggregates per-read rows into one [`StraglrRow`] per unique locus and sorts results
- //! in correct genomic order (chr1 < chr2 < … < chrX, not lexicographic).
- use anyhow::Context;
- use log::warn;
- use std::{
- collections::HashMap,
- io::{BufRead, BufReader},
- str::FromStr,
- };
- use crate::positions::GenomeRange;
- use super::readers::get_reader;
- /// Represents a single STR locus genotyped by Straglr.
- ///
- /// Aggregated from per-read rows sharing the same (chrom, start, end, repeat_unit).
- /// The genotype field contains allele sizes in format "size(count)" or "size1(n1);size2(n2)".
- #[derive(Debug, Clone, PartialEq)]
- pub struct StraglrRow {
- /// Chromosome name (e.g., "chr4", "chrX")
- pub chrom: String,
- /// Start position (0-based)
- pub start: u64,
- /// End position (exclusive)
- pub end: u64,
- /// Repeat unit motif (e.g., "CAG", "GGCCCC")
- pub repeat_unit: String,
- /// Genotype string: "size(count)" or "size1(n1);size2(n2)" for het
- pub genotype: String,
- /// Total read support (sum of counts from genotype)
- pub support: u32,
- }
- /// Raw per-read row from Straglr output (internal).
- #[derive(Debug)]
- struct RawReadRow {
- chrom: String,
- start: u64,
- end: u64,
- repeat_unit: String,
- genotype: String,
- }
- impl FromStr for RawReadRow {
- type Err = anyhow::Error;
- /// Parses a single TSV line from Straglr output.
- ///
- /// Expected format (11 tab-separated fields):
- /// ```text
- /// #chrom start end repeat_unit genotype read copy_number size read_start strand allele
- /// chr1 100075838 100075872 TTTAATT 35.9(54) uuid 5.3 37 2186 + 35.9
- /// ```
- fn from_str(s: &str) -> anyhow::Result<Self> {
- let fields: Vec<&str> = s.split('\t').collect();
- if fields.len() < 11 {
- anyhow::bail!(
- "Invalid Straglr TSV line: expected 11 fields, got {}",
- fields.len()
- );
- }
- Ok(Self {
- chrom: fields[0].to_string(),
- start: fields[1]
- .parse()
- .context(format!("Failed to parse start: {}", fields[1]))?,
- end: fields[2]
- .parse()
- .context(format!("Failed to parse end: {}", fields[2]))?,
- repeat_unit: fields[3].to_string(),
- genotype: fields[4].to_string(),
- })
- }
- }
- impl StraglrRow {
- /// Returns the locus length in base pairs.
- pub fn locus_length(&self) -> u64 {
- self.end.saturating_sub(self.start)
- }
- /// Returns the motif length.
- pub fn motif_length(&self) -> usize {
- self.repeat_unit.len()
- }
- /// Returns allele sizes parsed from genotype string.
- pub fn allele_sizes(&self) -> Vec<f64> {
- parse_genotype_sizes(&self.genotype)
- }
- /// Returns copy numbers (size / motif_length) for each allele.
- pub fn copy_numbers(&self) -> Vec<f64> {
- let motif_len = self.motif_length() as f64;
- self.allele_sizes()
- .into_iter()
- .map(|s| s / motif_len)
- .collect()
- }
- /// Returns the maximum allele size in bp, or `None` if no alleles are present.
- pub fn max_allele_size(&self) -> Option<f64> {
- self.allele_sizes()
- .into_iter()
- .max_by(f64::total_cmp)
- }
- /// Returns the maximum copy number across all alleles, or `None` if no alleles are present.
- pub fn max_copy_number(&self) -> Option<f64> {
- self.copy_numbers()
- .into_iter()
- .max_by(f64::total_cmp)
- }
- /// Returns `true` if any allele has a copy number ≥ `threshold_cn`.
- pub fn is_expanded(&self, threshold_cn: f64) -> bool {
- self.copy_numbers().into_iter().any(|cn| cn >= threshold_cn)
- }
- /// Returns true if the locus is homozygous (single allele).
- pub fn is_homozygous(&self) -> bool {
- self.allele_sizes().len() <= 1
- }
- /// Returns the location as a string (e.g., "chr4:3074876-3074933").
- pub fn location_string(&self) -> String {
- format!("{}:{}-{}", self.chrom, self.start, self.end)
- }
- }
- /// Parses genotype string "size(count)" or "size1(n1);size2(n2)" into sizes.
- fn parse_genotype_sizes(genotype: &str) -> Vec<f64> {
- genotype
- .split(';')
- .filter_map(|part| {
- let part = part.trim();
- if part.is_empty() || part == "." {
- return None;
- }
- if let Some(paren_idx) = part.find('(') {
- part[..paren_idx].parse().ok()
- } else {
- part.parse().ok()
- }
- })
- .collect()
- }
- /// Parses genotype string and returns total read support.
- fn parse_genotype_support(genotype: &str) -> u32 {
- genotype
- .split(';')
- .filter_map(|part| {
- let part = part.trim();
- if part.is_empty() || part == "." {
- return None;
- }
- let open = part.find('(')?;
- let close = part.find(')')?;
- part[open + 1..close].parse::<u32>().ok()
- })
- .sum()
- }
- /// Unique key for grouping reads by locus.
- #[derive(Debug, Clone, PartialEq, Eq, Hash)]
- struct LocusKey {
- chrom: String,
- start: u64,
- end: u64,
- repeat_unit: String,
- }
- /// Parse a Straglr TSV output file into one [`StraglrRow`] per unique locus.
- ///
- /// Aggregates all per-read rows that share the same `(chrom, start, end, repeat_unit)`.
- /// The genotype string from the first seen read is kept (all reads at a locus report the
- /// same genotype in Straglr output). Header lines (`#`) and blank lines are skipped.
- /// Lines that fail to parse are logged as warnings and skipped.
- ///
- /// Results are sorted in genomic order using numeric contig indices — `chr1` before
- /// `chr2` before `chr10` etc.
- ///
- /// # Arguments
- ///
- /// * `path` - Path to the Straglr TSV file (plain text or gzip)
- ///
- /// # Returns
- ///
- /// One [`StraglrRow`] per unique locus, sorted by `(contig, start, end)`.
- ///
- /// # Errors
- ///
- /// Returns an error if the file cannot be opened.
- pub fn read_straglr_tsv(path: &str) -> anyhow::Result<Vec<StraglrRow>> {
- let reader = BufReader::new(get_reader(path)?);
- let mut locus_map: HashMap<LocusKey, String> = HashMap::new();
- for (line_num, line) in reader.lines().enumerate() {
- let line = match line {
- Ok(l) => l,
- Err(e) => {
- warn!("Failed to read line {}: {}", line_num + 1, e);
- continue;
- }
- };
- if line.starts_with('#') || line.trim().is_empty() {
- continue;
- }
- match line.parse::<RawReadRow>() {
- Ok(row) => {
- let key = LocusKey {
- chrom: row.chrom,
- start: row.start,
- end: row.end,
- repeat_unit: row.repeat_unit,
- };
- // All reads at same locus share genotype; store first occurrence
- locus_map.entry(key).or_insert(row.genotype);
- }
- Err(e) => warn!("Failed to parse line {}: {} (error: {})", line_num + 1, line, e),
- }
- }
- // Convert to StraglrRow
- let mut results: Vec<StraglrRow> = locus_map
- .into_iter()
- .map(|(key, genotype)| {
- let support = parse_genotype_support(&genotype);
- StraglrRow {
- chrom: key.chrom,
- start: key.start,
- end: key.end,
- repeat_unit: key.repeat_unit,
- genotype,
- support,
- }
- })
- .collect();
- // Sort by genomic position using numeric contig ordering (chr1 < chr2 < … < chrX)
- // to avoid lexicographic pitfalls (e.g. "chr10" sorting before "chr2").
- results.sort_by_key(|r| {
- let contig = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32).contig;
- (contig, r.start, r.end)
- });
- Ok(results)
- }
- /// Retain only loci whose maximum copy number is at or above `min_copy_number`.
- pub fn filter_expanded(rows: Vec<StraglrRow>, min_copy_number: f64) -> Vec<StraglrRow> {
- rows.into_iter()
- .filter(|row| row.is_expanded(min_copy_number))
- .collect()
- }
- /// Group loci by chromosome name.
- pub fn group_by_chromosome(rows: Vec<StraglrRow>) -> HashMap<String, Vec<StraglrRow>> {
- let mut map: HashMap<String, Vec<StraglrRow>> = HashMap::new();
- for row in rows {
- map.entry(row.chrom.clone()).or_default().push(row);
- }
- map
- }
- #[cfg(test)]
- mod tests {
- use super::*;
- #[test]
- fn test_parse_raw_read_row() {
- let line = "chr1\t100075838\t100075872\tTTTAATT\t35.9(54)\t40ef6b9e-e0d6-4220-8300-eb66f4bfa5ea\t5.3\t37\t2186\t+\t35.9";
- let row: RawReadRow = line.parse().unwrap();
- assert_eq!(row.chrom, "chr1");
- assert_eq!(row.start, 100075838);
- assert_eq!(row.end, 100075872);
- assert_eq!(row.repeat_unit, "TTTAATT");
- assert_eq!(row.genotype, "35.9(54)");
- }
- #[test]
- fn test_parse_genotype_sizes() {
- assert_eq!(parse_genotype_sizes("35.9(54)"), vec![35.9]);
- assert_eq!(parse_genotype_sizes("20.1(25);45.3(29)"), vec![20.1, 45.3]);
- assert_eq!(parse_genotype_sizes("."), Vec::<f64>::new());
- }
- #[test]
- fn test_parse_genotype_support() {
- assert_eq!(parse_genotype_support("35.9(54)"), 54);
- assert_eq!(parse_genotype_support("20.1(25);45.3(29)"), 54);
- assert_eq!(parse_genotype_support("."), 0);
- }
- #[test]
- fn test_straglr_row_methods() {
- let row = StraglrRow {
- chrom: "chr4".to_string(),
- start: 100,
- end: 200,
- repeat_unit: "CAG".to_string(),
- genotype: "60.0(30);90.0(25)".to_string(),
- support: 55,
- };
- assert_eq!(row.locus_length(), 100);
- assert_eq!(row.motif_length(), 3);
- assert_eq!(row.allele_sizes(), vec![60.0, 90.0]);
-
- let cns = row.copy_numbers();
- assert!((cns[0] - 20.0).abs() < 0.01);
- assert!((cns[1] - 30.0).abs() < 0.01);
-
- assert!((row.max_allele_size().unwrap() - 90.0).abs() < 0.01);
- assert!((row.max_copy_number().unwrap() - 30.0).abs() < 0.01);
- assert!(row.is_expanded(25.0));
- assert!(!row.is_expanded(35.0));
- assert!(!row.is_homozygous());
- assert_eq!(row.location_string(), "chr4:100-200");
- }
- #[test]
- fn test_homozygous() {
- let row = StraglrRow {
- chrom: "chr4".to_string(),
- start: 100,
- end: 200,
- repeat_unit: "CAG".to_string(),
- genotype: "60.0(55)".to_string(),
- support: 55,
- };
- assert!(row.is_homozygous());
- }
- }
|