//! Straglr TSV output parser. //! //! Parses TSV files produced by the Straglr STR genotyper (`--genotype_in_size` mode). //! Aggregates per-read rows into one [`StraglrRow`] per unique locus and sorts results //! in correct genomic order (chr1 < chr2 < … < chrX, not lexicographic). use anyhow::Context; use log::warn; use std::{ collections::HashMap, io::{BufRead, BufReader}, str::FromStr, }; use super::readers::get_reader; use crate::positions::GenomeRange; /// Represents a single STR locus genotyped by Straglr. /// /// Aggregated from per-read rows sharing the same (chrom, start, end, repeat_unit). /// The genotype field contains allele sizes in format "size(count)" or "size1(n1);size2(n2)". #[derive(Debug, Clone, PartialEq)] pub struct StraglrRow { /// Chromosome name (e.g., "chr4", "chrX") pub chrom: String, /// Start position (0-based) pub start: u64, /// End position (exclusive) pub end: u64, /// Repeat unit motif (e.g., "CAG", "GGCCCC") pub repeat_unit: String, /// Genotype string: "size(count)" or "size1(n1);size2(n2)" for het pub genotype: String, /// Total read support (sum of counts from genotype) pub support: u32, } /// Raw per-read row from Straglr output (internal). #[derive(Debug)] struct RawReadRow { chrom: String, start: u64, end: u64, repeat_unit: String, genotype: String, } impl FromStr for RawReadRow { type Err = anyhow::Error; /// Parses a single TSV line from Straglr output. /// /// Expected format (11 tab-separated fields): /// ```text /// #chrom start end repeat_unit genotype read copy_number size read_start strand allele /// chr1 100075838 100075872 TTTAATT 35.9(54) uuid 5.3 37 2186 + 35.9 /// ``` fn from_str(s: &str) -> anyhow::Result { let fields: Vec<&str> = s.split('\t').collect(); if fields.len() < 11 { anyhow::bail!( "Invalid Straglr TSV line: expected 11 fields, got {}", fields.len() ); } Ok(Self { chrom: fields[0].to_string(), start: fields[1] .parse() .context(format!("Failed to parse start: {}", fields[1]))?, end: fields[2] .parse() .context(format!("Failed to parse end: {}", fields[2]))?, repeat_unit: fields[3].to_string(), genotype: fields[4].to_string(), }) } } impl StraglrRow { /// Returns the locus length in base pairs. pub fn locus_length(&self) -> u64 { self.end.saturating_sub(self.start) } /// Returns the motif length. pub fn motif_length(&self) -> usize { self.repeat_unit.len() } /// Returns allele sizes parsed from genotype string. pub fn allele_sizes(&self) -> Vec { parse_genotype_sizes(&self.genotype) } /// Returns copy numbers (size / motif_length) for each allele. pub fn copy_numbers(&self) -> Vec { let motif_len = self.motif_length() as f64; self.allele_sizes() .into_iter() .map(|s| s / motif_len) .collect() } /// Returns the maximum allele size in bp, or `None` if no alleles are present. pub fn max_allele_size(&self) -> Option { self.allele_sizes().into_iter().max_by(f64::total_cmp) } /// Returns the maximum copy number across all alleles, or `None` if no alleles are present. pub fn max_copy_number(&self) -> Option { self.copy_numbers().into_iter().max_by(f64::total_cmp) } /// Returns `true` if any allele has a copy number ≥ `threshold_cn`. pub fn is_expanded(&self, threshold_cn: f64) -> bool { self.copy_numbers().into_iter().any(|cn| cn >= threshold_cn) } /// Returns true if the locus is homozygous (single allele). pub fn is_homozygous(&self) -> bool { self.allele_sizes().len() <= 1 } /// Returns the location as a string (e.g., "chr4:3074876-3074933"). pub fn location_string(&self) -> String { format!("{}:{}-{}", self.chrom, self.start, self.end) } } /// Parses genotype string "size(count)" or "size1(n1);size2(n2)" into sizes. fn parse_genotype_sizes(genotype: &str) -> Vec { genotype .split(';') .filter_map(|part| { let part = part.trim(); if part.is_empty() || part == "." { return None; } if let Some(paren_idx) = part.find('(') { part[..paren_idx].parse().ok() } else { part.parse().ok() } }) .collect() } /// Parses genotype string and returns total read support. fn parse_genotype_support(genotype: &str) -> u32 { genotype .split(';') .filter_map(|part| { let part = part.trim(); if part.is_empty() || part == "." { return None; } let open = part.find('(')?; let close = part.find(')')?; part[open + 1..close].parse::().ok() }) .sum() } /// Unique key for grouping reads by locus. #[derive(Debug, Clone, PartialEq, Eq, Hash)] struct LocusKey { chrom: String, start: u64, end: u64, repeat_unit: String, } /// Parse a Straglr TSV output file into one [`StraglrRow`] per unique locus. /// /// Aggregates all per-read rows that share the same `(chrom, start, end, repeat_unit)`. /// The genotype string from the first seen read is kept (all reads at a locus report the /// same genotype in Straglr output). Header lines (`#`) and blank lines are skipped. /// Lines that fail to parse are logged as warnings and skipped. /// /// Results are sorted in genomic order using numeric contig indices — `chr1` before /// `chr2` before `chr10` etc. /// /// # Arguments /// /// * `path` - Path to the Straglr TSV file (plain text or gzip) /// /// # Returns /// /// One [`StraglrRow`] per unique locus, sorted by `(contig, start, end)`. /// /// # Errors /// /// Returns an error if the file cannot be opened. pub fn read_straglr_tsv(path: &str) -> anyhow::Result> { let reader = BufReader::new(get_reader(path)?); let mut locus_map: HashMap = HashMap::new(); for (line_num, line) in reader.lines().enumerate() { let line = match line { Ok(l) => l, Err(e) => { warn!("Failed to read line {}: {}", line_num + 1, e); continue; } }; if line.starts_with('#') || line.trim().is_empty() { continue; } match line.parse::() { Ok(row) => { let key = LocusKey { chrom: row.chrom, start: row.start, end: row.end, repeat_unit: row.repeat_unit, }; // All reads at same locus share genotype; store first occurrence locus_map.entry(key).or_insert(row.genotype); } Err(e) => warn!( "Failed to parse line {}: {} (error: {})", line_num + 1, line, e ), } } // Convert to StraglrRow let mut results: Vec = locus_map .into_iter() .map(|(key, genotype)| { let support = parse_genotype_support(&genotype); StraglrRow { chrom: key.chrom, start: key.start, end: key.end, repeat_unit: key.repeat_unit, genotype, support, } }) .collect(); // Sort by genomic position using numeric contig ordering (chr1 < chr2 < … < chrX) // to avoid lexicographic pitfalls (e.g. "chr10" sorting before "chr2"). results.sort_by_key(|r| { let contig = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32).contig; (contig, r.start, r.end) }); Ok(results) } /// Retain only loci whose maximum copy number is at or above `min_copy_number`. pub fn filter_expanded(rows: Vec, min_copy_number: f64) -> Vec { rows.into_iter() .filter(|row| row.is_expanded(min_copy_number)) .collect() } /// Group loci by chromosome name. pub fn group_by_chromosome(rows: Vec) -> HashMap> { let mut map: HashMap> = HashMap::new(); for row in rows { map.entry(row.chrom.clone()).or_default().push(row); } map } #[cfg(test)] mod tests { use super::*; #[test] fn test_parse_raw_read_row() { let line = "chr1\t100075838\t100075872\tTTTAATT\t35.9(54)\t40ef6b9e-e0d6-4220-8300-eb66f4bfa5ea\t5.3\t37\t2186\t+\t35.9"; let row: RawReadRow = line.parse().unwrap(); assert_eq!(row.chrom, "chr1"); assert_eq!(row.start, 100075838); assert_eq!(row.end, 100075872); assert_eq!(row.repeat_unit, "TTTAATT"); assert_eq!(row.genotype, "35.9(54)"); } #[test] fn test_parse_genotype_sizes() { assert_eq!(parse_genotype_sizes("35.9(54)"), vec![35.9]); assert_eq!(parse_genotype_sizes("20.1(25);45.3(29)"), vec![20.1, 45.3]); assert_eq!(parse_genotype_sizes("."), Vec::::new()); } #[test] fn test_parse_genotype_support() { assert_eq!(parse_genotype_support("35.9(54)"), 54); assert_eq!(parse_genotype_support("20.1(25);45.3(29)"), 54); assert_eq!(parse_genotype_support("."), 0); } #[test] fn test_straglr_row_methods() { let row = StraglrRow { chrom: "chr4".to_string(), start: 100, end: 200, repeat_unit: "CAG".to_string(), genotype: "60.0(30);90.0(25)".to_string(), support: 55, }; assert_eq!(row.locus_length(), 100); assert_eq!(row.motif_length(), 3); assert_eq!(row.allele_sizes(), vec![60.0, 90.0]); let cns = row.copy_numbers(); assert!((cns[0] - 20.0).abs() < 0.01); assert!((cns[1] - 30.0).abs() < 0.01); assert!((row.max_allele_size().unwrap() - 90.0).abs() < 0.01); assert!((row.max_copy_number().unwrap() - 30.0).abs() < 0.01); assert!(row.is_expanded(25.0)); assert!(!row.is_expanded(35.0)); assert!(!row.is_homozygous()); assert_eq!(row.location_string(), "chr4:100-200"); } #[test] fn test_homozygous() { let row = StraglrRow { chrom: "chr4".to_string(), start: 100, end: 200, repeat_unit: "CAG".to_string(), genotype: "60.0(55)".to_string(), support: 55, }; assert!(row.is_homozygous()); } }