|
|
@@ -1,7 +1,8 @@
|
|
|
-//! Straglr TSV output parser
|
|
|
-//! Straglr TSV output parser
|
|
|
+//! Straglr TSV output parser.
|
|
|
//!
|
|
|
-//! Parses TSV files produced by Straglr STR genotyper with `--genotype_in_size`.
|
|
|
+//! Parses TSV files produced by the Straglr STR genotyper (`--genotype_in_size` mode).
|
|
|
+//! Aggregates per-read rows into one [`StraglrRow`] per unique locus and sorts results
|
|
|
+//! in correct genomic order (chr1 < chr2 < … < chrX, not lexicographic).
|
|
|
|
|
|
use anyhow::Context;
|
|
|
use log::warn;
|
|
|
@@ -11,6 +12,7 @@ use std::{
|
|
|
str::FromStr,
|
|
|
};
|
|
|
|
|
|
+use crate::positions::GenomeRange;
|
|
|
use super::readers::get_reader;
|
|
|
|
|
|
/// Represents a single STR locus genotyped by Straglr.
|
|
|
@@ -102,23 +104,23 @@ impl StraglrRow {
|
|
|
.collect()
|
|
|
}
|
|
|
|
|
|
- /// Returns the maximum allele size in bp.
|
|
|
+ /// Returns the maximum allele size in bp, or `None` if no alleles are present.
|
|
|
pub fn max_allele_size(&self) -> Option<f64> {
|
|
|
self.allele_sizes()
|
|
|
.into_iter()
|
|
|
- .max_by(|a, b| a.partial_cmp(b).unwrap())
|
|
|
+ .max_by(f64::total_cmp)
|
|
|
}
|
|
|
|
|
|
- /// Returns the maximum copy number across all alleles.
|
|
|
+ /// Returns the maximum copy number across all alleles, or `None` if no alleles are present.
|
|
|
pub fn max_copy_number(&self) -> Option<f64> {
|
|
|
self.copy_numbers()
|
|
|
.into_iter()
|
|
|
- .max_by(|a, b| a.partial_cmp(b).unwrap())
|
|
|
+ .max_by(f64::total_cmp)
|
|
|
}
|
|
|
|
|
|
- /// Returns true if this locus has an expanded repeat.
|
|
|
+ /// Returns `true` if any allele has a copy number ≥ `threshold_cn`.
|
|
|
pub fn is_expanded(&self, threshold_cn: f64) -> bool {
|
|
|
- self.copy_numbers().iter().any(|&cn| cn >= threshold_cn)
|
|
|
+ self.copy_numbers().into_iter().any(|cn| cn >= threshold_cn)
|
|
|
}
|
|
|
|
|
|
/// Returns true if the locus is homozygous (single allele).
|
|
|
@@ -175,16 +177,27 @@ struct LocusKey {
|
|
|
repeat_unit: String,
|
|
|
}
|
|
|
|
|
|
-/// Reads and parses a Straglr TSV output file.
|
|
|
+/// Parse a Straglr TSV output file into one [`StraglrRow`] per unique locus.
|
|
|
///
|
|
|
-/// Aggregates per-read rows into unique loci.
|
|
|
-/// Skips header lines (starting with `#`) and empty lines.
|
|
|
+/// Aggregates all per-read rows that share the same `(chrom, start, end, repeat_unit)`.
|
|
|
+/// The genotype string from the first seen read is kept (all reads at a locus report the
|
|
|
+/// same genotype in Straglr output). Header lines (`#`) and blank lines are skipped.
|
|
|
+/// Lines that fail to parse are logged as warnings and skipped.
|
|
|
+///
|
|
|
+/// Results are sorted in genomic order using numeric contig indices — `chr1` before
|
|
|
+/// `chr2` before `chr10` etc.
|
|
|
///
|
|
|
/// # Arguments
|
|
|
-/// * `path` - Path to the Straglr TSV file (can be gzipped)
|
|
|
+///
|
|
|
+/// * `path` - Path to the Straglr TSV file (plain text or gzip)
|
|
|
///
|
|
|
/// # Returns
|
|
|
-/// `Ok(Vec<StraglrRow>)` with one entry per unique locus
|
|
|
+///
|
|
|
+/// One [`StraglrRow`] per unique locus, sorted by `(contig, start, end)`.
|
|
|
+///
|
|
|
+/// # Errors
|
|
|
+///
|
|
|
+/// Returns an error if the file cannot be opened.
|
|
|
pub fn read_straglr_tsv(path: &str) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
let reader = BufReader::new(get_reader(path)?);
|
|
|
let mut locus_map: HashMap<LocusKey, String> = HashMap::new();
|
|
|
@@ -233,25 +246,24 @@ pub fn read_straglr_tsv(path: &str) -> anyhow::Result<Vec<StraglrRow>> {
|
|
|
})
|
|
|
.collect();
|
|
|
|
|
|
- // Sort by genomic position
|
|
|
- results.sort_by(|a, b| {
|
|
|
- a.chrom
|
|
|
- .cmp(&b.chrom)
|
|
|
- .then(a.start.cmp(&b.start))
|
|
|
- .then(a.end.cmp(&b.end))
|
|
|
+ // Sort by genomic position using numeric contig ordering (chr1 < chr2 < … < chrX)
|
|
|
+ // to avoid lexicographic pitfalls (e.g. "chr10" sorting before "chr2").
|
|
|
+ results.sort_by_key(|r| {
|
|
|
+ let contig = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32).contig;
|
|
|
+ (contig, r.start, r.end)
|
|
|
});
|
|
|
|
|
|
Ok(results)
|
|
|
}
|
|
|
|
|
|
-/// Filters Straglr results to keep only loci with copy numbers above a threshold.
|
|
|
+/// Retain only loci whose maximum copy number is at or above `min_copy_number`.
|
|
|
pub fn filter_expanded(rows: Vec<StraglrRow>, min_copy_number: f64) -> Vec<StraglrRow> {
|
|
|
rows.into_iter()
|
|
|
.filter(|row| row.is_expanded(min_copy_number))
|
|
|
.collect()
|
|
|
}
|
|
|
|
|
|
-/// Groups Straglr results by chromosome.
|
|
|
+/// Group loci by chromosome name.
|
|
|
pub fn group_by_chromosome(rows: Vec<StraglrRow>) -> HashMap<String, Vec<StraglrRow>> {
|
|
|
let mut map: HashMap<String, Vec<StraglrRow>> = HashMap::new();
|
|
|
for row in rows {
|