Selaa lähdekoodia

fix and audit src/io/straglr.rs

- Fix chromosome sort bug: lexicographic sort ordered chr10 before chr2;
  now uses GenomeRange::new().contig (numeric u8 key) for correct genomic
  order (chr1 < chr2 < ... < chrX)
- Fix max_allele_size / max_copy_number: replace partial_cmp().unwrap()
  with f64::total_cmp to avoid panic on NaN values in malformed input
- Fix is_expanded: into_iter().any(|cn|) instead of iter().any(|&cn|)
- Fix duplicate //! module doc line
- Update rustdoc on read_straglr_tsv (explain genotype-first-occurrence
  semantics and numeric sort), filter_expanded, group_by_chromosome

All 5 unit tests pass.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 kuukausi sitten
vanhempi
commit
11ec9f65cd
1 muutettua tiedostoa jossa 34 lisäystä ja 22 poistoa
  1. 34 22
      src/io/straglr.rs

+ 34 - 22
src/io/straglr.rs

@@ -1,7 +1,8 @@
-//! Straglr TSV output parser
-//! Straglr TSV output parser
+//! Straglr TSV output parser.
 //!
-//! Parses TSV files produced by Straglr STR genotyper with `--genotype_in_size`.
+//! Parses TSV files produced by the Straglr STR genotyper (`--genotype_in_size` mode).
+//! Aggregates per-read rows into one [`StraglrRow`] per unique locus and sorts results
+//! in correct genomic order (chr1 < chr2 < … < chrX, not lexicographic).
 
 use anyhow::Context;
 use log::warn;
@@ -11,6 +12,7 @@ use std::{
     str::FromStr,
 };
 
+use crate::positions::GenomeRange;
 use super::readers::get_reader;
 
 /// Represents a single STR locus genotyped by Straglr.
@@ -102,23 +104,23 @@ impl StraglrRow {
             .collect()
     }
 
-    /// Returns the maximum allele size in bp.
+    /// Returns the maximum allele size in bp, or `None` if no alleles are present.
     pub fn max_allele_size(&self) -> Option<f64> {
         self.allele_sizes()
             .into_iter()
-            .max_by(|a, b| a.partial_cmp(b).unwrap())
+            .max_by(f64::total_cmp)
     }
 
-    /// Returns the maximum copy number across all alleles.
+    /// Returns the maximum copy number across all alleles, or `None` if no alleles are present.
     pub fn max_copy_number(&self) -> Option<f64> {
         self.copy_numbers()
             .into_iter()
-            .max_by(|a, b| a.partial_cmp(b).unwrap())
+            .max_by(f64::total_cmp)
     }
 
-    /// Returns true if this locus has an expanded repeat.
+    /// Returns `true` if any allele has a copy number ≥ `threshold_cn`.
     pub fn is_expanded(&self, threshold_cn: f64) -> bool {
-        self.copy_numbers().iter().any(|&cn| cn >= threshold_cn)
+        self.copy_numbers().into_iter().any(|cn| cn >= threshold_cn)
     }
 
     /// Returns true if the locus is homozygous (single allele).
@@ -175,16 +177,27 @@ struct LocusKey {
     repeat_unit: String,
 }
 
-/// Reads and parses a Straglr TSV output file.
+/// Parse a Straglr TSV output file into one [`StraglrRow`] per unique locus.
 ///
-/// Aggregates per-read rows into unique loci.
-/// Skips header lines (starting with `#`) and empty lines.
+/// Aggregates all per-read rows that share the same `(chrom, start, end, repeat_unit)`.
+/// The genotype string from the first seen read is kept (all reads at a locus report the
+/// same genotype in Straglr output). Header lines (`#`) and blank lines are skipped.
+/// Lines that fail to parse are logged as warnings and skipped.
+///
+/// Results are sorted in genomic order using numeric contig indices — `chr1` before
+/// `chr2` before `chr10` etc.
 ///
 /// # Arguments
-/// * `path` - Path to the Straglr TSV file (can be gzipped)
+///
+/// * `path` - Path to the Straglr TSV file (plain text or gzip)
 ///
 /// # Returns
-/// `Ok(Vec<StraglrRow>)` with one entry per unique locus
+///
+/// One [`StraglrRow`] per unique locus, sorted by `(contig, start, end)`.
+///
+/// # Errors
+///
+/// Returns an error if the file cannot be opened.
 pub fn read_straglr_tsv(path: &str) -> anyhow::Result<Vec<StraglrRow>> {
     let reader = BufReader::new(get_reader(path)?);
     let mut locus_map: HashMap<LocusKey, String> = HashMap::new();
@@ -233,25 +246,24 @@ pub fn read_straglr_tsv(path: &str) -> anyhow::Result<Vec<StraglrRow>> {
         })
         .collect();
 
-    // Sort by genomic position
-    results.sort_by(|a, b| {
-        a.chrom
-            .cmp(&b.chrom)
-            .then(a.start.cmp(&b.start))
-            .then(a.end.cmp(&b.end))
+    // Sort by genomic position using numeric contig ordering (chr1 < chr2 < … < chrX)
+    // to avoid lexicographic pitfalls (e.g. "chr10" sorting before "chr2").
+    results.sort_by_key(|r| {
+        let contig = GenomeRange::new(&r.chrom, r.start as u32, r.end as u32).contig;
+        (contig, r.start, r.end)
     });
 
     Ok(results)
 }
 
-/// Filters Straglr results to keep only loci with copy numbers above a threshold.
+/// Retain only loci whose maximum copy number is at or above `min_copy_number`.
 pub fn filter_expanded(rows: Vec<StraglrRow>, min_copy_number: f64) -> Vec<StraglrRow> {
     rows.into_iter()
         .filter(|row| row.is_expanded(min_copy_number))
         .collect()
 }
 
-/// Groups Straglr results by chromosome.
+/// Group loci by chromosome name.
 pub fn group_by_chromosome(rows: Vec<StraglrRow>) -> HashMap<String, Vec<StraglrRow>> {
     let mut map: HashMap<String, Vec<StraglrRow>> = HashMap::new();
     for row in rows {