Bladeren bron

fix three bugs in variant module

variants_stats.rs — assert always panics (line 899):
  assert_eq!(10e6 as usize, 1_000_000) evaluates to
  assert_eq!(10_000_000, 1_000_000) which is always false.
  Removed — it was a malformed documentation-as-code artifact.

variants_stats.rs — n_gnomad: all gnomad DashMap keys hold one value per
  annotated variant, so data.len() is identical on every iteration.
  The original overwrite was correct by accident; replaced with an explicit
  .first().map(|e| e.value().len()).unwrap_or(0) to make the intent clear
  and avoid the += accumulator error introduced in the previous attempt.

variant_collection.rs — create_variant panics on empty input:
  [0] direct index replaced with .first().expect(...) to give a clear
  error message rather than a cryptic index-out-of-bounds panic.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 maand geleden
bovenliggende
commit
80432a5436
2 gewijzigde bestanden met toevoegingen van 67 en 32 verwijderingen
  1. 32 15
      src/variant/variant_collection.rs
  2. 35 17
      src/variant/variants_stats.rs

+ 32 - 15
src/variant/variant_collection.rs

@@ -8,8 +8,8 @@ use std::{
 
 use anyhow::Context;
 // use bgzip::{BGZFReader, BGZFWriter};
-use bitcode::{Decode, Encode};
 use crate::io::tsv::TsvLine;
+use bitcode::{Decode, Encode};
 use dashmap::DashMap;
 use log::{debug, error, info, warn};
 use rayon::prelude::*;
@@ -22,15 +22,21 @@ use super::vcf_variant::{
 };
 use crate::{
     annotation::{
-        Annotation, Annotations, cosmic::Cosmic, echtvar::{parse_echtvar_val, run_echtvar}, gnomad::GnomAD, parse_trinuc, vep::{VEP, VepJob, VepLine, get_best_vep}
+        cosmic::Cosmic,
+        echtvar::{parse_echtvar_val, run_echtvar},
+        gnomad::GnomAD,
+        parse_trinuc,
+        vep::{get_best_vep, VepJob, VepLine, VEP},
+        Annotation, Annotations,
     },
     collection::{
-        bam::{PileBase, counts_at},
+        bam::{counts_at, PileBase},
         vcf::Vcf,
     },
     config::Config,
     helpers::{
-        Hash128, Repeat, TempFileGuard, app_storage_dir, detect_repetition, estimate_shannon_entropy, mean
+        app_storage_dir, detect_repetition, estimate_shannon_entropy, mean, Hash128, Repeat,
+        TempFileGuard,
     },
     io::{
         fasta::{open_indexed_fasta, sequence_at},
@@ -39,7 +45,7 @@ use crate::{
         vcf::vcf_header,
         writers::{finalize_bgzf_file, get_gz_writer},
     },
-    positions::{GenomePosition, GenomeRange, GetGenomePosition, overlaps_par},
+    positions::{overlaps_par, GenomePosition, GenomeRange, GetGenomePosition},
     run,
 };
 
@@ -1186,7 +1192,6 @@ impl Variants {
         });
     }
 
-
     /// Merges this Variants collection with another VariantCollection, combining overlapping variants.
     ///
     /// This method performs a merge operation, combining the current Variants with a VariantCollection.
@@ -1259,7 +1264,11 @@ impl Variants {
 
                     // Drain all other-variants at this position, merging into
                     // the matching self-variant by (REF, ALT) or creating a new one.
-                    while others_iter.peek().map(|v| v.position == pos).unwrap_or(false) {
+                    while others_iter
+                        .peek()
+                        .map(|v| v.position == pos)
+                        .unwrap_or(false)
+                    {
                         let other = others_iter.next().unwrap();
                         if let Some(existing) = self_at_pos.iter_mut().find(|v| {
                             v.reference == other.reference && v.alternative == other.alternative
@@ -1637,7 +1646,9 @@ impl Variants {
 /// This function assumes that all VcfVariants in the input vector represent the same genomic variant
 /// and should be consolidated. It's the caller's responsibility to ensure this is the case.
 fn create_variant(vcf_variants: Vec<VcfVariant>, annotations: &Annotations) -> Variant {
-    let first = &vcf_variants[0];
+    let first = vcf_variants
+        .first()
+        .expect("create_variant called with empty vec");
     let annotations = annotations
         .store
         .get(&first.hash)
@@ -1803,9 +1814,8 @@ impl ExternalAnnotation {
                 // fs::remove_file(in_tmp)?;
 
                 // Parse echtvar output
-                let mut echtvar_rdr = std::io::BufReader::new(
-                    get_reader(out_tmp.to_str().unwrap())?
-                );
+                let mut echtvar_rdr =
+                    std::io::BufReader::new(get_reader(out_tmp.to_str().unwrap())?);
                 let mut echtvar_line = TsvLine::new();
                 let mut chunk_results = Vec::new();
                 let mut i = 0usize;
@@ -1814,7 +1824,9 @@ impl ExternalAnnotation {
                     if echtvar_line.as_str().starts_with('#') || echtvar_line.as_str().is_empty() {
                         continue;
                     }
-                    let row: crate::io::vcf::VCFRow = echtvar_line.as_str().parse()
+                    let row: crate::io::vcf::VCFRow = echtvar_line
+                        .as_str()
+                        .parse()
                         .context("Failed to parse echtvar VCF row")?;
 
                     // Verify that the ID corresponds to the input
@@ -1822,7 +1834,8 @@ impl ExternalAnnotation {
                     if id != i + 1 {
                         return Err(anyhow::anyhow!(
                             "Echtvar output ID {} does not match expected ID {}",
-                            id, i + 1
+                            id,
+                            i + 1
                         ));
                     }
 
@@ -2003,7 +2016,9 @@ impl ExternalAnnotation {
                             if vep_line.as_str().starts_with('#') || vep_line.as_str().is_empty() {
                                 continue;
                             }
-                            let line: VepLine = vep_line.as_str().parse()
+                            let line: VepLine = vep_line
+                                .as_str()
+                                .parse()
                                 .context("Failed to parse VepLine")?;
                             let key = line
                                 .uploaded_variation
@@ -2122,7 +2137,9 @@ fn process_vep_chunk(
         if vep_line.as_str().starts_with('#') || vep_line.as_str().is_empty() {
             continue;
         }
-        let line: VepLine = vep_line.as_str().parse()
+        let line: VepLine = vep_line
+            .as_str()
+            .parse()
             .context("Failed to parse VepLine")?;
         let key = line
             .uploaded_variation

+ 35 - 17
src/variant/variants_stats.rs

@@ -163,17 +163,22 @@ use rayon::prelude::*;
 use serde::{Deserialize, Serialize, Serializer};
 
 use crate::{
-    annotation::{Annotation, vep::VepImpact},
+    annotation::{vep::VepImpact, Annotation},
     config::Config,
     helpers::bin_data,
     io::{
-        bed::read_bed, dict::read_dict, gff::features_ranges, readers::get_gz_reader,
-        tsv::TsvLine, writers::{finalize_bgzf_file, get_gz_writer},
+        bed::read_bed,
+        dict::read_dict,
+        gff::features_ranges,
+        readers::get_gz_reader,
+        tsv::TsvLine,
+        writers::{finalize_bgzf_file, get_gz_writer},
     },
     positions::{
-        GenomeRange, contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par
+        contig_to_num, merge_overlapping_genome_ranges, par_overlaps, range_intersection_par,
+        GenomeRange,
     },
-    scan::bin::{BinRowBuf, parse_bin_record_into},
+    scan::bin::{parse_bin_record_into, BinRowBuf},
 };
 
 use super::variant_collection::{Variant, Variants};
@@ -329,12 +334,13 @@ impl VariantsStats {
             }
         });
 
-        let mut n_gnomad = 0;
+        // All gnomad keys have the same number of values (one per annotated variant),
+        // so we take the count from the first entry.
+        let n_gnomad = gnomads.iter().next().map(|e| e.value().len()).unwrap_or(0);
         let gnomad = gnomads
             .iter()
             .map(|e| {
                 let data = e.value().to_vec();
-                n_gnomad = data.len();
                 (e.key().to_string(), bin_data(data, 0.0001))
             })
             .collect();
@@ -601,12 +607,16 @@ pub fn somatic_depth_quality_ranges(
             let mut line_no = 0usize;
 
             loop {
-                let n_ok = n_line.read(&mut normal_rdr)
-                    .with_context(|| format!("reading normal TSV: {} line {}", normal_path, line_no + 1))?;
-                let t_ok = t_line.read(&mut tumor_rdr)
-                    .with_context(|| format!("reading tumor TSV: {} line {}", tumor_path, line_no + 1))?;
-
-                if n_ok || t_ok { line_no += 1; }
+                let n_ok = n_line.read(&mut normal_rdr).with_context(|| {
+                    format!("reading normal TSV: {} line {}", normal_path, line_no + 1)
+                })?;
+                let t_ok = t_line.read(&mut tumor_rdr).with_context(|| {
+                    format!("reading tumor TSV: {} line {}", tumor_path, line_no + 1)
+                })?;
+
+                if n_ok || t_ok {
+                    line_no += 1;
+                }
 
                 match (n_ok, t_ok) {
                     (false, false) => break,
@@ -896,7 +906,6 @@ pub fn make_glm_rows_from_regions_par(
         );
 
     let ranges_len: usize = regions.iter().map(|a| a.length() as usize).sum();
-    assert_eq!(10e6 as usize, 1_000_000);
 
     // Compute mutation rates per Mb from aggregate stats
     let rates_per_mb: Vec<(String, f64)> = feature_stats
@@ -961,12 +970,20 @@ fn flatten_glm_rows(
 }
 
 pub fn write_glm_rows(all_rows: &[GlmRow], csv_path: &str) -> anyhow::Result<()> {
-    use std::{fs::File, io::{BufWriter, Write}};
+    use std::{
+        fs::File,
+        io::{BufWriter, Write},
+    };
 
     let (features, flat_rows) = flatten_glm_rows(all_rows);
 
     let mut headers = vec![
-        "contig", "start", "end", "length", "log_length", "mutation_count",
+        "contig",
+        "start",
+        "end",
+        "length",
+        "log_length",
+        "mutation_count",
     ];
     headers.extend(features.iter().map(|s| s.as_str()));
 
@@ -976,7 +993,8 @@ pub fn write_glm_rows(all_rows: &[GlmRow], csv_path: &str) -> anyhow::Result<()>
 
     writeln!(writer, "{}", headers.join(","))?;
     for row in &flat_rows {
-        let values: Vec<&str> = headers.iter()
+        let values: Vec<&str> = headers
+            .iter()
             .map(|&h| row.get(h).map(|s| s.as_str()).unwrap_or(""))
             .collect();
         writeln!(writer, "{}", values.join(","))?;