Ver código fonte

fix variant duplication bug in Variants::merge

The Equal branch advanced both iterators by exactly one step and compared
only that pair by (REF, ALT). When N self-variants and M other-variants
shared the same genomic position (multi-allelic sites, same position called
with different ALTs by different callers), only the first pair was compared
and the rest were appended raw — producing N+M entries instead of the
correct merged set.

Fix: when positions are equal, drain ALL variants at that position from both
sides, then match each other-variant to an existing self-variant by (REF, ALT).
Matched variants are merged into the existing entry; unmatched ones create a
new entry. This correctly handles the N:M case at every position.

The downstream sort() + in_place_merge() dedup_by could partially recover
consecutive duplicates but was not reliable for non-adjacent ones after the
position sort.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Thomas 1 mês atrás
pai
commit
9fe2d09789
1 arquivos alterados com 27 adições e 13 exclusões
  1. 27 13
      src/variant/variant_collection.rs

+ 27 - 13
src/variant/variant_collection.rs

@@ -1230,7 +1230,12 @@ impl Variants {
         let mut self_iter = self.data.drain(..).peekable(); // Iterator for self.data
         let mut others_iter = others.variants.into_iter().peekable(); // Iterator for others.variants
 
-        // Merge using two-pointer technique
+        // Merge using two-pointer technique.
+        // When positions are equal we collect ALL variants at that position from
+        // both sides before matching by (REF, ALT). Advancing only one element at
+        // a time was the original bug: with N self-variants and M other-variants at
+        // the same position, only the first pair was compared and the rest were
+        // appended raw, producing N+M entries instead of the correct max(N,M).
         while let (Some(self_variant), Some(other_variant)) = (self_iter.peek(), others_iter.peek())
         {
             match self_variant.position.cmp(&other_variant.position) {
@@ -1244,20 +1249,29 @@ impl Variants {
                     ));
                 }
                 std::cmp::Ordering::Equal => {
-                    let self_variant = self_iter.next().unwrap();
-                    let other_variant = others_iter.next().unwrap();
+                    let pos = self_iter.peek().unwrap().position.clone();
 
-                    if self_variant.reference == other_variant.reference
-                        && self_variant.alternative == other_variant.alternative
-                    {
-                        let mut merged_variant = self_variant;
-                        merged_variant.vcf_variants.push(other_variant);
-                        n_merged += 1;
-                        result.push(merged_variant);
-                    } else {
-                        result.push(self_variant);
-                        result.push(create_variant(vec![other_variant], annotations));
+                    // Drain all self-variants at this position.
+                    let mut self_at_pos: Vec<Variant> = Vec::new();
+                    while self_iter.peek().map(|v| v.position == pos).unwrap_or(false) {
+                        self_at_pos.push(self_iter.next().unwrap());
                     }
+
+                    // Drain all other-variants at this position, merging into
+                    // the matching self-variant by (REF, ALT) or creating a new one.
+                    while others_iter.peek().map(|v| v.position == pos).unwrap_or(false) {
+                        let other = others_iter.next().unwrap();
+                        if let Some(existing) = self_at_pos.iter_mut().find(|v| {
+                            v.reference == other.reference && v.alternative == other.alternative
+                        }) {
+                            existing.vcf_variants.push(other);
+                            n_merged += 1;
+                        } else {
+                            self_at_pos.push(create_variant(vec![other], annotations));
+                        }
+                    }
+
+                    result.extend(self_at_pos);
                 }
             }
         }