Pārlūkot izejas kodu

Infos.freq_maf()

Thomas 2 nedēļas atpakaļ
vecāks
revīzija
a631c652db
1 mainītis faili ar 39 papildinājumiem un 0 dzēšanām
  1. 39 0
      src/variant/vcf_variant.rs

+ 39 - 0
src/variant/vcf_variant.rs

@@ -1312,6 +1312,45 @@ impl Infos {
             _ => None,
         }
     }
+
+    pub fn freq_maf(&self) -> Option<f32> {
+        self.0.iter().find_map(|i| {
+            if let Info::FREQ(s) = i { parse_maf_from_freq(s) } else { None }
+        })
+    }
+
+}
+
+/// Average ALT frequency across real population sources in a dbSNP FREQ field.
+///
+/// Format: `KOREAN:0.92,0.08|TOMMO:0.94,0.06|SGDP_PRJ:0.5,0.5|...`
+/// - Index 0 = REF freq, index 1 = ALT freq
+/// - Excludes SGDP_PRJ (encodes presence as 0.5, not real frequency)
+/// - Excludes dbGaP_PopFreq (ascertainment bias from disease cohorts)
+/// - Skips sources with ALT freq == 0.0 (not observed, not absent)
+///
+/// Returns `None` if no valid sources found.
+pub fn parse_maf_from_freq(freq: &str) -> Option<f32> {
+    const EXCLUDED: &[&str] = &["SGDP_PRJ", "dbGaP_PopFreq"];
+
+    let (sum, count) = freq
+        .split('|')
+        .filter_map(|source| {
+            let (name, alleles) = source.split_once(':')?;
+            if EXCLUDED.contains(&name) {
+                return None;
+            }
+            let alt = alleles
+                .split(',')
+                .nth(1)?
+                .parse::<f32>()
+                .ok()?;
+            // 0.0 means not observed in this source, skip rather than pulling MAF down
+            if alt <= 0.0 { None } else { Some(alt) }
+        })
+        .fold((0.0_f32, 0usize), |(s, c), af| (s + af, c + 1));
+
+    if count == 0 { None } else { Some(sum / count as f32) }
 }
 
 /// Enum representing a single INFO field in a VCF record.