|
|
@@ -107,7 +107,10 @@
|
|
|
//! ```
|
|
|
|
|
|
use crate::{
|
|
|
- annotation::Annotations,
|
|
|
+ annotation::{
|
|
|
+ dbsnp::{DbSnpFreq, DbSnpFreqEntry},
|
|
|
+ Annotations,
|
|
|
+ },
|
|
|
helpers::{estimate_shannon_entropy, mean, revcomp, Hash128},
|
|
|
io::fasta::sequence_range,
|
|
|
pipes::ShouldRun,
|
|
|
@@ -826,6 +829,154 @@ impl core::fmt::Display for BNDDesc {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+// --------------------- Normalize Multiallelic
|
|
|
+// #[derive(Debug, Clone, PartialEq)]
|
|
|
+// pub enum VcfNumber {
|
|
|
+// A, // one per ALT allele
|
|
|
+// R, // one per allele (REF + ALTs)
|
|
|
+// G, // one per genotype — leave alone
|
|
|
+// Dot, // unknown — leave alone
|
|
|
+// Fixed(usize), // fixed count — leave alone
|
|
|
+// }
|
|
|
+
|
|
|
+/// Split a multiallelic VcfVariant into N biallelic VcfVariants.
|
|
|
+/// Rewrites A/R-number INFO fields per allele using header metadata.
|
|
|
+pub fn from_multiallelic(variant: VcfVariant) -> Vec<VcfVariant> {
|
|
|
+ let alt_str = variant.alternative.to_string();
|
|
|
+ let alts: Vec<&str> = alt_str.split(',').collect();
|
|
|
+
|
|
|
+ // Already biallelic
|
|
|
+ if alts.len() == 1 {
|
|
|
+ return vec![variant];
|
|
|
+ }
|
|
|
+
|
|
|
+ alts.iter()
|
|
|
+ .enumerate()
|
|
|
+ .map(|(i, alt)| {
|
|
|
+ let mut v = variant.clone();
|
|
|
+
|
|
|
+ // Rewrite ALT
|
|
|
+ v.alternative = alt.parse().expect("Failed to parse split ALT");
|
|
|
+
|
|
|
+ // Rewrite INFO fields (A/R-number)
|
|
|
+ v.infos = rewrite_infos(&variant.infos, i);
|
|
|
+
|
|
|
+ // Rewrite FORMAT/GT
|
|
|
+ v.formats = rewrite_formats(&variant.formats, i);
|
|
|
+
|
|
|
+ // Recompute hash (pos + ref + new alt)
|
|
|
+ let mut hasher = blake3::Hasher::new();
|
|
|
+ hasher.update(&v.position.contig.to_ne_bytes());
|
|
|
+ hasher.update(&v.position.position.to_ne_bytes());
|
|
|
+ hasher.update(v.reference.to_string().as_bytes());
|
|
|
+ hasher.update(v.alternative.to_string().as_bytes());
|
|
|
+ let hash = hasher.finalize();
|
|
|
+ v.hash = Hash128::new(hash.as_bytes()[..16].try_into().unwrap());
|
|
|
+
|
|
|
+ v
|
|
|
+ })
|
|
|
+ .collect()
|
|
|
+}
|
|
|
+
|
|
|
+fn rewrite_infos(infos: &Infos, allele_idx: usize) -> Infos {
|
|
|
+ let rewritten = infos
|
|
|
+ .0
|
|
|
+ .iter()
|
|
|
+ .map(|info| {
|
|
|
+ match info {
|
|
|
+ Info::FREQ(DbSnpFreq(entries)) => {
|
|
|
+ let new_entries = entries
|
|
|
+ .iter()
|
|
|
+ .map(|entry| {
|
|
|
+ // R-length: [REF, ALT1, ALT2, ...] → keep REF + allele_idx-th ALT
|
|
|
+ let ref_val = entry.values.first().copied().flatten();
|
|
|
+ let alt_val = entry.values.get(allele_idx + 1).copied().flatten();
|
|
|
+ DbSnpFreqEntry {
|
|
|
+ source: entry.source.clone(),
|
|
|
+ values: vec![ref_val, alt_val],
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+ Info::FREQ(DbSnpFreq(new_entries))
|
|
|
+ }
|
|
|
+ // All other typed Info variants: pass through unchanged
|
|
|
+ other => other.clone(),
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ Infos(rewritten)
|
|
|
+}
|
|
|
+
|
|
|
+fn rewrite_formats(formats: &Formats, allele_idx: usize) -> Formats {
|
|
|
+ if formats.0.is_empty() {
|
|
|
+ return formats.clone();
|
|
|
+ }
|
|
|
+ // Reconstruct a fake FORMAT+sample row to reuse existing FromStr
|
|
|
+ let (fmt_str, sample_str): (String, String) = formats.clone().into();
|
|
|
+ let fmt_keys: Vec<&str> = fmt_str.split(':').collect();
|
|
|
+ let rewritten_sample = rewrite_sample(&sample_str, &fmt_keys, allele_idx);
|
|
|
+ (fmt_str.as_str(), rewritten_sample.as_str())
|
|
|
+ .try_into()
|
|
|
+ .expect("Failed to parse rewritten FORMAT")
|
|
|
+}
|
|
|
+
|
|
|
+/// Recode a sample's GT from multiallelic to biallelic representation.
|
|
|
+/// e.g. GT "0/2" with allele_idx=1 → "0/1" (allele 2 becomes allele 1 in new row)
|
|
|
+/// GT "1/2" with allele_idx=1 → "0/1" (allele 1 in original becomes REF-equivalent here)
|
|
|
+/// GT "2/2" with allele_idx=1 → "1/1"
|
|
|
+fn rewrite_sample(sample: &str, format_keys: &[&str], allele_idx: usize) -> String {
|
|
|
+ let values: Vec<&str> = sample.split(':').collect();
|
|
|
+ let mut out = Vec::with_capacity(values.len());
|
|
|
+
|
|
|
+ for (k, &key) in format_keys.iter().enumerate() {
|
|
|
+ let val = values.get(k).copied().unwrap_or(".");
|
|
|
+ if key == "GT" {
|
|
|
+ out.push(recode_gt(val, allele_idx));
|
|
|
+ } else {
|
|
|
+ out.push(val.to_string());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ out.join(":")
|
|
|
+}
|
|
|
+
|
|
|
+/// Recode GT for one allele split.
|
|
|
+/// Original ALT allele (allele_idx + 1) → 1; all others → 0; missing → .
|
|
|
+fn recode_gt(gt: &str, allele_idx: usize) -> String {
|
|
|
+ let phased = gt.contains('|');
|
|
|
+ let sep = if phased { '|' } else { '/' };
|
|
|
+ let target = allele_idx + 1; // 1-based ALT index in original
|
|
|
+
|
|
|
+ gt.split(sep)
|
|
|
+ .map(|a| match a {
|
|
|
+ "." => ".".to_string(),
|
|
|
+ a => {
|
|
|
+ let n: usize = a.parse().unwrap_or(0);
|
|
|
+ if n == target {
|
|
|
+ "1".to_string()
|
|
|
+ } else {
|
|
|
+ "0".to_string()
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
+ .collect::<Vec<_>>()
|
|
|
+ .join(&sep.to_string())
|
|
|
+}
|
|
|
+
|
|
|
+fn pick_nth(values: &str, idx: usize) -> &str {
|
|
|
+ values.split(',').nth(idx).unwrap_or(".")
|
|
|
+}
|
|
|
+
|
|
|
+fn pick_r(values: &str, allele_idx: usize) -> String {
|
|
|
+ let mut it = values.split(',');
|
|
|
+ let ref_val = it.next().unwrap_or(".");
|
|
|
+ let alt_val = it.nth(allele_idx).unwrap_or(".");
|
|
|
+ format!("{ref_val},{alt_val}")
|
|
|
+}
|
|
|
+
|
|
|
+// ---------------------- VcfVariant SV graph
|
|
|
+
|
|
|
use petgraph::graph::{DiGraph, NodeIndex};
|
|
|
use petgraph::visit::{IntoNodeIdentifiers, NodeIndexable};
|
|
|
use petgraph::Direction;
|
|
|
@@ -1321,8 +1472,8 @@ impl Infos {
|
|
|
|
|
|
pub fn freq_maf(&self) -> Option<f32> {
|
|
|
self.0.iter().find_map(|i| {
|
|
|
- if let Info::FREQ(s) = i {
|
|
|
- parse_maf_from_freq(s)
|
|
|
+ if let Info::FREQ(freq) = i {
|
|
|
+ freq.maf()
|
|
|
} else {
|
|
|
None
|
|
|
}
|
|
|
@@ -1330,42 +1481,6 @@ impl Infos {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-/// Average ALT frequency across real population sources in a dbSNP FREQ field.
|
|
|
-///
|
|
|
-/// Format: `KOREAN:0.92,0.08|TOMMO:0.94,0.06|SGDP_PRJ:0.5,0.5|...`
|
|
|
-/// - Index 0 = REF freq, index 1 = ALT freq
|
|
|
-/// - Excludes SGDP_PRJ (encodes presence as 0.5, not real frequency)
|
|
|
-/// - Excludes dbGaP_PopFreq (ascertainment bias from disease cohorts)
|
|
|
-/// - Skips sources with ALT freq == 0.0 (not observed, not absent)
|
|
|
-///
|
|
|
-/// Returns `None` if no valid sources found.
|
|
|
-pub fn parse_maf_from_freq(freq: &str) -> Option<f32> {
|
|
|
- const EXCLUDED: &[&str] = &["SGDP_PRJ", "dbGaP_PopFreq"];
|
|
|
-
|
|
|
- let (sum, count) = freq
|
|
|
- .split('|')
|
|
|
- .filter_map(|source| {
|
|
|
- let (name, alleles) = source.split_once(':')?;
|
|
|
- if EXCLUDED.contains(&name) {
|
|
|
- return None;
|
|
|
- }
|
|
|
- let alt = alleles.split(',').nth(1)?.parse::<f32>().ok()?;
|
|
|
- // 0.0 means not observed in this source, skip rather than pulling MAF down
|
|
|
- if alt <= 0.0 {
|
|
|
- None
|
|
|
- } else {
|
|
|
- Some(alt)
|
|
|
- }
|
|
|
- })
|
|
|
- .fold((0.0_f32, 0usize), |(s, c), af| (s + af, c + 1));
|
|
|
-
|
|
|
- if count == 0 {
|
|
|
- None
|
|
|
- } else {
|
|
|
- Some(sum / count as f32)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
/// Enum representing a single INFO field in a VCF record.
|
|
|
///
|
|
|
/// Supports both standard fields and Severus-specific structural variant annotations.
|
|
|
@@ -1441,7 +1556,7 @@ pub enum Info {
|
|
|
INSIDE_VNTR(String),
|
|
|
ALINGED_POS(String),
|
|
|
// dbSNP
|
|
|
- FREQ(String),
|
|
|
+ FREQ(DbSnpFreq),
|
|
|
COMMON,
|
|
|
RS(u32),
|
|
|
}
|
|
|
@@ -1524,7 +1639,11 @@ impl FromStr for Info {
|
|
|
"MATE_ID" => Info::MATE_ID(value.to_string()),
|
|
|
"INSIDE_VNTR" => Info::INSIDE_VNTR(value.to_string()),
|
|
|
"ALINGED_POS" => Info::ALINGED_POS(value.to_string()),
|
|
|
- "FREQ" => Info::FREQ(value.to_string()),
|
|
|
+ "FREQ" => Info::FREQ(
|
|
|
+ value
|
|
|
+ .parse()
|
|
|
+ .with_context(|| format!("Failed to parse FREQ: `{value}`"))?,
|
|
|
+ ),
|
|
|
"RS" => Info::RS(parse_value(value, key)?),
|
|
|
|
|
|
_ => Info::Empty,
|