Ver Fonte

bug: GATK mutect2 use SM tag but no tag if no barcode by dorado. solution: add sm tag and reheader

Thomas há 3 semanas atrás
pai
commit
83f7501240
3 ficheiros alterados com 149 adições e 8 exclusões
  1. 15 5
      src/callers/gatk.rs
  2. 93 0
      src/commands/samtools.rs
  3. 41 3
      src/io/bam.rs

+ 15 - 5
src/callers/gatk.rs

@@ -133,11 +133,13 @@ use crate::{
     annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
     collection::vcf::Vcf,
     commands::{
-        Command as JobCommand, LocalBatchRunner, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner, bcftools::{BcftoolsConcat, BcftoolsIndex, BcftoolsKeepPass}
+        bcftools::{BcftoolsConcat, BcftoolsIndex, BcftoolsKeepPass},
+        Command as JobCommand, LocalBatchRunner, LocalRunner, SbatchRunner, SlurmParams,
+        SlurmRunner,
     },
     config::Config,
     helpers::{is_file_older, remove_dir_if_exists, singularity_bind_flags},
-    io::{bam::read_sm_tag, bed::BedRow, vcf::read_vcf},
+    io::{bam::read_sm_tag_or_inject, bed::BedRow, vcf::read_vcf},
     locker::SampleLock,
     pipes::{Initialize, ShouldRun, Version},
     run, run_many,
@@ -226,9 +228,17 @@ impl Initialize for Mutect2 {
             bed_path.display()
         );
 
-        let normal_bam = config.normal_bam(&id);
-        let normal_sample = read_sm_tag(&normal_bam)
-            .with_context(|| format!("Failed to read @RG SM from normal BAM: {normal_bam}"))?;
+        let normal_sample = read_sm_tag_or_inject(
+            &config.normal_bam(&id),
+            &format!("{id}_{}", config.normal_name),
+            config,
+        )?;
+        // Also fix tumor BAM (GATK reads SM from both)
+        let _tumor_sample = read_sm_tag_or_inject(
+            &config.tumoral_bam(&id),
+            &format!("{id}_{}", config.tumoral_name),
+            config,
+        )?;
         info!("Normal sample name from BAM header: {normal_sample}");
 
         let mutect2 = Self {

+ 93 - 0
src/commands/samtools.rs

@@ -92,6 +92,99 @@ impl Run for SamtoolsIndex {
     }
 }
 
+/// Wrapper around `samtools reheader` to inject a missing `SM` tag into `@RG` lines.
+///
+/// This is needed for ONT BAMs produced by dorado/MinKNOW where the `@RG` header
+/// may lack an `SM` field (e.g., when barcode is "unclassified"). GATK Mutect2
+/// requires `SM` on every `@RG` line and will crash without it.
+///
+/// The operation is **header-only** — no read data is rewritten, making it
+/// nearly instant even on large BAMs.
+///
+/// Produces a command of the form:
+///
+/// ```text
+/// bash -c '<bin> view -H <bam> | sed "s/^\(@RG.*\)/\1\tSM:<sample>/" | <bin> reheader --in-place - <bam>'
+/// ```
+///
+/// **Important**: After reheader, the BAM index (`.bai`) should be regenerated
+/// via [`SamtoolsIndex`] since header offsets may change.
+#[derive(Debug)]
+pub struct SamtoolsReheader {
+    /// Path to the `samtools` binary.
+    pub bin: String,
+    /// Path to the BAM file to modify in-place.
+    pub bam: String,
+    /// Sample name to inject as `SM:<sample>` in all `@RG` lines.
+    pub sample: String,
+    slurm: bool,
+}
+
+impl super::Command for SamtoolsReheader {
+    fn cmd(&self) -> String {
+        // Uses --in-place to avoid tmp file + rename dance.
+        // The sed appends SM:<sample> to every @RG line that doesn't already have one.
+        format!(
+            "bash -c '{bin} view -H {bam} \
+             | sed \"/^@RG/{{/SM:/!s/$/\\tSM:{sample}/}}\" \
+             | {bin} reheader --in-place - {bam}'",
+            bin = self.bin,
+            bam = self.bam,
+            sample = self.sample,
+        )
+    }
+}
+
+impl super::LocalRunner for SamtoolsReheader {}
+impl super::LocalBatchRunner for SamtoolsReheader {}
+
+impl super::SlurmRunner for SamtoolsReheader {
+    fn slurm_args(&self) -> Vec<String> {
+        SlurmParams {
+            job_name: Some("samtools_reheader".into()),
+            cpus_per_task: Some(1),
+            mem: Some("4G".into()),
+            partition: Some("shortq".into()),
+            gres: None,
+        }
+        .to_args()
+    }
+}
+
+impl super::SbatchRunner for SamtoolsReheader {
+    fn slurm_params(&self) -> SlurmParams {
+        SlurmParams {
+            job_name: Some("samtools_reheader".into()),
+            cpus_per_task: Some(1),
+            mem: Some("4G".into()),
+            partition: Some("shortq".into()),
+            gres: None,
+        }
+    }
+}
+
+impl SamtoolsReheader {
+    pub fn from_config(config: &Config, bam: &str, sample: &str) -> Self {
+        Self {
+            bin: config.align.samtools_bin.clone(),
+            bam: bam.to_string(),
+            sample: sample.to_string(),
+            slurm: config.slurm_runner,
+        }
+    }
+}
+
+impl Run for SamtoolsReheader {
+    fn run(&mut self) -> anyhow::Result<()> {
+        if self.slurm {
+            let _output = SlurmRunner::exec(self)?;
+        } else {
+            let _output = LocalRunner::exec(self)?;
+        }
+        Ok(())
+    }
+}
+
 /// Wrapper around a `samtools merge` invocation used to append one BAM into
 /// another while preserving read group (RG) uniqueness.
 ///

+ 41 - 3
src/io/bam.rs

@@ -1,9 +1,11 @@
 use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
-use log::warn;
+use log::{info, warn};
 use rust_htslib::bam::{self, record::Aux, record::Cigar, IndexedReader, Read, Record};
 
+use crate::{commands::samtools::{SamtoolsIndex, SamtoolsReheader}, config::Config, runners::Run};
+
 /// Parses an SA tag string and extracts chromosome and position information.
 ///
 /// # Arguments
@@ -751,7 +753,29 @@ pub fn fb_inv_from_record(
     })
 }
 
-pub fn read_sm_tag(bam_path: &str) -> anyhow::Result<String> {
+// pub fn read_sm_tag(bam_path: &str) -> anyhow::Result<String> {
+//     let reader = bam::Reader::from_path(bam_path)
+//         .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
+//     let header = bam::Header::from_template(reader.header());
+//     let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
+//
+//     for line in header_text.lines() {
+//         if line.starts_with("@RG") {
+//             for field in line.split('\t') {
+//                 if let Some(sm) = field.strip_prefix("SM:") {
+//                     return Ok(sm.to_string());
+//                 }
+//             }
+//         }
+//     }
+//     anyhow::bail!("No SM tag found in @RG header of {bam_path}")
+// }
+
+pub fn read_sm_tag_or_inject(
+    bam_path: &str,
+    fallback_sample: &str,
+    config: &Config,
+) -> anyhow::Result<String> {
     let reader = bam::Reader::from_path(bam_path)
         .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
     let header = bam::Header::from_template(reader.header());
@@ -766,5 +790,19 @@ pub fn read_sm_tag(bam_path: &str) -> anyhow::Result<String> {
             }
         }
     }
-    anyhow::bail!("No SM tag found in @RG header of {bam_path}")
+
+    // SM missing (dorado/MinKNOW unclassified barcode) — inject it
+    info!("No @RG SM tag in {bam_path}, injecting SM:{fallback_sample}");
+
+    let mut reheader = SamtoolsReheader::from_config(config, bam_path, fallback_sample);
+    reheader
+        .run()
+        .with_context(|| format!("Failed to inject SM into {bam_path}"))?;
+
+    let mut index = SamtoolsIndex::from_config(config, bam_path);
+    index
+        .run()
+        .with_context(|| format!("Failed to re-index {bam_path}"))?;
+
+    Ok(fallback_sample.to_string())
 }