Sfoglia il codice sorgente

bug: GATK mutect2 SM tag in normal bam not corresponding to sample_name in CMD line. solution: parse SM tag and use it as sample_name. caveat: no verification vbut as long names are generic and attributed by ssystem ok/

Thomas 3 settimane fa
parent
commit
4690b66eea
2 ha cambiato i file con 30 aggiunte e 5 eliminazioni
  1. 11 5
      src/callers/gatk.rs
  2. 19 0
      src/io/bam.rs

+ 11 - 5
src/callers/gatk.rs

@@ -133,13 +133,11 @@ use crate::{
     annotation::{Annotation, Annotations, Caller, CallerCat, Sample},
     collection::vcf::Vcf,
     commands::{
-        bcftools::{BcftoolsConcat, BcftoolsIndex, BcftoolsKeepPass},
-        Command as JobCommand, LocalBatchRunner, LocalRunner, SbatchRunner, SlurmParams,
-        SlurmRunner,
+        Command as JobCommand, LocalBatchRunner, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner, bcftools::{BcftoolsConcat, BcftoolsIndex, BcftoolsKeepPass}
     },
     config::Config,
     helpers::{is_file_older, remove_dir_if_exists, singularity_bind_flags},
-    io::{bed::BedRow, vcf::read_vcf},
+    io::{bam::read_sm_tag, bed::BedRow, vcf::read_vcf},
     locker::SampleLock,
     pipes::{Initialize, ShouldRun, Version},
     run, run_many,
@@ -185,6 +183,8 @@ pub struct Mutect2 {
     pub id: String,
     /// BED interval file for this run (full BED or chunk sub-BED).
     pub bed_path: PathBuf,
+    /// Normal sample name as found in the `@RG SM` tag of the normal BAM header.
+    pub normal_sample: String,
     /// Directory for log file storage.
     pub log_dir: String,
     /// Global pipeline configuration.
@@ -226,9 +226,15 @@ impl Initialize for Mutect2 {
             bed_path.display()
         );
 
+        let normal_bam = config.normal_bam(&id);
+        let normal_sample = read_sm_tag(&normal_bam)
+            .with_context(|| format!("Failed to read @RG SM from normal BAM: {normal_bam}"))?;
+        info!("Normal sample name from BAM header: {normal_sample}");
+
         let mutect2 = Self {
             id,
             bed_path,
+            normal_sample,
             log_dir,
             config: config.clone(),
             part_index: None,
@@ -276,7 +282,7 @@ impl JobCommand for Mutect2 {
         let bed = self.bed_path.display();
 
         // Normal sample name must match @RG SM tag in normal BAM header.
-        let sample_name_normal = format!("{}_{}", self.id, self.config.normal_name);
+        let sample_name_normal = &self.normal_sample;
 
         let bind_flags = singularity_bind_flags([
             &self.config.normal_bam(&self.id),

+ 19 - 0
src/io/bam.rs

@@ -1,5 +1,6 @@
 use std::collections::{HashMap, HashSet};
 
+use anyhow::Context;
 use log::warn;
 use rust_htslib::bam::{self, record::Aux, record::Cigar, IndexedReader, Read, Record};
 
@@ -749,3 +750,21 @@ pub fn fb_inv_from_record(
         sa_qend,
     })
 }
+
+pub fn read_sm_tag(bam_path: &str) -> anyhow::Result<String> {
+    let reader = bam::Reader::from_path(bam_path)
+        .with_context(|| format!("Failed to open BAM: {bam_path}"))?;
+    let header = bam::Header::from_template(reader.header());
+    let header_text = String::from_utf8_lossy(&header.to_bytes()).to_string();
+
+    for line in header_text.lines() {
+        if line.starts_with("@RG") {
+            for field in line.split('\t') {
+                if let Some(sm) = field.strip_prefix("SM:") {
+                    return Ok(sm.to_string());
+                }
+            }
+        }
+    }
+    anyhow::bail!("No SM tag found in @RG header of {bam_path}")
+}