Преглед изворни кода

improved BAM INS detection

Thomas пре 1 недеља
родитељ
комит
478f26daf0

+ 22 - 0
Cargo.lock

@@ -1714,6 +1714,19 @@ dependencies = [
  "percent-encoding",
 ]
 
+[[package]]
+name = "noodles-tabix"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bf2eaf575add136ea3e2f72983e175c549fc09e267efbec1ff89f2bc8c92821"
+dependencies = [
+ "bstr",
+ "indexmap",
+ "noodles-bgzf",
+ "noodles-core",
+ "noodles-csi",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.6"
@@ -1908,10 +1921,12 @@ dependencies = [
  "itertools 0.14.0",
  "lazy_static",
  "log",
+ "noodles-bgzf",
  "noodles-core",
  "noodles-csi",
  "noodles-fasta",
  "noodles-gff",
+ "noodles-tabix",
  "num-format",
  "ordered-float",
  "pandora_lib_assembler",
@@ -1929,6 +1944,7 @@ dependencies = [
  "thiserror 2.0.17",
  "toml 0.9.10+spec-1.1.0",
  "tracing",
+ "triple_accel",
  "uuid",
  "walkdir",
 ]
@@ -2905,6 +2921,12 @@ dependencies = [
  "once_cell",
 ]
 
+[[package]]
+name = "triple_accel"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.22"

+ 3 - 0
Cargo.toml

@@ -49,6 +49,9 @@ hex = "0.4.3"
 walkdir = "2.5.0"
 thiserror = "2.0.17"
 hostname = "0.4.2"
+noodles-tabix = "0.59.0"
+noodles-bgzf = "0.45.0"
+triple_accel = "0.4.0"
 
 [profile.dev]
 opt-level = 0

+ 1 - 1
src/annotation/echtvar.rs

@@ -124,7 +124,7 @@ mod tests {
         test_init();
 
         let s = "gnomad_ac=1;gnomad_an=-1;gnomad_af=-1;gnomad_af_oth=-1;gnomad_af_ami=-1;gnomad_af_sas=-1;gnomad_af_fin=-1;gnomad_af_eas=-1;gnomad_af_amr=-1;gnomad_af_afr=-1;gnomad_af_mid=-1;gnomad_af_asj=-1;gnomad_af_nfe=-1;CNT=188";
-        let (cosmic, gnomad) = parse_echtvar_val(s)?;
+        let (cosmic, _gnomad) = parse_echtvar_val(s)?;
 
         println!("{cosmic:#?}");
 

+ 5 - 1
src/callers/mod.rs

@@ -140,7 +140,11 @@ use crate::{
     callers::{
         clairs::ClairS, deep_somatic::DeepSomatic, deep_variant::DeepVariant, nanomonsv::NanomonSV,
         savana::Savana, severus::Severus, straglr::Straglr,
-    }, commands::longphase::run_phasing_somatic, config::Config, pipes::{Initialize, InitializeSolo}, runners::Run
+    },
+    commands::longphase::run_phasing_somatic,
+    config::Config,
+    pipes::{Initialize, InitializeSolo},
+    runners::Run,
 };
 
 pub mod clairs;

+ 158 - 13
src/collection/bam.rs

@@ -953,7 +953,6 @@ impl fmt::Display for WGSBamStats {
     }
 }
 
-
 /// Result of querying a read at a reference position.
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub enum PileBase {
@@ -963,7 +962,10 @@ pub enum PileBase {
     T,
     N,
     /// insertion immediately after the queried base
-    Ins,
+    Ins {
+        len: u32,
+        seq: Option<Vec<u8>>,
+    },
     /// deletion covering the queried base: (qname, del_len)
     Del((Vec<u8>, u32)),
     /// reference skip (N)
@@ -995,6 +997,74 @@ fn decode(n: u8) -> PileBase {
     }
 }
 
+#[inline]
+fn decode_str(n: u8) -> u8 {
+    match n & 0x0f {
+        1 => b'A',
+        2 => b'C',
+        4 => b'G',
+        8 => b'T',
+        15 => b'N', // also use N for ambiguous (e.g. 3,5,6,...)
+        _ => b'N',
+    }
+}
+
+
+// pub fn base_at_new(
+//     record: &rust_htslib::bam::Record,
+//     at_pos: i64,
+//     with_next_ins: bool,
+// ) -> Option<PileBase> {
+//     if record.pos() > at_pos {
+//         return None;
+//     }
+//
+//     let mut ref_pos = record.pos();
+//     let mut read_pos: i64 = 0;
+//     let cigar = record.cigar();
+//
+//     for (i, op) in cigar.iter().enumerate() {
+//         let (consume_read, consume_ref) = match *op {
+//             Cigar::Match(l) | Cigar::Equal(l) | Cigar::Diff(l) => (l as i64, l as i64),
+//             Cigar::Ins(l) | Cigar::SoftClip(l) => (l as i64, 0),
+//             Cigar::Del(l) | Cigar::RefSkip(l) => (0, l as i64),
+//             Cigar::HardClip(_) | Cigar::Pad(_) => (0, 0),
+//         };
+//
+//         // Look-ahead: insertion immediately AFTER the queried base?
+//         if with_next_ins
+//             && ref_pos + consume_ref == at_pos + 1
+//             && matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
+//         {
+//             return Some(PileBase::Ins);
+//         }
+//
+//         // Does the queried reference coordinate fall inside this CIGAR op?
+//         if ref_pos + consume_ref > at_pos {
+//             return Some(match *op {
+//                 Cigar::Del(l) => PileBase::Del((record.qname().to_vec(), l)),
+//                 Cigar::RefSkip(_) => PileBase::Skip,
+//                 // Match/mismatch or soft-clipped covering (only match/mismatch should consume ref)
+//                 _ => {
+//                     let offset = (at_pos - ref_pos) as usize;
+//                     // Guard against malformed records
+//                     let seq_len = record.seq_len();
+//                     if (read_pos as usize) + offset >= seq_len {
+//                         return None;
+//                     }
+//                     let b4 = record.seq().encoded_base((read_pos as usize) + offset);
+//                     decode(b4)
+//                 }
+//             });
+//         }
+//
+//         read_pos += consume_read;
+//         ref_pos += consume_ref;
+//     }
+//
+//     None // beyond alignment
+// }
+
 pub fn base_at_new(
     record: &rust_htslib::bam::Record,
     at_pos: i64,
@@ -1008,6 +1078,28 @@ pub fn base_at_new(
     let mut read_pos: i64 = 0;
     let cigar = record.cigar();
 
+    // Helper: extract insertion seq if available
+    let extract_ins_seq = |start: usize, len: usize| -> Option<Vec<u8>> {
+        let seq_len = record.seq_len();
+        if seq_len == 0 || start + len > seq_len {
+            return None;
+        }
+        // Build ASCII bases without allocating full read-as-bytes
+        let mut out = Vec::with_capacity(len);
+        for i in 0..len {
+            let eb = record.seq().encoded_base(start + i);
+            out.push(match eb {
+                1 => b'A',
+                2 => b'C',
+                4 => b'G',
+                8 => b'T',
+                15 => b'N',
+                _ => b'N',
+            });
+        }
+        Some(out)
+    };
+
     for (i, op) in cigar.iter().enumerate() {
         let (consume_read, consume_ref) = match *op {
             Cigar::Match(l) | Cigar::Equal(l) | Cigar::Diff(l) => (l as i64, l as i64),
@@ -1021,7 +1113,18 @@ pub fn base_at_new(
             && ref_pos + consume_ref == at_pos + 1
             && matches!(cigar.get(i + 1), Some(Cigar::Ins(_)))
         {
-            return Some(PileBase::Ins);
+            if let Some(Cigar::Ins(l)) = cigar.get(i + 1) {
+                let len_u32 = *l;
+                let len = len_u32 as usize;
+
+                // insertion starts at the read position right after this op
+                let ins_start = (read_pos + consume_read) as usize;
+
+                return Some(PileBase::Ins {
+                    len: len_u32,
+                    seq: extract_ins_seq(ins_start, len),
+                });
+            }
         }
 
         // Does the queried reference coordinate fall inside this CIGAR op?
@@ -1029,15 +1132,17 @@ pub fn base_at_new(
             return Some(match *op {
                 Cigar::Del(l) => PileBase::Del((record.qname().to_vec(), l)),
                 Cigar::RefSkip(_) => PileBase::Skip,
-                // Match/mismatch or soft-clipped covering (only match/mismatch should consume ref)
                 _ => {
+                    // Match/mismatch covering this ref base (softclip/ins don't consume ref, so
+                    // in well-formed data they shouldn't land here; keep the guard anyway).
                     let offset = (at_pos - ref_pos) as usize;
-                    // Guard against malformed records
-                    let seq_len = record.seq_len();
-                    if (read_pos as usize) + offset >= seq_len {
+                    let idx = (read_pos as usize) + offset;
+
+                    if record.seq_len() == 0 || idx >= record.seq_len() {
                         return None;
                     }
-                    let b4 = record.seq().encoded_base((read_pos as usize) + offset);
+
+                    let b4 = record.seq().encoded_base(idx);
                     decode(b4)
                 }
             });
@@ -1047,7 +1152,7 @@ pub fn base_at_new(
         ref_pos += consume_ref;
     }
 
-    None // beyond alignment
+    None
 }
 
 /// Return the pile-up of **all reads** at `chr:pos` (0-based, inclusive).
@@ -1065,6 +1170,7 @@ pub fn nt_pileup_new(
     pos: u32,
     with_next_ins: bool,
 ) -> anyhow::Result<Vec<PileBase>> {
+    // debug!("Pileup @ {chr}:{pos}");
     let mut pile = Vec::new();
 
     // Restrict BAM iterator to the one-base span of interest.
@@ -1082,10 +1188,32 @@ pub fn nt_pileup_new(
 
         for aln in pileup.alignments() {
             match aln.indel() {
-                rust_htslib::bam::pileup::Indel::Ins(_) => {
-                    // insertion *starts at* this reference position
-                    pile.push(PileBase::Ins);
+                rust_htslib::bam::pileup::Indel::Ins(len) => {
+                    let rec = aln.record();
+                    let seq = if rec.seq_len() > 0 {
+                        let qpos = match aln.qpos() {
+                            Some(q) => q + 1,
+                            None => continue,
+                        };
+                        let l = len as usize;
+
+                        if qpos + l <= rec.seq_len() {
+                            let mut v = Vec::with_capacity(l);
+                            for i in 0..l {
+                                let b = rec.seq().encoded_base(qpos + i);
+                                v.push(decode_str(b));
+                            }
+                            Some(v)
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    };
+
+                    pile.push(PileBase::Ins { len, seq });
                 }
+
                 rust_htslib::bam::pileup::Indel::Del(e) => {
                     let qname = aln.record().qname().to_vec();
                     pile.push(PileBase::Del((qname, e)));
@@ -1107,7 +1235,7 @@ mod tests {
 
     use crate::{
         collection::{
-            bam::{bam_composition, WGSBam},
+            bam::{bam_composition, nt_pileup_new, WGSBam},
             run::PromRuns,
         },
         config::Config,
@@ -1167,4 +1295,21 @@ mod tests {
         rs.stats();
         Ok(())
     }
+
+    #[test]
+    fn bam_pileup() -> anyhow::Result<()> {
+        test_init();
+
+        let config = Config::default();
+        let constit_bam_path = config.normal_bam("CHAHA");
+
+        let mut bam = rust_htslib::bam::IndexedReader::from_path(&constit_bam_path)
+            .map_err(|e| anyhow::anyhow!("Failed to open BAM file: {e}"))?;
+
+        let pu = nt_pileup_new(&mut bam, "chr1", 285, false)?;
+
+        dbg!(pu);
+
+        Ok(())
+    }
 }

+ 2 - 2
src/collection/bam_stats.rs

@@ -1359,9 +1359,9 @@ mod tests {
 
         let config = Config::default();
 
-        let stats = WGSBamStats::open("CHAHA", "diag", &config)?;
+        let stats = WGSBamStats::open("36167", "norm", &config)?;
         println!("{stats}");
-        let stats = WGSBamStats::open("DUMCO", "diag", &config)?;
+        let stats = WGSBamStats::open("36434", "norm", &config)?;
         println!("{stats}");
         Ok(())
     }

+ 1 - 0
src/collection/prom_run.rs

@@ -1350,6 +1350,7 @@ fn atomic_replace(source: &Path, destination: &Path) -> anyhow::Result<()> {
 
     Ok(())
 }
+
 /// Indexes a BAM file using samtools.
 fn index_bam(bam: &Path, config: &Config) -> anyhow::Result<()> {
     let mut index_cmd = SamtoolsIndex::from_config(config, bam.to_string_lossy().as_ref());

+ 1 - 1
src/collection/vcf.rs

@@ -31,7 +31,7 @@ impl Vcf {
         let caller = stem_splt[2..stem_splt.len() - 1].join("_");
 
         if !PathBuf::from(format!("{}.csi", path.display())).exists() {
-            anyhow::bail!("CSI index for VCF doesn't exist.")
+            anyhow::bail!("CSI index for {} doesn't exist.", path.display())
         }
 
         let n_variants = n_variants(path.to_str().context("Can't convert path to str")?)?;

+ 407 - 6
src/commands/modkit.rs

@@ -1,12 +1,21 @@
-use std::{fs::File, io::{BufRead, BufReader}, path::Path, str::FromStr};
+use std::{
+    cmp::Ordering,
+    fs::File,
+    io::{BufRead, BufReader, BufWriter, Write},
+    path::Path,
+    str::FromStr,
+};
 
 use anyhow::Context;
 
 use crate::{
     commands::{Command, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner},
     config::Config,
-    pipes::InitializeSolo,
+    helpers::diverging_rgb,
+    io::bed::convert_bgz_with_tabix,
+    pipes::{Initialize, InitializeSolo},
     run,
+    runners::Run,
 };
 
 pub struct ModkitSummary {
@@ -174,6 +183,7 @@ pub struct ModkitPileupMarlin {
     time: String,
     config: Config,
 }
+
 impl InitializeSolo for ModkitPileupMarlin {
     fn initialize(id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
         Ok(Self {
@@ -189,7 +199,12 @@ impl Command for ModkitPileupMarlin {
     fn cmd(&self) -> String {
         let hp_bam = self.config.solo_haplotagged_bam(&self.id, &self.time);
         let bam = self.config.solo_bam(&self.id, &self.time);
-        let out = format!("{}/{}_{}_MARLIN.bed", self.config.solo_dir(&self.id, &self.time), &self.id, &self.time);
+        let out = format!(
+            "{}/{}_{}_MARLIN.bed",
+            self.config.solo_dir(&self.id, &self.time),
+            &self.id,
+            &self.time
+        );
         let bam = if Path::new(&hp_bam).exists() {
             hp_bam
         } else {
@@ -235,14 +250,352 @@ impl SbatchRunner for ModkitPileupMarlin {
 impl ModkitPileupMarlin {
     pub fn run(&mut self) -> anyhow::Result<()> {
         let report = run!(&self.config, self)?;
-        let log_prefix = format!("{}/{}/log/modkit_pileup_{}", self.config.result_dir, self.id, self.time);
+        let log_prefix = format!(
+            "{}/{}/log/modkit_pileup_{}",
+            self.config.result_dir, self.id, self.time
+        );
         report.save_to_file(&log_prefix).context(format!(
-            "Error while writing Severus logs into {log_prefix}"
+            "Error while writing Modkit Pileup logs into {log_prefix}"
+        ))?;
+        Ok(())
+    }
+}
+
+pub struct ModkitPileup {
+    id: String,
+    time: String,
+    config: Config,
+    bam: String,
+    out: String,
+}
+
+impl InitializeSolo for ModkitPileup {
+    fn initialize(id: &str, time: &str, config: &Config) -> anyhow::Result<Self> {
+        let hp_bam = config.solo_haplotagged_bam(id, time);
+        let bam = config.solo_bam(id, time);
+        let out = format!(
+            "{}/{}_{}_modkit_pileup.bed",
+            config.solo_dir(id, time),
+            id,
+            time
+        );
+        let bam = if Path::new(&hp_bam).exists() {
+            hp_bam
+        } else {
+            bam
+        };
+
+        Ok(Self {
+            id: id.to_string(),
+            time: time.to_string(),
+
+            config: config.clone(),
+            bam,
+            out,
+        })
+    }
+}
+
+impl Command for ModkitPileup {
+    fn cmd(&self) -> String {
+        format!(
+            "{modkit} pileup {bam} {out} -t {threads} --combine-mods --ref {reference} --cpg --modified-bases 5mC",
+            modkit = self.config.modkit_bin,
+            bam = self.bam,
+            out = self.out,
+            threads = self.config.modkit_summary_threads,
+            reference = self.config.reference,
+        )
+    }
+}
+
+impl Run for ModkitPileup {
+    fn run(&mut self) -> anyhow::Result<()> {
+        run!(&self.config, self)?;
+        convert_bgz_with_tabix(&self.out, true)?;
+        Ok(())
+    }
+}
+
+impl LocalRunner for ModkitPileup {}
+impl SlurmRunner for ModkitPileup {
+    fn slurm_args(&self) -> Vec<String> {
+        SlurmParams {
+            job_name: Some(format!("modkit_pileup_{}_{}", self.id, self.time)),
+            cpus_per_task: Some(self.config.modkit_summary_threads.into()),
+            mem: Some("40G".into()),
+            partition: Some("shortq".into()),
+            gres: None,
+        }
+        .to_args()
+    }
+}
+impl SbatchRunner for ModkitPileup {
+    fn slurm_params(&self) -> SlurmParams {
+        SlurmParams {
+            job_name: Some(format!("modkit_pileup_{}_{}", self.id, self.time)),
+            cpus_per_task: Some(self.config.modkit_summary_threads.into()),
+            mem: Some("40G".into()),
+            partition: Some("shortq".into()),
+            gres: None,
+        }
+    }
+}
+
+impl ModkitPileup {
+    pub fn run(&mut self) -> anyhow::Result<()> {
+        let report = run!(&self.config, self)?;
+        let log_prefix = format!(
+            "{}/{}/log/modkit_pileup_{}",
+            self.config.result_dir, self.id, self.time
+        );
+        report.save_to_file(&log_prefix).context(format!(
+            "Error while writing Modkit Pileup logs into {log_prefix}"
         ))?;
         Ok(())
     }
 }
 
+pub struct ModkitDMR {
+    id: String,
+    config: Config,
+    pileup_nor: String,
+    pileup_tum: String,
+    out: String,
+}
+
+impl Initialize for ModkitDMR {
+    fn initialize(id: &str, config: &Config) -> anyhow::Result<Self> {
+        let pileup_nor = format!(
+            "{}/{}_{}_modkit_pileup.bed.gz",
+            config.normal_dir(id),
+            id,
+            config.normal_name
+        );
+        let pileup_tum = format!(
+            "{}/{}_{}_modkit_pileup.bed.gz",
+            config.tumoral_dir(id),
+            id,
+            config.tumoral_name
+        );
+        let out = format!(
+            "{}/{}_{}_modkit_dmr_tum_nor.tsv",
+            config.tumoral_dir(id),
+            id,
+            config.tumoral_name
+        );
+
+        Ok(Self {
+            id: id.to_string(),
+            config: config.clone(),
+            pileup_nor,
+            pileup_tum,
+            out,
+        })
+    }
+}
+
+impl Command for ModkitDMR {
+    fn cmd(&self) -> String {
+        format!(
+            "{modkit} dmr pair -a {normal} -b {tumoral} -o {out} -t {threads} --ref {reference} --base C -r {regions}",
+            modkit = self.config.modkit_bin,
+            normal = self.pileup_nor,
+            tumoral = self.pileup_tum,
+            out = self.out,
+            threads = self.config.modkit_summary_threads,
+            reference = self.config.reference,
+            regions = "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed"
+        )
+    }
+}
+
+impl Run for ModkitDMR {
+    fn run(&mut self) -> anyhow::Result<()> {
+        run!(&self.config, self)?;
+        convert_bgz_with_tabix(&self.out, true)?;
+        Ok(())
+    }
+}
+
+impl LocalRunner for ModkitDMR {}
+impl SlurmRunner for ModkitDMR {
+    fn slurm_args(&self) -> Vec<String> {
+        SlurmParams {
+            job_name: Some(format!("modkit_dmr_{}", self.id)),
+            cpus_per_task: Some(self.config.modkit_summary_threads.into()),
+            mem: Some("40G".into()),
+            partition: Some("shortq".into()),
+            gres: None,
+        }
+        .to_args()
+    }
+}
+impl SbatchRunner for ModkitDMR {
+    fn slurm_params(&self) -> SlurmParams {
+        SlurmParams {
+            job_name: Some(format!("modkit_dmr_{}", self.id)),
+            cpus_per_task: Some(self.config.modkit_summary_threads.into()),
+            mem: Some("40G".into()),
+            partition: Some("shortq".into()),
+            gres: None,
+        }
+    }
+}
+
+impl ModkitDMR {
+    pub fn run(&mut self) -> anyhow::Result<()> {
+        let report = run!(&self.config, self)?;
+        let log_prefix = format!(
+            "{}/{}/log/modkit_dmr_{}",
+            self.config.result_dir, self.id, self.config.tumoral_name
+        );
+        report.save_to_file(&log_prefix).context(format!(
+            "Error while writing Modkit DMR logs into {log_prefix}"
+        ))?;
+        Ok(())
+    }
+}
+
+/// One DMR genomic interval with an associated numeric value
+/// (e.g. delta methylation or test statistic).
+#[derive(Debug, Clone)]
+pub struct DmrInterval {
+    pub chrom: String,
+    pub start: u64,
+    pub end: u64,
+    pub name: String,
+    pub value: f64,
+}
+
+/// Read a modkit DMR TSV file and extract genomic intervals plus one numeric column.
+///
+/// The TSV is expected to contain at least:
+/// 1. chrom
+/// 2. start
+/// 3. end
+/// 4. name
+///
+/// # Arguments
+/// * `path` – path to the TSV file
+/// * `value_col_1based` – 1-based index of the column used as `value`
+///
+/// # Errors
+/// Returns an error if the file cannot be read, required fields are missing,
+/// or the value column cannot be parsed as `f64`.
+pub fn read_dmr_tsv(path: &str, value_col_1based: usize) -> anyhow::Result<Vec<DmrInterval>> {
+    if value_col_1based == 0 {
+        return Err(anyhow::anyhow!("value_col_1based must be >= 1"));
+    }
+    let value_idx = value_col_1based - 1;
+
+    let mut rdr = csv::ReaderBuilder::new()
+        .delimiter(b'\t')
+        .has_headers(false)
+        .flexible(true)
+        .from_path(path)
+        .with_context(|| format!("Failed to open input TSV: {path}"))?;
+
+    let mut intervals = Vec::new();
+
+    for (i, row) in rdr.records().enumerate() {
+        let row = row.with_context(|| format!("CSV parse error at line {}", i + 1))?;
+
+        let chrom = row
+            .get(0)
+            .ok_or_else(|| anyhow::anyhow!("Missing chrom at line {}", i + 1))?
+            .to_string();
+
+        let start: u64 = row
+            .get(1)
+            .ok_or_else(|| anyhow::anyhow!("Missing start at line {}", i + 1))?
+            .parse()
+            .with_context(|| format!("Invalid start at line {}", i + 1))?;
+
+        let end: u64 = row
+            .get(2)
+            .ok_or_else(|| anyhow::anyhow!("Missing end at line {}", i + 1))?
+            .parse()
+            .with_context(|| format!("Invalid end at line {}", i + 1))?;
+
+        let name = row.get(3).unwrap_or(".").to_string();
+
+        let value_str = row.get(value_idx).ok_or_else(|| {
+            anyhow::anyhow!(
+                "Missing value column {} at line {}",
+                value_col_1based,
+                i + 1
+            )
+        })?;
+
+        let value: f64 = value_str.parse().with_context(|| {
+            format!(
+                "Invalid float in value col {} at line {}",
+                value_col_1based,
+                i + 1
+            )
+        })?;
+
+        intervals.push(DmrInterval {
+            chrom,
+            start,
+            end,
+            name,
+            value,
+        });
+    }
+
+    Ok(intervals)
+}
+
+/// Sort DMR intervals in-place by chromosome and start coordinate.
+///
+/// IGV requires BED files to be sorted lexicographically by chrom,
+/// then numerically by start.
+pub fn sort_intervals(intervals: &mut [DmrInterval]) {
+    intervals.sort_by(|a, b| match a.chrom.cmp(&b.chrom) {
+        Ordering::Equal => a.start.cmp(&b.start),
+        other => other,
+    });
+}
+
+/// Write an IGV-compatible BED9 file with `itemRgb=On`.
+///
+/// Each interval is rendered as a single box whose color reflects
+/// `DmrInterval::value` using [`diverging_rgb`].
+///
+/// BED9 fields written:
+/// `chrom start end name score strand thickStart thickEnd itemRgb`
+///
+/// # Arguments
+/// * `path` – output BED path
+/// * `track_name` – IGV track name
+/// * `intervals` – sorted (recommended) DMR intervals
+/// * `clip` – symmetric clipping bound for coloring
+///
+/// # Errors
+/// Returns an error if the output file cannot be written.
+pub fn write_bed9_itemrgb(
+    path: &str,
+    track_name: &str,
+    intervals: &[DmrInterval],
+    clip: f64,
+) -> anyhow::Result<()> {
+    let out = File::create(path).with_context(|| format!("Failed to create output BED: {path}"))?;
+    let mut w = BufWriter::new(out);
+
+    writeln!(w, r#"track name="{}" itemRgb="On""#, track_name)?;
+
+    for r in intervals {
+        let (rr, gg, bb) = diverging_rgb(r.value, clip);
+        writeln!(
+            w,
+            "{}\t{}\t{}\t{}\t0\t.\t{}\t{}\t{},{},{}",
+            r.chrom, r.start, r.end, r.name, r.start, r.end, rr, gg, bb
+        )?;
+    }
+    Ok(())
+}
 
 #[cfg(test)]
 mod tests {
@@ -265,7 +618,6 @@ mod tests {
         let mut caller = ModkitSummary::initialize("DUMCO", "diag", &config)?;
         caller.run()?;
 
-
         Ok(())
     }
 
@@ -277,7 +629,56 @@ mod tests {
         let mut caller = ModkitPileupMarlin::initialize("DUMCO", "diag", &config)?;
         caller.run()?;
 
+        Ok(())
+    }
+
+    #[test]
+    fn modkit_pileup() -> anyhow::Result<()> {
+        test_init();
+        let config = Config::default();
+
+        let mut caller = ModkitPileup::initialize("DUMCO", "diag", &config)?;
+        caller.run()?;
+        let mut caller = ModkitPileup::initialize("DUMCO", "norm", &config)?;
+        caller.run()?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn modkit_dmr() -> anyhow::Result<()> {
+        test_init();
+        let config = Config::default();
+
+        let mut caller = ModkitDMR::initialize("DUMCO", &config)?;
+        caller.run()?;
 
         Ok(())
     }
+
+    // #[test]
+    // fn modkit_dmr_igv() -> anyhow::Result<()> {
+    //     test_init();
+    //     let config = Config::default();
+    //
+    //     let id = "CHAHA";
+    //     let path = format!(
+    //         "{}/{}_{}_modkit_dmr_tum_nor.tsv",
+    //         config.tumoral_dir(id),
+    //         id,
+    //         config.tumoral_name
+    //     );
+    //
+    //     let out = format!(
+    //         "{}/{}_{}_modkit_dmr_tum_nor_igv.bed",
+    //         config.tumoral_dir(id),
+    //         id,
+    //         config.tumoral_name
+    //     );
+    //
+    //     let mut dmr = read_dmr_tsv(&path, 13)?;
+    //     sort_intervals(&mut dmr);
+    //     write_bed9_itemrgb(&out, "DMR delta", &dmr, 0.3)?;
+    //     Ok(())
+    // }
 }

+ 37 - 0
src/helpers.rs

@@ -1132,6 +1132,43 @@ pub fn remove_bam_with_index(bam: &Path) -> anyhow::Result<()> {
     Ok(())
 }
 
+/// Convert a numeric value to an RGB triplet using a diverging palette.
+///
+/// Values are clipped to `[-clip, +clip]` and mapped as:
+/// - negative → blue
+/// - zero → white
+/// - positive → red
+///
+/// # Arguments
+/// * `v` – value to color (e.g. delta methylation)
+/// * `clip` – symmetric clipping bound (must be > 0)
+///
+/// # Returns
+/// `(r, g, b)` as 8-bit RGB components.
+///
+/// # Panics
+/// Panics if `clip <= 0.0`.
+pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
+    assert!(clip > 0.0, "clip must be > 0");
+
+    let x = v.clamp(-clip, clip);
+    let t = (x + clip) / (2.0 * clip); // [-clip,+clip] → [0,1]
+
+    if t < 0.5 {
+        // blue → white
+        let u = t / 0.5;
+        let r = (255.0 * u).round() as u8;
+        let g = (255.0 * u).round() as u8;
+        (r, g, 255)
+    } else {
+        // white → red
+        let u = (t - 0.5) / 0.5;
+        let g = (255.0 * (1.0 - u)).round() as u8;
+        let b = (255.0 * (1.0 - u)).round() as u8;
+        (255, g, b)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use log::info;

+ 110 - 3
src/io/bed.rs

@@ -7,18 +7,24 @@
 //! - A pre-indexed BED structure for fast gene queries
 
 use std::{
-    io::{BufRead, BufReader},
-    str::FromStr, sync::Arc,
+    io::{BufRead, BufReader}, path::Path, str::FromStr, sync::Arc
 };
 
 use anyhow::Context;
 use log::warn;
 use rayon::prelude::*;
 
-use crate::{annotation::Annotation, positions::{extract_contig_indices, find_contig_indices, overlaps_par, GenomePosition, GenomeRange, GetGenomeRange}, variant::variant_collection::Variants};
+use crate::{annotation::Annotation, io::writers::get_gz_writer, positions::{GenomePosition, GenomeRange, GetGenomeRange, extract_contig_indices, find_contig_indices, overlaps_par}, variant::variant_collection::Variants};
 
 use super::readers::get_reader;
 
+use noodles_bgzf as nbgzf;
+use noodles_core::Position;
+use noodles_csi::binning_index::index::reference_sequence::bin::Chunk;
+use noodles_csi::binning_index::index::Header;
+use noodles_tabix as tabix;
+use std::io::Write;
+
 /// One row of a BED file.
 ///
 /// Represents a genomic interval with optional name, score, and strand
@@ -279,3 +285,104 @@ impl GenesBedIndex {
         out
     }
 }
+
+/// Convert BED -> BGZF (.gz) and create a Tabix index (.tbi) in the same pass.
+///
+/// Notes:
+/// - Tabix expects the file to be tab-delimited, grouped by contig, and coordinate-sorted.
+/// - BED is 0-based, half-open; Tabix uses 1-based positions. We convert as:
+///   start_pos = start0 + 1
+///   end_pos   = end0
+pub fn convert_bgz_with_tabix(input: impl AsRef<Path>, force: bool) -> anyhow::Result<()> {
+    let input = input.as_ref();
+    let out_bgz = format!("{}.gz", input.display());
+    let out_tbi = format!("{}.tbi", out_bgz);
+
+    let reader = get_reader(&input.to_string_lossy())?;
+    let mut reader = BufReader::new(reader);
+
+    let mut writer = get_gz_writer(&out_bgz, force)?;
+
+    // Build a tabix index while we write.
+    let mut indexer = tabix::index::Indexer::default();
+    indexer.set_header(Header::default()); // minimal header; usable for most tools :contentReference[oaicite:1]{index=1}
+
+    let mut line = String::new();
+    loop {
+        line.clear();
+        let n = reader.read_line(&mut line)?;
+        if n == 0 {
+            break;
+        }
+
+        // Record start virtual offset for this line in the *output* bgzf stream.
+        let chunk_start = writer.bgzf_pos(); // BGZF virtual offset :contentReference[oaicite:2]{index=2}
+
+        // Write line as-is.
+        writer.write_all(line.as_bytes())?;
+
+        // Record end virtual offset after writing the line.
+        let chunk_end = writer.bgzf_pos(); // :contentReference[oaicite:3]{index=3}
+
+        // Add to tabix index if it's a data line (skip meta/header lines).
+        if !line.starts_with('#') && !line.trim().is_empty() {
+            let mut it = line.split('\t');
+            let rname = it
+                .next()
+                .context("BED: missing contig")?;
+
+            let start0: u32 = it
+                .next()
+                .context("BED: missing start")?
+                .parse()
+                .context("BED: invalid start")?;
+
+            let end0: u32 = it
+                .next()
+                .context("BED: missing end")?
+                .parse()
+                .context("BED: invalid end")?;
+
+            // BED 0-based half-open -> 1-based inclusive-ish interval for tabix
+            let start1 = start0
+                .checked_add(1)
+                .context("BED: start overflow")?;
+
+            let start = Position::try_from(start1 as usize)
+                .context("BED: start must be >= 1")?;
+            let end = Position::try_from(end0 as usize)
+                .context("BED: end must be >= 1 for tabix indexing")?;
+
+            let chunk = Chunk::new(
+                nbgzf::VirtualPosition::from(chunk_start),
+                nbgzf::VirtualPosition::from(chunk_end),
+            ); // chunk is [start, end) in virtual offsets :contentReference[oaicite:4]{index=4}
+
+            indexer
+                .add_record(rname, start, end, chunk)
+                .with_context(|| format!("tabix: failed to add record for {rname}:{start0}-{end0}"))?;
+        }
+    }
+
+    writer.close()?; // writes EOF marker :contentReference[oaicite:5]{index=5}
+
+    let index = indexer.build(); // :contentReference[oaicite:6]{index=6}
+    tabix::fs::write(&out_tbi, &index)?; // :contentReference[oaicite:7]{index=7}
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::helpers::test_init;
+
+    #[test]
+    fn bed_to_bgz() -> anyhow::Result<()> {
+        test_init();
+
+        let a = "/mnt/beegfs02/scratch/t_steimle/data/wgs/CHAHA/norm/CHAHA_norm_modkit_pileup.bed";
+        convert_bgz_with_tabix(a, true)?;
+        Ok(())
+    }
+}

+ 1 - 0
src/io/mod.rs

@@ -10,3 +10,4 @@ pub mod bam;
 pub mod writers;
 pub mod straglr;
 pub mod tsv;
+pub mod modkit;

+ 526 - 0
src/io/modkit.rs

@@ -0,0 +1,526 @@
+use anyhow::{anyhow, Context, Result};
+use csv::ReaderBuilder;
+use std::{
+    cmp::Ordering, collections::HashMap, fs::File, io::{BufWriter, Write}
+};
+
+use crate::io::readers::get_gz_reader;
+
+/// One methylation call row from a modkit pileup (bedMethyl-like) file.
+///
+/// We only keep what we need to aggregate region methylation:
+/// - `start` (0-based position; pileup rows are single-base intervals: [start, start+1))
+/// - `nvalid_cov` (col 10)
+/// - `nmod` (col 12)
+#[derive(Debug, Clone)]
+pub struct PileupSite {
+    pub start: u64,
+    pub nvalid_cov: u64,
+    pub nmod: u64,
+}
+
+/// One region from a BED4 file: chrom, start, end, name.
+#[derive(Debug, Clone)]
+pub struct BedRegion {
+    pub chrom: String,
+    pub start: u64,
+    pub end: u64,
+    pub name: String,
+}
+
+/// Promoter/body classification inferred from region name.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum RegionKind {
+    Promoter,
+    GeneBody,
+    Other,
+}
+
+/// Parsed region name into gene key + kind.
+#[derive(Debug, Clone)]
+pub struct ParsedName {
+    pub gene_key: String,
+    pub kind: RegionKind,
+}
+
+/// Parse region `name` for `_prom` or `_body` suffix.
+///
+/// - `FOO_prom` => gene_key `FOO`, kind `Promoter`
+/// - `FOO_body` => gene_key `FOO`, kind `GeneBody`
+pub fn parse_region_name(name: &str) -> ParsedName {
+    if let Some(g) = name.strip_suffix("_prom") {
+        ParsedName { gene_key: g.to_string(), kind: RegionKind::Promoter }
+    } else if let Some(g) = name.strip_suffix("_body") {
+        ParsedName { gene_key: g.to_string(), kind: RegionKind::GeneBody }
+    } else {
+        ParsedName { gene_key: name.to_string(), kind: RegionKind::Other }
+    }
+}
+
+/// Diverging blue→white→red palette for IGV itemRgb.
+///
+/// Negative values → blue, zero → white, positive → red.
+/// Values are clipped to `[-clip, +clip]`.
+pub fn diverging_rgb(v: f64, clip: f64) -> (u8, u8, u8) {
+    assert!(clip > 0.0, "clip must be > 0");
+    let x = v.clamp(-clip, clip);
+    let t = (x + clip) / (2.0 * clip);
+    if t < 0.5 {
+        let u = t / 0.5;
+        let r = (255.0 * u).round() as u8;
+        let g = (255.0 * u).round() as u8;
+        (r, g, 255)
+    } else {
+        let u = (t - 0.5) / 0.5;
+        let g = (255.0 * (1.0 - u)).round() as u8;
+        let b = (255.0 * (1.0 - u)).round() as u8;
+        (255, g, b)
+    }
+}
+
+/// Read a BED4 regions file.
+///
+/// # Errors
+/// Fails on malformed lines or invalid coordinates.
+pub fn read_bed4(path: &str) -> Result<Vec<BedRegion>> {
+    let mut rdr = ReaderBuilder::new()
+        .delimiter(b'\t')
+        .has_headers(false)
+        .flexible(true)
+        .from_path(path)
+        .with_context(|| format!("Failed to open BED: {path}"))?;
+
+    let mut out = Vec::new();
+    for (i, row) in rdr.records().enumerate() {
+        let row = row.with_context(|| format!("BED parse error at line {}", i + 1))?;
+        let chrom = row.get(0).ok_or_else(|| anyhow!("Missing chrom at line {}", i + 1))?.to_string();
+        let start: u64 = row.get(1).ok_or_else(|| anyhow!("Missing start at line {}", i + 1))?
+            .parse().with_context(|| format!("Bad start at line {}", i + 1))?;
+        let end: u64 = row.get(2).ok_or_else(|| anyhow!("Missing end at line {}", i + 1))?
+            .parse().with_context(|| format!("Bad end at line {}", i + 1))?;
+        if end <= start {
+            return Err(anyhow!("Invalid BED interval end<=start at line {}", i + 1));
+        }
+        let name = row.get(3).unwrap_or(".").to_string();
+        out.push(BedRegion { chrom, start, end, name });
+    }
+    Ok(out)
+}
+
+/// Read a modkit pileup file and index it per chromosome.
+///
+/// Expected columns (1-based):
+/// - chrom (1)
+/// - start (2)
+/// - Nvalid_cov (10)
+/// - Nmod (12)
+///
+/// # Notes
+/// The file is typically sorted; we sort per chromosome just in case.
+///
+/// # Errors
+/// Fails on malformed lines or invalid integers.
+pub fn read_pileup_index(path: &str) -> Result<HashMap<String, Vec<PileupSite>>> {
+    let mut rdr = ReaderBuilder::new()
+        .delimiter(b'\t')
+        .has_headers(false)
+        .flexible(true)
+        .from_reader(get_gz_reader(path)?);
+
+    let mut map: HashMap<String, Vec<PileupSite>> = HashMap::new();
+
+    for (i, row) in rdr.records().enumerate() {
+        let row = row.with_context(|| format!("Pileup parse error at line {}", i + 1))?;
+
+        let chrom = row.get(0).ok_or_else(|| anyhow!("Missing chrom at line {}", i + 1))?.to_string();
+        let start: u64 = row.get(1).ok_or_else(|| anyhow!("Missing start at line {}", i + 1))?
+            .parse().with_context(|| format!("Bad start at line {}", i + 1))?;
+
+        // col10 = Nvalid_cov (index 9), col12 = Nmod (index 11)
+        let nvalid_cov: u64 = row.get(9).ok_or_else(|| anyhow!("Missing Nvalid_cov (col10) at line {}", i + 1))?
+            .parse().with_context(|| format!("Bad Nvalid_cov at line {}", i + 1))?;
+        let nmod: u64 = row.get(11).ok_or_else(|| anyhow!("Missing Nmod (col12) at line {}", i + 1))?
+            .parse().with_context(|| format!("Bad Nmod at line {}", i + 1))?;
+
+        // Skip sites with zero valid coverage to avoid useless entries.
+        if nvalid_cov == 0 {
+            continue;
+        }
+
+        map.entry(chrom).or_default().push(PileupSite { start, nvalid_cov, nmod });
+    }
+
+    // Ensure sorted by position for binary searching.
+    for v in map.values_mut() {
+        v.sort_by(|a, b| a.start.cmp(&b.start));
+    }
+
+    Ok(map)
+}
+
+/// Aggregate methylation fraction over `[start, end)` from a per-chrom pileup index.
+///
+/// Fraction is computed as: `sum(Nmod) / sum(Nvalid_cov)`.
+///
+/// # Returns
+/// `(fraction, sum_nvalid_cov, sum_nmod)`; returns `None` if no sites overlap.
+pub fn region_fraction_modified(
+    chrom_sites: &[PileupSite],
+    start: u64,
+    end: u64,
+) -> Option<(f64, u64, u64)> {
+    // lower_bound for first site with pos >= start
+    let mut lo = 0usize;
+    let mut hi = chrom_sites.len();
+    while lo < hi {
+        let mid = (lo + hi) / 2;
+        if chrom_sites[mid].start < start { lo = mid + 1 } else { hi = mid }
+    }
+    let mut i = lo;
+
+    let mut sum_cov: u64 = 0;
+    let mut sum_mod: u64 = 0;
+
+    while i < chrom_sites.len() {
+        let s = &chrom_sites[i];
+        if s.start >= end {
+            break;
+        }
+        sum_cov = sum_cov.saturating_add(s.nvalid_cov);
+        sum_mod = sum_mod.saturating_add(s.nmod);
+        i += 1;
+    }
+
+    if sum_cov == 0 {
+        None
+    } else {
+        Some((sum_mod as f64 / sum_cov as f64, sum_cov, sum_mod))
+    }
+}
+
+/// Gene-level activity proxy computed from one sample’s pileup.
+///
+/// Activity score:
+/// `log2((body_frac + eps) / (prom_frac + eps))`
+///
+/// This is written as one BED feature per gene (using promoter coordinates).
+#[derive(Debug, Clone)]
+pub struct GeneActivity {
+    pub gene_key: String,
+    pub chrom: String,
+    pub start: u64,
+    pub end: u64,
+    pub prom_frac: f64,
+    pub body_frac: f64,
+    pub score: f64,
+}
+
+/// Compute a per-gene “epigenetic activity” proxy from a single sample’s modkit pileup.
+///
+/// ## Overview
+/// This function derives a gene-level activity-like score by contrasting
+/// **promoter methylation** and **gene-body methylation** within the *same sample*.
+///
+/// It is intended to capture the commonly observed epigenetic pattern where
+/// actively transcribed genes tend to have:
+/// - relatively **low methylation at the promoter**, and
+/// - relatively **higher methylation across the gene body**.
+///
+/// ## Region methylation estimation
+/// For each genomic region \(R = [start, end)\), methylation is estimated by
+/// pooling per-site counts from the pileup:
+///
+/// \[
+/// f(R) = \frac{\sum_{i \in R} Nmod_i}{\sum_{i \in R} Nvalid\_cov_i}
+/// \]
+///
+/// where:
+/// - \(Nmod_i\) is the number of modified calls at site \(i\)
+/// - \(Nvalid\_cov_i\) is the number of valid modification calls at site \(i\)
+///
+/// This is a **coverage-weighted fraction**, which avoids biases that arise
+/// from simply averaging per-site percentages.
+///
+/// ## Activity score
+/// For each gene \(g\) with both a promoter region \(P_g\) and a gene-body region \(B_g\),
+/// the activity proxy is defined as:
+///
+/// \[
+/// A_g = \log_2\left(\frac{f(B_g)}{f(P_g)}\right)
+/// \]
+///
+/// ## Skipping rules
+/// To avoid undefined or numerically unstable values, a gene is **skipped** if:
+/// - either the promoter or gene body has **no overlapping pileup sites**
+/// - \(\sum Nvalid\_cov = 0\) in either region
+/// - \(f(P_g) = 0\) or \(f(B_g) = 0\)
+///
+/// In other words, the log-ratio is computed **only when both regions have
+/// strictly positive methylation fractions**.
+///
+/// ## Interpretation (heuristic)
+/// - \(A_g > 0\): gene-body methylation exceeds promoter methylation  
+///   → “more active-like” epigenetic state
+/// - \(A_g < 0\): promoter methylation exceeds gene-body methylation  
+///   → “more repressed-like” epigenetic state
+/// - \(A_g \approx 0\): similar promoter/body methylation  
+///   → ambiguous or context-dependent
+///
+/// **Important:** this score is a proxy and does not directly measure transcription.
+/// When RNA-seq data are available, expression-based measures should be preferred.
+///
+/// ## Output coordinates
+/// Each output [`GeneActivity`] entry uses the **promoter coordinates** for display
+/// (anchoring the signal near the transcription start site in genome browsers),
+/// while the score itself reflects the promoter/body contrast.
+///
+/// ## Errors
+/// Returns an error if:
+/// - any region has invalid coordinates (`end <= start`)
+///
+/// # Arguments
+/// * `pileup` – per-chromosome pileup sites indexed by chromosome
+/// * `regions` – BED4 regions with names ending in `_prom` and `_body`
+pub fn compute_gene_activity_from_pileup(
+    pileup: &HashMap<String, Vec<PileupSite>>,
+    regions: &[BedRegion],
+    min_sum_cov: u64,
+) -> Result<Vec<GeneActivity>> {
+    #[derive(Clone)]
+    struct Part {
+        chrom: String,
+        start: u64,
+        end: u64,
+        frac: f64,
+        sum_cov: u64,
+    }
+
+    #[derive(Default)]
+    struct Pair {
+        prom: Option<Part>,
+        body: Option<Part>,
+    }
+
+    let mut by_gene: HashMap<String, Pair> = HashMap::new();
+
+    for r in regions {
+        if r.end <= r.start {
+            return Err(anyhow!(
+                "Invalid region end<=start {}:{}-{} ({})",
+                r.chrom, r.start, r.end, r.name
+            ));
+        }
+
+        let ParsedName { gene_key, kind } = parse_region_name(&r.name);
+        if kind == RegionKind::Other {
+            continue;
+        }
+
+        let chrom_sites = match pileup.get(&r.chrom) {
+            Some(v) => v,
+            None => continue,
+        };
+
+        let (frac, sum_cov, _) =
+            match region_fraction_modified(chrom_sites, r.start, r.end) {
+                Some(x) => x,
+                None => continue,
+            };
+
+        // Enforce minimum coverage and positive fraction
+        if sum_cov < min_sum_cov || frac <= 0.0 {
+            continue;
+        }
+
+        let part = Part {
+            chrom: r.chrom.clone(),
+            start: r.start,
+            end: r.end,
+            frac,
+            sum_cov,
+        };
+
+        let entry = by_gene.entry(gene_key).or_default();
+        match kind {
+            RegionKind::Promoter => entry.prom = Some(part),
+            RegionKind::GeneBody => entry.body = Some(part),
+            RegionKind::Other => {}
+        }
+    }
+
+    let mut out = Vec::new();
+
+    for (gene, pair) in by_gene {
+        let (prom, body) = match (pair.prom, pair.body) {
+            (Some(p), Some(b)) => (p, b),
+            _ => continue,
+        };
+
+        let score = (body.frac / prom.frac).log2();
+
+        out.push(GeneActivity {
+            gene_key: gene,
+            chrom: prom.chrom,
+            start: prom.start,
+            end: prom.end,
+            prom_frac: prom.frac,
+            body_frac: body.frac,
+            score,
+        });
+    }
+
+    // Sort for IGV
+    out.sort_by(|a, b| match a.chrom.cmp(&b.chrom) {
+        Ordering::Equal => a.start.cmp(&b.start),
+        o => o,
+    });
+
+    Ok(out)
+}
+
+/// Convert a log2-ratio value to a BED `score` (0..=1000) using symmetric clipping.
+///
+/// Mapping:
+/// - clip to [-clip, +clip]
+/// - scale to [0, 1000]
+///
+/// If `v = -clip` → 0  
+/// If `v = 0`     → 500  
+/// If `v = +clip` → 1000
+pub fn bed_score_from_log2_ratio(v: f64, clip: f64) -> u16 {
+    assert!(clip > 0.0, "clip must be > 0");
+    let x = v.clamp(-clip, clip);
+    let t = (x + clip) / (2.0 * clip); // 0..1
+    let s = (t * 1000.0).round();
+    s.clamp(0.0, 1000.0) as u16
+}
+
+/// Write gene activity as an IGV BED9 track with `itemRgb=On` (diverging colors),
+/// and store the (clipped+scaled) log2-ratio in BED column 5 (`score`).
+///
+/// - Color encodes `GeneActivity::score` (log2(body/prom)) clipped to `[-clip, +clip]`.
+/// - BED `score` encodes the same quantity scaled to [0,1000].
+pub fn write_gene_activity_bed9_itemrgb(
+    path: &str,
+    track_name: &str,
+    activity: &[GeneActivity],
+    clip: f64,
+) -> Result<()> {
+    if !(clip > 0.0) {
+        return Err(anyhow!("clip must be > 0"));
+    }
+
+    let out = File::create(path).with_context(|| format!("Failed to create BED: {path}"))?;
+    let mut w = BufWriter::new(out);
+
+    writeln!(w, r#"track name="{}" itemRgb="On""#, track_name)?;
+
+    for a in activity {
+        let (rr, gg, bb) = diverging_rgb(a.score, clip);
+        let bed_score = bed_score_from_log2_ratio(a.score, clip);
+
+        writeln!(
+            w,
+            "{}\t{}\t{}\t{}\t{}\t.\t{}\t{}\t{},{},{}",
+            a.chrom,
+            a.start,
+            a.end,
+            format!("{}={}", a.gene_key, a.score),
+            bed_score,
+            a.start,
+            a.end,
+            rr,
+            gg,
+            bb
+        )?;
+    }
+
+    Ok(())
+}
+
+/// High-level helper: pileup + regions BED4 -> IGV BED9 activity track (no pseudocounts).
+///
+/// Pipeline:
+/// 1) Read pileup and build a per-chromosome index (`read_pileup_index`)
+/// 2) Read regions BED4 (`read_bed4`)
+/// 3) Compute gene activity (`compute_gene_activity_from_pileup`) using
+///    `score = log2(body_frac / prom_frac)` and **skipping** genes where either fraction is 0
+/// 4) Write BED9 with `itemRgb=On` (`write_gene_activity_bed9_itemrgb`)
+///
+/// Returns the number of written gene features.
+pub fn pileup_regions_to_activity_bed9_itemrgb(
+    pileup_path: &str,
+    regions_bed4_path: &str,
+    min_sum_cov: u64,
+    out_bed9_path: &str,
+    track_name: &str,
+    clip: f64,
+) -> Result<usize> {
+    let pile = read_pileup_index(pileup_path)?;
+    let regions = read_bed4(regions_bed4_path)?;
+    let activity = compute_gene_activity_from_pileup(&pile, &regions, min_sum_cov)?;
+    let n = activity.len();
+    write_gene_activity_bed9_itemrgb(out_bed9_path, track_name, &activity, clip)?;
+    Ok(n)
+}
+
+#[cfg(test)]
+mod tests {
+    use log::info;
+
+    use super::*;
+    use crate::{config::Config, helpers::test_init};
+
+    #[test]
+    fn modkit_activity_igv() -> anyhow::Result<()> {
+        test_init();
+
+        let config = Config::default();
+
+        let id = "DUMCO";
+        let path = format!(
+            "{}/{}_{}_modkit_pileup.bed.gz",
+            config.tumoral_dir(id),
+            id,
+            config.tumoral_name
+        );
+
+        let out = format!(
+            "{}/{}_{}_modkit_activity.bed",
+            config.tumoral_dir(id),
+            id,
+            config.tumoral_name
+        );
+
+        let track_name = format!("{id} Gene Activity");
+
+        let regions_bed4_path = "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed";
+
+        let n = pileup_regions_to_activity_bed9_itemrgb(&path, regions_bed4_path, 100, &out, &track_name, 2.0)?;
+        info!("{n} genes activities written");
+
+        let id = "CHAHA";
+        let path = format!(
+            "{}/{}_{}_modkit_pileup.bed.gz",
+            config.tumoral_dir(id),
+            id,
+            config.tumoral_name
+        );
+
+        let out = format!(
+            "{}/{}_{}_modkit_activity.bed",
+            config.tumoral_dir(id),
+            id,
+            config.tumoral_name
+        );
+
+        let track_name = format!("{id} Gene Activity");
+
+        let regions_bed4_path = "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_genes_prom_body.bed";
+
+        let n = pileup_regions_to_activity_bed9_itemrgb(&path, regions_bed4_path, 100, &out, &track_name, 2.0)?;
+        info!("{n} genes activities written");
+
+        Ok(())
+    }
+}

+ 37 - 1
src/io/writers.rs

@@ -1,11 +1,15 @@
 use std::{
-    fs::{self, File, OpenOptions}, io::BufWriter, path::Path
+    fs::{self, File, OpenOptions},
+    io::BufWriter,
+    path::Path,
 };
 
 use anyhow::Context;
 use bgzip::{BGZFWriter, Compression};
 use log::info;
 
+use crate::io::readers::get_reader;
+
 pub fn get_gz_writer(path: &str, force: bool) -> anyhow::Result<BGZFWriter<File>> {
     if !path.ends_with(".gz") {
         anyhow::bail!("The file should end with gz");
@@ -37,3 +41,35 @@ pub fn get_writer(path: &str) -> anyhow::Result<BufWriter<File>> {
     info!("Writing into {path}");
     Ok(BufWriter::new(file))
 }
+
+pub fn convert_bgz(input: impl AsRef<Path>, force: bool) -> anyhow::Result<()> {
+    let input = input.as_ref();
+
+    let mut reader = get_reader(&input.to_string_lossy())?;
+    let mut writer = get_gz_writer(&format!("{}.gz", input.display()), force)?;
+
+    std::io::copy(&mut reader, &mut writer)?;
+
+    writer.close()?; // or writer.finish()? if gzip encoder
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::helpers::test_init;
+
+    #[test]
+    fn bgz_convert() -> anyhow::Result<()> {
+        test_init();
+
+        let a = "/mnt/beegfs02/scratch/t_steimle/data/wgs/DUMCO/diag/DUMCO_diag_modkit_pileup.bed";
+        convert_bgz(a, true)?;
+
+        let a = "/mnt/beegfs02/scratch/t_steimle/data/wgs/DUMCO/norm/DUMCO_norm_modkit_pileup.bed";
+        convert_bgz(a, true)?;
+
+        Ok(())
+    }
+}

+ 13 - 15
src/pipes/somatic.rs

@@ -10,14 +10,12 @@ use crate::{
         vcf_variant::{run_if_required, ShouldRunBox},
     },
 };
-use itertools::Itertools;
 use log::info;
 use rayon::slice::ParallelSliceMut;
 use serde::Serialize;
 use std::{
-    collections::{BTreeMap, HashMap},
+    collections::BTreeMap,
     fs::{self, File},
-    io::Write,
     path::{Path, PathBuf},
 };
 
@@ -294,7 +292,7 @@ impl Run for SomaticPipe {
 
         // Filter out germline and solo constitutional variants
         info!(
-            "Keeping somatic variants (variants neither in solo {} nor in germline).",
+            "Keeping somatic variants (variants neither in solo normal {} nor in germline).",
             config.normal_name
         );
         somatic_stats.n_constit_germline =
@@ -634,17 +632,17 @@ impl SomaticPipeStats {
     /// [
     ///   {
     ///     "caller_name": "DeepVariant SoloTumor",
-    ///     "germline": [
-    ///       {"ClairS Germline": 99978},
-    ///       {"ClairS Germline + DeepVariant SoloConstit": 4710570}
-    ///     ]
+    ///     "germline": {
+    ///       "ClairS Germline": 99978,
+    ///       "ClairS Germline + DeepVariant SoloConstit": 4710570
+    ///     }
     ///   },
     ///   {
     ///     "caller_name": "ClairS Somatic",
-    ///     "germline": [
-    ///       {"ClairS Germline": 944},
-    ///       {"ClairS Germline + DeepVariant SoloConstit": 362}
-    ///     ]
+    ///     "germline": {
+    ///       "ClairS Germline": 944,
+    ///       "ClairS Germline + DeepVariant SoloConstit": 362
+    ///     }
     ///   }
     /// ]
     /// ```
@@ -892,11 +890,11 @@ mod tests {
         test_init();
         let config = Config::default();
 
-        ThreadPoolBuilder::new()
+        let pool = ThreadPoolBuilder::new()
             .num_threads(config.threads.into())
-            .build_global()?;
+            .build()?;
 
-        SomaticPipe::initialize("DUMCO", &config)?.run()?;
+        pool.install(|| SomaticPipe::initialize("CHAHA", &config)?.run())?;
         Ok(())
     }
 }

+ 52 - 33
src/variant/variant_collection.rs

@@ -14,6 +14,7 @@ use dashmap::DashMap;
 use log::{debug, error, info, warn};
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
+use triple_accel::levenshtein_exp;
 use uuid::Uuid;
 
 use super::vcf_variant::{
@@ -29,7 +30,7 @@ use crate::{
         Annotation, Annotations,
     },
     collection::{
-        bam::{counts_at, counts_ins_at},
+        bam::{counts_at, PileBase},
         vcf::Vcf,
     },
     config::Config,
@@ -334,7 +335,7 @@ impl VariantCollection {
                     // so we can find the variant index inside `seq` even if start was clamped.
                     let position1 = pos0 + 1;
                     let start1 = position1.saturating_sub(seq_len / 2).max(1);
-                    let locus_idx = position1.saturating_sub(start1) as usize; // 0-based index into seq
+                    let locus_idx = position1.saturating_sub(start1); // 0-based index into seq
 
                     // Compute outside map lock
                     let entropy = if seq.len() == seq_len {
@@ -475,16 +476,6 @@ impl VariantCollection {
                 .collect()
         }
 
-        fn single_char_repeat(s: &str) -> Option<(char, usize)> {
-            let mut chars = s.chars();
-            let first = chars.next()?;
-            if chars.all(|c| c == first) {
-                Some((first, s.len()))
-            } else {
-                None
-            }
-        }
-
         self.variants
             .par_chunks(self.chunk_size(max_threads))
             .try_for_each(|chunk| {
@@ -584,36 +575,36 @@ impl VariantCollection {
                                 }
                             }
                             AlterationCategory::INS => {
-                                let pileup = counts_ins_at(
-                                    &mut bam,
-                                    &var.position.contig(),
-                                    var.position.position,
-                                )?;
-                                let depth = crate::collection::bam::nt_pileup_new(
+                                let normal_pileup = crate::collection::bam::nt_pileup_new(
                                     &mut bam,
                                     &var.position.contig(),
                                     var.position.position,
                                     false,
-                                )?
-                                .len();
+                                )?;
+                                let normal_depth = normal_pileup.len();
 
                                 let alt_seq = var.inserted_seq().unwrap_or_default();
-
-                                let (_, alt) = match single_char_repeat(&alt_seq) {
-                                    Some((repeated, n)) if alt_seq.len() > 1 => {
-                                        // If stretch of same nt consider eq +/- 3 nt
-                                        let pv = pileup.clone().into_iter().collect::<Vec<_>>();
-                                        let res = match_repeats(&pv, repeated, n, 3);
-                                        let depth = pileup.len() as u32;
-                                        let alt = res.iter().map(|(_, n)| *n as u32).sum();
-                                        (depth, alt)
+                                let alt_seq = alt_seq.as_bytes().to_vec();
+
+                                let mut normal_n_alt = 0;
+                                for pile in normal_pileup {
+                                    if let PileBase::Ins { len, seq } = pile {
+                                        if let Some(seq) = seq {
+                                            let dist = levenshtein_exp(&alt_seq, &seq);
+                                            let dist_frac = dist as f64 / len as f64;
+                                            // dbg!(dist_frac);
+                                            if dist_frac < 0.1 {
+                                                normal_n_alt += 1;
+                                            }
+                                        } else if alt_seq.len() as u32 == len {
+                                            normal_n_alt += 1;
+                                        }
                                     }
-                                    _ => pileup.clone().into_iter().fold((0, 0), folder(&alt_seq)),
-                                };
+                                }
 
                                 // debug!("{} {alt} / {depth} ", var.variant_id());
-                                anns.push(Annotation::ConstitAlt(alt as u16));
-                                anns.push(Annotation::ConstitDepth(depth as u16));
+                                anns.push(Annotation::ConstitAlt(normal_n_alt as u16));
+                                anns.push(Annotation::ConstitDepth(normal_depth as u16));
                             }
                             _ => (),
                         }
@@ -1990,3 +1981,31 @@ fn process_vep_chunk(
 
     Ok(chunk_results)
 }
+
+//
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{annotation::Caller, helpers::test_init};
+
+    #[test]
+    fn annotate_constit() -> anyhow::Result<()> {
+        test_init();
+        let config = Config::default();
+        let ins: VcfVariant = "chr1\t286\t.\tC\tCT\t27.4\tPASS\t.\tGT:GQ:DP:AD:VAF:MID:PL\t1/1:25:24:0,22:0.916667:deepvariant:27,28,0".parse()?;
+        let ins_2: VcfVariant = "chr1\t1000188\t.\tT\tTGGTGCAGGCAGAGAACAGACGTCGCGATGGGCCCGACGGTGCTGGCTCCATGGGAACCGAGACCCAACACCCAAAGGAGTCCCACAGGCTCAGGGG\t8.9\tPASS\t.\tGT:GQ:DP:AD:VAF:MID:PL\t0/1:8:48:31,16:0.333333:deepvariant:8,0,13".parse()?;
+        let vcf_path = "/mnt/beegfs02/scratch/t_steimle/data/wgs/CHAHA/norm/DeepVariant/CHAHA_norm_DeepVariant_PASSED.vcf.gz";
+        let coll = VariantCollection {
+            variants: vec![ins, ins_2],
+            vcf: Vcf::new(vcf_path.into())?,
+            caller: Annotation::Callers(Caller::DeepVariant, crate::annotation::Sample::Somatic),
+        };
+
+        let constit_bam_path = &config.normal_bam("CHAHA");
+        let annotations = Annotations::default();
+        coll.annotate_with_constit_bam(&annotations, constit_bam_path, 1)?;
+
+        println!("{annotations:#?}");
+        Ok(())
+    }
+}