Thomas 3 часов назад
Родитель
Сommit
44067c4310

+ 10 - 1
pandora-config.example.toml

@@ -384,7 +384,7 @@ coral_dir = "/home/t_steimle/somatic_pipe_tools/CoRAL"
 # Memory usage scales with amplicon complexity and BAM depth.
 # 32G is sufficient for typical WGS at 30–60×; increase to 64G
 # for highly rearranged genomes (chromothripsis, high ecDNA copy number).
-coral_slurm_mem = "64G"
+coral_slurm_mem = "32G"
 
 # SLURM partition to use for CoRAL jobs.
 #
@@ -428,6 +428,15 @@ coral_min_seed_size = 100000
 # of adjacent independent amplicons.
 coral_max_seg_gap = 100000
 
+#######################################
+# Flye
+#######################################
+flye_bin = "/usr/bin/python /home/t_steimle/somatic_pipe_tools/Flye/bin/flye"
+
+flye_threads = 12
+
+flye_slurm_mem = "16G"
+
 #######################################
 # Marlin
 #######################################

+ 273 - 0
src/aligner/minimap.rs

@@ -0,0 +1,273 @@
+use std::path::{Path, PathBuf};
+
+use anyhow::Context;
+
+use crate::{
+    commands::{
+        samtools::{SamtoolsIndex, SamtoolsSort},
+        Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner,
+    },
+    config::Config,
+    helpers::TempFileGuard,
+    runners::Run,
+};
+
+// ─── Minimap2BuildMmi ─────────────────────────────────────────────────────────
+
+#[derive(Clone)]
+pub enum Minimap2Preset {
+    /// Raw ONT reads → reference genome.
+    /// Reference is taken from config.reference — MMI always precomputed.
+    MapOntRef,
+    /// Assembled contigs → single-contig FASTA.
+    /// Reference is provided explicitly — no MMI.
+    Asm5(PathBuf),
+}
+
+impl Minimap2Preset {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Minimap2Preset::MapOntRef => "map-ont",
+            Minimap2Preset::Asm5(_) => "asm5",
+        }
+    }
+}
+
+/// Precompute a minimap2 index (.mmi) for a reference FASTA.
+///
+/// Only dispatched if the MMI does not already exist — callers should check
+/// `Minimap2BuildMmi::needed()` before constructing and running this job.
+///
+/// Runs on the cluster (SbatchRunner) when slurm_runner is set in config,
+/// locally otherwise — consistent with all other Pandora jobs.
+pub struct Minimap2BuildMmi {
+    pub reference: PathBuf,
+    pub out_mmi: PathBuf,
+    pub minimap2_bin: String,
+    pub threads: u8,
+    pub slurm_mem: Option<String>,
+    pub log_dir: String,
+}
+
+impl Minimap2BuildMmi {
+    pub fn init(config: &Config, reference: PathBuf) -> Self {
+        let out_mmi = reference.with_extension("map-ont.mmi");
+
+        let log_dir = format!("{}/log/minimap2_index", config.result_dir);
+        let slurm_mem = if config.slurm_runner {
+            Some(config.minimap2_slurm_mem.clone())
+        } else {
+            None
+        };
+
+        Self {
+            reference,
+            out_mmi,
+            minimap2_bin: config.minimap2_bin.clone(),
+            threads: config.minimap2_threads,
+            slurm_mem,
+            log_dir,
+        }
+    }
+
+    /// Returns true if the MMI needs to be built.
+    pub fn needed(&self) -> bool {
+        !self.out_mmi.exists()
+    }
+
+    /// Returns the MMI path regardless of whether it exists yet.
+    /// Use after `run()` to get the reference path for downstream jobs.
+    pub fn mmi_path(&self) -> &Path {
+        &self.out_mmi
+    }
+}
+
+impl JobCommand for Minimap2BuildMmi {
+    fn cmd(&self) -> String {
+        format!(
+            "{bin} -x map-ont -t {threads} -d {out} {reference}",
+            bin = self.minimap2_bin,
+            threads = self.threads,
+            out = self.out_mmi.display(),
+            reference = self.reference.display(),
+        )
+    }
+}
+
+impl SbatchRunner for Minimap2BuildMmi {
+    fn slurm_params(&self) -> SlurmParams {
+        SlurmParams {
+            job_name: Some("minimap2_index".to_string()),
+            cpus_per_task: Some(self.threads.into()),
+            // MMI construction is memory-intensive for large references (~14 GB
+            // for a human genome); use the same mem config as alignment jobs.
+            mem: self.slurm_mem.clone(),
+            partition: Some("shortq".to_string()),
+            gres: None,
+        }
+    }
+}
+
+impl SlurmRunner for Minimap2BuildMmi {
+    fn slurm_args(&self) -> Vec<String> {
+        self.slurm_params().to_args()
+    }
+}
+
+impl LocalRunner for Minimap2BuildMmi {}
+
+impl Run for Minimap2BuildMmi {
+    fn run(&mut self) -> anyhow::Result<()> {
+        if !self.needed() {
+            tracing::info!(
+                path = %self.out_mmi.display(),
+                "Reusing existing MMI index"
+            );
+            return Ok(());
+        }
+
+        tracing::info!(
+            reference = %self.reference.display(),
+            "Building minimap2 MMI index"
+        );
+
+        let res = if self.slurm_mem.is_some() {
+            SbatchRunner::exec(self)
+        } else {
+            LocalRunner::exec(self)
+        }
+        .context("minimap2 index failed")?;
+
+        res.save_to_file(format!("{}/minimap2_index_", self.log_dir))?;
+        Ok(())
+    }
+}
+
+/// Per-contig realignment job. One instance per contig emitted by the assembler.
+pub struct Minimap2AlignOnt {
+    pub reads: PathBuf,
+    pub reference: PathBuf,
+    pub out_sam: PathBuf,
+    pub final_bam: PathBuf,
+    pub preset: Minimap2Preset,
+    pub minimap2_bin: String,
+    pub minimap2_threads: u8,
+    pub minimap2_slurm_mem: Option<String>,
+    pub log_dir: String,
+    guard: TempFileGuard,
+    sort_job: SamtoolsSort,
+    index_job: SamtoolsIndex,
+}
+
+impl JobCommand for Minimap2AlignOnt {
+    fn cmd(&self) -> String {
+        format!(
+            "{bin} -a -x {preset} --MD -t {threads} {reference} {reads} > {out_sam}",
+            bin = self.minimap2_bin,
+            preset = self.preset.as_str(),
+            threads = self.minimap2_threads,
+            reference = self.reference.display(),
+            reads = self.reads.display(),
+            out_sam = self.out_sam.display(),
+        )
+    }
+}
+
+impl SbatchRunner for Minimap2AlignOnt {
+    fn slurm_params(&self) -> SlurmParams {
+        SlurmParams {
+            job_name: Some("minimap2".to_string()),
+            cpus_per_task: Some(self.minimap2_threads.into()),
+            mem: self.minimap2_slurm_mem.clone(),
+            partition: Some("shortq".to_string()),
+            gres: None,
+        }
+    }
+}
+
+impl SlurmRunner for Minimap2AlignOnt {
+    fn slurm_args(&self) -> Vec<String> {
+        self.slurm_params().to_args()
+    }
+}
+
+impl LocalRunner for Minimap2AlignOnt {}
+
+impl Minimap2AlignOnt {
+    pub fn init(
+        id: &str,
+        config: &Config,
+        reads: PathBuf,
+        final_bam: PathBuf,
+        preset: Minimap2Preset,
+    ) -> anyhow::Result<Self> {
+        let reference = match &preset {
+            Minimap2Preset::MapOntRef => {
+                let mut mmi_job = Minimap2BuildMmi::init(config, config.reference.clone().into());
+                mmi_job.run()?;
+                mmi_job.out_mmi
+            }
+            Minimap2Preset::Asm5(contig_fasta) => contig_fasta.clone(),
+        };
+
+        let log_dir = format!("{}/{}/log/minimap2", config.result_dir, &id);
+
+        let minimap2_slurm_mem = if config.slurm_runner {
+            Some(config.minimap2_slurm_mem.clone())
+        } else {
+            None
+        };
+
+        let tmp_dir = PathBuf::from(config.tmp_dir.clone());
+        let mut guard = TempFileGuard::new();
+        let out_sam = guard.tmp_path(".tmp.sam", &tmp_dir);
+
+        let sort_job = SamtoolsSort::from_config(config, &out_sam, &final_bam);
+        let index_job = SamtoolsIndex::from_config(config, final_bam.to_str().unwrap());
+
+        Ok(Self {
+            reads,
+            reference,
+            out_sam,
+            final_bam,
+            preset,
+            minimap2_bin: config.minimap2_bin.clone(),
+            minimap2_threads: config.minimap2_threads,
+            minimap2_slurm_mem,
+            guard,
+            log_dir,
+            sort_job,
+            index_job,
+        })
+    }
+}
+
+impl Run for Minimap2AlignOnt {
+    fn run(&mut self) -> anyhow::Result<()> {
+        // Align
+        let res = if self.minimap2_slurm_mem.is_some() {
+            SbatchRunner::exec(self)
+        } else {
+            LocalRunner::exec(self)
+        }?;
+        res.save_to_file(format!("{}/minimap2_", self.log_dir))?;
+
+        let res = if self.minimap2_slurm_mem.is_some() {
+            SbatchRunner::exec(&mut self.sort_job)
+        } else {
+            LocalRunner::exec(&mut self.sort_job)
+        }?;
+        res.save_to_file(format!("{}/samtools_sort_", self.log_dir))?;
+
+        let res = if self.minimap2_slurm_mem.is_some() {
+            SbatchRunner::exec(&mut self.index_job)
+        } else {
+            LocalRunner::exec(&mut self.index_job)
+        }?;
+        res.save_to_file(format!("{}/samtools_index_", self.log_dir))?;
+
+        self.guard.cleanup();
+
+        Ok(())
+    }
+}

+ 2 - 0
src/aligner/mod.rs

@@ -0,0 +1,2 @@
+pub mod minimap;
+

+ 1 - 1
src/commands/samtools.rs

@@ -721,7 +721,7 @@ impl SamtoolsSort {
 impl super::Command for SamtoolsSort {
     fn init(&mut self) -> anyhow::Result<()> {
         if !self.input_bam.exists() {
-            anyhow::bail!("No BAM file at: {}", self.input_bam.display());
+            anyhow::bail!("No BAM/SAM file at: {}", self.input_bam.display());
         }
 
         if self.output_bam.exists() {

+ 14 - 0
src/config.rs

@@ -521,6 +521,20 @@ pub struct Config {
 
     /// Path to VEP sorted GFF
     pub vep_gff: String,
+
+    // === Flye ===
+    /// "python_bin_path flye_bin_path" ex.: "/usr/bin/python /home/t_steimle/somatic_pipe_tools/Flye/bin/flye"
+    pub flye_bin: String,
+
+    /// Flye threads
+    pub flye_threads: u8,
+
+    /// Slurm max mem
+    pub flye_slurm_mem: String,
+
+    pub minimap2_bin: String,
+    pub minimap2_threads: u8,
+    pub minimap2_slurm_mem: String,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]

+ 281 - 0
src/de_novo/de_novo_pipe.rs

@@ -0,0 +1,281 @@
+use std::{
+    collections::{HashMap, HashSet},
+    path::{Path, PathBuf},
+};
+
+use anyhow::Context;
+use rust_htslib::bam::{self, Read};
+
+use crate::{
+    aligner::minimap::{Minimap2AlignOnt, Minimap2Preset},
+    commands::samtools::{SamtoolsIndex, SamtoolsSort},
+    config::Config,
+    de_novo::{Assembler, Polisher},
+    helpers::TempFileGuard,
+    io::{fasta::split_fasta, fastq::write_fastq},
+    run, run_many,
+    runners::Run,
+};
+
+/// Result for a single assembled contig.
+/// The function returns one of these per contig — caller reasons about them
+/// independently.
+pub struct ContigAssemblyResult {
+    pub contig_name: String,
+    /// Single-contig polished FASTA used as alignment reference.
+    pub contig_fasta: PathBuf,
+    /// Reads realigned to this contig with MM/ML/MN tags from original records.
+    pub aligned_bam: PathBuf,
+    /// Read names excluded because they produced supplementary alignments on
+    /// this contig. Non-empty signals that the input region should be tightened.
+    /// Note: a read suspicious on contig A but absent here was cleanly placed
+    /// on this contig — it likely bridges the two contigs.
+    pub suspicious_reads: Vec<Vec<u8>>,
+    pub contig_ref_bam: PathBuf, // contig → reference genome
+}
+
+// ─── Config ───────────────────────────────────────────────────────────────────
+
+pub struct LocalAssemblyConfig {
+    /// Minimum number of records required before attempting assembly.
+    /// Flye produces fragmented or empty output silently below ~30×.
+    /// Lower for targeted subclonal loci with expected sparse coverage.
+    /// Default: 30.
+    pub min_records: usize,
+    pub case_id: String,
+    pub config: Config,
+}
+
+impl LocalAssemblyConfig {
+    fn from_config(case_id: String, min_records: usize, config: Config) -> Self {
+        Self {
+            min_records,
+            case_id,
+            config,
+        }
+    }
+}
+
+// ─── Pipeline orchestrator ────────────────────────────────────────────────────
+
+/// Orchestrate local de novo assembly from a set of primary htslib records.
+///
+/// Returns one `ContigAssemblyResult` per contig emitted by the assembler.
+/// All path dependencies must be set at construction time by the caller.
+///
+/// Steps 1, 5, 6 run locally (pure Rust, Windows-safe).
+/// Steps 2–4 are dispatched via `run!` / `run_many!` through SlurmRunner.
+pub fn run_local_assembly(
+    records: &[bam::Record],
+    mut assembler: impl Assembler,
+    mut polisher: impl Polisher,
+    work_dir: PathBuf,
+    config: LocalAssemblyConfig,
+) -> anyhow::Result<Vec<ContigAssemblyResult>> {
+    if records.len() < config.min_records {
+        anyhow::bail!(
+            "Too few records for local assembly: {} (minimum {})",
+            records.len(),
+            config.min_records
+        );
+    }
+
+    // Step 1 — write FASTQ (local)
+    let reads_path = work_dir.join("reads.fastq");
+    write_fastq(records, &reads_path).context("FASTQ write failed")?;
+
+    // Step 2 — assemble (cluster)
+    run!(&config.config, &mut assembler)?;
+
+    // Step 3 — polish (cluster)
+    // draft set at construction to assembler.assembly_fasta()
+    run!(&config.config, &mut polisher)?;
+
+    // Step 4 — split polished FASTA into per-contig files (local)
+    let contigs_dir = work_dir.join("contigs");
+    let contigs =
+        split_fasta(&polisher.polished_fasta(), &contigs_dir).context("FASTA split failed")?;
+
+    if contigs.is_empty() {
+        anyhow::bail!("Assembler produced no contigs");
+    }
+
+    tracing::info!(
+        count = contigs.len(),
+        "Assembly produced {} contig(s)",
+        contigs.len()
+    );
+
+    let tmp_dir = PathBuf::from(config.config.tmp_dir.clone());
+    let mut guard = TempFileGuard::new();
+
+    // Step 5 — realign reads to each contig independently (cluster, sequential)
+    // Asm5(contig_fasta) — no MMI, reference is the single-contig FASTA
+    let mut realigners: Vec<Minimap2AlignOnt> = contigs
+        .iter()
+        .map(|c| {
+            Minimap2AlignOnt::init(
+                &config.case_id,
+                &config.config,
+                reads_path.clone(),
+                guard.tmp_path(".tmp.bam", &tmp_dir),
+                Minimap2Preset::Asm5(c.fasta_path.clone()),
+            )
+        })
+        .collect::<anyhow::Result<Vec<_>>>()?;
+
+    for realigner in &mut realigners {
+        realigner.run()?;
+    }
+
+    // Step 6 — align contigs to reference genome (cluster, sequential)
+    // MapOnt — MMI precomputed inside init if not already present
+    let mut contig_aligners: Vec<Minimap2AlignOnt> = contigs
+        .iter()
+        .map(|c| {
+            Minimap2AlignOnt::init(
+                &config.case_id,
+                &config.config,
+                c.fasta_path.clone(),
+                work_dir.join(format!("{}.ref.bam", c.name)),
+                Minimap2Preset::MapOntRef,
+            )
+        })
+        .collect::<anyhow::Result<Vec<_>>>()?;
+
+    for aligner in &mut contig_aligners {
+        aligner.run()?;
+    }
+
+    // Step 7 — per-contig: supplementary scan + tag transfer (local)
+    let original_index = index_records_by_name(records);
+
+    let results = contigs
+        .into_iter()
+        .zip(realigners.iter())
+        .zip(contig_aligners.iter())
+        .map(|((contig, realigner), contig_aligner)| {
+            let suspicious_qnames = collect_supplementary_qnames(&realigner.final_bam)
+                .with_context(|| {
+                    format!("Supplementary scan failed for contig {}", contig.name)
+                })?;
+
+            if !suspicious_qnames.is_empty() {
+                tracing::warn!(
+                    contig = %contig.name,
+                    count  = suspicious_qnames.len(),
+                    "Suspicious reads on contig — consider tightening input region"
+                );
+            }
+
+            let out_bam = guard.tmp_path(".tmp.bam", &tmp_dir);
+            transfer_methylation_tags(
+                &realigner.final_bam,
+                &original_index,
+                &suspicious_qnames,
+                &out_bam,
+            )
+            .with_context(|| {
+                format!("MM/ML tag transfer failed for contig {}", contig.name)
+            })?;
+
+            let final_bam = work_dir.join(format!("{}.bam", contig.name));
+            let mut sort_job = SamtoolsSort::from_config(&config.config, &out_bam, &final_bam);
+            let _log = run!(&config.config, &mut sort_job)?;
+
+            let mut index_job =
+                SamtoolsIndex::from_config(&config.config, final_bam.to_str().unwrap());
+            let _log = run!(&config.config, &mut index_job)?;
+
+            // contig_aligner.run() already performed sort + index internally
+            Ok(ContigAssemblyResult {
+                contig_name:      contig.name,
+                contig_fasta:     contig.fasta_path,
+                aligned_bam:      final_bam,
+                contig_ref_bam:   contig_aligner.final_bam.clone(),
+                suspicious_reads: suspicious_qnames.into_iter().collect(),
+            })
+        })
+        .collect::<anyhow::Result<Vec<_>>>()?;
+
+    guard.cleanup();
+
+    Ok(results)
+}
+
+// ─── Supplementary detection ──────────────────────────────────────────────────
+
+const BAM_FSUPPLEMENTARY: u16 = 0x800;
+
+/// Collect read names that have at least one supplementary alignment in `bam`.
+///
+/// Because the reference is a single contig, any supplementary alignment is
+/// unambiguously suspicious — the read spans a discontinuity within that contig,
+/// indicating the input record selection was too broad.
+fn collect_supplementary_qnames(bam: &Path) -> anyhow::Result<HashSet<Vec<u8>>> {
+    let mut reader = bam::Reader::from_path(bam)
+        .with_context(|| format!("Cannot open BAM: {}", bam.display()))?;
+
+    let mut supp_names = HashSet::new();
+
+    for result in reader.records() {
+        let rec = result.context("Error reading record during supplementary scan")?;
+        if rec.flags() & BAM_FSUPPLEMENTARY != 0 {
+            supp_names.insert(rec.qname().to_vec());
+        }
+    }
+
+    Ok(supp_names)
+}
+
+// ─── MM / ML tag transfer ─────────────────────────────────────────────────────
+
+/// Tags carrying base modification data.
+/// MN is the sequence length at call time — required for correct MM parsing.
+const METHYLATION_TAGS: &[&[u8]] = &[b"MM", b"ML", b"MN"];
+
+pub fn index_records_by_name(records: &[bam::Record]) -> HashMap<Vec<u8>, &bam::Record> {
+    records.iter().map(|r| (r.qname().to_vec(), r)).collect()
+}
+
+/// Copy MM/ML/MN tags from original records onto the realigned BAM records.
+/// Reads in `suspicious_qnames` are excluded from the output.
+fn transfer_methylation_tags(
+    raw_bam: &Path,
+    original_index: &HashMap<Vec<u8>, &bam::Record>,
+    suspicious_qnames: &HashSet<Vec<u8>>,
+    out_bam: &Path,
+) -> anyhow::Result<()> {
+    let mut reader = bam::Reader::from_path(raw_bam)
+        .with_context(|| format!("Cannot open BAM: {}", raw_bam.display()))?;
+    let header = bam::Header::from_template(reader.header());
+    let mut writer = bam::Writer::from_path(out_bam, &header, bam::Format::Bam)
+        .with_context(|| format!("Cannot create BAM: {}", out_bam.display()))?;
+
+    for result in reader.records() {
+        let mut rec = result.context("Error reading BAM record")?;
+
+        if suspicious_qnames.contains(rec.qname()) {
+            continue;
+        }
+
+        if let Some(&orig) = original_index.get(rec.qname()) {
+            for tag in METHYLATION_TAGS {
+                rec.remove_aux(tag)?;
+                if let Ok(aux) = orig.aux(tag) {
+                    rec.push_aux(tag, aux).with_context(|| {
+                        format!(
+                            "Failed to push {} tag for read {}",
+                            std::str::from_utf8(tag).unwrap_or("??"),
+                            std::str::from_utf8(rec.qname()).unwrap_or("??"),
+                        )
+                    })?;
+                }
+            }
+        }
+
+        writer.write(&rec).context("Error writing BAM record")?;
+    }
+
+    Ok(())
+}

+ 82 - 0
src/de_novo/flye.rs

@@ -0,0 +1,82 @@
+use std::path::{Path, PathBuf};
+
+use crate::{
+    commands::{Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner},
+    config::Config,
+    de_novo::Assembler,
+};
+
+pub struct FlyeAssembler {
+    pub reads: PathBuf,
+    pub out_dir: PathBuf,
+    pub genome_size: String,
+    pub threads: u8,
+    pub flye_bin: String,
+    pub flye_slurm_mem: String,
+}
+
+impl FlyeAssembler {
+    pub fn from_config(
+        config: &Config,
+        reads: PathBuf,
+        out_dir: PathBuf,
+        genome_size: String,
+    ) -> Self {
+        Self {
+            reads,
+            out_dir,
+            genome_size,
+            threads: config.flye_threads,
+            flye_bin: config.flye_bin.clone(),
+            flye_slurm_mem: config.flye_slurm_mem.clone(),
+        }
+    }
+}
+
+impl JobCommand for FlyeAssembler {
+    fn cmd(&self) -> String {
+        format!(
+            "{flye_bin} --nano-hq {reads} --out-dir {out_dir} --threads {threads} --genome-size {genome_size}",
+            flye_bin = self.flye_bin,
+            reads = self.reads.display(),
+            out_dir = self.out_dir.display(),
+            threads = self.threads,
+            genome_size = self.genome_size,
+        )
+    }
+}
+
+impl SbatchRunner for FlyeAssembler {
+    fn slurm_params(&self) -> crate::commands::SlurmParams {
+        SlurmParams {
+            job_name: Some("flye".to_string()),
+            cpus_per_task: Some(self.threads.into()),
+            mem: Some(self.flye_slurm_mem.clone()),
+            partition: Some("shortq".to_string()),
+            gres: None,
+        }
+    }
+}
+
+impl SlurmRunner for FlyeAssembler {
+    fn slurm_args(&self) -> Vec<String> {
+        self.slurm_params().to_args()
+    }
+}
+
+impl LocalRunner for FlyeAssembler {}
+
+impl Assembler for FlyeAssembler {
+    fn reads_input(&self) -> &Path {
+        &self.reads
+    }
+    fn output_dir(&self) -> &Path {
+        &self.out_dir
+    }
+    fn assembly_fasta(&self) -> PathBuf {
+        self.out_dir.join("assembly.fasta")
+    }
+    fn assembly_graph(&self) -> Option<PathBuf> {
+        Some(self.out_dir.join("assembly_graph.gfa"))
+    }
+}

+ 75 - 0
src/de_novo/medaka.rs

@@ -0,0 +1,75 @@
+use std::path::{Path, PathBuf};
+
+use crate::{
+    commands::{Command as JobCommand, LocalRunner, SbatchRunner, SlurmParams, SlurmRunner},
+    de_novo::Polisher,
+};
+
+pub struct MedakaPolisher {
+    pub reads: PathBuf,
+    /// Set at construction from assembler.assembly_fasta()
+    pub draft: PathBuf,
+    pub out_dir: PathBuf,
+    pub model: String,
+    pub threads: u32,
+    pub conda_sh: String,
+    pub slurm_mem: String,
+    pub medaka_env: String,
+    pub medaka_consensus_bin: String,
+}
+
+impl JobCommand for MedakaPolisher {
+    fn cmd(&self) -> String {
+        format!(
+            "source {conda_sh} && conda activate {medaka_env} && {medaka_consensus_bin} \
+                -i {reads} \
+                -d {draft} \
+                -o {out_dir} \
+                -t {threads} \
+                -m {model}",
+            conda_sh = self.conda_sh,
+            medaka_env = self.medaka_env,
+            medaka_consensus_bin = self.medaka_consensus_bin,
+            reads = self.reads.to_str().unwrap(),
+            draft = self.draft.to_str().unwrap(),
+            out_dir = self.out_dir.to_str().unwrap(),
+            threads = self.threads,
+            model = self.model,
+        )
+    }
+}
+
+impl Polisher for MedakaPolisher {
+    fn reads_input(&self) -> &Path {
+        &self.reads
+    }
+    fn draft_input(&self) -> &Path {
+        &self.draft
+    }
+    fn output_dir(&self) -> &Path {
+        &self.out_dir
+    }
+    fn polished_fasta(&self) -> PathBuf {
+        self.out_dir.join("consensus.fasta")
+    }
+}
+
+impl SbatchRunner for MedakaPolisher {
+    fn slurm_params(&self) -> SlurmParams {
+        SlurmParams {
+            job_name: Some("medaka".to_string()),
+            cpus_per_task: Some(self.threads),
+            mem: Some(self.slurm_mem.clone()),
+            partition: Some("shortq".to_string()),
+            gres: None,
+        }
+    }
+}
+
+impl SlurmRunner for MedakaPolisher {
+    fn slurm_args(&self) -> Vec<String> {
+        self.slurm_params().to_args()
+    }
+}
+
+impl LocalRunner for MedakaPolisher {}

+ 21 - 0
src/de_novo/mod.rs

@@ -0,0 +1,21 @@
+pub mod flye;
+pub mod de_novo_pipe;
+pub mod medaka;
+
+use crate::commands::{Command as JobCommand, LocalRunner, SbatchRunner, SlurmRunner};
+use std::path::{Path, PathBuf};
+
+// ─── Assembler / Polisher traits ──────────────────────────────────────────────
+pub trait Assembler: JobCommand + SlurmRunner + SbatchRunner + LocalRunner {
+    fn reads_input(&self) -> &Path;
+    fn output_dir(&self) -> &Path;
+    fn assembly_fasta(&self) -> PathBuf;
+    fn assembly_graph(&self) -> Option<PathBuf>;
+}
+
+pub trait Polisher: JobCommand + SlurmRunner + SbatchRunner + LocalRunner {
+    fn reads_input(&self) -> &Path;
+    fn draft_input(&self) -> &Path;
+    fn output_dir(&self) -> &Path;
+    fn polished_fasta(&self) -> PathBuf;
+}

+ 52 - 1
src/io/fasta.rs

@@ -1,5 +1,7 @@
-use std::{fs::File, path::Path};
+use std::io::{BufRead, Write};
+use std::{fs::File, io::BufReader, path::{Path, PathBuf}};
 
+use anyhow::Context;
 use noodles_fasta::io::{BufReader as NoodlesBufReader, IndexedReader};
 
 /// Open FASTA with its .fai index.
@@ -57,3 +59,52 @@ pub fn sequence_range(
 
     Ok(s)
 }
+
+pub struct ContigFasta {
+    pub name:       String,
+    pub fasta_path: PathBuf,
+}
+ 
+/// Split a multi-contig FASTA into individual single-contig files under `out_dir`.
+/// Returns one entry per contig in the order they appear in the input.
+pub fn split_fasta(fasta: &Path, out_dir: &Path) -> anyhow::Result<Vec<ContigFasta>> {
+    std::fs::create_dir_all(out_dir)
+        .with_context(|| format!("Cannot create contig dir: {}", out_dir.display()))?;
+ 
+    let f = std::fs::File::open(fasta)
+        .with_context(|| format!("Cannot open FASTA: {}", fasta.display()))?;
+    let reader = BufReader::new(f);
+ 
+    let mut contigs  = Vec::new();
+    let mut current_name: Option<String> = None;
+    let mut current_writer: Option<std::fs::File> = None;
+ 
+    for line in reader.lines() {
+        let line = line.context("Error reading FASTA")?;
+        if let Some(name) = line.strip_prefix('>') {
+            // Sanitise contig name for use as filename
+            let safe_name: String = name
+                .split_whitespace()
+                .next()
+                .unwrap_or("contig")
+                .chars()
+                .map(|c| if c.is_alphanumeric() || c == '_' || c == '-' { c } else { '_' })
+                .collect();
+ 
+            let path = out_dir.join(format!("{}.fasta", safe_name));
+            let mut w = std::fs::File::create(&path)
+                .with_context(|| format!("Cannot create contig FASTA: {}", path.display()))?;
+            writeln!(w, ">{}", name)?;
+ 
+            contigs.push(ContigFasta { name: safe_name.clone(), fasta_path: path });
+            current_name   = Some(safe_name);
+            current_writer = Some(w);
+        } else if let Some(ref mut w) = current_writer {
+            writeln!(w, "{}", line)?;
+        }
+    }
+ 
+    let _ = current_name; // suppress unused warning
+    Ok(contigs)
+}
+

+ 35 - 0
src/io/fastq.rs

@@ -0,0 +1,35 @@
+use std::{io::Write, path::Path};
+
+use anyhow::Context;
+use rust_htslib::bam;
+
+// ─── FASTQ writer ─────────────────────────────────────────────────────────────
+ 
+/// Write htslib records to FASTQ.
+///
+/// - htslib always stores the forward-strand sequence regardless of flag
+/// - Converts phred+0 qual to phred+33 ASCII
+/// - Skips records with empty sequence
+pub fn write_fastq(records: &[bam::Record], out: &Path) -> anyhow::Result<()> {
+    let mut f = std::fs::File::create(out)
+        .with_context(|| format!("Cannot create FASTQ: {}", out.display()))?;
+ 
+    for rec in records {
+        let seq = rec.seq().as_bytes();
+        if seq.is_empty() {
+            continue;
+        }
+ 
+        let name       = std::str::from_utf8(rec.qname()).context("Non-UTF8 read name")?;
+        let qual_ascii: Vec<u8> = rec.qual().iter().map(|&q| q + 33).collect();
+ 
+        writeln!(f, "@{}", name)?;
+        f.write_all(&seq)?;
+        writeln!(f)?;
+        writeln!(f, "+")?;
+        f.write_all(&qual_ascii)?;
+        writeln!(f)?;
+    }
+ 
+    Ok(())
+}

+ 1 - 0
src/io/mod.rs

@@ -12,3 +12,4 @@ pub mod straglr;
 pub mod tsv;
 pub mod modkit;
 pub mod liftover;
+pub mod fastq;

+ 2 - 0
src/lib.rs

@@ -148,6 +148,8 @@ pub mod runners;
 pub mod scan;
 pub mod slurm_helpers;
 pub mod variant;
+pub mod de_novo;
+pub mod aligner;
 
 #[macro_use]
 extern crate lazy_static;