Browse Source

Update runs for unligned BAM files and not only POD5

Thomas 2 weeks ago
parent
commit
40524fefc7
2 changed files with 124 additions and 47 deletions
  1. 7 1
      src/commands/mod.rs
  2. 117 46
      src/pipes/somatic_slurm.rs

+ 7 - 1
src/commands/mod.rs

@@ -336,6 +336,9 @@ impl SlurmParams {
         args
     }
 }
+fn printable(s: &str) -> String {
+    s.chars().filter(|c| !c.is_control()).collect()
+}
 
 pub trait SbatchRunner: Command {
     fn sbatch_bin(&self) -> &str {
@@ -420,7 +423,10 @@ pub trait SbatchRunner: Command {
                             let mut remaining = Vec::new();
                             if f.read_to_end(&mut remaining).is_ok() && !remaining.is_empty() {
                                 let s = String::from_utf8_lossy(&remaining);
-                                s.split("\n").for_each(|v| info!("{job_id_clone}: {v}"));
+                                s.split("\n")
+                                    .map(printable)
+                                    .filter(|v| !v.is_empty())
+                                    .for_each(|v| info!("{job_id_clone}: {v}"));
                                 if let Ok(mut c) = captured_stdout_clone.lock() {
                                     c.push_str(&s);
                                 }

+ 117 - 46
src/pipes/somatic_slurm.rs

@@ -80,66 +80,137 @@ fn collect_bams_recursively(root: &Path) -> anyhow::Result<Vec<PathBuf>> {
     Ok(bams)
 }
 
-/// Run the full Nanopore POD5 → aligned, sorted and indexed BAM pipeline for a single run.
+/// Run the full POD5 → unaligned BAM → aligned, merged, sorted and indexed BAM
+/// pipeline for a single sequencing run on a flowcell.
 ///
-/// This function orchestrates the end-to-end processing of a `Pod5sRun`,
-/// starting from raw POD5 files and producing per–case, per–sample-type
-/// coordinate–sorted, indexed BAM files in `config.result_dir`.
+/// This function orchestrates a five–step process:
 ///
-/// High-level steps:
+/// 1. **Inspect cases and define barcode mappings**
+///    - Verifies that `run` has at least one associated case.
+///    - Logs all cases and builds a `barcode → case` map using
+///      [`normalize_barcode`] on each case’s `barcode`.
 ///
-/// 1. **Basecalling**  
-///    All POD5 files from the run are basecalled with `DoradoBasecall` into a
-///    single temporary BAM in `tmp_dir`.
+/// 2. **Reuse existing unaligned BAMs and decide which POD5 files to basecall**
+///    - If `run.bams_pass` is set, recursively collects all BAM files under
+///      that directory via [`collect_bams_recursively`] and treats them as
+///      already split per barcode. The BAM file stem (filename without
+///      extension) is used as the POD5 stem key.
+///    - For every POD5 in `run.pod5s`, compares its file stem against the set
+///      of existing BAM stems:
+///        * if a matching BAM exists, the POD5 is considered already
+///          basecalled and is **not** reprocessed;
+///        * otherwise, the POD5 is queued for basecalling.
 ///
-/// 2. **Splitting by read group**  
-///    The basecalled BAM is split by read group using `SamtoolsSplit`, producing
-///    one BAM per barcode/read group in a temporary split directory.
+/// 3. **Basecall missing POD5 files and split by read group (if needed)**
+///    - If there are POD5 files without corresponding BAMs:
+///        * Creates a temporary directory under `tmp_dir` containing only
+///          symlinks to the required POD5s (Linux/Unix only).
+///        * Basecalls this mini–run using [`DoradoBasecall::from_config`],
+///          producing a temporary combined BAM.
+///        * Splits the combined BAM by read group into per–read–group BAMs
+///          (typically per–barcode) via [`SamtoolsSplit::from_config`].
+///        * Deletes the combined basecalled BAM and adds all split BAMs to the
+///          pool of unaligned BAMs (`split_bams`).
+///    - If all POD5 files already had corresponding BAMs, no basecalling is
+///      performed.
 ///
-/// 3. **Barcode → case mapping and alignment job setup**  
-///    Each split BAM is:
-///    - matched to a `IdInput` case via its (normalized) barcode,  
-///    - assigned to a per–case / per–sample-type output directory under
-///      `config.result_dir`, and  
-///    - turned into a `DoradoAlign` job that aligns the reads to the configured
-///      reference.  
+/// 4. **Match unaligned BAMs to cases and schedule alignment jobs**
+///    - Iterates over all unaligned split BAMs:
+///        * Skips files smaller than or equal to `min_bam_size` bytes to avoid
+///          trivial or corrupted inputs.
+///        * Extracts and normalizes a barcode from the BAM filename using
+///          [`extract_and_normalize_barcode`].
+///        * If the normalized barcode matches a case from the earlier
+///          `barcode → case` map:
+///            - Computes a sample–type–specific subdirectory with
+///              [`map_sample_type_to_dir`].
+///            - Ensures that
+///              `config.result_dir/<case_id>/<sample_type_dir>` exists.
+///            - Allocates a temporary output BAM path in `tmp_dir` for the
+///              aligned BAM.
+///            - Constructs a [`DoradoAlign`] job from `config`, the unaligned
+///              BAM, and the temporary aligned BAM path.
+///            - Groups the aligned BAM paths in an internal map keyed by
+///              `<case_id>_<sample_type_dir>` to later merge per
+///              case/sample–type.
+///        * BAMs whose barcode cannot be extracted or does not match any case
+///          are recorded as unmatched and logged.
+///    - If there are both cases without barcodes and unmatched BAMs, a warning
+///      is emitted indicating that manual association may be required.
+///    - If no alignment jobs are scheduled, the function bails with an error
+///      indicating that there are no BAM files to align for this run.
+///    - Otherwise, all alignment jobs are submitted and executed via
+///      [`run_many_sbatch`], which is expected to block until completion.
 ///
-///    The paths of aligned BAMs are tracked in `case_bam_map` for later merging.
-///    Split BAMs below `min_bam_size` are skipped.
+/// 5. **Merge, sort, and index final case–level BAMs**
+///    - For each case with a non-empty barcode:
+///        * Derives a key `<case_id>_<sample_type_dir>` and looks up the
+///          corresponding aligned BAMs from the internal map.
+///        * If no aligned BAMs are found, a warning is logged and the case is
+///          skipped.
+///        * Otherwise, constructs a final BAM path:
+///          `config.result_dir/<case_id>/<sample_type_dir>/\
+///           <case_id>_<sample_type_dir>_<reference_name>.bam`.
+///        * For the first aligned BAM:
+///            - Moves it into place as the initial `final_bam_path`
+///              (`fs::rename`).
+///            - Indexes it using [`SamtoolsIndex`].
+///        * For subsequent aligned BAMs:
+///            - Merges them into `final_bam_path` using [`SamtoolsMerge`].
+///        * After all aligned BAMs for a case/sample–type are merged:
+///            - Sorts the final BAM into a temporary
+///              `*.sorted.bam` file via [`SamtoolsSort`].
+///            - Renames the sorted BAM back to `final_bam_path`.
+///            - Indexes the sorted BAM using [`SamtoolsIndex`] again.
+///    - The resulting, per–case, per–sample–type BAMs are aligned, merged,
+///      coordinate–sorted, and indexed, ready for downstream analysis.
 ///
-/// 4. **Alignment**  
-///    All `DoradoAlign` jobs are submitted via `run_many_sbatch`. At the end of
-///    this step, each case / sample-type may have one or more temporary aligned
-///    BAMs in `tmp_dir`.
+/// 6. **Cleanup**
+///    - Removes the temporary split BAM directory and temporary POD5 directory
+///      created in step 3, if they exist.
+///    - Logs a final summary including the number of cases and the number of
+///      case/sample–type groups that produced final BAMs.
 ///
-/// 5. **Merge, sort and index final BAMs**  
-///    For each case with a barcode:
-///    - all aligned BAMs for a given `(case_id, sample_type)` key are merged
-///      into a single BAM at:
-///      `config.result_dir/<case_id>/<sample_type_dir>/<case>_<sample_type>_<reference>.bam`  
-///      using `SamtoolsMerge`,
-///    - the merged BAM is coordinate–sorted with `SamtoolsSort`, and  
-///    - the sorted BAM is indexed with `SamtoolsIndex`.
+/// # Arguments
 ///
-/// Finally, all temporary split files and directories are removed.  
-/// The function fails early if:
-/// - the run has no cases,  
-/// - no BAMs are eligible for alignment, or  
-/// - any of the Slurm / samtools / filesystem operations fail.
+/// * `run` – Description of a sequencing run, including POD5 input files,
+///   optional existing unaligned BAMs (`bams_pass`), and associated cases with
+///   barcodes and sample types.
+/// * `config` – Global configuration object with paths, reference name,
+///   and command–line options for Dorado, samtools and Slurm submission.
+/// * `tmp_dir` – Scratch directory used to create temporary POD5 directories,
+///   intermediate unaligned/ aligned BAMs, and split BAMs. It should be on a
+///   filesystem accessible to the Slurm jobs and support `rename`.
+/// * `min_bam_size` – Minimum BAM file size (in bytes) required for an
+///   unaligned BAM to be considered for alignment. Smaller files are skipped.
 ///
-/// # Arguments
+/// # Returns
 ///
-/// * `run` – Description of the POD5 run, associated cases and barcodes.
-/// * `config` – Global pipeline configuration (paths, samtools/dorado binaries,
-///   Slurm parameters, reference name, etc.).
-/// * `tmp_dir` – Directory for intermediate BAMs (basecalled, split and aligned).
-/// * `min_bam_size` – Minimum size (in bytes) for a split BAM to be kept and aligned.
+/// Returns `Ok(())` if all stages of the pipeline complete successfully and
+/// final BAMs are produced (where possible) for the run’s cases. On any error,
+/// returns an [`anyhow::Error`] with contextual information.
 ///
 /// # Errors
 ///
-/// Returns an error if the run has no associated cases, if no BAM files are
-/// eligible for alignment, or if any external command (Dorado, samtools) or
-/// filesystem operation fails.
+/// This function will return an error if:
+///
+/// * The run has no associated cases.
+/// * Collecting existing BAMs, creating temporary directories, or symlinking
+///   POD5 files fails.
+/// * Any external command submission or execution (Dorado basecalling,
+///   samtools split/merge/sort/index, alignment jobs) fails.
+/// * Merging, sorting, indexing, or moving BAM files fails.
+/// * Cleanup of temporary directories fails.
+///
+/// # Notes
+///
+/// * This function assumes a Unix-like environment for the use of filesystem
+///   symlinks (`std::os::unix::fs::symlink`).
+/// * Duplicate normalized barcodes in `run.cases` will cause later entries to
+///   overwrite earlier ones in the `barcode → case` mapping.
+/// * BAM indexing is currently performed both before and after sorting; this is
+///   redundant but harmless and can be optimized by indexing only the final
+///   sorted BAM.
 pub fn basecall_align_split(
     run: &Pod5sRun,
     config: &Config,