|
|
@@ -80,66 +80,137 @@ fn collect_bams_recursively(root: &Path) -> anyhow::Result<Vec<PathBuf>> {
|
|
|
Ok(bams)
|
|
|
}
|
|
|
|
|
|
-/// Run the full Nanopore POD5 → aligned, sorted and indexed BAM pipeline for a single run.
|
|
|
+/// Run the full POD5 → unaligned BAM → aligned, merged, sorted and indexed BAM
|
|
|
+/// pipeline for a single sequencing run on a flowcell.
|
|
|
///
|
|
|
-/// This function orchestrates the end-to-end processing of a `Pod5sRun`,
|
|
|
-/// starting from raw POD5 files and producing per–case, per–sample-type
|
|
|
-/// coordinate–sorted, indexed BAM files in `config.result_dir`.
|
|
|
+/// This function orchestrates a five–step process:
|
|
|
///
|
|
|
-/// High-level steps:
|
|
|
+/// 1. **Inspect cases and define barcode mappings**
|
|
|
+/// - Verifies that `run` has at least one associated case.
|
|
|
+/// - Logs all cases and builds a `barcode → case` map using
|
|
|
+/// [`normalize_barcode`] on each case’s `barcode`.
|
|
|
///
|
|
|
-/// 1. **Basecalling**
|
|
|
-/// All POD5 files from the run are basecalled with `DoradoBasecall` into a
|
|
|
-/// single temporary BAM in `tmp_dir`.
|
|
|
+/// 2. **Reuse existing unaligned BAMs and decide which POD5 files to basecall**
|
|
|
+/// - If `run.bams_pass` is set, recursively collects all BAM files under
|
|
|
+/// that directory via [`collect_bams_recursively`] and treats them as
|
|
|
+/// already split per barcode. The BAM file stem (filename without
|
|
|
+/// extension) is used as the POD5 stem key.
|
|
|
+/// - For every POD5 in `run.pod5s`, compares its file stem against the set
|
|
|
+/// of existing BAM stems:
|
|
|
+/// * if a matching BAM exists, the POD5 is considered already
|
|
|
+/// basecalled and is **not** reprocessed;
|
|
|
+/// * otherwise, the POD5 is queued for basecalling.
|
|
|
///
|
|
|
-/// 2. **Splitting by read group**
|
|
|
-/// The basecalled BAM is split by read group using `SamtoolsSplit`, producing
|
|
|
-/// one BAM per barcode/read group in a temporary split directory.
|
|
|
+/// 3. **Basecall missing POD5 files and split by read group (if needed)**
|
|
|
+/// - If there are POD5 files without corresponding BAMs:
|
|
|
+/// * Creates a temporary directory under `tmp_dir` containing only
|
|
|
+/// symlinks to the required POD5s (Linux/Unix only).
|
|
|
+/// * Basecalls this mini–run using [`DoradoBasecall::from_config`],
|
|
|
+/// producing a temporary combined BAM.
|
|
|
+/// * Splits the combined BAM by read group into per–read–group BAMs
|
|
|
+/// (typically per–barcode) via [`SamtoolsSplit::from_config`].
|
|
|
+/// * Deletes the combined basecalled BAM and adds all split BAMs to the
|
|
|
+/// pool of unaligned BAMs (`split_bams`).
|
|
|
+/// - If all POD5 files already had corresponding BAMs, no basecalling is
|
|
|
+/// performed.
|
|
|
///
|
|
|
-/// 3. **Barcode → case mapping and alignment job setup**
|
|
|
-/// Each split BAM is:
|
|
|
-/// - matched to a `IdInput` case via its (normalized) barcode,
|
|
|
-/// - assigned to a per–case / per–sample-type output directory under
|
|
|
-/// `config.result_dir`, and
|
|
|
-/// - turned into a `DoradoAlign` job that aligns the reads to the configured
|
|
|
-/// reference.
|
|
|
+/// 4. **Match unaligned BAMs to cases and schedule alignment jobs**
|
|
|
+/// - Iterates over all unaligned split BAMs:
|
|
|
+/// * Skips files smaller than or equal to `min_bam_size` bytes to avoid
|
|
|
+/// trivial or corrupted inputs.
|
|
|
+/// * Extracts and normalizes a barcode from the BAM filename using
|
|
|
+/// [`extract_and_normalize_barcode`].
|
|
|
+/// * If the normalized barcode matches a case from the earlier
|
|
|
+/// `barcode → case` map:
|
|
|
+/// - Computes a sample–type–specific subdirectory with
|
|
|
+/// [`map_sample_type_to_dir`].
|
|
|
+/// - Ensures that
|
|
|
+/// `config.result_dir/<case_id>/<sample_type_dir>` exists.
|
|
|
+/// - Allocates a temporary output BAM path in `tmp_dir` for the
|
|
|
+/// aligned BAM.
|
|
|
+/// - Constructs a [`DoradoAlign`] job from `config`, the unaligned
|
|
|
+/// BAM, and the temporary aligned BAM path.
|
|
|
+/// - Groups the aligned BAM paths in an internal map keyed by
|
|
|
+/// `<case_id>_<sample_type_dir>` to later merge per
|
|
|
+/// case/sample–type.
|
|
|
+/// * BAMs whose barcode cannot be extracted or does not match any case
|
|
|
+/// are recorded as unmatched and logged.
|
|
|
+/// - If there are both cases without barcodes and unmatched BAMs, a warning
|
|
|
+/// is emitted indicating that manual association may be required.
|
|
|
+/// - If no alignment jobs are scheduled, the function bails with an error
|
|
|
+/// indicating that there are no BAM files to align for this run.
|
|
|
+/// - Otherwise, all alignment jobs are submitted and executed via
|
|
|
+/// [`run_many_sbatch`], which is expected to block until completion.
|
|
|
///
|
|
|
-/// The paths of aligned BAMs are tracked in `case_bam_map` for later merging.
|
|
|
-/// Split BAMs below `min_bam_size` are skipped.
|
|
|
+/// 5. **Merge, sort, and index final case–level BAMs**
|
|
|
+/// - For each case with a non-empty barcode:
|
|
|
+/// * Derives a key `<case_id>_<sample_type_dir>` and looks up the
|
|
|
+/// corresponding aligned BAMs from the internal map.
|
|
|
+/// * If no aligned BAMs are found, a warning is logged and the case is
|
|
|
+/// skipped.
|
|
|
+/// * Otherwise, constructs a final BAM path:
|
|
|
+/// `config.result_dir/<case_id>/<sample_type_dir>/\
|
|
|
+/// <case_id>_<sample_type_dir>_<reference_name>.bam`.
|
|
|
+/// * For the first aligned BAM:
|
|
|
+/// - Moves it into place as the initial `final_bam_path`
|
|
|
+/// (`fs::rename`).
|
|
|
+/// - Indexes it using [`SamtoolsIndex`].
|
|
|
+/// * For subsequent aligned BAMs:
|
|
|
+/// - Merges them into `final_bam_path` using [`SamtoolsMerge`].
|
|
|
+/// * After all aligned BAMs for a case/sample–type are merged:
|
|
|
+/// - Sorts the final BAM into a temporary
|
|
|
+/// `*.sorted.bam` file via [`SamtoolsSort`].
|
|
|
+/// - Renames the sorted BAM back to `final_bam_path`.
|
|
|
+/// - Indexes the sorted BAM using [`SamtoolsIndex`] again.
|
|
|
+/// - The resulting, per–case, per–sample–type BAMs are aligned, merged,
|
|
|
+/// coordinate–sorted, and indexed, ready for downstream analysis.
|
|
|
///
|
|
|
-/// 4. **Alignment**
|
|
|
-/// All `DoradoAlign` jobs are submitted via `run_many_sbatch`. At the end of
|
|
|
-/// this step, each case / sample-type may have one or more temporary aligned
|
|
|
-/// BAMs in `tmp_dir`.
|
|
|
+/// 6. **Cleanup**
|
|
|
+/// - Removes the temporary split BAM directory and temporary POD5 directory
|
|
|
+/// created in step 3, if they exist.
|
|
|
+/// - Logs a final summary including the number of cases and the number of
|
|
|
+/// case/sample–type groups that produced final BAMs.
|
|
|
///
|
|
|
-/// 5. **Merge, sort and index final BAMs**
|
|
|
-/// For each case with a barcode:
|
|
|
-/// - all aligned BAMs for a given `(case_id, sample_type)` key are merged
|
|
|
-/// into a single BAM at:
|
|
|
-/// `config.result_dir/<case_id>/<sample_type_dir>/<case>_<sample_type>_<reference>.bam`
|
|
|
-/// using `SamtoolsMerge`,
|
|
|
-/// - the merged BAM is coordinate–sorted with `SamtoolsSort`, and
|
|
|
-/// - the sorted BAM is indexed with `SamtoolsIndex`.
|
|
|
+/// # Arguments
|
|
|
///
|
|
|
-/// Finally, all temporary split files and directories are removed.
|
|
|
-/// The function fails early if:
|
|
|
-/// - the run has no cases,
|
|
|
-/// - no BAMs are eligible for alignment, or
|
|
|
-/// - any of the Slurm / samtools / filesystem operations fail.
|
|
|
+/// * `run` – Description of a sequencing run, including POD5 input files,
|
|
|
+/// optional existing unaligned BAMs (`bams_pass`), and associated cases with
|
|
|
+/// barcodes and sample types.
|
|
|
+/// * `config` – Global configuration object with paths, reference name,
|
|
|
+/// and command–line options for Dorado, samtools and Slurm submission.
|
|
|
+/// * `tmp_dir` – Scratch directory used to create temporary POD5 directories,
|
|
|
+/// intermediate unaligned/ aligned BAMs, and split BAMs. It should be on a
|
|
|
+/// filesystem accessible to the Slurm jobs and support `rename`.
|
|
|
+/// * `min_bam_size` – Minimum BAM file size (in bytes) required for an
|
|
|
+/// unaligned BAM to be considered for alignment. Smaller files are skipped.
|
|
|
///
|
|
|
-/// # Arguments
|
|
|
+/// # Returns
|
|
|
///
|
|
|
-/// * `run` – Description of the POD5 run, associated cases and barcodes.
|
|
|
-/// * `config` – Global pipeline configuration (paths, samtools/dorado binaries,
|
|
|
-/// Slurm parameters, reference name, etc.).
|
|
|
-/// * `tmp_dir` – Directory for intermediate BAMs (basecalled, split and aligned).
|
|
|
-/// * `min_bam_size` – Minimum size (in bytes) for a split BAM to be kept and aligned.
|
|
|
+/// Returns `Ok(())` if all stages of the pipeline complete successfully and
|
|
|
+/// final BAMs are produced (where possible) for the run’s cases. On any error,
|
|
|
+/// returns an [`anyhow::Error`] with contextual information.
|
|
|
///
|
|
|
/// # Errors
|
|
|
///
|
|
|
-/// Returns an error if the run has no associated cases, if no BAM files are
|
|
|
-/// eligible for alignment, or if any external command (Dorado, samtools) or
|
|
|
-/// filesystem operation fails.
|
|
|
+/// This function will return an error if:
|
|
|
+///
|
|
|
+/// * The run has no associated cases.
|
|
|
+/// * Collecting existing BAMs, creating temporary directories, or symlinking
|
|
|
+/// POD5 files fails.
|
|
|
+/// * Any external command submission or execution (Dorado basecalling,
|
|
|
+/// samtools split/merge/sort/index, alignment jobs) fails.
|
|
|
+/// * Merging, sorting, indexing, or moving BAM files fails.
|
|
|
+/// * Cleanup of temporary directories fails.
|
|
|
+///
|
|
|
+/// # Notes
|
|
|
+///
|
|
|
+/// * This function assumes a Unix-like environment for the use of filesystem
|
|
|
+/// symlinks (`std::os::unix::fs::symlink`).
|
|
|
+/// * Duplicate normalized barcodes in `run.cases` will cause later entries to
|
|
|
+/// overwrite earlier ones in the `barcode → case` mapping.
|
|
|
+/// * BAM indexing is currently performed both before and after sorting; this is
|
|
|
+/// redundant but harmless and can be optimized by indexing only the final
|
|
|
+/// sorted BAM.
|
|
|
pub fn basecall_align_split(
|
|
|
run: &Pod5sRun,
|
|
|
config: &Config,
|