Thomas
/
pandora_lib_promethion


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660
							# Pandora configuration template

#######################################
# General filesystem layout / I/O
#######################################

# Root directory where all results will be written.
result_dir = "/mnt/beegfs02/scratch/t_steimle/data/wgs"

# Temporary directory.
tmp_dir = "/mnt/beegfs02/scratch/t_steimle/tmp"

# Should use Slurm as runner
slurm_runner = true

# Slurm max parallel jobs
slurm_max_par = 20

# Run cache directory.
run_cache_dir = "/home/t_steimle/data/prom_runs"

# Software threads
threads = 5

# Singularity bin
singularity_bin = "module load singularity-ce && singularity"

# Path to the conda activation script.
conda_sh = "/mnt/beegfs02/software/recherche/miniconda/25.1.1/etc/profile.d/conda.sh"

#######################################
# Reference genome & annotations
#######################################

# Reference FASTA used throughout the pipeline.
reference = "/home/t_steimle/ref/hs1/chm13v2.0.fa"

# Short reference name used in filenames.
reference_name = "hs1"

# Pseudoautosomal regions (PARs) BED file.
pseudoautosomal_regions_bed = "/home/t_steimle/ref/hs1/chm13v2.0_PAR.bed"

# Sequence dictionary (.dict) for the reference.
dict_file = "/home/t_steimle/ref/hs1/chm13v2.0.dict"

# RefSeq GFF3 annotation (sorted/indexed).
refseq_gff = "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz"

# dbSNP vcf.gz file (should be indexed)
db_snp = "/home/t_steimle/ref/hs1/chm13v2.0_dbSNPv155.vcf.gz"

# BED with genes on the 4th column should be sorted
genes_bed = "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_Genes.bed"

# Cytobands BED file
cytobands_bed = "/home/t_steimle/ref/hs1/chm13v2.0_cytobands_allchrs.bed"

# Chromosome alias file
# ex: https://hgdownload.soe.ucsc.edu/hubs/GCA/009/914/755/GCA_009914755.4/GCA_009914755.4.chromAlias.txt
chromosomes_alias = "/home/t_steimle/ref/hs1/GCA_009914755.4.chromAlias.txt"

# Template for mask BED file (low-quality / filtered regions).
# {result_dir} -> global result directory
# {id}         -> case identifier
mask_bed = "{result_dir}/{id}/diag/mask.bed"

# Panels of interest: [ [name, bed_path], ... ]
panels = [
	["CM", "/home/t_steimle/ref/hs1/panel_cm_hs1.bed"],
]

repeats_bed = "/home/t_steimle/ref/hs1/all_repeats_chm13_final.bed"

#######################################
# Sample naming / BAM handling
#######################################

# Tumor sample label (used in paths & filenames).
tumoral_name = "diag"

# Normal sample label.
normal_name = "norm"

# BAM tag name used for haplotagged reads.
haplotagged_bam_tag_name = "HP"

# Minimum MAPQ for reads kept during BAM filtering.
bam_min_mapq = 40

# Number of threads for hts BAM reader decrompression (should be adapted to IO speed).
bam_n_threads = 4

# Number of reads sampled for BAM composition estimation.
bam_composition_sample_size = 20000

#######################################
# Coverage counting / somatic-scan
#######################################

# Name of directory (under each sample dir) where counts are stored.
count_dir_name = "counts"

# Bin size (bp) for count files.
count_bin_size = 1000

# Number of chunks used to split contigs for counting.
count_n_chunks = 1000

# Force recomputation of counting even if outputs exist.
somatic_scan_force = false

#######################################
# Somatic pipeline global settings
#######################################

# Force recomputation of the entire somatic pipeline.
somatic_pipe_force = true

# Default thread count for heavy tools.
somatic_pipe_threads = 15

# Template for somatic pipeline statistics directory.
# {result_dir}, {id}
somatic_pipe_stats = "{result_dir}/{id}/diag/somatic_pipe_stats"

#######################################
# Filtering / QC thresholds
#######################################

# Minimum depth in constitutional sample to consider site evaluable.
somatic_min_constit_depth = 5

# Maximum allowed ALT count in constitutional sample for a somatic call.
somatic_max_alt_constit = 1

# Window size (bp) for sequence entropy around variants.
entropy_seq_len = 10

# Minimum Shannon entropy threshold.
min_shannon_entropy = 1.0

# Max depth considered "low quality".
max_depth_low_quality = 20

# Min depth considered "high quality".
min_high_quality_depth = 14

# Minimum number of callers required to keep a variant.
min_n_callers = 1

#######################################
# DeepVariant configuration
#######################################

# DeepVariant output directory template.
# {result_dir}, {id}, {time}
deepvariant_output_dir = "{result_dir}/{id}/{time}/DeepVariant"

# Threads for DeepVariant.
deepvariant_threads = 20

# DeepVariant singularity image path
deepvariant_image = "/mnt/beegfs02/scratch/t_steimle/somatic_pipe_tools/deepvariant_latest.sif"

# DeepVariant model type (e.g. ONT).
deepvariant_model_type = "ONT_R104"

# Force DeepVariant recomputation.
deepvariant_force = false

#######################################
# DeepSomatic configuration
#######################################

# DeepSomatic output directory template.
# {result_dir}, {id}, {time}
deepsomatic_output_dir = "{result_dir}/{id}/{time}/DeepSomatic"

# Threads for DeepSomatic.
deepsomatic_threads = 20

# DeepVariant singularity image path
deepsomatic_image = "/mnt/beegfs02/scratch/t_steimle/somatic_pipe_tools/deepsomatic_latest.sif"

# DeepSomatic model type.
deepsomatic_model_type = "ONT"

# Force DeepSomatic recomputation.
deepsomatic_force = false

#######################################
# ClairS configuration
#######################################

# Threads for ClairS.
clairs_threads = 10

# ClairS docker tag.
clairs_image = "/mnt/beegfs02/scratch/t_steimle/somatic_pipe_tools/clairs_latest.sif"

# Force ClairS recomputation.
clairs_force = false

# Keep per-part directories after chunked ClairS merging.
# Set to true to retain intermediate VCFs (raw SNV/indel/germline) for reanalysis.
clairs_keep_parts = false

# Platform preset for ClairS.
clairs_platform = "ont_r10_dorado_sup_5khz_ssrs"

# ClairS output directory template.
# {result_dir}, {id}
clairs_output_dir = "{result_dir}/{id}/diag/ClairS"

#######################################
# GATK configuration
#######################################
# Path to the GATK container image (Singularity/Apptainer .sif, or a docker:// URI
# if you pull at runtime).
#
# Examples:
# - "/containers/gatk_4.6.0.0.sif"
gatk_image = "/mnt/beegfs02/scratch/t_steimle/somatic_pipe_tools/gatk_latest.sif"

# Path to a BED file restricting analysis to target regions (0-based, half-open).
# Must match contig naming of the reference/BAMs (e.g. "chr9" vs "9").
#
# Used for targeted calling (e.g. Mutect2 `-L` or region chunking).
gatk_bed_path = "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_Genes.bed"

# Local single-run CPU threads (non-Slurm execution).
# Used for full-run Mutect2 or other GATK tools.
# Typically forwarded to:
#   - `--native-pair-hmm-threads`
#   - `--reader-threads`
# Should match available cores on the node.
gatk_threads = 100

# Local single-run memory limit in GB.
# Used to size Java heap:
#   `--java-options "-Xmx{mem}g"`
# Should leave headroom for native memory (PairHMM, buffers).
gatk_mem_gb = 120

# Per-chunk CPU threads when running chunked under Slurm.
# Applies to each parallel job independently.
gatk_slurm_threads = 8

# Per-chunk memory (GB) when running under Slurm.
# Used both for scheduler request and Java heap sizing per chunk.
# Must be sufficient for interval-restricted Mutect2.
gatk_slurm_mem_gb = 32

# If true, force re-run of GATK steps by removing or ignoring existing outputs.
gatk_force = false

# GATK output directory template.
# {result_dir}, {id}
gatk_output_dir = "{result_dir}/{id}/{tumoral_name}/GATK"

# GATK passed VCF.
gatk_passed_vcf = "{output_dir}/{id}_{tumoral_name}_{reference_name}_GATK_PASSED.vcf.gz"

#######################################
# Savana configuration
#######################################

# Savana binary (name or full path).
savana_bin = "/home/t_steimle/.conda/envs/savana_env/bin/savana"

# Threads for Savana.
savana_threads = 40

# RAM capacity used for running Savana with slurm (in GB).
savana_mem = 110

# Savana output directory template.
# {result_dir}, {id}
savana_output_dir = "{result_dir}/{id}/diag/savana"

# Savana copy-number output file.
# {output_dir}, {id}, {reference_name}, {haplotagged_bam_tag_name}
savana_copy_number = "{output_dir}/{id}_diag_{reference_name}_{haplotagged_bam_tag_name}_segmented_absolute_copy_number.tsv"

# Savana raw read counts file.
savana_read_counts = "{output_dir}/{id}_diag_{reference_name}_{haplotagged_bam_tag_name}_raw_read_counts.tsv"

# Savana passed VCF.
savana_passed_vcf = "{output_dir}/{id}_diag_savana_PASSED.vcf.gz"

# Force Savana recomputation.
savana_force = false

# Constitutional phased VCF template.
# {result_dir}, {id}
germline_phased_vcf = "{result_dir}/{id}/diag/{id}_variants_constit_phased.vcf.gz"


#######################################
# Severus configuration
#######################################

# Path to Severus script.
severus_bin = " /home/t_steimle/somatic_pipe_tools/Severus/severus.py"

# Force Severus recomputation.
severus_force = false

# Threads for Severus.
severus_threads = 32

# VNTRs BED for Severus.
vntrs_bed = "/home/t_steimle/ref/hs1/vntrs_chm13.bed"

# Path of the Severus panel of normals.
severus_pon = "/home/t_steimle/ref/hs1/PoN_1000G_chm13.tsv.gz"

# Paired Severus output directory.
# {result_dir}, {id}
severus_output_dir = "{result_dir}/{id}/diag/severus"

# Solo Severus output directory.
# {result_dir}, {id}, {time}
severus_solo_output_dir = "{result_dir}/{id}/{time}/severus"

#######################################
# Straglr configuration
#######################################

# Path to Straglr executable.
straglr_bin = "/home/t_steimle/.conda/envs/straglr_env/bin/straglr.py"

# Path to STR loci BED file for Straglr.
#
# RepeatMasker Simple_repeat
straglr_loci_bed = "/home/t_steimle/ref/hs1/simple_repeat_ucsc_hs1.bed"

# Minimum allele size difference in bp to report as changed between normal and tumoral
straglr_min_size_diff = 4

# Minimum read support required for an allele to be considered for 
# change between normal and tumoral
straglr_min_support_diff = 2

# Minimum read support for STR genotyping.
straglr_min_support = 2

# Minimum cluster size for STR detection.
straglr_min_cluster_size = 2

# Whether to genotype in size mode.
straglr_genotype_in_size = true

# Template for paired Straglr output directory.
#
# Placeholders: `{result_dir}`, `{id}`.
straglr_output_dir = "{result_dir}/{id}/diag/straglr"

# Template for solo Straglr output directory.
#
# Placeholders: `{result_dir}`, `{id}`, `{time}`.
straglr_solo_output_dir = "{result_dir}/{id}/{time}/straglr"

# Force Straglr recomputation.
straglr_force = false

#######################################
# CoRAL
#######################################
# Number of CPU threads for the CoRAL reconstruction job.
#
# CoRAL is CPU-bound during breakpoint graph construction and quadratic
# programming cycle extraction. 8–16 threads is sufficient for most
# focal amplification cases; increase for highly complex ecDNA with
# many amplicons.
coral_threads = 16

# CoRAL cloned dir (required...)
coral_dir = "/home/t_steimle/somatic_pipe_tools/CoRAL"

# Memory allocation for the CoRAL SLURM job (e.g. `"32G"`).
#
# Memory usage scales with amplicon complexity and BAM depth.
# 32G is sufficient for typical WGS at 30–60×; increase to 64G
# for highly rearranged genomes (chromothripsis, high ecDNA copy number).
coral_slurm_mem = "32G"

# SLURM partition to use for CoRAL jobs.
#
# CoRAL requires only CPU — do not submit to a GPU partition.
coral_slurm_partition = "shortq"

# Minimum copy number gain threshold for a segment to be considered
# a focal amplification seed (CoRAL `--gain`).
#
# CoRAL applies this threshold to the raw absolute CN values from the
# cn_segs BED — do NOT pre-correct for purity or ploidy, as this may
# cause entire chromosome arms to exceed the threshold in aneuploid tumours.
#
# Default in CoRAL is 6.0 (diploid assumption). For hyperdiploid tumours
# (e.g. hyperploid ALL, CML blast crisis) consider lowering to 4.0–5.0.
coral_seed_gain = 6.0

# Minimum size in base pairs for a CN segment to qualify as a seed
# (CoRAL `--min-seed-size`).
#
# Segments below this size are discarded even if they exceed `coral_seed_gain`.
# Two merged proximal segments (see `coral_max_seg_gap`) are evaluated
# against this threshold as a single combined interval.
#
# Default in CoRAL is 100000 (100 kb). Reducing this risks including
# artefactual short high-copy segments; increasing it misses small focal
# amplifications (e.g. narrow EGFR or MYC peaks).
coral_min_seed_size = 100000

# Maximum gap in base pairs between two proximal CN segments to allow
# merging into a single seed candidate (CoRAL `--max-seg-gap`).
#
# If two amplified segments are separated by a gap smaller than this value,
# they are merged before the `coral_min_seed_size` filter is applied.
# This handles cases where a single focal amplicon is split by a low-coverage
# or diploid bin.
#
# Default in CoRAL is 300000 (300 kb). For haematological cancers with
# compact focal amplifications (e.g. NUP214::ABL1, ABL1 amplification in
# CML blast crisis) a tighter value such as 100000 reduces spurious merging
# of adjacent independent amplicons.
coral_max_seg_gap = 100000

#######################################
# Flye
#######################################
 
# Path to the Flye binary. Can be a python-prefixed call if Flye is not
# installed as a standalone executable.
flye_bin = "/usr/bin/python /home/t_steimle/somatic_pipe_tools/Flye/bin/flye"
 
# Number of threads allocated to Flye. 8–16 is sufficient for local assembly
# of a single locus; diminishing returns above 16.
flye_threads = 12
 
# Memory allocated to the Flye SLURM job. 16G is comfortable for local
# assembly (<1 Mb target). Increase to 32G+ for larger regions.
flye_slurm_mem = "16G"
 
#######################################
# Medaka
#######################################
 
# Name of the conda environment containing medaka.
# Activated via conda_sh before running medaka_consensus.
medaka_env = "medaka_env"
 
# Path to the medaka_consensus binary within the conda environment.
# Usually just "medaka_consensus" if the env is correctly activated.
medaka_consensus_bin = "medaka_consensus"
 
# Number of threads for medaka. Used for the minimap2 alignment step;
# the neural network inference step is GPU-bound when a GPU is available.
medaka_threads = 8
 
# Memory allocated to the Medaka SLURM job. 16G is sufficient for local
# polishing of a small assembly.
medaka_slurm_mem = "16G"
 
# Medaka model — MUST match the basecalling chemistry and Dorado version exactly.
# Using the wrong model silently degrades polishing quality.
#
# Model naming: {chemistry}_{flowcell}_{speed}bps_{caller}_{version}
#   r1041_e82 = R10.4.1 flowcell
#   400bps    = 400 bps sampling rate (standard; 260bps is legacy)
#   sup       = Dorado sup basecalling (use hac if basecalled with hac)
#
# Current default (medaka tools list_models): r1041_e82_400bps_sup_v5.2.0
#
# v5.2.0 also has dwell-time variants for improved homopolymer resolution:
#   r1041_e82_400bps_sup_v5.2.0_rl_lstm384_dwells    — use if Dorado called with dwell times
#   r1041_e82_400bps_sup_v5.2.0_rl_lstm384_no_dwells — use if Dorado called without dwell times
#
# For R9.4.1 data use r941_min_sup_g507 (MinION) or r941_prom_sup_g507 (PromethION).
# Run `medaka tools list_models` to list all available models.
medaka_model = "r1041_e82_400bps_sup_v5.2.0"

#######################################
# Minimap2
#######################################
 
# Path to the minimap2 binary. Use a versioned path to ensure reproducibility
# across pipeline runs — minimap2 output is version-sensitive.
minimap2_bin = "/home/t_steimle/somatic_pipe_tools/minimap2-2.30_x64-linux/minimap2"
 
# Number of threads for minimap2 alignment. Scales linearly up to ~16;
# 16 is appropriate for read→reference alignment on a full WGS BAM.
# For local assembly realignment (few hundred reads) 8 is sufficient.
minimap2_threads = 16
 
# Memory allocated to the minimap2 SLURM job.
# 32G is required for read→reference alignment against a human genome
# (minimap2 loads the MMI index into memory: ~14G for hg38 map-ont).
# Can be reduced to 8G for contig→contig or local assembly realignment.
minimap2_slurm_mem = "32G"

#######################################
# Marlin
#######################################

marlin_bed = "/home/t_steimle/ref/hs1/marlin_v1.probes_t2t.bed"

#######################################
# Echtvar
#######################################

echtvar_bin = "/home/t_steimle/somatic_pipe_tools/echtvar"

echtvar_sources = [
	"/home/t_steimle/ref/hs1/gnomAD_4-2022_10-gnomad.echtvar.zip",
	"/home/t_steimle/ref/hs1/CosmicCodingMuts.echtvar.zip"
]

#######################################
# Bcftools configuration
#######################################

# Path to bcftools binary.
bcftools_bin = "/mnt/beegfs02/scratch/t_steimle/somatic_pipe_tools/bcftools"

# Threads for bcftools.
bcftools_threads = 10

#######################################
# Longphase configuration
#######################################

# Path to longphase binary.
longphase_bin = "/mnt/beegfs02/scratch/t_steimle/somatic_pipe_tools/longphase_linux-x64"

# Threads for longphase.
longphase_threads = 20

# Threads for longphase modcall step.
# limit memory usage here
longphase_modcall_threads = 4

# Force longphase recomputation (haplotagging/phasing).
longphase_force = false

# Longphase modcall VCF template.
# {result_dir}, {id}, {time}
longphase_modcall_vcf = "{result_dir}/{id}/{time}/5mC_5hmC/{id}_{time}_5mC_5hmC_modcall.vcf.gz"


#######################################
# Modkit configuration
#######################################

# Path to modkit binary.
modkit_bin = "/mnt/beegfs02/scratch/t_steimle/somatic_pipe_tools/modkit_latest/modkit"

# Threads for `modkit summary`.
modkit_summary_threads = 40

# Modkit summary file template.
# {result_dir}, {id}, {time}
modkit_summary_file = "{result_dir}/{id}/{time}/{id}_{time}_5mC_5hmC_summary.txt"

#######################################
# Nanomonsv configuration
#######################################

# Path to nanomonsv binary.
nanomonsv_bin = "/home/t_steimle/.conda/envs/nanomonsv_env/bin/nanomonsv"

# Paired nanomonsv output directory template.
# {result_dir}, {id}, {time}
nanomonsv_output_dir = "{result_dir}/{id}/{time}/nanomonsv"

# Force nanomonsv recomputation.
nanomonsv_force = false

# Threads for nanomonsv.
nanomonsv_threads = 40

# Paired nanomonsv PASSED VCF template.
# {output_dir}, {id}
nanomonsv_passed_vcf = "{output_dir}/{id}_diag_nanomonsv_PASSED.vcf.gz"

# Solo nanomonsv output directory template.
# {result_dir}, {id}, {time}
nanomonsv_solo_output_dir = "{result_dir}/{id}/{time}/nanomonsv-solo"

# Solo nanomonsv PASSED VCF template.
# {output_dir}, {id}, {time}
nanomonsv_solo_passed_vcf = "{output_dir}/{id}_{time}_nanomonsv-solo_PASSED.vcf.gz"

# Path to simple repeat BED file for nanomonsv.
# https://github.com/friend1ws/nanomonsv
# Warning TBI index should exists
nanomonsv_simple_repeat_bed = "/home/t_steimle/ref/hs1/human_chm13v2.0_simpleRepeat.bed.gz"

#######################################
# PromethION metadata
#######################################

# Directory containing PromethION run metadata.
promethion_runs_metadata_dir = "/data/promethion-runs-metadata"

# JSON file mapping flowcell IDs / runs for Pandora.
promethion_runs_input = "/data/pandora-flowcell-id.json"

#######################################
# VEP configuration
#######################################

# Path to VEP singularity image
vep_image = "/home/t_steimle/somatic_pipe_tools/vep_latest.sif"

# Path to the VEP cache directory
vep_cache_dir = "/home/t_steimle/ref/hs1/vepcache"

# Path to VEP sorted GFF
vep_gff = "/home/t_steimle/ref/hs1/chm13v2.0_RefSeq_Liftoff_v5.1_sorted.gff3.gz"

#######################################
# Alignment / basecalling (Dorado)
#######################################

[align]
# Path to Dorado binary.
dorado_bin = "/mnt/beegfs02/scratch/t_steimle/tools/dorado-latest-linux-x64/bin/dorado"

# Dorado basecalling arguments (device, model, modifications…).
dorado_basecall_arg = "-x 'cuda:all' sup,5mC_5hmC"

# Should dorado re-align after demux ?
dorado_should_realign = false

# Dorado aligner threads number
dorado_aligner_threads = 10

# Reference FASTA used for alignment.
ref_fa = "/mnt/beegfs02/scratch/t_steimle/ref/hs1/chm13v2.0.fa"

# Minimap2 index used for alignment.
ref_mmi = ""

# Samtools bin 
samtools_bin = "/mnt/beegfs02/scratch/t_steimle/tools/samtools"

# Threads for `samtools view`.
samtools_view_threads = 10

# Threads for `samtools sort`.
samtools_sort_threads = 20

# Threads for `samtools merge`.
samtools_merge_threads = 40

# Threads for `samtools split`.
samtools_split_threads = 20