[[PageOutline]] = Job Checkpointing Examples = == Checkpoint Runner == Here is a job script for a fully self restarting job for use with either of the accompanying, minimal working examples of checkpointed applications in BASH, Python, or R. === checkpoint_runner.sh === {{{ #!/bin/bash ############################################################################### # CHECKPOINT RUNNER: checkpoint + auto-requeue (Slurm 14.x-safe) ############################################################################### # MODULE_LIST : modules loaded before running app command # APP_CMD : command to run (string) # LAUNCH_MODE : run app command directly or via srun # SRUN_ARGS : arguments to srun, if needed # TIME_LIMIT : wall time (D-HH:MM:SS or HH:MM:SS) sync w/ #SBATCH --time # MARGIN_SEC : seconds before wall time to checkpoint (checkpoint_timer) # CKPT_PATH : path to checkpoint file # CHECKPOINT_EVERY: checkpoint after every this many application iterations # MAX_ITER : stop application after this many iterations total # MAX_RESTARTS : stop requeuing after this many restarts (safety) ############################################################################### # User settings with defaults ############################################################################### # Edit settings below # Or pass values via... # sbatch --export=ALL,Var1=...,...,VarN=... checkpoint_runner.sh # Or pass values via... # Var1=... VarN=... sbatch checkpoint_runner.sh ############################################################################### MODULE_LIST="${MODULE_LIST:-anaconda3/2023.07}" # space-delimited module list APP_CMD="${APP_CMD:-python3 checkpoint_signal_iter.py}" LAUNCH_MODE="${LAUNCH_MODE:-direct}" # direct | srun SRUN_ARGS="${SRUN_ARGS:--n 1}" # extra srun flags TIME_LIMIT="${TIME_LIMIT:-00:03:00}" # match #SBATCH --time below MARGIN_SEC="${MARGIN_SEC:-60}" # checkpoint time before time limit CKPT_PATH="${CKPT_PATH:-state_iter.txt}" # checkpoint file path CHECKPOINT_EVERY="${CHECKPOINT_EVERY:-20}" # number of iter. before checkpoint MAX_ITER="${MAX_ITER:-500}" # number of iter. total MAX_RESTARTS="${MAX_RESTARTS:-10}" # max. number of restarts ############################################################################### # Slurm directives (keep TIME_LIMIT in sync with #SBATCH --time) ############################################################################### #SBATCH --job-name=ckpt_requeue_demo #SBATCH --output=log_%j.out #SBATCH --error=log_%j.err #SBATCH --open-mode=append #SBATCH --time=00:03:00 # keep in sync with TIME_LIMIT above #SBATCH --ntasks=1 #SBATCH --cpus-per-task=1 #SBATCH --mem=512M #SBATCH --requeue #SBATCH --get-user-env #SBATCH --partition=centos7 # change partition to defq if needed #SBATCH --qos=normal # best qos for node availability set -euo pipefail shopt -s expand_aliases alias dtstamp="date +%Y%m%d-%H:%M:%S" info(){ echo "Info[$(dtstamp)]: $*"; } info "Start on $(hostname); JOB_ID=${SLURM_JOB_ID}; RESTARTS=${SLURM_RESTART_COUNT:-0}" info "Settings:" info "MODULE_LIST=${MODULE_LIST}" info "APP_CMD=${APP_CMD}" info "LAUNCH_MODE=${LAUNCH_MODE}" info "SRUN_ARGS=${SRUN_ARGS}" info "TIME_LIMIT=${TIME_LIMIT}" info "MARGIN_SEC=${MARGIN_SEC}" info "CKPT_PATH=${CKPT_PATH}" info "CHECKPOINT_EVERY=${CHECKPOINT_EVERY}" info "MAX_ITER=${MAX_ITER}" info "MAX_RESTARTS=${MAX_RESTARTS}" # Load site modules (available on Cypress workers) module load slurm 2>/dev/null || true # makes scontrol visible on worker module load "${MODULE_LIST}" || true # Tool paths SCTRL="$(command -v scontrol || true)" # Short diagnostics if [[ -n "${SCTRL}" ]]; then echo "=== BEGIN JOB SNAPSHOT (scontrol) ===" "${SCTRL}" show job "${SLURM_JOB_ID}" | \ grep -E "JobId=|Partition=|QOS=|TimeLimit=|StartTime=|EndTime=|RunTime=|State=|Restarts=" echo "=== END JOB SNAPSHOT (scontrol) ===" else info "WARNING: scontrol not found on this node; in-place requeue will not be attempted." fi # Helpers get_ckpt_iter() { # reads the first numeric token from CKPT_PATH; returns 0 on any error [[ -f "${CKPT_PATH}" ]] || { echo 0; return; } local v v=$(tr -cd '0-9' < "${CKPT_PATH}" 2>/dev/null) echo "${v:-0}" } get_restarts() { if [[ -n "${SLURM_RESTART_COUNT:-}" ]]; then echo "${SLURM_RESTART_COUNT}" elif [[ -n "${SCTRL}" ]]; then "${SCTRL}" show job "${SLURM_JOB_ID}" | tr ' ' '\n' | awk -F= '/^Restarts=/{print $2; exit}' else echo "0" fi } to_seconds() { local t="$1" if [[ "$t" == *-*:*:* ]]; then local d h m s; IFS='-:' read -r d h m s <<<"$t" echo $(( d*86400 + h*3600 + m*60 + s )) else local h m s; IFS=':' read -r h m s <<<"$t" h=${h:-0}; m=${m:-0}; s=${s:-0} echo $(( h*3600 + m*60 + s )) fi } # Trap (batch shell) signal_handler () { info "TERM/INT caught in batch shell" local rc_local=0 if [[ -n "${child_pid:-}" ]]; then # srun or bash are group leaders; forward to their process group local child_pgid child_pgid="$(ps -o pgid= -p "${child_pid}" 2>/dev/null | awk '{print $1}')" if [[ -n "${child_pgid}" ]]; then kill -TERM "-${child_pgid}" 2>/dev/null || true else kill -TERM "${child_pid}" 2>/dev/null || true fi wait "${child_pid}" || rc_local=$? fi info "Program exit code (from trap): ${rc_local}" local restarts; restarts=$(get_restarts) if [[ ${rc_local} -eq 99 && ${requeued:-0} -eq 0 && ${restarts} -lt ${MAX_RESTARTS} && -n "${SCTRL}" ]]; then requeued=1 info "Checkpoint written (trap path). Requeueing in-place (same JobID)..." "${SCTRL}" requeue "${SLURM_JOB_ID}" || true info "Requeued via scontrol." fi exit 0 } trap 'signal_handler' TERM INT # Launch via timeout TOTAL_SEC=$(to_seconds "${TIME_LIMIT}") RUN_WINDOW_SEC=$(( TOTAL_SEC - MARGIN_SEC )) if (( RUN_WINDOW_SEC <= 0 )); then info "WARNING: RUN_WINDOW_SEC <= 0; using 1s." RUN_WINDOW_SEC=1 fi before_iter=$(get_ckpt_iter) set +e if [[ "${LAUNCH_MODE}" == "srun" ]]; then # timeout -> srun -> bash -lc "APP_CMD" # On expiry, timeout sends TERM to srun; srun forwards signals to its job step tasks. timeout "${RUN_WINDOW_SEC}s" srun ${SRUN_ARGS} bash -lc "${APP_CMD}" else # direct mode: run in the batch shell timeout "${RUN_WINDOW_SEC}s" bash -lc "${APP_CMD}" fi rc=$? set -e info "Program exit code (from timeout wrapper): ${rc}" # Interpret coreutils 8.4 returns: # 0 -> completed # 99 -> app exited 99 before timeout expiry (valid) # 124 -> timeout expired (TERM sent), treat as checkpoint cycle and requeue # (optionally confirm CKPT grew) # else-> unexpected -> propagate requeued=0 if [[ ${rc} -eq 0 ]]; then info "Completed." exit 0 elif [[ ${rc} -eq 99 ]]; then restarts=$(get_restarts) if (( restarts < MAX_RESTARTS )) && [[ -n "${SCTRL}" ]]; then requeued=1 info "Checkpoint written. Requeueing in-place (same JobID)..." "${SCTRL}" requeue "${SLURM_JOB_ID}" || true info "Requeued via scontrol." else info "WARNING: cannot requeue (scontrol unavailable or MAX_RESTARTS reached)." fi exit 0 elif [[ ${rc} -eq 124 ]]; then after_iter=$(get_ckpt_iter) if (( after_iter > before_iter )); then restarts=$(get_restarts) if (( restarts < MAX_RESTARTS )) && [[ -n "${SCTRL}" ]]; then requeued=1 info "Timeout TERM observed; checkpoint advanced (${before_iter}->${after_iter}). Requeueing..." "${SCTRL}" requeue "${SLURM_JOB_ID}" || true info "Requeued via scontrol." exit 0 else info "WARNING: cannot requeue (scontrol unavailable or MAX_RESTARTS reached)." exit 0 fi else info "Timeout TERM observed but checkpoint did not advance; marking as failure." exit 1 fi else info "Unexpected exit code: ${rc}" exit "${rc}" fi }}} == BASH Checkpointing Example == See [wiki:Workshops/JobCheckpointing/Examples/BASH BASH Checkpointing Example] == Python Checkpointing Example == See [wiki:Workshops/JobCheckpointing/Examples/Python Python Checkpointing Example] == R Checkpointing Example == See [wiki:Workshops/JobCheckpointing/Examples/R R Checkpointing Example]