| Version 4 (modified by , 2 days ago) ( diff ) |
|---|
Job Checkpointing Examples
Checkpoint Runner
Here is a job script for a fully self restarting job for use with either of the accompanying, minimal working examples of checkpointed applications in BASH, Python, or R.
checkpoint_runner.sh
#!/bin/bash
###############################################################################
# CHECKPOINT RUNNER: checkpoint + auto-requeue (Slurm 14.x-safe)
###############################################################################
# MODULE_LIST : modules loaded before running app command
# APP_CMD : command to run (string)
# LAUNCH_MODE : run app command directly or via srun
# SRUN_ARGS : arguments to srun, if needed
# TIME_LIMIT : wall time (D-HH:MM:SS or HH:MM:SS) sync w/ #SBATCH --time
# MARGIN_SEC : seconds before wall time to checkpoint (checkpoint_timer)
# CKPT_PATH : path to checkpoint file
# CHECKPOINT_EVERY: checkpoint after every this many application iterations
# MAX_ITER : stop application after this many iterations total
# MAX_RESTARTS : stop requeuing after this many restarts (safety)
###############################################################################
# User settings with defaults
###############################################################################
# Edit settings below
# Or pass values via...
# sbatch --export=ALL,Var1=...,...,VarN=... checkpoint_runner.sh
# Or pass values via...
# Var1=... VarN=... sbatch checkpoint_runner.sh
###############################################################################
MODULE_LIST="${MODULE_LIST:-anaconda3/2023.07}"
# space-delimited module list
APP_CMD="${APP_CMD:-python3 checkpoint_signal_iter.py}"
LAUNCH_MODE="${LAUNCH_MODE:-direct}" # direct | srun
SRUN_ARGS="${SRUN_ARGS:--n 1}" # extra srun flags
TIME_LIMIT="${TIME_LIMIT:-00:03:00}" # match #SBATCH --time below
MARGIN_SEC="${MARGIN_SEC:-60}" # checkpoint time before time limit
CKPT_PATH="${CKPT_PATH:-state_iter.txt}" # checkpoint file path
CHECKPOINT_EVERY="${CHECKPOINT_EVERY:-20}" # number of iter. before checkpoint
MAX_ITER="${MAX_ITER:-500}" # number of iter. total
MAX_RESTARTS="${MAX_RESTARTS:-10}" # max. number of restarts
###############################################################################
# Slurm directives (keep TIME_LIMIT in sync with #SBATCH --time)
###############################################################################
#SBATCH --job-name=ckpt_requeue_demo
#SBATCH --output=log_%j.out
#SBATCH --error=log_%j.err
#SBATCH --open-mode=append
#SBATCH --time=00:03:00 # keep in sync with TIME_LIMIT above
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=512M
#SBATCH --requeue
#SBATCH --get-user-env
#SBATCH --partition=centos7 # change partition to defq if needed
#SBATCH --qos=normal # best qos for node availability
set -euo pipefail
shopt -s expand_aliases
alias dtstamp="date +%Y%m%d-%H:%M:%S"
info(){ echo "Info[$(dtstamp)]: $*"; }
info "Start on $(hostname); JOB_ID=${SLURM_JOB_ID}; RESTARTS=${SLURM_RESTART_COUNT:-0}"
info "Settings:"
info "MODULE_LIST=${MODULE_LIST}"
info "APP_CMD=${APP_CMD}"
info "LAUNCH_MODE=${LAUNCH_MODE}"
info "SRUN_ARGS=${SRUN_ARGS}"
info "TIME_LIMIT=${TIME_LIMIT}"
info "MARGIN_SEC=${MARGIN_SEC}"
info "CKPT_PATH=${CKPT_PATH}"
info "CHECKPOINT_EVERY=${CHECKPOINT_EVERY}"
info "MAX_ITER=${MAX_ITER}"
info "MAX_RESTARTS=${MAX_RESTARTS}"
# Load site modules (available on Cypress workers)
module load slurm 2>/dev/null || true # makes scontrol visible on worker
module load "${MODULE_LIST}" || true
# Tool paths
SCTRL="$(command -v scontrol || true)"
# Short diagnostics
if [[ -n "${SCTRL}" ]]; then
echo "=== BEGIN JOB SNAPSHOT (scontrol) ==="
"${SCTRL}" show job "${SLURM_JOB_ID}" | \
grep -E "JobId=|Partition=|QOS=|TimeLimit=|StartTime=|EndTime=|RunTime=|State=|Restarts="
echo "=== END JOB SNAPSHOT (scontrol) ==="
else
info "WARNING: scontrol not found on this node; in-place requeue will not be attempted."
fi
# Helpers
get_ckpt_iter() {
# reads the first numeric token from CKPT_PATH; returns 0 on any error
[[ -f "${CKPT_PATH}" ]] || { echo 0; return; }
local v
v=$(tr -cd '0-9' < "${CKPT_PATH}" 2>/dev/null)
echo "${v:-0}"
}
get_restarts() {
if [[ -n "${SLURM_RESTART_COUNT:-}" ]]; then
echo "${SLURM_RESTART_COUNT}"
elif [[ -n "${SCTRL}" ]]; then
"${SCTRL}" show job "${SLURM_JOB_ID}" | tr ' ' '\n' | awk -F= '/^Restarts=/{print $2; exit}'
else
echo "0"
fi
}
to_seconds() {
local t="$1"
if [[ "$t" == *-*:*:* ]]; then
local d h m s; IFS='-:' read -r d h m s <<<"$t"
echo $(( d*86400 + h*3600 + m*60 + s ))
else
local h m s; IFS=':' read -r h m s <<<"$t"
h=${h:-0}; m=${m:-0}; s=${s:-0}
echo $(( h*3600 + m*60 + s ))
fi
}
# Trap (batch shell)
signal_handler () {
info "TERM/INT caught in batch shell"
local rc_local=0
if [[ -n "${child_pid:-}" ]]; then
# srun or bash are group leaders; forward to their process group
local child_pgid
child_pgid="$(ps -o pgid= -p "${child_pid}" 2>/dev/null | awk '{print $1}')"
if [[ -n "${child_pgid}" ]]; then
kill -TERM "-${child_pgid}" 2>/dev/null || true
else
kill -TERM "${child_pid}" 2>/dev/null || true
fi
wait "${child_pid}" || rc_local=$?
fi
info "Program exit code (from trap): ${rc_local}"
local restarts; restarts=$(get_restarts)
if [[ ${rc_local} -eq 99 && ${requeued:-0} -eq 0 && ${restarts} -lt ${MAX_RESTARTS} && -n "${SCTRL}" ]]; then
requeued=1
info "Checkpoint written (trap path). Requeueing in-place (same JobID)..."
"${SCTRL}" requeue "${SLURM_JOB_ID}" || true
info "Requeued via scontrol."
fi
exit 0
}
trap 'signal_handler' TERM INT
# Launch via timeout
TOTAL_SEC=$(to_seconds "${TIME_LIMIT}")
RUN_WINDOW_SEC=$(( TOTAL_SEC - MARGIN_SEC ))
if (( RUN_WINDOW_SEC <= 0 )); then
info "WARNING: RUN_WINDOW_SEC <= 0; using 1s."
RUN_WINDOW_SEC=1
fi
before_iter=$(get_ckpt_iter)
set +e
if [[ "${LAUNCH_MODE}" == "srun" ]]; then
# timeout -> srun -> bash -lc "APP_CMD"
# On expiry, timeout sends TERM to srun; srun forwards signals to its job step tasks.
timeout "${RUN_WINDOW_SEC}s" srun ${SRUN_ARGS} bash -lc "${APP_CMD}"
else
# direct mode: run in the batch shell
timeout "${RUN_WINDOW_SEC}s" bash -lc "${APP_CMD}"
fi
rc=$?
set -e
info "Program exit code (from timeout wrapper): ${rc}"
# Interpret coreutils 8.4 returns:
# 0 -> completed
# 99 -> app exited 99 before timeout expiry (valid)
# 124 -> timeout expired (TERM sent), treat as checkpoint cycle and requeue
# (optionally confirm CKPT grew)
# else-> unexpected -> propagate
requeued=0
if [[ ${rc} -eq 0 ]]; then
info "Completed."
exit 0
elif [[ ${rc} -eq 99 ]]; then
restarts=$(get_restarts)
if (( restarts < MAX_RESTARTS )) && [[ -n "${SCTRL}" ]]; then
requeued=1
info "Checkpoint written. Requeueing in-place (same JobID)..."
"${SCTRL}" requeue "${SLURM_JOB_ID}" || true
info "Requeued via scontrol."
else
info "WARNING: cannot requeue (scontrol unavailable or MAX_RESTARTS reached)."
fi
exit 0
elif [[ ${rc} -eq 124 ]]; then
after_iter=$(get_ckpt_iter)
if (( after_iter > before_iter )); then
restarts=$(get_restarts)
if (( restarts < MAX_RESTARTS )) && [[ -n "${SCTRL}" ]]; then
requeued=1
info "Timeout TERM observed; checkpoint advanced (${before_iter}->${after_iter}). Requeueing..."
"${SCTRL}" requeue "${SLURM_JOB_ID}" || true
info "Requeued via scontrol."
exit 0
else
info "WARNING: cannot requeue (scontrol unavailable or MAX_RESTARTS reached)."
exit 0
fi
else
info "Timeout TERM observed but checkpoint did not advance; marking as failure."
exit 1
fi
else
info "Unexpected exit code: ${rc}"
exit "${rc}"
fi
BASH Checkpointing Example
See BASH Checkpointing Example
Python Checkpointing Example
See Python Checkpointing Example
R Checkpointing Example
Note:
See TracWiki
for help on using the wiki.
