wiki:Workshops/JobCheckpointing/Examples

Version 4 (modified by Carl Baribault, 2 days ago) ( diff )

Broke out BASH, Python, R examples in separate pages

Job Checkpointing Examples

Checkpoint Runner

Here is a job script for a fully self restarting job for use with either of the accompanying, minimal working examples of checkpointed applications in BASH, Python, or R.

checkpoint_runner.sh

#!/bin/bash
###############################################################################
# CHECKPOINT RUNNER: checkpoint + auto-requeue (Slurm 14.x-safe)
###############################################################################
#   MODULE_LIST     : modules loaded before running app command
#   APP_CMD         : command to run (string)
#   LAUNCH_MODE     : run app command directly or via srun
#   SRUN_ARGS       : arguments to srun, if needed
#   TIME_LIMIT      : wall time (D-HH:MM:SS or HH:MM:SS) sync w/ #SBATCH --time
#   MARGIN_SEC      : seconds before wall time to checkpoint (checkpoint_timer)
#   CKPT_PATH       : path to checkpoint file
#   CHECKPOINT_EVERY: checkpoint after every this many application iterations
#   MAX_ITER        : stop application after this many iterations total
#   MAX_RESTARTS    : stop requeuing after this many restarts (safety)
###############################################################################
# User settings with defaults
###############################################################################
# Edit settings below
# Or pass values via...
#   sbatch --export=ALL,Var1=...,...,VarN=... checkpoint_runner.sh
# Or pass values via...
#   Var1=... VarN=... sbatch checkpoint_runner.sh
###############################################################################
MODULE_LIST="${MODULE_LIST:-anaconda3/2023.07}"
                                           # space-delimited module list
APP_CMD="${APP_CMD:-python3 checkpoint_signal_iter.py}"
LAUNCH_MODE="${LAUNCH_MODE:-direct}"       # direct | srun
SRUN_ARGS="${SRUN_ARGS:--n 1}"             # extra srun flags
TIME_LIMIT="${TIME_LIMIT:-00:03:00}"       # match #SBATCH --time below
MARGIN_SEC="${MARGIN_SEC:-60}"             # checkpoint time before time limit
CKPT_PATH="${CKPT_PATH:-state_iter.txt}"   # checkpoint file path
CHECKPOINT_EVERY="${CHECKPOINT_EVERY:-20}" # number of iter. before checkpoint
MAX_ITER="${MAX_ITER:-500}"                # number of iter. total
MAX_RESTARTS="${MAX_RESTARTS:-10}"         # max. number of restarts

###############################################################################
# Slurm directives (keep TIME_LIMIT in sync with #SBATCH --time)
###############################################################################
#SBATCH --job-name=ckpt_requeue_demo
#SBATCH --output=log_%j.out
#SBATCH --error=log_%j.err
#SBATCH --open-mode=append
#SBATCH --time=00:03:00                 # keep in sync with TIME_LIMIT above
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=512M
#SBATCH --requeue
#SBATCH --get-user-env
#SBATCH --partition=centos7             # change partition to defq if needed
#SBATCH --qos=normal                    # best qos for node availability

set -euo pipefail
shopt -s expand_aliases
alias dtstamp="date +%Y%m%d-%H:%M:%S"
info(){ echo "Info[$(dtstamp)]: $*"; }

info "Start on $(hostname); JOB_ID=${SLURM_JOB_ID}; RESTARTS=${SLURM_RESTART_COUNT:-0}"
info "Settings:"
info "MODULE_LIST=${MODULE_LIST}"
info "APP_CMD=${APP_CMD}"
info "LAUNCH_MODE=${LAUNCH_MODE}"
info "SRUN_ARGS=${SRUN_ARGS}"
info "TIME_LIMIT=${TIME_LIMIT}"
info "MARGIN_SEC=${MARGIN_SEC}"
info "CKPT_PATH=${CKPT_PATH}"
info "CHECKPOINT_EVERY=${CHECKPOINT_EVERY}"
info "MAX_ITER=${MAX_ITER}"
info "MAX_RESTARTS=${MAX_RESTARTS}"

# Load site modules (available on Cypress workers)
module load slurm 2>/dev/null || true   # makes scontrol visible on worker
module load "${MODULE_LIST}" || true

# Tool paths
SCTRL="$(command -v scontrol || true)"

# Short diagnostics
if [[ -n "${SCTRL}" ]]; then
  echo "=== BEGIN JOB SNAPSHOT (scontrol) ==="
  "${SCTRL}" show job "${SLURM_JOB_ID}" | \
    grep -E "JobId=|Partition=|QOS=|TimeLimit=|StartTime=|EndTime=|RunTime=|State=|Restarts="
  echo "=== END JOB SNAPSHOT (scontrol) ==="
else
  info "WARNING: scontrol not found on this node; in-place requeue will not be attempted."
fi

# Helpers
get_ckpt_iter() {
  # reads the first numeric token from CKPT_PATH; returns 0 on any error
  [[ -f "${CKPT_PATH}" ]] || { echo 0; return; }
  local v
  v=$(tr -cd '0-9' < "${CKPT_PATH}" 2>/dev/null)
  echo "${v:-0}"
}

get_restarts() {
  if [[ -n "${SLURM_RESTART_COUNT:-}" ]]; then
    echo "${SLURM_RESTART_COUNT}"
  elif [[ -n "${SCTRL}" ]]; then
    "${SCTRL}" show job "${SLURM_JOB_ID}" | tr ' ' '\n' | awk -F= '/^Restarts=/{print $2; exit}'
  else
    echo "0"
  fi
}

to_seconds() {
  local t="$1"
  if [[ "$t" == *-*:*:* ]]; then
    local d h m s; IFS='-:' read -r d h m s <<<"$t"
    echo $(( d*86400 + h*3600 + m*60 + s ))
  else
    local h m s; IFS=':' read -r h m s <<<"$t"
    h=${h:-0}; m=${m:-0}; s=${s:-0}
    echo $(( h*3600 + m*60 + s ))
  fi
}

# Trap (batch shell)
signal_handler () {
  info "TERM/INT caught in batch shell"
  local rc_local=0

  if [[ -n "${child_pid:-}" ]]; then
    # srun or bash are group leaders; forward to their process group
    local child_pgid
    child_pgid="$(ps -o pgid= -p "${child_pid}" 2>/dev/null | awk '{print $1}')"
    if [[ -n "${child_pgid}" ]]; then
      kill -TERM "-${child_pgid}" 2>/dev/null || true
    else
      kill -TERM "${child_pid}" 2>/dev/null || true
    fi
    wait "${child_pid}" || rc_local=$?
  fi

  info "Program exit code (from trap): ${rc_local}"
  local restarts; restarts=$(get_restarts)
  if [[ ${rc_local} -eq 99 && ${requeued:-0} -eq 0 && ${restarts} -lt ${MAX_RESTARTS} && -n "${SCTRL}" ]]; then
    requeued=1
    info "Checkpoint written (trap path). Requeueing in-place (same JobID)..."
    "${SCTRL}" requeue "${SLURM_JOB_ID}" || true
    info "Requeued via scontrol."
  fi
  exit 0
}
trap 'signal_handler' TERM INT

# Launch via timeout
TOTAL_SEC=$(to_seconds "${TIME_LIMIT}")
RUN_WINDOW_SEC=$(( TOTAL_SEC - MARGIN_SEC ))
if (( RUN_WINDOW_SEC <= 0 )); then
  info "WARNING: RUN_WINDOW_SEC <= 0; using 1s."
  RUN_WINDOW_SEC=1
fi

before_iter=$(get_ckpt_iter)

set +e
if [[ "${LAUNCH_MODE}" == "srun" ]]; then
  # timeout -> srun -> bash -lc "APP_CMD"
  # On expiry, timeout sends TERM to srun; srun forwards signals to its job step tasks.
  timeout "${RUN_WINDOW_SEC}s" srun ${SRUN_ARGS} bash -lc "${APP_CMD}"
else
  # direct mode: run in the batch shell
  timeout "${RUN_WINDOW_SEC}s" bash -lc "${APP_CMD}"
fi
rc=$?
set -e

info "Program exit code (from timeout wrapper): ${rc}"

# Interpret coreutils 8.4 returns:
#   0   -> completed
#   99  -> app exited 99 before timeout expiry (valid)
#   124 -> timeout expired (TERM sent), treat as checkpoint cycle and requeue
#          (optionally confirm CKPT grew)
#   else-> unexpected -> propagate
requeued=0
if [[ ${rc} -eq 0 ]]; then
  info "Completed."
  exit 0
elif [[ ${rc} -eq 99 ]]; then
  restarts=$(get_restarts)
  if (( restarts < MAX_RESTARTS )) && [[ -n "${SCTRL}" ]]; then
    requeued=1
    info "Checkpoint written. Requeueing in-place (same JobID)..."
    "${SCTRL}" requeue "${SLURM_JOB_ID}" || true
    info "Requeued via scontrol."
  else
    info "WARNING: cannot requeue (scontrol unavailable or MAX_RESTARTS reached)."
  fi
  exit 0
elif [[ ${rc} -eq 124 ]]; then
        after_iter=$(get_ckpt_iter)
        if (( after_iter > before_iter )); then
          restarts=$(get_restarts)
          if (( restarts < MAX_RESTARTS )) && [[ -n "${SCTRL}" ]]; then
            requeued=1
            info "Timeout TERM observed; checkpoint advanced (${before_iter}->${after_iter}). Requeueing..."
            "${SCTRL}" requeue "${SLURM_JOB_ID}" || true
            info "Requeued via scontrol."
            exit 0
          else
            info "WARNING: cannot requeue (scontrol unavailable or MAX_RESTARTS reached)."
            exit 0
          fi
        else
          info "Timeout TERM observed but checkpoint did not advance; marking as failure."
          exit 1
        fi
else
  info "Unexpected exit code: ${rc}"
  exit "${rc}"
fi

BASH Checkpointing Example

See BASH Checkpointing Example

Python Checkpointing Example

See Python Checkpointing Example

R Checkpointing Example

See R Checkpointing Example

Note: See TracWiki for help on using the wiki.