Context Navigation

Examples

Timestamp:: 03/14/2026 01:25:38 AM (4 months ago)
Author:: Carl Baribault
Comment:: Broke out BASH, Python, R examples in separate pages

Legend:

: Unmodified
: Added
: Removed
: Modified

Workshops/JobCheckpointing/Examples

-              v3
+              v4
 = Job Checkpointing Examples =
+== Python Example ==
+=== Checkpointed, self restarting job ===
+Here is a fully self restarting job and, further below, the accompanying, minimal working example of a checkpointed application in Python.
+Note that we're using the latest available Python '''module anaconda3/2023.07''' in partition '''centos7'''.
+== Checkpoint Runner ==
+Here is a job script for a fully self restarting job for use with either of the accompanying, minimal working examples of checkpointed applications in BASH, Python, or R.
+=== checkpoint_runner.sh ===
 {{{
 #!/bin/bash
+#SBATCH --job-name=checkpoint_example
+#SBATCH --partition=centos7
+#SBATCH --qos=normal
+#SBATCH --nodes=1
+###############################################################################
+# CHECKPOINT RUNNER: checkpoint + auto-requeue (Slurm 14.x-safe)
+###############################################################################
+#   MODULE_LIST     : modules loaded before running app command
+#   APP_CMD         : command to run (string)
+#   LAUNCH_MODE     : run app command directly or via srun
+#   SRUN_ARGS       : arguments to srun, if needed
+#   TIME_LIMIT      : wall time (D-HH:MM:SS or HH:MM:SS) sync w/ #SBATCH --time
+#   MARGIN_SEC      : seconds before wall time to checkpoint (checkpoint_timer)
+#   CKPT_PATH       : path to checkpoint file
+#   CHECKPOINT_EVERY: checkpoint after every this many application iterations
+#   MAX_ITER        : stop application after this many iterations total
+#   MAX_RESTARTS    : stop requeuing after this many restarts (safety)
+###############################################################################
+# User settings with defaults
+###############################################################################
+# Edit settings below
+# Or pass values via...
+#   sbatch --export=ALL,Var1=...,...,VarN=... checkpoint_runner.sh
+# Or pass values via...
+#   Var1=... VarN=... sbatch checkpoint_runner.sh
+###############################################################################
+MODULE_LIST="${MODULE_LIST:-anaconda3/2023.07}"
+                                           # space-delimited module list
+APP_CMD="${APP_CMD:-python3 checkpoint_signal_iter.py}"
+LAUNCH_MODE="${LAUNCH_MODE:-direct}"       # direct | srun
+SRUN_ARGS="${SRUN_ARGS:--n 1}"             # extra srun flags
+TIME_LIMIT="${TIME_LIMIT:-00:03:00}"       # match #SBATCH --time below
+MARGIN_SEC="${MARGIN_SEC:-60}"             # checkpoint time before time limit
+CKPT_PATH="${CKPT_PATH:-state_iter.txt}"   # checkpoint file path
+CHECKPOINT_EVERY="${CHECKPOINT_EVERY:-20}" # number of iter. before checkpoint
+MAX_ITER="${MAX_ITER:-500}"                # number of iter. total
+MAX_RESTARTS="${MAX_RESTARTS:-10}"         # max. number of restarts
+###############################################################################
+# Slurm directives (keep TIME_LIMIT in sync with #SBATCH --time)
+###############################################################################
+#SBATCH --job-name=ckpt_requeue_demo
+#SBATCH --output=log_%j.out
+#SBATCH --error=log_%j.err
+#SBATCH --open-mode=append
+#SBATCH --time=00:03:00                 # keep in sync with TIME_LIMIT above
 #SBATCH --ntasks=1
+#SBATCH --cpus-per-task=4
+#SBATCH --time=24:00:00
+#SBATCH --mem=16G
+# --- Logging ---
+#SBATCH --output=output.%j.out
+#SBATCH --error=error.%j.err
+#SBATCH --open-mode=append
+# --- Enable automatic requeue ---
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=512M
 #SBATCH --requeue
 # --- Send SIGTERM 2 minutes before walltime ---
 #SBATCH --signal=TERM@120
+#SBATCH --get-user-env
+#SBATCH --partition=centos7             # change partition to defq if needed
+#SBATCH --qos=normal                    # best qos for node availability
 set -euo pipefail
+echo "Job started at $(date)"
+echo "SLURM_JOB_ID = ${SLURM_JOB_ID}"
+echo "SLURM_RESTART_COUNT = ${SLURM_RESTART_COUNT:-0}"
+# ---------------------------------------------
+# Application-specific configuration
+# ---------------------------------------------
+CHECKPOINT_DIR="$PWD/checkpoints"
+CHECKPOINT_FILE="${CHECKPOINT_DIR}/state.chk"
+mkdir -p "${CHECKPOINT_DIR}"
+# ---------------------------------------------
+# Launch the application
+# ---------------------------------------------
+# Your application must:
+#  1) Load checkpoint if it exists
+#  2) Catch SIGTERM
+#  3) Write checkpoint
+#  4) exit(99)
+module load anaconda3/2023.07
+srun ./my_simulation.py \
+    --checkpoint "${CHECKPOINT_FILE}"
+EXIT_CODE=$?
+echo "Application exited with code ${EXIT_CODE}"
+# ---------------------------------------------
+# Restart logic
+# ---------------------------------------------
+if [[ ${EXIT_CODE} -eq 0 ]]; then
+    echo "INFO: Job completed successfully"
+    exit 0
+elif [[ ${EXIT_CODE} -eq 99 ]]; then
+    echo "INFO: Checkpoint written, requeuing job"
+    scontrol requeue "${SLURM_JOB_ID}"
+    exit 0
+shopt -s expand_aliases
+alias dtstamp="date +%Y%m%d-%H:%M:%S"
+info(){ echo "Info[$(dtstamp)]: $*"; }
+info "Start on $(hostname); JOB_ID=${SLURM_JOB_ID}; RESTARTS=${SLURM_RESTART_COUNT:-0}"
+info "Settings:"
+info "MODULE_LIST=${MODULE_LIST}"
+info "APP_CMD=${APP_CMD}"
+info "LAUNCH_MODE=${LAUNCH_MODE}"
+info "SRUN_ARGS=${SRUN_ARGS}"
+info "TIME_LIMIT=${TIME_LIMIT}"
+info "MARGIN_SEC=${MARGIN_SEC}"
+info "CKPT_PATH=${CKPT_PATH}"
+info "CHECKPOINT_EVERY=${CHECKPOINT_EVERY}"
+info "MAX_ITER=${MAX_ITER}"
+info "MAX_RESTARTS=${MAX_RESTARTS}"
+# Load site modules (available on Cypress workers)
+module load slurm 2>/dev/null || true   # makes scontrol visible on worker
+module load "${MODULE_LIST}" || true
+# Tool paths
+SCTRL="$(command -v scontrol || true)"
+# Short diagnostics
+if [[ -n "${SCTRL}" ]]; then
+  echo "=== BEGIN JOB SNAPSHOT (scontrol) ==="
+  "${SCTRL}" show job "${SLURM_JOB_ID}" | \
+    grep -E "JobId=|Partition=|QOS=|TimeLimit=|StartTime=|EndTime=|RunTime=|State=|Restarts="
+  echo "=== END JOB SNAPSHOT (scontrol) ==="
 else
+    echo "ERROR: Job failed with unexpected exit code"
+    exit ${EXIT_CODE}
+  info "WARNING: scontrol not found on this node; in-place requeue will not be attempted."
+fi
+# Helpers
+get_ckpt_iter() {
+  # reads the first numeric token from CKPT_PATH; returns 0 on any error
+  [[ -f "${CKPT_PATH}" ]] || { echo 0; return; }
+  local v
+  v=$(tr -cd '0-9' < "${CKPT_PATH}" 2>/dev/null)
+  echo "${v:-0}"
+}
+get_restarts() {
+  if [[ -n "${SLURM_RESTART_COUNT:-}" ]]; then
+    echo "${SLURM_RESTART_COUNT}"
+  elif [[ -n "${SCTRL}" ]]; then
+    "${SCTRL}" show job "${SLURM_JOB_ID}" | tr ' ' '\n' | awk -F= '/^Restarts=/{print $2; exit}'
+  else
+    echo "0"
+  fi
+}
+to_seconds() {
+  local t="$1"
+  if [[ "$t" == *-*:*:* ]]; then
+    local d h m s; IFS='-:' read -r d h m s <<<"$t"
+    echo $(( d*86400 + h*3600 + m*60 + s ))
+  else
+    local h m s; IFS=':' read -r h m s <<<"$t"
+    h=${h:-0}; m=${m:-0}; s=${s:-0}
+    echo $(( h*3600 + m*60 + s ))
+  fi
+}
+# Trap (batch shell)
+signal_handler () {
+  info "TERM/INT caught in batch shell"
+  local rc_local=0
+  if [[ -n "${child_pid:-}" ]]; then
+    # srun or bash are group leaders; forward to their process group
+    local child_pgid
+    child_pgid="$(ps -o pgid= -p "${child_pid}" 2>/dev/null | awk '{print $1}')"
+    if [[ -n "${child_pgid}" ]]; then
+      kill -TERM "-${child_pgid}" 2>/dev/null || true
+    else
+      kill -TERM "${child_pid}" 2>/dev/null || true
+    fi
+    wait "${child_pid}" || rc_local=$?
+  fi
+  info "Program exit code (from trap): ${rc_local}"
+  local restarts; restarts=$(get_restarts)
+  if [[ ${rc_local} -eq 99 && ${requeued:-0} -eq 0 && ${restarts} -lt ${MAX_RESTARTS} && -n "${SCTRL}" ]]; then
+    requeued=1
+    info "Checkpoint written (trap path). Requeueing in-place (same JobID)..."
+    "${SCTRL}" requeue "${SLURM_JOB_ID}" || true
+    info "Requeued via scontrol."
+  fi
+  exit 0
+}
+trap 'signal_handler' TERM INT
+# Launch via timeout
+TOTAL_SEC=$(to_seconds "${TIME_LIMIT}")
+RUN_WINDOW_SEC=$(( TOTAL_SEC - MARGIN_SEC ))
+if (( RUN_WINDOW_SEC <= 0 )); then
+  info "WARNING: RUN_WINDOW_SEC <= 0; using 1s."
+  RUN_WINDOW_SEC=1
+fi
+before_iter=$(get_ckpt_iter)
+set +e
+if [[ "${LAUNCH_MODE}" == "srun" ]]; then
+  # timeout -> srun -> bash -lc "APP_CMD"
+  # On expiry, timeout sends TERM to srun; srun forwards signals to its job step tasks.
+  timeout "${RUN_WINDOW_SEC}s" srun ${SRUN_ARGS} bash -lc "${APP_CMD}"
+else
+  # direct mode: run in the batch shell
+  timeout "${RUN_WINDOW_SEC}s" bash -lc "${APP_CMD}"
+fi
+rc=$?
+set -e
+info "Program exit code (from timeout wrapper): ${rc}"
+# Interpret coreutils 8.4 returns:
+#   0   -> completed
+#   99  -> app exited 99 before timeout expiry (valid)
+#   124 -> timeout expired (TERM sent), treat as checkpoint cycle and requeue
+#          (optionally confirm CKPT grew)
+#   else-> unexpected -> propagate
+requeued=0
+if [[ ${rc} -eq 0 ]]; then
+  info "Completed."
+  exit 0
+elif [[ ${rc} -eq 99 ]]; then
+  restarts=$(get_restarts)
+  if (( restarts < MAX_RESTARTS )) && [[ -n "${SCTRL}" ]]; then
+    requeued=1
+    info "Checkpoint written. Requeueing in-place (same JobID)..."
+    "${SCTRL}" requeue "${SLURM_JOB_ID}" || true
+    info "Requeued via scontrol."
+  else
+    info "WARNING: cannot requeue (scontrol unavailable or MAX_RESTARTS reached)."
+  fi
+  exit 0
+elif [[ ${rc} -eq 124 ]]; then
+        after_iter=$(get_ckpt_iter)
+        if (( after_iter > before_iter )); then
+          restarts=$(get_restarts)
+          if (( restarts < MAX_RESTARTS )) && [[ -n "${SCTRL}" ]]; then
+            requeued=1
+            info "Timeout TERM observed; checkpoint advanced (${before_iter}->${after_iter}). Requeueing..."
+            "${SCTRL}" requeue "${SLURM_JOB_ID}" || true
+            info "Requeued via scontrol."
+            exit 0
+          else
+            info "WARNING: cannot requeue (scontrol unavailable or MAX_RESTARTS reached)."
+            exit 0
+          fi
+        else
+          info "Timeout TERM observed but checkpoint did not advance; marking as failure."
+          exit 1
+        fi
+else
+  info "Unexpected exit code: ${rc}"
+  exit "${rc}"
 fi
 }}}
+=== Checkpointed application in Python ===
+Here is an accompanying, minimal working example of a checkpointed application for Python in file '''my_simulation.py'''.
+{{{
+#!/usr/bin/env python3
+import signal
+import sys
+import time
+import os
+import json
+CHECKPOINT_FILE = "checkpoints/state.chk"
+def save_checkpoint(i):
+    os.makedirs("checkpoints", exist_ok=True)
+    with open(CHECKPOINT_FILE, "w") as f:
+        json.dump({"step": i}, f)
+def load_checkpoint():
+    if os.path.exists(CHECKPOINT_FILE):
+        with open(CHECKPOINT_FILE, "r") as f:
+            return json.load(f)["step"]
+    return 0
+def term_handler(signum, frame):
+    print("SIGTERM received — saving checkpoint")
+    save_checkpoint(current_step)
+    sys.exit(99)  # <- special "requeue me" code
+signal.signal(signal.SIGTERM, term_handler)
+current_step = load_checkpoint()
+print(f"Resuming from step {current_step}")
+for i in range(current_step, 1_000_000):
+    current_step = i
+    time.sleep(1)  # simulate work
+}}}
+== R Example ==
+=== Checkpointed, self restarting job ===
+Here is a fully self restarting job and, further below, the accompanying, minimal working example of a checkpointed application in R.
+{{{
+#!/bin/bash
+#SBATCH --job-name=r_checkpoint_demo
+#SBATCH --partition=centos7
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=4G
+#SBATCH --output=output.%j.out
+#SBATCH --error=error.%j.err
+#SBATCH --open-mode=append
+#SBATCH --requeue
+#SBATCH --signal=TERM@120   # send SIGTERM 120s before walltime
+set -euo pipefail
+mkdir -p logs checkpoints
+# Trap SIGTERM from SLURM: create a file flag that R checks for
+trap 'echo "SIGTERM received, creating TERM.flag"; touch TERM.flag' TERM
+echo "Starting R checkpointing run at $(date)"
+# load the R module
+module load R/4.4.1
+# Run the R script under srun
+srun Rscript checkpoint.R || rc=$? || rc=0
+# srun exit code
+rc=${rc:-0}
+echo "R exited with code: $rc"
+if [[ $rc -eq 0 ]]; then
+  echo "INFO: Finished successfully."
+  exit 0
+elif [[ $rc -eq 99 ]]; then
+  echo "INFO: Checkpoint written (exit 99). Requeuing job..."
+  rm -f TERM.flag
+  scontrol requeue "$SLURM_JOB_ID"
+  exit 0
+else
+  echo "ERROR: Unexpected failure (code $rc)."
+  exit $rc
+fi
+}}}
+=== Checkpointed application in Python ===
+Here is an accompanying, minimal working example of a checkpointed application for R in file '''checkpoint.R'''.
+{{{
+#!/usr/bin/env Rscript
+# Simple checkpointing/resume pattern for long runs in R.
+# - Saves state as checkpoints/state.rds
+# - Auto-resumes if that file exists
+# - Periodically checkpoints every N iterations
+# - If a TERM flag (created by SLURM trap) is detected, saves and exits(99)
+checkpoint_file <- "checkpoints/state.rds"
+term_flag       <- "TERM.flag"        # created by the shell trap
+dir.create("checkpoints", showWarnings = FALSE, recursive = TRUE)
+# --- Parameters you can tune ---
+max_steps            <- 1e6L
+checkpoint_every_n   <- 200L     # save every N iterations
+sleep_seconds        <- 0.05     # simulate work
+verbose              <- TRUE
+# --- Load or initialize state ---
+state <- list(step = 0L, results = numeric())
+if (file.exists(checkpoint_file)) {
+  if (verbose) cat("Resuming from checkpoint:", checkpoint_file, "\n")
+  state <- readRDS(checkpoint_file)
+} else {
+  if (verbose) cat("Starting fresh run\n")
+}
+# --- Utility: save checkpoint ---
+save_checkpoint <- function(st) {
+  saveRDS(st, checkpoint_file)
+  if (verbose) {
+    cat(sprintf("Checkpoint saved at step %d -> %s\n", st$step, checkpoint_file))
+  }
+}
+# --- Main work loop ---
+for (i in seq.int(state$step + 1L, max_steps)) {
+  state$step <- i
+  # Simulate "work" (replace with your compute kernel)
+  # e.g., update some running statistic
+  x <- sin(i * 0.001) + rnorm(1, sd = 0.01)
+  state$results <- c(state$results, x)
+  if (sleep_seconds > 0) Sys.sleep(sleep_seconds)
+  # Periodic checkpoint
+  if ((i %% checkpoint_every_n) == 0L) {
+    save_checkpoint(state)
+  }
+  # Respect pre-timeout signal from SLURM via a file flag
+  if (file.exists(term_flag)) {
+    cat("TERM flag detected. Saving final checkpoint and exiting with code 99.\n")
+    save_checkpoint(state)
+    quit(status = 99, save = "no")
+  }
+}
+# Finished normally
+save_checkpoint(state)
+cat("Completed all steps. Exiting with code 0.\n")
+quit(status = 0, save = "no")
+}}}
+== BASH Checkpointing Example ==
+See [wiki:Workshops/JobCheckpointing/Examples/BASH BASH Checkpointing Example]
+== Python Checkpointing Example ==
+See [wiki:Workshops/JobCheckpointing/Examples/Python Python Checkpointing Example]
+== R Checkpointing Example ==
+See [wiki:Workshops/JobCheckpointing/Examples/R R Checkpointing Example]