| | 1 | [[PageOutline]] |
| | 2 | = R Checkpointing Example = |
| | 3 | |
| | 4 | == Checkpoint Runner == |
| | 5 | |
| | 6 | See [wiki:Workshops/JobCheckpointing/Examples#CheckpointRunner Checkpoint Runner] for the contents of the job script file '''checkpoint_runner.sh'''. |
| | 7 | |
| | 8 | == R Checkpointing Application == |
| | 9 | |
| | 10 | === checkpoint_signal_iter.R === |
| | 11 | |
| | 12 | {{{ |
| | 13 | #!/usr/bin/env Rscript |
| | 14 | # checkpoint_signal_iter.R (requires 'sigterm' package) |
| | 15 | suppressWarnings(suppressMessages({ |
| | 16 | if (!requireNamespace("sigterm", quietly = TRUE)) { |
| | 17 | message("ERROR: 'sigterm' package not installed; install via remotes/devtools.") |
| | 18 | quit(save="no", status=2L) |
| | 19 | } |
| | 20 | })) |
| | 21 | |
| | 22 | get_env <- function(k, d) { v <- Sys.getenv(k, unset=NA); if (is.na(v) || v=="") d else v } |
| | 23 | CKPT <- get_env("CKPT_PATH", "state_iter.txt") |
| | 24 | EVERY <- as.integer(get_env("CHECKPOINT_EVERY", "20")) |
| | 25 | MAX_ITER <- as.integer(get_env("MAX_ITER", "500")) |
| | 26 | |
| | 27 | atomic_save <- function(path, val) { |
| | 28 | dir <- dirname(normalizePath(path, mustWork = FALSE)) |
| | 29 | if (!dir.exists(dir)) dir.create(dir, recursive = TRUE, showWarnings = FALSE) |
| | 30 | tmp <- tempfile(pattern = ".ckpt.", tmpdir = dir) |
| | 31 | con <- file(tmp, open="wt"); writeLines(as.character(val), con); flush(con); close(con) |
| | 32 | file.rename(tmp, path) |
| | 33 | } |
| | 34 | load_ckpt <- function(path) { |
| | 35 | if (!file.exists(path)) return(0L) |
| | 36 | txt <- tryCatch(readLines(path, warn = FALSE), error = function(e) "0") |
| | 37 | as.integer(gsub("[^0-9]", "", paste(txt, collapse = ""))) |
| | 38 | } |
| | 39 | |
| | 40 | library(sigterm) # installs a SIGTERM handler; poll has_sigterm_flag() |
| | 41 | |
| | 42 | i <- load_ckpt(CKPT) |
| | 43 | cat(sprintf("R: Resuming from i=%d (every %d, MAX_ITER=%d)\n", i, EVERY, MAX_ITER)); flush(stdout()) |
| | 44 | |
| | 45 | repeat { |
| | 46 | i <- i + 1L |
| | 47 | Sys.sleep(1) |
| | 48 | |
| | 49 | if (i %% EVERY == 0L) { |
| | 50 | atomic_save(CKPT, i) |
| | 51 | cat(sprintf("[periodic/iter] saved i=%d\n", i)); flush(stdout()) |
| | 52 | } |
| | 53 | |
| | 54 | if (sigterm::has_sigterm_flag()) { |
| | 55 | cat(sprintf("R: SIGTERM detected — saving i=%d and exiting 99\n", i)); flush(stdout()) |
| | 56 | atomic_save(CKPT, i) |
| | 57 | quit(save="no", status=99L) |
| | 58 | } |
| | 59 | |
| | 60 | if (i > MAX_ITER) { |
| | 61 | cat(sprintf("Reached i=%d > %d; exiting 0\n", i, MAX_ITER)); flush(stdout()) |
| | 62 | atomic_save(CKPT, i) |
| | 63 | quit(save="no", status=0L) |
| | 64 | } |
| | 65 | } |
| | 66 | }}} |
| | 67 | |
| | 68 | == Running R checkpointing example on Cypress == |
| | 69 | |
| | 70 | To run the R checkpointing job example, defaulting to checkpointing every 20 application iterations and a total of 500 iterations, perform the following. |
| | 71 | |
| | 72 | 1. Edit the files '''checkpoint_runner.sh''' and '''checkpoint_signal_iter.R''' in your current directory. |
| | 73 | For file editing with nano, etc., see [wiki:cypress/FileEditingSoftware/Example File Editing Example]. |
| | 74 | |
| | 75 | 2. Use the steps in the section [wiki:cypress/RunningRStudioWithSingularity#Alternativefortidyverse Alternative for tidyverse] for constructing a container image file '''tidyverse_latest.sif''' (or '''tidyverse_4.5.2.sif''' as in the following) for use with Singularity. |
| | 76 | |
| | 77 | 3. Install the required R package '''sigterm''' via the following, substituting your own writable R library directory for <R_lib_path>. See [wiki:cypress/InstallingRPackages Installing R Packages on Cypress] for more information on using a writable R library directory. |
| | 78 | |
| | 79 | {{{ |
| | 80 | [tulaneID@cypress1 ~]$idev --partition=centos7 |
| | 81 | [tulaneID@cypress01-XXX ~]$module load singularity/3.9.0 |
| | 82 | [tulaneID@cypress01-XXX ~]$singularity shell tidyverse_4.5.2.sif |
| | 83 | Singularity> Rscript --version # confirm Rscript is available |
| | 84 | Rscript (R) version 4.5.2 (2025-10-31) |
| | 85 | Singularity>Rscript -e "devtools::install_github('atheriel/sigterm', lib='<R_lib_path>')" |
| | 86 | Singularity>exit # exit the container |
| | 87 | [tulaneID@cypress01-XXX ~]$exit # exit the interactive session |
| | 88 | }}} |
| | 89 | |
| | 90 | 4. Submit the job via the following command. |
| | 91 | |
| | 92 | {{{ |
| | 93 | [tulaneID@cypress1 ~]$APP_CMD="singularity exec tidyverse_4.5.2.sif Rscript checkpoint_signal_iter.R" MODULE_LIST="singularity/3.9.0" CKPT_PATH=state_iter_r.txt sbatch checkpoint_runner.sh |
| | 94 | }}} |
| | 95 | |
| | 96 | 5. Monitor the job's output via the following command, substituting the job ID for <jobID>. |
| | 97 | |
| | 98 | {{{ |
| | 99 | [tulaneID@cypress1 ~]$ tail -f log_<jobID>.* |
| | 100 | }}} |
| | 101 | |
| | 102 | 6. Here are normal results for the output and error files, '''log_<jobID>.err''' and '''log_<jobID>.out''', observing that the job cancelled and requeued itself many times. (Not all cancellations were captured in the error file.) |
| | 103 | |
| | 104 | {{{ |
| | 105 | [tulaneID@cypress1 ~]$cat log_3300700.err |
| | 106 | slurmstepd: *** JOB 3300700 CANCELLED AT 2026-03-13T22:20:20 *** |
| | 107 | slurmstepd: *** JOB 3300700 CANCELLED AT 2026-03-13T22:28:00 *** |
| | 108 | }}} |
| | 109 | |
| | 110 | {{{ |
| | 111 | [tulaneID@cypress1 ~]$cat log_3300700.out |
| | 112 | Info[20260313-22:18:18]: Start on cypress01-066; JOB_ID=3300700; RESTARTS=0 |
| | 113 | Info[20260313-22:18:18]: Settings: |
| | 114 | Info[20260313-22:18:18]: MODULE_LIST=singularity/3.9.0 |
| | 115 | Info[20260313-22:18:18]: APP_CMD=singularity exec tidyverse_4.5.2.sif Rscript checkpoint_signal_iter.R |
| | 116 | Info[20260313-22:18:18]: LAUNCH_MODE=direct |
| | 117 | Info[20260313-22:18:18]: SRUN_ARGS=-n 1 |
| | 118 | Info[20260313-22:18:18]: TIME_LIMIT=00:03:00 |
| | 119 | Info[20260313-22:18:18]: MARGIN_SEC=60 |
| | 120 | Info[20260313-22:18:18]: CKPT_PATH=state_iter_r.txt |
| | 121 | Info[20260313-22:18:18]: CHECKPOINT_EVERY=20 |
| | 122 | Info[20260313-22:18:18]: MAX_ITER=500 |
| | 123 | Info[20260313-22:18:18]: MAX_RESTARTS=10 |
| | 124 | === BEGIN JOB SNAPSHOT (scontrol) === |
| | 125 | JobId=3300700 Name=ckpt_requeue_demo |
| | 126 | Priority=80808 Nice=0 Account=<groupID> QOS=normal |
| | 127 | JobState=RUNNING Reason=None Dependency=(null) |
| | 128 | Requeue=1 Restarts=0 BatchFlag=1 ExitCode=0:0 |
| | 129 | RunTime=00:00:01 TimeLimit=00:03:00 TimeMin=N/A |
| | 130 | StartTime=2026-03-13T22:18:18 EndTime=2026-03-13T22:21:18 |
| | 131 | Partition=centos7 AllocNode:Sid=cypress2:33768 |
| | 132 | === END JOB SNAPSHOT (scontrol) === |
| | 133 | R: Resuming from i=0 (every 20, MAX_ITER=500) |
| | 134 | [periodic/iter] saved i=20 |
| | 135 | [periodic/iter] saved i=40 |
| | 136 | [periodic/iter] saved i=60 |
| | 137 | [periodic/iter] saved i=80 |
| | 138 | [periodic/iter] saved i=100 |
| | 139 | R: SIGTERM detected — saving i=117 and exiting 99 |
| | 140 | Info[20260313-22:20:19]: Program exit code (from timeout wrapper): 124 |
| | 141 | Info[20260313-22:20:19]: Timeout TERM observed; checkpoint advanced (0->117). Requeueing... |
| | 142 | Info[20260313-22:20:19]: Requeued via scontrol. |
| | 143 | Info[20260313-22:20:58]: Start on cypress01-066; JOB_ID=3300700; RESTARTS=1 |
| | 144 | Info[20260313-22:20:58]: Settings: |
| | 145 | Info[20260313-22:20:58]: MODULE_LIST=singularity/3.9.0 |
| | 146 | Info[20260313-22:20:58]: APP_CMD=singularity exec tidyverse_4.5.2.sif Rscript checkpoint_signal_iter.R |
| | 147 | Info[20260313-22:20:58]: LAUNCH_MODE=direct |
| | 148 | Info[20260313-22:20:58]: SRUN_ARGS=-n 1 |
| | 149 | Info[20260313-22:20:58]: TIME_LIMIT=00:03:00 |
| | 150 | Info[20260313-22:20:58]: MARGIN_SEC=60 |
| | 151 | Info[20260313-22:20:58]: CKPT_PATH=state_iter_r.txt |
| | 152 | Info[20260313-22:20:58]: CHECKPOINT_EVERY=20 |
| | 153 | Info[20260313-22:20:58]: MAX_ITER=500 |
| | 154 | Info[20260313-22:20:58]: MAX_RESTARTS=10 |
| | 155 | === BEGIN JOB SNAPSHOT (scontrol) === |
| | 156 | JobId=3300700 Name=ckpt_requeue_demo |
| | 157 | Priority=80808 Nice=0 Account=<groupID> QOS=normal |
| | 158 | JobState=RUNNING Reason=None Dependency=(null) |
| | 159 | Requeue=1 Restarts=1 BatchFlag=1 ExitCode=0:0 |
| | 160 | RunTime=00:00:01 TimeLimit=00:03:00 TimeMin=N/A |
| | 161 | StartTime=2026-03-13T22:20:57 EndTime=2026-03-13T22:23:57 |
| | 162 | Partition=centos7 AllocNode:Sid=cypress2:33768 |
| | 163 | === END JOB SNAPSHOT (scontrol) === |
| | 164 | R: Resuming from i=117 (every 20, MAX_ITER=500) |
| | 165 | [periodic/iter] saved i=120 |
| | 166 | [periodic/iter] saved i=140 |
| | 167 | [periodic/iter] saved i=160 |
| | 168 | [periodic/iter] saved i=180 |
| | 169 | [periodic/iter] saved i=200 |
| | 170 | [periodic/iter] saved i=220 |
| | 171 | R: SIGTERM detected — saving i=236 and exiting 99 |
| | 172 | Info[20260313-22:22:59]: Program exit code (from timeout wrapper): 124 |
| | 173 | Info[20260313-22:22:59]: Timeout TERM observed; checkpoint advanced (117->236). Requeueing... |
| | 174 | Info[20260313-22:22:59]: Requeued via scontrol. |
| | 175 | Info[20260313-22:23:29]: Start on cypress01-066; JOB_ID=3300700; RESTARTS=2 |
| | 176 | Info[20260313-22:23:29]: Settings: |
| | 177 | Info[20260313-22:23:29]: MODULE_LIST=singularity/3.9.0 |
| | 178 | Info[20260313-22:23:29]: APP_CMD=singularity exec tidyverse_4.5.2.sif Rscript checkpoint_signal_iter.R |
| | 179 | Info[20260313-22:23:29]: LAUNCH_MODE=direct |
| | 180 | Info[20260313-22:23:29]: SRUN_ARGS=-n 1 |
| | 181 | Info[20260313-22:23:29]: TIME_LIMIT=00:03:00 |
| | 182 | Info[20260313-22:23:29]: MARGIN_SEC=60 |
| | 183 | Info[20260313-22:23:29]: CKPT_PATH=state_iter_r.txt |
| | 184 | Info[20260313-22:23:29]: CHECKPOINT_EVERY=20 |
| | 185 | Info[20260313-22:23:29]: MAX_ITER=500 |
| | 186 | Info[20260313-22:23:29]: MAX_RESTARTS=10 |
| | 187 | === BEGIN JOB SNAPSHOT (scontrol) === |
| | 188 | JobId=3300700 Name=ckpt_requeue_demo |
| | 189 | Priority=80808 Nice=0 Account=<groupID> QOS=normal |
| | 190 | JobState=RUNNING Reason=None Dependency=(null) |
| | 191 | Requeue=1 Restarts=2 BatchFlag=1 ExitCode=0:0 |
| | 192 | RunTime=00:00:02 TimeLimit=00:03:00 TimeMin=N/A |
| | 193 | StartTime=2026-03-13T22:23:27 EndTime=2026-03-13T22:26:27 |
| | 194 | Partition=centos7 AllocNode:Sid=cypress2:33768 |
| | 195 | === END JOB SNAPSHOT (scontrol) === |
| | 196 | R: Resuming from i=236 (every 20, MAX_ITER=500) |
| | 197 | [periodic/iter] saved i=240 |
| | 198 | [periodic/iter] saved i=260 |
| | 199 | [periodic/iter] saved i=280 |
| | 200 | [periodic/iter] saved i=300 |
| | 201 | [periodic/iter] saved i=320 |
| | 202 | [periodic/iter] saved i=340 |
| | 203 | R: SIGTERM detected — saving i=355 and exiting 99 |
| | 204 | Info[20260313-22:25:30]: Program exit code (from timeout wrapper): 124 |
| | 205 | Info[20260313-22:25:30]: Timeout TERM observed; checkpoint advanced (236->355). Requeueing... |
| | 206 | Info[20260313-22:25:30]: Requeued via scontrol. |
| | 207 | Info[20260313-22:25:59]: Start on cypress01-066; JOB_ID=3300700; RESTARTS=3 |
| | 208 | Info[20260313-22:25:59]: Settings: |
| | 209 | Info[20260313-22:25:59]: MODULE_LIST=singularity/3.9.0 |
| | 210 | Info[20260313-22:25:59]: APP_CMD=singularity exec tidyverse_4.5.2.sif Rscript checkpoint_signal_iter.R |
| | 211 | Info[20260313-22:25:59]: LAUNCH_MODE=direct |
| | 212 | Info[20260313-22:25:59]: SRUN_ARGS=-n 1 |
| | 213 | Info[20260313-22:25:59]: TIME_LIMIT=00:03:00 |
| | 214 | Info[20260313-22:25:59]: MARGIN_SEC=60 |
| | 215 | Info[20260313-22:25:59]: CKPT_PATH=state_iter_r.txt |
| | 216 | Info[20260313-22:25:59]: CHECKPOINT_EVERY=20 |
| | 217 | Info[20260313-22:25:59]: MAX_ITER=500 |
| | 218 | Info[20260313-22:25:59]: MAX_RESTARTS=10 |
| | 219 | === BEGIN JOB SNAPSHOT (scontrol) === |
| | 220 | JobId=3300700 Name=ckpt_requeue_demo |
| | 221 | Priority=80808 Nice=0 Account=<groupID> QOS=normal |
| | 222 | JobState=RUNNING Reason=None Dependency=(null) |
| | 223 | Requeue=1 Restarts=3 BatchFlag=1 ExitCode=0:0 |
| | 224 | RunTime=00:00:02 TimeLimit=00:03:00 TimeMin=N/A |
| | 225 | StartTime=2026-03-13T22:25:57 EndTime=2026-03-13T22:28:57 |
| | 226 | Partition=centos7 AllocNode:Sid=cypress2:33768 |
| | 227 | === END JOB SNAPSHOT (scontrol) === |
| | 228 | R: Resuming from i=355 (every 20, MAX_ITER=500) |
| | 229 | [periodic/iter] saved i=360 |
| | 230 | [periodic/iter] saved i=380 |
| | 231 | [periodic/iter] saved i=400 |
| | 232 | [periodic/iter] saved i=420 |
| | 233 | [periodic/iter] saved i=440 |
| | 234 | [periodic/iter] saved i=460 |
| | 235 | R: SIGTERM detected — saving i=474 and exiting 99 |
| | 236 | Info[20260313-22:28:00]: Program exit code (from timeout wrapper): 124 |
| | 237 | Info[20260313-22:28:00]: Timeout TERM observed; checkpoint advanced (355->474). Requeueing... |
| | 238 | Info[20260313-22:28:00]: Requeued via scontrol. |
| | 239 | Info[20260313-22:28:19]: Start on cypress01-066; JOB_ID=3300700; RESTARTS=4 |
| | 240 | Info[20260313-22:28:19]: Settings: |
| | 241 | Info[20260313-22:28:19]: MODULE_LIST=singularity/3.9.0 |
| | 242 | Info[20260313-22:28:19]: APP_CMD=singularity exec tidyverse_4.5.2.sif Rscript checkpoint_signal_iter.R |
| | 243 | Info[20260313-22:28:19]: LAUNCH_MODE=direct |
| | 244 | Info[20260313-22:28:19]: SRUN_ARGS=-n 1 |
| | 245 | Info[20260313-22:28:19]: TIME_LIMIT=00:03:00 |
| | 246 | Info[20260313-22:28:19]: MARGIN_SEC=60 |
| | 247 | Info[20260313-22:28:19]: CKPT_PATH=state_iter_r.txt |
| | 248 | Info[20260313-22:28:19]: CHECKPOINT_EVERY=20 |
| | 249 | Info[20260313-22:28:19]: MAX_ITER=500 |
| | 250 | Info[20260313-22:28:19]: MAX_RESTARTS=10 |
| | 251 | === BEGIN JOB SNAPSHOT (scontrol) === |
| | 252 | JobId=3300700 Name=ckpt_requeue_demo |
| | 253 | Priority=80808 Nice=0 Account=<groupID> QOS=normal |
| | 254 | JobState=RUNNING Reason=None Dependency=(null) |
| | 255 | Requeue=1 Restarts=4 BatchFlag=1 ExitCode=0:0 |
| | 256 | RunTime=00:00:01 TimeLimit=00:03:00 TimeMin=N/A |
| | 257 | StartTime=2026-03-13T22:28:18 EndTime=2026-03-13T22:31:18 |
| | 258 | Partition=centos7 AllocNode:Sid=cypress2:33768 |
| | 259 | === END JOB SNAPSHOT (scontrol) === |
| | 260 | R: Resuming from i=474 (every 20, MAX_ITER=500) |
| | 261 | [periodic/iter] saved i=480 |
| | 262 | [periodic/iter] saved i=500 |
| | 263 | Reached i=501 > 500; exiting 0 |
| | 264 | Info[20260313-22:28:47]: Program exit code (from timeout wrapper): 0 |
| | 265 | Info[20260313-22:28:47]: Completed. |
| | 266 | }}} |