| 51 | ==== Eligibility ==== |
| 52 | As we saw yesterday during our Matlab tutorial, any program/code that makes use of the Intel MKL library may take advantage of Automatic Offloading (AO) to the MIC. However, not every MKL routine will automatically offload. The Routines that are eligible for AO are: |
| 53 | * BLAS: |
| 54 | * BLAS level-3 subroutines - ?SYMM,?TRMM, ?TRSM, ?GEMM |
| 55 | * LAPACK: |
| 56 | * LU (?GETRF), Cholesky ((S/D)POTRF), and QR (?GEQRF) factorization functions |
| 57 | |
| 58 | However, AO will only kick in if MKL deems the problem to be of sufficient size (i.e. the increase in parallelism will outweigh the increase in overhead). For instance, SGEMM will use AO only if the matrix size exceeds 2048x2048. For more information on which routines are eligible for AO see the white paper [https://software.intel.com/en-us/articles/intel-mkl-automatic-offload-enabled-functions-for-intel-xeon-phi-coprocessors|Intel® MKL Automatic Offload enabled functions for Intel Xeon Phi coprocessors] |
| 59 | |
| 60 | ==== Enabling Offloading ==== |
| 61 | To enable AO on Cypress you must |
| 62 | * Load the Intel Parallel Studio XE module |
| 63 | * Turn on MKL AO by setting the environment variable MKL_MIC_ENABLE to 1 (0 or nothing will turn off MKL AO) |
| 64 | * (OPTIONAL) Turn on offload reporting to track your use of the MIC by setting OFFLOAD_REPORT to either 1 or 2. Setting OFFLOAD_REPORT to 2 adds more detail than 1 and will give you information on data transfers. |
| 65 | {{{ |
| 66 | [tulaneID@cypress1]$ module load intel-psxe |
| 67 | [tulaneID@cypress1]$ export MKL_MIC_ENABLE=1 |
| 68 | [tulaneID@cypress1]$ export OFFLOAD_REPORT=2 |
| 69 | }}} |
| 70 | |
| 71 | ==== Example using SGEMM ==== |
| 72 | Let's do a small example using SGEMM to test the behavior of MLK AO |
| 73 | |
| 74 | {{{#!c |
| 75 | [tuhpc002@cypress01-089 Day2]$ cat sgemm_example.c |
| 76 | /* System headers */ |
| 77 | #include <stdio.h> |
| 78 | #include <stdlib.h> |
| 79 | #include <malloc.h> |
| 80 | #include <stdint.h> |
| 81 | |
| 82 | #include "mkl.h" |
| 83 | |
| 84 | // dtime |
| 85 | // |
| 86 | // returns the current wall clock time |
| 87 | // |
| 88 | double dtime() |
| 89 | { |
| 90 | double tseconds = 0.0; |
| 91 | struct timeval mytime; |
| 92 | gettimeofday(&mytime,(struct timezone*)0); |
| 93 | tseconds = (double)(mytime.tv_sec + |
| 94 | mytime.tv_usec*1.0e-6); |
| 95 | return( tseconds ); |
| 96 | } |
| 97 | |
| 98 | int main(int argc, char **argv) |
| 99 | { |
| 100 | float *A, *B, *C; /* Matrices */ |
| 101 | double workdivision; |
| 102 | double tstart, tstop, ttime; |
| 103 | |
| 104 | MKL_INT N = 2560; /* Matrix dimensions */ |
| 105 | MKL_INT LD = N; /* Leading dimension */ |
| 106 | int matrix_bytes; /* Matrix size in bytes */ |
| 107 | int matrix_elements; /* Matrix size in elements */ |
| 108 | |
| 109 | float alpha = 1.0, beta = 1.0; /* Scaling factors */ |
| 110 | char transa = 'N', transb = 'N'; /* Transposition options */ |
| 111 | |
| 112 | int i, j; /* Counters */ |
| 113 | |
| 114 | matrix_elements = N * N; |
| 115 | matrix_bytes = sizeof(float) * matrix_elements; |
| 116 | |
| 117 | /* Allocate the matrices */ |
| 118 | A = malloc(matrix_bytes); |
| 119 | B = malloc(matrix_bytes); |
| 120 | C = malloc(matrix_bytes); |
| 121 | |
| 122 | /* Initialize the matrices */ |
| 123 | for (i = 0; i < matrix_elements; i++) { |
| 124 | A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; |
| 125 | } |
| 126 | |
| 127 | tstart = dtime(); |
| 128 | sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, |
| 129 | &beta, C, &N); |
| 130 | tstop = dtime(); |
| 131 | /* Free the matrix memory */ |
| 132 | free(A); free(B); free(C); |
| 133 | |
| 134 | // elasped time |
| 135 | ttime = tstop - tstart; |
| 136 | // |
| 137 | // Print the results |
| 138 | // |
| 139 | if ((ttime) > 0.0) |
| 140 | { |
| 141 | printf("Time spent on SGEMM = %10.3lf\n",ttime); |
| 142 | } |
| 143 | printf("Done\n"); |
| 144 | |
| 145 | return 0; |
| 146 | } |
| 147 | }}} |
| 148 | |
| 149 | To test MKL AO |
| 150 | * Get onto a compute node using idev |
| 151 | {{{ |
| 152 | [tuhpc002@cypress1 Day2]$ export MY_PARTITION=workshop |
| 153 | [tuhpc002@cypress1 Day2]$ export MY_QUEUE=workshop |
| 154 | [tuhpc002@cypress1 Day2]$ idev -c 4 --gres=mic:0 |
| 155 | Requesting 1 node(s) task(s) to workshop queue of workshop partition |
| 156 | 1 task(s)/node, 4 cpu(s)/task, 2 MIC device(s)/node |
| 157 | Time: 0 (hr) 60 (min). |
| 158 | Submitted batch job 54982 |
| 159 | JOBID=54982 begin on cypress01-089 |
| 160 | --> Creating interactive terminal session (login) on node cypress01-089. |
| 161 | --> You have 0 (hr) 60 (min). |
| 162 | Last login: Fri Aug 21 07:16:58 2015 from cypress1.cm.cluster |
| 163 | [tuhpc002@cypress01-089 Day2]$ |
| 164 | }}} |
| 165 | |
| 166 | Note: We will be sharing MICs so expect some resource conflicts |
| 167 | |
| 168 | * Load the Intel module containing MKL and set your environment variables |
| 169 | {{{ |
| 170 | [tuhpc002@cypress01-089 Day2]$ module load intel-psxe |
| 171 | [tuhpc002@cypress01-089 Day2]$ export MKL_MIC_ENABLE=0 |
| 172 | [tuhpc002@cypress01-089 Day2]$ export OFFLOAD_REPORT=2 |
| 173 | }}} |
| 174 | |
| 175 | Notice that automatic offloading is turned OFF. This will set our baseline. |
| 176 | * Compile the example code being sure to link to the MKL library |
| 177 | * Run the executable |
| 178 | * Turn on MKL AO and run it again |
| 179 | {{{ |
| 180 | [tuhpc002@cypress01-089 Day2]$ icc -O3 -mkl -openmp sgemm_example.c -o AOtest |
| 181 | [tuhpc002@cypress01-089 Day2]$ ./AOtest |
| 182 | Time spent on SGEMM = 0.835 |
| 183 | Done |
| 184 | [tuhpc002@cypress01-089 Day2]$ export MKL_MIC_ENABLE=1 |
| 185 | [tuhpc002@cypress01-089 Day2]$ ./AOtest |
| 186 | [MKL] [MIC --] [AO Function] SGEMM |
| 187 | [MKL] [MIC --] [AO SGEMM Workdivision] 0.60 0.20 0.20 |
| 188 | [MKL] [MIC 00] [AO SGEMM CPU Time] 2.858848 seconds |
| 189 | [MKL] [MIC 00] [AO SGEMM MIC Time] 0.104307 seconds |
| 190 | [MKL] [MIC 00] [AO SGEMM CPU->MIC Data] 31457280 bytes |
| 191 | [MKL] [MIC 00] [AO SGEMM MIC->CPU Data] 5242880 bytes |
| 192 | [MKL] [MIC 01] [AO SGEMM CPU Time] 2.858848 seconds |
| 193 | [MKL] [MIC 01] [AO SGEMM MIC Time] 0.113478 seconds |
| 194 | [MKL] [MIC 01] [AO SGEMM CPU->MIC Data] 31457280 bytes |
| 195 | [MKL] [MIC 01] [AO SGEMM MIC->CPU Data] 5242880 bytes |
| 196 | Time spent on SGEMM = 3.436 |
| 197 | Done |
| 198 | [tuhpc002@cypress01-089 Day2]$ |
| 199 | }}} |
| 200 | |
| 201 | The Point: This example gets at some of the challenges of coding for the Xeon Phi. Utilization is simple, but optimization can be a real challenge. Let's look at a few more options we can manipulate through environment variables: |
| 202 | |
| 203 | * The work division among the Host and MICs can also be tuned by hand using MKL_MIC_<0,1>_WORKDIVISION |
| 204 | {{{ |
| 205 | [tuhpc002@cypress01-089 Day2]$ export MKL_MIC_0_WORKDIVISION=1.0 |
| 206 | [tuhpc002@cypress01-089 Day2]$ ./AOtest |
| 207 | [MKL] [MIC --] [AO Function] SGEMM |
| 208 | [MKL] [MIC --] [AO SGEMM Workdivision] 0.00 1.00 0.00 |
| 209 | [MKL] [MIC 00] [AO SGEMM CPU Time] 2.831957 seconds |
| 210 | [MKL] [MIC 00] [AO SGEMM MIC Time] 0.141694 seconds |
| 211 | [MKL] [MIC 00] [AO SGEMM CPU->MIC Data] 52428800 bytes |
| 212 | [MKL] [MIC 00] [AO SGEMM MIC->CPU Data] 26214400 bytes |
| 213 | [MKL] [MIC 01] [AO SGEMM CPU Time] 2.831957 seconds |
| 214 | [MKL] [MIC 01] [AO SGEMM MIC Time] 0.000000 seconds |
| 215 | [MKL] [MIC 01] [AO SGEMM CPU->MIC Data] 0 bytes |
| 216 | [MKL] [MIC 01] [AO SGEMM MIC->CPU Data] 0 bytes |
| 217 | Time spent on SGEMM = 3.394 |
| 218 | }}} |
| 219 | |
| 220 | * The number of threads used on each MIC can be controlled using MIC_OMP_NUMTHREADS |
| 221 | {{{ |
| 222 | [tuhpc002@cypress01-089 Day2]$ export MIC_OMP_NUMTHREADS=122 |
| 223 | [tuhpc002@cypress01-089 Day2]$ ./AOtest |
| 224 | [MKL] [MIC --] [AO Function] SGEMM |
| 225 | [MKL] [MIC --] [AO SGEMM Workdivision] 0.60 0.20 0.20 |
| 226 | [MKL] [MIC 00] [AO SGEMM CPU Time] 1.625511 seconds |
| 227 | [MKL] [MIC 00] [AO SGEMM MIC Time] 0.102266 seconds |
| 228 | [MKL] [MIC 00] [AO SGEMM CPU->MIC Data] 31457280 bytes |
| 229 | [MKL] [MIC 00] [AO SGEMM MIC->CPU Data] 5242880 bytes |
| 230 | [MKL] [MIC 01] [AO SGEMM CPU Time] 1.625511 seconds |
| 231 | [MKL] [MIC 01] [AO SGEMM MIC Time] 0.089364 seconds |
| 232 | [MKL] [MIC 01] [AO SGEMM CPU->MIC Data] 31457280 bytes |
| 233 | [MKL] [MIC 01] [AO SGEMM MIC->CPU Data] 5242880 bytes |
| 234 | Time spent on SGEMM = 2.288 |
| 235 | Done |
| 236 | [tuhpc002@cypress01-089 Day2]$ |
| 237 | }}} |
| 238 | |
| 239 | * We can control the distribution of threads using MIC_KMP_AFFINITY |
| 240 | {{{ |
| 241 | [tuhpc002@cypress01-089 Day2]$ export MIC_KMP_AFFINITY=scatter |
| 242 | [tuhpc002@cypress01-089 Day2]$ ./AOtest |
| 243 | [MKL] [MIC --] [AO Function] SGEMM |
| 244 | [MKL] [MIC --] [AO SGEMM Workdivision] 0.60 0.20 0.20 |
| 245 | [MKL] [MIC 00] [AO SGEMM CPU Time] 1.631954 seconds |
| 246 | [MKL] [MIC 00] [AO SGEMM MIC Time] 0.101270 seconds |
| 247 | [MKL] [MIC 00] [AO SGEMM CPU->MIC Data] 31457280 bytes |
| 248 | [MKL] [MIC 00] [AO SGEMM MIC->CPU Data] 5242880 bytes |
| 249 | [MKL] [MIC 01] [AO SGEMM CPU Time] 1.631954 seconds |
| 250 | [MKL] [MIC 01] [AO SGEMM MIC Time] 0.105702 seconds |
| 251 | [MKL] [MIC 01] [AO SGEMM CPU->MIC Data] 31457280 bytes |
| 252 | [MKL] [MIC 01] [AO SGEMM MIC->CPU Data] 5242880 bytes |
| 253 | Time spent on SGEMM = 2.028 |
| 254 | Done |
| 255 | [tuhpc002@cypress01-089 Day2]$ |
| 256 | }}} |
| 259 | |
| 260 | The native model centers around the notion that each MIC is its own machine with it's own architecture. The first challenge is to compile code to run specifically on the hardware of the MIC. |
| 261 | |
| 262 | |
| 263 | |
| 264 | SLURM jobscript script is, for example, |
| 265 | |
| 266 | {{{#!bash |
| 267 | #!/bin/bash |
| 268 | #SBATCH --qos=normal # Quality of Service |
| 269 | #SBATCH --job-name=nativeTest # Job Name |
| 270 | #SBATCH --time=00:10:00 # WallTime |
| 271 | #SBATCH --nodes=1 # Number of Nodes |
| 272 | #SBATCH --ntasks-per-node=1 # Number of tasks (MPI presseces) |
| 273 | #SBATCH --cpus-per-task=1 # Number of processors per task OpenMP threads() |
| 274 | #SBATCH --gres=mic:1 # Number of Co-Processors |
| 275 | |
| 276 | micnativeloadex ./myNativeExecutable -e "OMP_NUM_THREADS=100" -d 0 -v |
| 277 | }}} |
| 278 | |
| 279 | |
| 280 | |
| 281 | In the script above we request one MIC device that will be device number 0. |
| 282 | "micnativeloadex" command launches MIC native executable. "-e "OMP_NUM_THREADS=100"" option to set the number of threads on the MIC device to 100. |
| 283 | For more options, see below. |
| 284 | |
| 285 | {{{#!bash |
| 286 | [fuji@cypress01-090 nativeTest]$ micnativeloadex -h |
| 287 | |
| 288 | Usage: |
| 289 | micnativeloadex [ -h | -V ] AppName -l -t timeout -p -v -d coprocessor -a "args" -e "environment" |
| 290 | -a "args" An optional string of command line arguments to pass to |
| 291 | the remote app. |
| 292 | -d The (zero based) index of the Intel(R) Xeon Phi(TM) coprocessor to run the app on. |
| 293 | -e "environment" An optional environment string to pass to the remote app. |
| 294 | Multiple environment variable may be specified using spaces as separators: |
| 295 | -e "LD_LIBRARY_PATH=/lib64/ DEBUG=1" |
| 296 | -h Print this help message |
| 297 | -l Do not execute the binary on the coprocessor. Instead, list the shared library |
| 298 | dependency information. |
| 299 | -p Disable console proxy. |
| 300 | -t Time to wait for the remote app to finish (in seconds). After the timeout |
| 301 | is reached the remote app will be terminated. |
| 302 | -v Enable verbose mode. Note that verbose output will be displayed |
| 303 | if the remote app terminates abnormally. |
| 304 | -V Show version and build information |
| 305 | }}} |
| 306 | |
| 307 | |
| 308 | |