266 | | #!/bin/bash |
267 | | #SBATCH --qos=normal # Quality of Service |
268 | | #SBATCH --job-name=nativeTest # Job Name |
269 | | #SBATCH --time=00:10:00 # WallTime |
270 | | #SBATCH --nodes=1 # Number of Nodes |
271 | | #SBATCH --ntasks-per-node=1 # Number of tasks (MPI presseces) |
272 | | #SBATCH --cpus-per-task=1 # Number of processors per task OpenMP threads() |
273 | | #SBATCH --gres=mic:1 # Number of Co-Processors |
274 | | |
275 | | micnativeloadex ./myNativeExecutable -e "OMP_NUM_THREADS=100" -d 0 -v |
276 | | }}} |
277 | | |
278 | | |
279 | | |
280 | | In the script above we request one MIC device that will be device number 0. |
281 | | "micnativeloadex" command launches MIC native executable. "-e "OMP_NUM_THREADS=100"" option to set the number of threads on the MIC device to 100. |
282 | | For more options, see below. |
283 | | |
284 | | {{{#!bash |
285 | | [fuji@cypress01-090 nativeTest]$ micnativeloadex -h |
| 268 | export MY_PARTITION=workshop |
| 269 | export MY_QUEUE=workshop |
| 270 | idev -c 4 --gres=mic:0 |
| 271 | }}} |
| 272 | |
| 273 | |
| 274 | |
| 275 | * Load the intel modules |
| 276 | * intel-psxe |
| 277 | * intel/mic/sdk/3.3 |
| 278 | * intel/mic/runtime/3.3 |
| 279 | |
| 280 | And let's examine the code helloflops3.c from Jim Jeffers and James Rainers seminal text |
| 281 | [[http://lotsofcores.com/|Intels Xeon Phi Coprocessor High - Performance Programming]] |
| 282 | |
| 283 | {{{#!c |
| 284 | // |
| 285 | // |
| 286 | // helloflops2 |
| 287 | // |
| 288 | // A simple example that gets lots of Flops (Floating Point Operations) on |
| 289 | // Intel(r) Xeon Phi(tm) co-processors using openmp to scale |
| 290 | // |
| 291 | |
| 292 | #include <stdio.h> |
| 293 | #include <stdlib.h> |
| 294 | #include <string.h> |
| 295 | #include <omp.h> |
| 296 | #include <sys/time.h> |
| 297 | |
| 298 | // dtime |
| 299 | // |
| 300 | // returns the current wall clock time |
| 301 | // |
| 302 | double dtime() |
| 303 | { |
| 304 | double tseconds = 0.0; |
| 305 | struct timeval mytime; |
| 306 | gettimeofday(&mytime,(struct timezone*)0); |
| 307 | tseconds = (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6); |
| 308 | return( tseconds ); |
| 309 | } |
| 310 | |
| 311 | #define FLOPS_ARRAY_SIZE (1024*1024) |
| 312 | #define MAXFLOPS_ITERS 100000000 |
| 313 | #define LOOP_COUNT 128 |
| 314 | |
| 315 | // number of float pt ops per calculation |
| 316 | #define FLOPSPERCALC 2 |
| 317 | // define some arrays - |
| 318 | // make sure they are 64 byte aligned |
| 319 | // for best cache access |
| 320 | float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64))); |
| 321 | float fb[FLOPS_ARRAY_SIZE] __attribute__((aligned(64))); |
| 322 | // |
| 323 | // Main program - pedal to the metal...calculate using tons o'flops! |
| 324 | // |
| 325 | int main(int argc, char *argv[] ) |
| 326 | { |
| 327 | int i,j,k; |
| 328 | int numthreads; |
| 329 | double tstart, tstop, ttime; |
| 330 | double gflops = 0.0; |
| 331 | float a=1.1; |
| 332 | |
| 333 | // |
| 334 | // initialize the compute arrays |
| 335 | // |
| 336 | // |
| 337 | |
| 338 | #pragma omp parallel |
| 339 | #pragma omp master |
| 340 | numthreads = omp_get_num_threads(); |
| 341 | |
| 342 | printf("Initializing\r\n"); |
| 343 | #pragma omp parallel for |
| 344 | for(i=0; i<FLOPS_ARRAY_SIZE; i++) |
| 345 | { |
| 346 | fa[i] = (float)i + 0.1; |
| 347 | fb[i] = (float)i + 0.2; |
| 348 | } |
| 349 | printf("Starting Compute on %d threads\r\n",numthreads); |
| 350 | |
| 351 | tstart = dtime(); |
| 352 | |
| 353 | // scale the calculation across threads requested |
| 354 | // need to set environment variables OMP_NUM_THREADS and KMP_AFFINITY |
| 355 | |
| 356 | #pragma omp parallel for private(j,k) |
| 357 | for (i=0; i<numthreads; i++) |
| 358 | { |
| 359 | // each thread will work it's own array section |
| 360 | // calc offset into the right section |
| 361 | int offset = i*LOOP_COUNT; |
| 362 | |
| 363 | // loop many times to get lots of calculations |
| 364 | for(j=0; j<MAXFLOPS_ITERS; j++) |
| 365 | { |
| 366 | // scale 1st array and add in the 2nd array |
| 367 | for(k=0; k<LOOP_COUNT; k++) |
| 368 | { |
| 369 | fa[k+offset] = a * fa[k+offset] + fb[k+offset]; |
| 370 | } |
| 371 | } |
| 372 | } |
| 373 | tstop = dtime(); |
| 374 | // # of gigaflops we just calculated |
| 375 | gflops = (double)( 1.0e-9*numthreads*LOOP_COUNT* |
| 376 | MAXFLOPS_ITERS*FLOPSPERCALC); |
| 377 | |
| 378 | //elasped time |
| 379 | ttime = tstop - tstart; |
| 380 | // |
| 381 | // Print the results |
| 382 | // |
| 383 | if ((ttime) > 0.0) |
| 384 | { |
| 385 | printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf\r\n", gflops, ttime, gflops/ttime); |
| 386 | } |
| 387 | return( 0 ); |
| 388 | } |
| 389 | }}} |
| 390 | |
| 391 | |
| 392 | Lets begin by compiling the code for the host and running it on the host processor |
| 393 | {{{ |
| 394 | [tuhpc002@cypress01-089 Day2]$ icc -O3 -openmp helloflops3.c -o helloflops3_host |
| 395 | [tuhpc002@cypress01-089 Day2]$ ./helloflops3_host |
| 396 | Initializing |
| 397 | Starting Compute on 20 threads |
| 398 | GFlops = 512.000, Secs = 6.349, GFlops per sec = 80.645 |
| 399 | [tuhpc002@cypress01-089 Day2]$ |
| 400 | }}} |
| 401 | |
| 402 | Not bad. Now lets compile a native version by adding the architecture flag '''-mmic'''. |
| 403 | {{{ |
| 404 | [tuhpc002@cypress01-089 Day2]$ icc -O3 -openmp -mmic helloflops3.c -o helloflops3_mic |
| 405 | [tuhpc002@cypress01-089 Day2]$ ./helloflops3_mic |
| 406 | -bash: ./helloflops3_mic: cannot execute binary file |
| 407 | [tuhpc002@cypress01-089 Day2]$ |
| 408 | }}} |
| 409 | |
| 410 | We could not execute our binary because it we tried to run it on the host architecture. We could ssh onto one of the MICs and manually set our library environment variables to run the code, but a cleaner method (and the recommended method) is to use the intel program '''micnativeloadex'''. |
| 411 | |
| 412 | {{{ |
| 413 | [tuhpc002@cypress01-089 Day2]$ micnativeloadex ./helloflops3_mic |
| 414 | Initializing |
| 415 | Starting Compute on 240 threads |
| 416 | GFlops = 6144.000, Secs = 2.630, GFlops per sec = 2335.925 |
| 417 | |
| 418 | [tuhpc002@cypress01-089 Day2]$ |
| 419 | }}} |
| 420 | |
| 421 | '''micnativeloadex''' has a number of options which can be seen using the '''-h''' or help flag. |
| 422 | |
| 423 | {{{ |
| 424 | [tulaneID@cypress01 $ micnativeloadex -h |
| 445 | Notice that we can use the '''-d''' flag to select which MIC we want to run on and the '''-e''' flag to set environment variables on the MIC (separated by whitespace). For example, we can choose to run on MIC0 and set the number of threads and their affinity with |
| 446 | |
| 447 | {{{ |
| 448 | [tuhpc002@cypress01-089 Day2]$ micnativeloadex ./helloflops3_mic -e "OMP_NUM_THREADS=120 KMP_AFFINITY=scatter" -d 0 |
| 449 | Initializing |
| 450 | Starting Compute on 120 threads |
| 451 | GFlops = 3072.000, Secs = 1.500, GFlops per sec = 2048.143 |
| 452 | |
| 453 | [tuhpc002@cypress01-089 Day2]$ |
| 454 | }}} |
| 455 | |
| 456 | |
| 457 | We've been using idev as an instructional tool, but we won't normally be running our MIC native jobs interactively. Rather, we'll be submitting jobs that we want to run in native mode on a compute node. An example SLURM jobscript script for our code would look like |
| 458 | |
| 459 | {{{#!bash |
| 460 | #!/bin/bash |
| 461 | #SBATCH --qos=workshop # Quality of Service |
| 462 | #SBATCH --partition=workshop #Partition |
| 463 | #SBATCH --job-name=nativeTest # Job Name |
| 464 | #SBATCH --time=00:10:00 # WallTime |
| 465 | #SBATCH --nodes=1 # Number of Nodes |
| 466 | #SBATCH --ntasks-per-node=1 # Number of tasks (MPI presseces) |
| 467 | #SBATCH --cpus-per-task=1 # Number of processors per task OpenMP threads() |
| 468 | #SBATCH --gres=mic:1 # Number of Co-Processors |
| 469 | |
| 470 | module load intel-psxe/2015-update1 |
| 471 | module load intel/mic/sdk/3.3 |
| 472 | module load intel/mic/runtime/3.3 |
| 473 | |
| 474 | micnativeloadex ./helloflops3_mic -e "OMP_NUM_THREADS=120 KMP_AFFINITY=scatter" -d 0 |
| 475 | }}} |
| 476 | |
| 477 | |
| 478 | |
| 479 | |
| 480 | |
| 481 | |
| 482 | |