| 490 | | |
| 491 | | |
| 492 | | |
| 493 | | |
| | 490 | {{{#!c |
| | 491 | // |
| | 492 | // |
| | 493 | // helloflops3offl |
| | 494 | // |
| | 495 | // A simple example that gets lots of Flops (Floating Point Operations) on |
| | 496 | // Intel(r) Xeon Phi(tm) co-processors using offload plus openmp to scale |
| | 497 | // |
| | 498 | |
| | 499 | #include <stdio.h> |
| | 500 | #include <stdlib.h> |
| | 501 | #include <string.h> |
| | 502 | #include <omp.h> |
| | 503 | #include <sys/time.h> |
| | 504 | |
| | 505 | // dtime |
| | 506 | // |
| | 507 | // returns the current wall clock time |
| | 508 | // |
| | 509 | double dtime() |
| | 510 | { |
| | 511 | double tseconds = 0.0; |
| | 512 | struct timeval mytime; |
| | 513 | gettimeofday(&mytime,(struct timezone*)0); |
| | 514 | tseconds = (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6); |
| | 515 | return( tseconds ); |
| | 516 | } |
| | 517 | |
| | 518 | #define FLOPS_ARRAY_SIZE (1024*512) |
| | 519 | #define MAXFLOPS_ITERS 100000000 |
| | 520 | #define LOOP_COUNT 128 |
| | 521 | |
| | 522 | // number of float pt ops per calculation |
| | 523 | #define FLOPSPERCALC 2 |
| | 524 | // define some arrays - |
| | 525 | // make sure they are 64 byte aligned |
| | 526 | // for best cache access |
| | 527 | __declspec ( target (mic)) float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64))); |
| | 528 | __declspec ( target (mic)) float fb[FLOPS_ARRAY_SIZE] __attribute__((aligned(64))); |
| | 529 | // |
| | 530 | // Main program - pedal to the metal...calculate using tons o'flops! |
| | 531 | // |
| | 532 | int main(int argc, char *argv[] ) |
| | 533 | { |
| | 534 | int i,j,k; |
| | 535 | int numthreads; |
| | 536 | double tstart, tstop, ttime; |
| | 537 | double gflops = 0.0; |
| | 538 | float a=1.1; |
| | 539 | |
| | 540 | // |
| | 541 | // initialize the compute arrays |
| | 542 | // |
| | 543 | // |
| | 544 | |
| | 545 | #pragma offload target (mic) |
| | 546 | #pragma omp parallel |
| | 547 | #pragma omp master |
| | 548 | numthreads = omp_get_num_threads(); |
| | 549 | |
| | 550 | printf("Initializing\r\n"); |
| | 551 | |
| | 552 | #pragma omp parallel for |
| | 553 | for(i=0; i<FLOPS_ARRAY_SIZE; i++) |
| | 554 | { |
| | 555 | fa[i] = (float)i + 0.1; |
| | 556 | fb[i] = (float)i + 0.2; |
| | 557 | } |
| | 558 | printf("Starting Compute on %d threads\r\n",numthreads); |
| | 559 | |
| | 560 | tstart = dtime(); |
| | 561 | |
| | 562 | // scale the calculation across threads requested |
| | 563 | // need to set environment variables OMP_NUM_THREADS and KMP_AFFINITY |
| | 564 | |
| | 565 | #pragma offload target (mic) |
| | 566 | #pragma omp parallel for private(j,k) |
| | 567 | for (i=0; i<numthreads; i++) |
| | 568 | { |
| | 569 | // each thread will work it's own array section |
| | 570 | // calc offset into the right section |
| | 571 | int offset = i*LOOP_COUNT; |
| | 572 | |
| | 573 | // loop many times to get lots of calculations |
| | 574 | for(j=0; j<MAXFLOPS_ITERS; j++) |
| | 575 | { |
| | 576 | // scale 1st array and add in the 2nd array |
| | 577 | #pragma vector aligned |
| | 578 | for(k=0; k<LOOP_COUNT; k++) |
| | 579 | { |
| | 580 | fa[k+offset] = a * fa[k+offset] + fb[k+offset]; |
| | 581 | } |
| | 582 | } |
| | 583 | } |
| | 584 | tstop = dtime(); |
| | 585 | // # of gigaflops we just calculated |
| | 586 | gflops = (double)( 1.0e-9*numthreads*LOOP_COUNT* |
| | 587 | MAXFLOPS_ITERS*FLOPSPERCALC); |
| | 588 | |
| | 589 | //elasped time |
| | 590 | ttime = tstop - tstart; |
| | 591 | // |
| | 592 | // Print the results |
| | 593 | // |
| | 594 | if ((ttime) > 0.0) |
| | 595 | { |
| | 596 | printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf\r\n", gflops, ttime, gflops/ttime); |
| | 597 | } |
| | 598 | return( 0 ); |
| | 599 | } |
| | 600 | }}} |
| | 601 | |
| | 602 | Changes to take note of: |
| | 603 | * The addition of the directive before the section of code that we wish to run on the MIC |
| | 604 | {{{#!c |
| | 605 | #pragma offload target (mic) |
| | 606 | }}} |
| | 607 | * The alteration of our array declarations indicating they would be part of offload use, eg |
| | 608 | {{{ |
| | 609 | __declspec ( target (mic)) float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64))); |
| | 610 | }}} |
| | 611 | |
| | 612 | Let's take a look at a submission script for our offloading example |
| | 613 | |
| | 614 | {{{#!bash |
| | 615 | #!/bin/bash |
| | 616 | #SBATCH --qos=workshop # Quality of Service |
| | 617 | #SBATCH --partition=workshop #Partition |
| | 618 | #SBATCH --job-name=offloadTest # Job Name |
| | 619 | #SBATCH --time=00:10:00 # WallTime |
| | 620 | #SBATCH --nodes=1 # Number of Nodes |
| | 621 | #SBATCH --ntasks-per-node=1 # Number of tasks (MPI presseces) |
| | 622 | #SBATCH --cpus-per-task=20 # Number of processors per task OpenMP threads() |
| | 623 | #SBATCH --gres=mic:1 # Number of Co-Processors |
| | 624 | |
| | 625 | module load intel-psxe/2015-update1 |
| | 626 | module load intel/mic/sdk/3.3 |
| | 627 | module load intel/mic/runtime/3.3 |
| | 628 | |
| | 629 | export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK |
| | 630 | export MIC_ENV_PREFIX=MIC |
| | 631 | export MIC_OMP_NUM_THREADS=120 |
| | 632 | export MIC_KMP_AFFINITY=scatter |
| | 633 | |
| | 634 | ./helloflops3offload |
| | 635 | }}} |