490 | | |
491 | | |
492 | | |
493 | | |
| 490 | {{{#!c |
| 491 | // |
| 492 | // |
| 493 | // helloflops3offl |
| 494 | // |
| 495 | // A simple example that gets lots of Flops (Floating Point Operations) on |
| 496 | // Intel(r) Xeon Phi(tm) co-processors using offload plus openmp to scale |
| 497 | // |
| 498 | |
| 499 | #include <stdio.h> |
| 500 | #include <stdlib.h> |
| 501 | #include <string.h> |
| 502 | #include <omp.h> |
| 503 | #include <sys/time.h> |
| 504 | |
| 505 | // dtime |
| 506 | // |
| 507 | // returns the current wall clock time |
| 508 | // |
| 509 | double dtime() |
| 510 | { |
| 511 | double tseconds = 0.0; |
| 512 | struct timeval mytime; |
| 513 | gettimeofday(&mytime,(struct timezone*)0); |
| 514 | tseconds = (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6); |
| 515 | return( tseconds ); |
| 516 | } |
| 517 | |
| 518 | #define FLOPS_ARRAY_SIZE (1024*512) |
| 519 | #define MAXFLOPS_ITERS 100000000 |
| 520 | #define LOOP_COUNT 128 |
| 521 | |
| 522 | // number of float pt ops per calculation |
| 523 | #define FLOPSPERCALC 2 |
| 524 | // define some arrays - |
| 525 | // make sure they are 64 byte aligned |
| 526 | // for best cache access |
| 527 | __declspec ( target (mic)) float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64))); |
| 528 | __declspec ( target (mic)) float fb[FLOPS_ARRAY_SIZE] __attribute__((aligned(64))); |
| 529 | // |
| 530 | // Main program - pedal to the metal...calculate using tons o'flops! |
| 531 | // |
| 532 | int main(int argc, char *argv[] ) |
| 533 | { |
| 534 | int i,j,k; |
| 535 | int numthreads; |
| 536 | double tstart, tstop, ttime; |
| 537 | double gflops = 0.0; |
| 538 | float a=1.1; |
| 539 | |
| 540 | // |
| 541 | // initialize the compute arrays |
| 542 | // |
| 543 | // |
| 544 | |
| 545 | #pragma offload target (mic) |
| 546 | #pragma omp parallel |
| 547 | #pragma omp master |
| 548 | numthreads = omp_get_num_threads(); |
| 549 | |
| 550 | printf("Initializing\r\n"); |
| 551 | |
| 552 | #pragma omp parallel for |
| 553 | for(i=0; i<FLOPS_ARRAY_SIZE; i++) |
| 554 | { |
| 555 | fa[i] = (float)i + 0.1; |
| 556 | fb[i] = (float)i + 0.2; |
| 557 | } |
| 558 | printf("Starting Compute on %d threads\r\n",numthreads); |
| 559 | |
| 560 | tstart = dtime(); |
| 561 | |
| 562 | // scale the calculation across threads requested |
| 563 | // need to set environment variables OMP_NUM_THREADS and KMP_AFFINITY |
| 564 | |
| 565 | #pragma offload target (mic) |
| 566 | #pragma omp parallel for private(j,k) |
| 567 | for (i=0; i<numthreads; i++) |
| 568 | { |
| 569 | // each thread will work it's own array section |
| 570 | // calc offset into the right section |
| 571 | int offset = i*LOOP_COUNT; |
| 572 | |
| 573 | // loop many times to get lots of calculations |
| 574 | for(j=0; j<MAXFLOPS_ITERS; j++) |
| 575 | { |
| 576 | // scale 1st array and add in the 2nd array |
| 577 | #pragma vector aligned |
| 578 | for(k=0; k<LOOP_COUNT; k++) |
| 579 | { |
| 580 | fa[k+offset] = a * fa[k+offset] + fb[k+offset]; |
| 581 | } |
| 582 | } |
| 583 | } |
| 584 | tstop = dtime(); |
| 585 | // # of gigaflops we just calculated |
| 586 | gflops = (double)( 1.0e-9*numthreads*LOOP_COUNT* |
| 587 | MAXFLOPS_ITERS*FLOPSPERCALC); |
| 588 | |
| 589 | //elasped time |
| 590 | ttime = tstop - tstart; |
| 591 | // |
| 592 | // Print the results |
| 593 | // |
| 594 | if ((ttime) > 0.0) |
| 595 | { |
| 596 | printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf\r\n", gflops, ttime, gflops/ttime); |
| 597 | } |
| 598 | return( 0 ); |
| 599 | } |
| 600 | }}} |
| 601 | |
| 602 | Changes to take note of: |
| 603 | * The addition of the directive before the section of code that we wish to run on the MIC |
| 604 | {{{#!c |
| 605 | #pragma offload target (mic) |
| 606 | }}} |
| 607 | * The alteration of our array declarations indicating they would be part of offload use, eg |
| 608 | {{{ |
| 609 | __declspec ( target (mic)) float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64))); |
| 610 | }}} |
| 611 | |
| 612 | Let's take a look at a submission script for our offloading example |
| 613 | |
| 614 | {{{#!bash |
| 615 | #!/bin/bash |
| 616 | #SBATCH --qos=workshop # Quality of Service |
| 617 | #SBATCH --partition=workshop #Partition |
| 618 | #SBATCH --job-name=offloadTest # Job Name |
| 619 | #SBATCH --time=00:10:00 # WallTime |
| 620 | #SBATCH --nodes=1 # Number of Nodes |
| 621 | #SBATCH --ntasks-per-node=1 # Number of tasks (MPI presseces) |
| 622 | #SBATCH --cpus-per-task=20 # Number of processors per task OpenMP threads() |
| 623 | #SBATCH --gres=mic:1 # Number of Co-Processors |
| 624 | |
| 625 | module load intel-psxe/2015-update1 |
| 626 | module load intel/mic/sdk/3.3 |
| 627 | module load intel/mic/runtime/3.3 |
| 628 | |
| 629 | export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK |
| 630 | export MIC_ENV_PREFIX=MIC |
| 631 | export MIC_OMP_NUM_THREADS=120 |
| 632 | export MIC_KMP_AFFINITY=scatter |
| 633 | |
| 634 | ./helloflops3offload |
| 635 | }}} |