Context Navigation

Changes between Version 16 and Version 17 of cypress/XeonPhi

Timestamp:: 08/21/15 10:38:38 (10 years ago)
Author:: cmaggio
Comment:: —

Legend:

: Unmodified
: Added
: Removed
: Modified

cypress/XeonPhi

-              v16
+              v17
 Offloading allows us to designate specific sections of our code that we wish to have executed on the MIC. Unlike the first two methods, this requires (minimal) alteration of the source code. Lets take our helloflop3.c example and modify it so that it offloads the area of heavy computation
+{{{#!c
+//
+//
+// helloflops3offl
+//
+// A simple example that gets lots of Flops (Floating Point Operations) on
+// Intel(r) Xeon Phi(tm) co-processors using offload plus  openmp to scale
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <omp.h>
+#include <sys/time.h>
+// dtime
+//
+// returns the current wall clock time
+//
+double dtime()
+{
+    double tseconds = 0.0;
+    struct timeval mytime;
+    gettimeofday(&mytime,(struct timezone*)0);
+    tseconds = (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6);
+    return( tseconds );
+}
+#define FLOPS_ARRAY_SIZE (1024*512)
+#define MAXFLOPS_ITERS 100000000
+#define LOOP_COUNT 128
+// number of float pt ops per calculation
+#define FLOPSPERCALC 2
+// define some arrays -
+// make sure they are 64 byte aligned
+// for best cache access
+__declspec ( target (mic)) float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64)));
+__declspec ( target (mic)) float fb[FLOPS_ARRAY_SIZE] __attribute__((aligned(64)));
+//
+// Main program - pedal to the metal...calculate using tons o'flops!
+//
+int main(int argc, char *argv[] )
+{
+    int i,j,k;
+    int numthreads;
+    double tstart, tstop, ttime;
+    double gflops = 0.0;
+    float a=1.1;
+    //
+    // initialize the compute arrays
+    //
+    //
+#pragma offload target (mic)
+#pragma omp parallel
+#pragma omp master
+    numthreads = omp_get_num_threads();
+    printf("Initializing\r\n");
+#pragma omp parallel for
+    for(i=0; i<FLOPS_ARRAY_SIZE; i++)
+    {
+        fa[i] = (float)i + 0.1;
+        fb[i] = (float)i + 0.2;
+    }
+    printf("Starting Compute on %d threads\r\n",numthreads);
+    tstart = dtime();
+    // scale the calculation across threads requested
+    // need to set environment variables OMP_NUM_THREADS and KMP_AFFINITY
+#pragma offload target (mic)
+#pragma omp parallel for private(j,k)
+    for (i=0; i<numthreads; i++)
+    {
+        // each thread will work it's own array section
+        // calc offset into the right section
+        int offset = i*LOOP_COUNT;
+        // loop many times to get lots of calculations
+        for(j=0; j<MAXFLOPS_ITERS; j++)
+        {
+            // scale 1st array and add in the 2nd array
+            #pragma vector aligned
+            for(k=0; k<LOOP_COUNT; k++)
+            {
+                fa[k+offset] = a * fa[k+offset] + fb[k+offset];
+            }
+        }
+    }
+    tstop = dtime();
+    // # of gigaflops we just calculated
+    gflops = (double)( 1.0e-9*numthreads*LOOP_COUNT*
+                        MAXFLOPS_ITERS*FLOPSPERCALC);
+    //elasped time
+    ttime = tstop - tstart;
+    //
+    // Print the results
+    //
+    if ((ttime) > 0.0)
+    {
+        printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf\r\n",                   gflops, ttime, gflops/ttime);
+    }
+    return( 0 );
+}
+}}}
+Changes to take note of:
+* The addition of the directive before the section of code that we wish to run on the MIC
+ {{{#!c
+#pragma offload target (mic)
+}}}
+* The alteration of our array declarations indicating they would be part of offload use, eg
+{{{
+__declspec ( target (mic)) float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64)));
+}}}
+Let's take a look at a submission script for our offloading example
+{{{#!bash
+#!/bin/bash
+#SBATCH --qos=workshop          # Quality of Service
+#SBATCH --partition=workshop    #Partition
+#SBATCH --job-name=offloadTest   # Job Name
+#SBATCH --time=00:10:00         # WallTime
+#SBATCH --nodes=1               # Number of Nodes
+#SBATCH --ntasks-per-node=1     # Number of tasks (MPI presseces)
+#SBATCH --cpus-per-task=20      # Number of processors per task OpenMP threads()
+#SBATCH --gres=mic:1            # Number of Co-Processors
+module load intel-psxe/2015-update1
+module load intel/mic/sdk/3.3
+module load intel/mic/runtime/3.3
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export MIC_ENV_PREFIX=MIC
+export MIC_OMP_NUM_THREADS=120
+export MIC_KMP_AFFINITY=scatter
+./helloflops3offload
+}}}
 == Programming Considerations ==