Changes between Version 14 and Version 15 of cypress/XeonPhi


Ignore:
Timestamp:
08/21/15 09:47:52 (9 years ago)
Author:
cmaggio
Comment:

Legend:

Unmodified
Added
Removed
Modified
  • cypress/XeonPhi

    v14 v15  
    255255}}}
    256256
     257
     258
     259
     260
     261
    257262=== Native Programming ===
    258263
    259264The native model centers around the notion that each MIC is its own machine with it's own architecture. The first challenge is to compile code to run specifically on the hardware of the MIC.
    260265
    261 
    262 
    263 SLURM jobscript script is, for example,
    264 
     266Let's start out in interactive mode and then we will look at job submission:
    265267{{{#!bash
    266 #!/bin/bash
    267 #SBATCH --qos=normal            # Quality of Service
    268 #SBATCH --job-name=nativeTest   # Job Name
    269 #SBATCH --time=00:10:00         # WallTime
    270 #SBATCH --nodes=1               # Number of Nodes
    271 #SBATCH --ntasks-per-node=1     # Number of tasks (MPI presseces)
    272 #SBATCH --cpus-per-task=1       # Number of processors per task OpenMP threads()
    273 #SBATCH --gres=mic:1            # Number of Co-Processors
    274 
    275 micnativeloadex ./myNativeExecutable -e "OMP_NUM_THREADS=100" -d 0 -v
    276 }}}
    277 
    278 
    279 
    280 In the script above we request one MIC device that will be device number 0.
    281 "micnativeloadex" command launches MIC native executable. "-e "OMP_NUM_THREADS=100"" option to set the number of threads on the MIC device to 100.
    282 For more options, see below.
    283 
    284 {{{#!bash
    285 [fuji@cypress01-090 nativeTest]$ micnativeloadex -h
     268export MY_PARTITION=workshop
     269export MY_QUEUE=workshop
     270idev -c 4 --gres=mic:0
     271}}}
     272
     273
     274
     275* Load the intel modules
     276 * intel-psxe
     277 * intel/mic/sdk/3.3
     278 * intel/mic/runtime/3.3
     279
     280And let's examine the code helloflops3.c from Jim Jeffers and James Rainers seminal text
     281[[http://lotsofcores.com/|Intels Xeon Phi Coprocessor High - Performance Programming]]
     282
     283{{{#!c
     284//
     285//
     286// helloflops2
     287//
     288// A simple example that gets lots of Flops (Floating Point Operations) on
     289// Intel(r) Xeon Phi(tm) co-processors using openmp to scale
     290//
     291
     292#include <stdio.h>
     293#include <stdlib.h>
     294#include <string.h>
     295#include <omp.h>
     296#include <sys/time.h>
     297
     298// dtime
     299//
     300// returns the current wall clock time
     301//
     302double dtime()
     303{
     304    double tseconds = 0.0;
     305    struct timeval mytime;
     306    gettimeofday(&mytime,(struct timezone*)0);
     307    tseconds = (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6);
     308    return( tseconds );
     309}
     310
     311#define FLOPS_ARRAY_SIZE (1024*1024)
     312#define MAXFLOPS_ITERS 100000000
     313#define LOOP_COUNT 128
     314
     315// number of float pt ops per calculation
     316#define FLOPSPERCALC 2     
     317// define some arrays -
     318// make sure they are 64 byte aligned
     319// for best cache access
     320float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64)));
     321float fb[FLOPS_ARRAY_SIZE] __attribute__((aligned(64)));
     322//
     323// Main program - pedal to the metal...calculate using tons o'flops!
     324//
     325int main(int argc, char *argv[] )
     326{
     327    int i,j,k;
     328    int numthreads;
     329    double tstart, tstop, ttime;
     330    double gflops = 0.0;
     331    float a=1.1;
     332
     333    //
     334    // initialize the compute arrays
     335    //
     336    //
     337
     338#pragma omp parallel
     339#pragma omp master
     340    numthreads = omp_get_num_threads();
     341
     342    printf("Initializing\r\n");
     343#pragma omp parallel for
     344    for(i=0; i<FLOPS_ARRAY_SIZE; i++)
     345    {
     346        fa[i] = (float)i + 0.1;
     347        fb[i] = (float)i + 0.2;
     348    }   
     349    printf("Starting Compute on %d threads\r\n",numthreads);
     350
     351    tstart = dtime();
     352       
     353    // scale the calculation across threads requested
     354    // need to set environment variables OMP_NUM_THREADS and KMP_AFFINITY
     355
     356#pragma omp parallel for private(j,k)
     357    for (i=0; i<numthreads; i++)
     358    {
     359        // each thread will work it's own array section
     360        // calc offset into the right section
     361        int offset = i*LOOP_COUNT;
     362
     363        // loop many times to get lots of calculations
     364        for(j=0; j<MAXFLOPS_ITERS; j++) 
     365        {
     366            // scale 1st array and add in the 2nd array
     367            for(k=0; k<LOOP_COUNT; k++) 
     368            {
     369                fa[k+offset] = a * fa[k+offset] + fb[k+offset];
     370            }
     371        }
     372    }
     373    tstop = dtime();
     374    // # of gigaflops we just calculated 
     375    gflops = (double)( 1.0e-9*numthreads*LOOP_COUNT*
     376                        MAXFLOPS_ITERS*FLOPSPERCALC);   
     377
     378    //elasped time
     379    ttime = tstop - tstart;
     380    //
     381    // Print the results
     382    //
     383    if ((ttime) > 0.0)
     384    {
     385        printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf\r\n",                   gflops, ttime, gflops/ttime);
     386    }
     387    return( 0 );
     388}
     389}}}
     390
     391
     392Lets begin by compiling the code for the host and running it on the host processor
     393{{{
     394[tuhpc002@cypress01-089 Day2]$ icc -O3 -openmp helloflops3.c -o helloflops3_host
     395[tuhpc002@cypress01-089 Day2]$ ./helloflops3_host
     396Initializing
     397Starting Compute on 20 threads
     398GFlops =    512.000, Secs =      6.349, GFlops per sec =     80.645
     399[tuhpc002@cypress01-089 Day2]$
     400}}}
     401
     402Not bad. Now lets compile a native version by adding the architecture flag '''-mmic'''.
     403{{{
     404[tuhpc002@cypress01-089 Day2]$ icc -O3 -openmp -mmic helloflops3.c -o helloflops3_mic
     405[tuhpc002@cypress01-089 Day2]$ ./helloflops3_mic
     406-bash: ./helloflops3_mic: cannot execute binary file
     407[tuhpc002@cypress01-089 Day2]$
     408}}}
     409
     410We could not execute our binary because it we tried to run it on the host architecture. We could ssh onto one of the MICs and manually set our library environment variables to run the code, but a cleaner method (and the recommended method) is to use the intel program '''micnativeloadex'''.
     411
     412{{{
     413[tuhpc002@cypress01-089 Day2]$ micnativeloadex ./helloflops3_mic
     414Initializing
     415Starting Compute on 240 threads
     416GFlops =   6144.000, Secs =      2.630, GFlops per sec =   2335.925
     417
     418[tuhpc002@cypress01-089 Day2]$
     419}}}
     420
     421'''micnativeloadex''' has a number of options which can be seen using the '''-h''' or help flag.
     422
     423{{{
     424[tulaneID@cypress01 $ micnativeloadex -h
    286425
    287426Usage:
     
    304443}}}
    305444
     445Notice that we can use the '''-d''' flag to select which MIC we want to run on and the '''-e''' flag to set environment variables on the MIC (separated by whitespace). For example, we can choose to run on MIC0 and set the number of threads and their affinity with
     446
     447{{{
     448[tuhpc002@cypress01-089 Day2]$ micnativeloadex ./helloflops3_mic -e "OMP_NUM_THREADS=120 KMP_AFFINITY=scatter" -d 0
     449Initializing
     450Starting Compute on 120 threads
     451GFlops =   3072.000, Secs =      1.500, GFlops per sec =   2048.143
     452
     453[tuhpc002@cypress01-089 Day2]$
     454}}}
     455
     456
     457We've been using idev as an instructional tool, but we won't normally be running our MIC native jobs interactively. Rather, we'll be submitting jobs that we want to run in native mode on a compute node. An example SLURM jobscript script for our code would look like
     458
     459{{{#!bash
     460#!/bin/bash
     461#SBATCH --qos=workshop          # Quality of Service
     462#SBATCH --partition=workshop    #Partition
     463#SBATCH --job-name=nativeTest   # Job Name
     464#SBATCH --time=00:10:00         # WallTime
     465#SBATCH --nodes=1               # Number of Nodes
     466#SBATCH --ntasks-per-node=1     # Number of tasks (MPI presseces)
     467#SBATCH --cpus-per-task=1       # Number of processors per task OpenMP threads()
     468#SBATCH --gres=mic:1            # Number of Co-Processors
     469
     470module load intel-psxe/2015-update1
     471module load intel/mic/sdk/3.3
     472module load intel/mic/runtime/3.3
     473
     474micnativeloadex ./helloflops3_mic -e "OMP_NUM_THREADS=120 KMP_AFFINITY=scatter" -d 0
     475}}}
     476
     477
     478
     479
     480
     481
     482
    306483
    307484
    308485
    309486=== Offloading ===
     487
     488
     489
     490
     491
    310492
    311493