Changes between Version 16 and Version 17 of cypress/XeonPhi


Ignore:
Timestamp:
Aug 21, 2015 10:38:38 AM (6 years ago)
Author:
cmaggio
Comment:

Legend:

Unmodified
Added
Removed
Modified
  • cypress/XeonPhi

    v16 v17  
    488488Offloading allows us to designate specific sections of our code that we wish to have executed on the MIC. Unlike the first two methods, this requires (minimal) alteration of the source code. Lets take our helloflop3.c example and modify it so that it offloads the area of heavy computation
    489489
    490 
    491 
    492 
    493 
     490{{{#!c
     491//
     492//
     493// helloflops3offl
     494//
     495// A simple example that gets lots of Flops (Floating Point Operations) on
     496// Intel(r) Xeon Phi(tm) co-processors using offload plus  openmp to scale
     497//
     498
     499#include <stdio.h>
     500#include <stdlib.h>
     501#include <string.h>
     502#include <omp.h>
     503#include <sys/time.h>
     504
     505// dtime
     506//
     507// returns the current wall clock time
     508//
     509double dtime()
     510{
     511    double tseconds = 0.0;
     512    struct timeval mytime;
     513    gettimeofday(&mytime,(struct timezone*)0);
     514    tseconds = (double)(mytime.tv_sec + mytime.tv_usec*1.0e-6);
     515    return( tseconds );
     516}
     517
     518#define FLOPS_ARRAY_SIZE (1024*512)
     519#define MAXFLOPS_ITERS 100000000
     520#define LOOP_COUNT 128
     521
     522// number of float pt ops per calculation
     523#define FLOPSPERCALC 2     
     524// define some arrays -
     525// make sure they are 64 byte aligned
     526// for best cache access
     527__declspec ( target (mic)) float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64)));
     528__declspec ( target (mic)) float fb[FLOPS_ARRAY_SIZE] __attribute__((aligned(64)));
     529//
     530// Main program - pedal to the metal...calculate using tons o'flops!
     531//
     532int main(int argc, char *argv[] )
     533{
     534    int i,j,k;
     535    int numthreads;
     536    double tstart, tstop, ttime;
     537    double gflops = 0.0;
     538    float a=1.1;
     539
     540    //
     541    // initialize the compute arrays
     542    //
     543    //
     544
     545#pragma offload target (mic)
     546#pragma omp parallel
     547#pragma omp master
     548    numthreads = omp_get_num_threads();
     549
     550    printf("Initializing\r\n");
     551
     552#pragma omp parallel for
     553    for(i=0; i<FLOPS_ARRAY_SIZE; i++)
     554    {
     555        fa[i] = (float)i + 0.1;
     556        fb[i] = (float)i + 0.2;
     557    }   
     558    printf("Starting Compute on %d threads\r\n",numthreads);
     559
     560    tstart = dtime();
     561       
     562    // scale the calculation across threads requested
     563    // need to set environment variables OMP_NUM_THREADS and KMP_AFFINITY
     564
     565#pragma offload target (mic)
     566#pragma omp parallel for private(j,k)
     567    for (i=0; i<numthreads; i++)
     568    {
     569        // each thread will work it's own array section
     570        // calc offset into the right section
     571        int offset = i*LOOP_COUNT;
     572
     573        // loop many times to get lots of calculations
     574        for(j=0; j<MAXFLOPS_ITERS; j++) 
     575        {
     576            // scale 1st array and add in the 2nd array
     577            #pragma vector aligned
     578            for(k=0; k<LOOP_COUNT; k++) 
     579            {
     580                fa[k+offset] = a * fa[k+offset] + fb[k+offset];
     581            }
     582        }
     583    }
     584    tstop = dtime();
     585    // # of gigaflops we just calculated 
     586    gflops = (double)( 1.0e-9*numthreads*LOOP_COUNT*
     587                        MAXFLOPS_ITERS*FLOPSPERCALC);   
     588
     589    //elasped time
     590    ttime = tstop - tstart;
     591    //
     592    // Print the results
     593    //
     594    if ((ttime) > 0.0)
     595    {
     596        printf("GFlops = %10.3lf, Secs = %10.3lf, GFlops per sec = %10.3lf\r\n",                   gflops, ttime, gflops/ttime);
     597    }
     598    return( 0 );
     599}
     600}}}
     601
     602Changes to take note of:
     603* The addition of the directive before the section of code that we wish to run on the MIC
     604 {{{#!c
     605#pragma offload target (mic)
     606}}}
     607* The alteration of our array declarations indicating they would be part of offload use, eg
     608{{{
     609__declspec ( target (mic)) float fa[FLOPS_ARRAY_SIZE] __attribute__((aligned(64)));
     610}}}
     611
     612Let's take a look at a submission script for our offloading example
     613
     614{{{#!bash
     615#!/bin/bash
     616#SBATCH --qos=workshop          # Quality of Service
     617#SBATCH --partition=workshop    #Partition
     618#SBATCH --job-name=offloadTest   # Job Name
     619#SBATCH --time=00:10:00         # WallTime
     620#SBATCH --nodes=1               # Number of Nodes
     621#SBATCH --ntasks-per-node=1     # Number of tasks (MPI presseces)
     622#SBATCH --cpus-per-task=20      # Number of processors per task OpenMP threads()
     623#SBATCH --gres=mic:1            # Number of Co-Processors
     624
     625module load intel-psxe/2015-update1
     626module load intel/mic/sdk/3.3
     627module load intel/mic/runtime/3.3
     628
     629export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
     630export MIC_ENV_PREFIX=MIC
     631export MIC_OMP_NUM_THREADS=120
     632export MIC_KMP_AFFINITY=scatter
     633
     634./helloflops3offload
     635}}}
    494636
    495637== Programming Considerations ==