Changes between Initial Version and Version 1 of Workshops/cypress/OffloadingWithOpenMP


Ignore:
Timestamp:
08/16/25 14:14:21 (13 hours ago)
Author:
fuji
Comment:

Legend:

Unmodified
Added
Removed
Modified
  • Workshops/cypress/OffloadingWithOpenMP

    v1 v1  
     1[[PageOutline]]
     2= Explicit Offloading with OpenMP =
     3Note that "host" is the CPU, and "device" is MIC/GPU.
     4
     5This is a simple OpenMP code:
     6{{{
     7#include <iostream>
     8#include <omp.h>
     9
     10int main( void ) {
     11        int totalProcs;
     12        totalProcs = omp_get_num_procs();
     13        std::cout << "Number of Threads = " << totalProcs << std::endl;
     14        return 0;
     15}
     16}}}
     17If run on Cypress computing node, the "Number of Threads" will be 20.
     18
     19Add a one-line directive #pragma that offloads to the device a line of executable code.
     20{{{
     21#include <iostream>
     22#include <omp.h>
     23
     24int main( void ) {
     25        int totalProcs;
     26#pragma omp target device(0)
     27        totalProcs = omp_get_num_procs();
     28        std::cout << "Number of Threads = " << totalProcs << std::endl;
     29        return 0;
     30}
     31}}}
     32codes now return "240"
     33Note that the host pauses until the device code is finished.
     34This code offloads only one line of
     35{{{
     36         totalProcs = omp_get_num_procs();
     37}}}
     38to the device. Use { } to offload a block of codes.
     39 
     40What happens to 'totalProcs'?
     41
     42Primitive variables are automatically transferred to/from the device.
     43
     44[[Image(https://docs.google.com/drawings/d/e/2PACX-1vRSu_BN8fhGC6vUHyrWPmwFgM60MjQdt8xOJt3gLruenwkjfMtleZR7m7n8Zy6uSy2F9DFUAp03gdxN/pub?w=533&h=285)]]
     45
     46== Parallel Loop ==
     47OpenMP region is defined by an omp directive. This for-loop runs on device.
     48{{{
     49int main( void ) {
     50        double a[500000];
     51        // static arrays are allocated on the stack; literal here is important
     52        int i;
     53#pragma omp target device(0)
     54#pragma omp parallel for
     55        for ( i=0; i<500000; i++ ) {
     56                a[i] = (double)i;
     57        }
     58        ...
     59}}}
     60What happens to “a”?
     611. Detect a device
     622. Allocate 'a' on the device memory.
     633. The static array “a” is transferred to the device memory.
     644. Execute the device-side code
     655. Values in “a” in the device memory are transferred back to the host memory.
     66 
     67== Controlling the Offload ==
     68Get the number of devices
     69{{{
     70        const int num_dev = omp_get_num_devices();
     71        std::cout << "number of devices : " << num_dev << std::endl;
     72}}}
     73Control data transfer
     74
     75Transfer data from the device at the end of the offload section
     76{{{
     77int main( void ) {
     78        double a[500000];
     79        // static arrays are allocated on the stack; literal here is important
     80        int i;
     81#pragma omp target device(0) map(from:a)
     82#pragma omp parallel for
     83        for ( i=0; i<500000; i++ ) {
     84                a[i] = (double)i;
     85        }
     86}
     87}}}
     88
     89Transfer data to the device at the beginning of the offload section
     90{{{
     91#pragma omp target device(0) map(to:a) 
     92}}}
     93If not specified, do both.
     94
     95
     96
     97
     98=== Transfer dynamic arrays ===
     99
     100You have to specify the range in the array.
     101{{{
     102#pragma omp target device(0)    map(from:phi[0:num * num])
     103}}}
     104
     105=== Keeping Data on Device Memory ===
     106
     107This will allocate a space for the array a on Device memory. 
     108{{{
     109#pragma omp target if (dev != num_dev) device(dev) map(to:a) map(from:a[dev:dev+1])
     110}}}
     111 The memory block for a on Device will be freed when the offload section ends.
     112
     113To keep data on Device memory, we have to allocate array on Device memory explicitly. 
     114{{{
     115        void *data;
     116#pragma omp target device(0) map(from:data)
     117        {
     118                double *vdata = new double[100];
     119#pragma omp parallel
     120                for (int i = 0 ; i < 100 ; i++) vdata[i]= i;
     121                data = (void *)vdata;
     122        }
     123
     124#pragma omp target device(0) map(to:data)
     125        {
     126                double *vdata = (double *)data;
     127                for (int i = 0 ; i < 100 ; i++){
     128                        std::cout << vdata[i] << std::endl;
     129                }
     130        }
     131}}}
     132 Use void * pointer variable to store the address of array on Device memory.
     133
     134=== Controlling data transfer ===
     135{{{
     136#pragma omp target data map(to:aArray[0:num], bArray[0:num]) map(alloc:cArray[0:num])
     137  {  // aArray, bArray, cArray are allocated on Device memory, and the elements of aArray & bArray are transferred from CPU to Device                  
     138#pragma omp target   // Use aArray,bArray,cArray on Device memory
     139#pragma omp parallel for  // Runs on Device
     140    for (int i = 0 ; i < num ; i++){
     141      double sum = 0.0;
     142      for (int j = 0 ; j < num ; j++){
     143       sum += aArray[i] * bArray[j];
     144      }
     145      cArray[i] = sum;
     146    }
     147
     148
     149    //Compute ||C|| . Host gets the results.
     150    double cNorm = 0.0;
     151#pragma omp target  // Use aArray,bArray,cArray on Device memory
     152#pragma omp parallel for reduction(+:cNorm) // Runs on Device
     153    for (int i = 0 ; i < num ; i++){
     154      cNorm += cArray[i] * cArray[i];
     155    }
     156    cNorm = std::sqrt(cNorm); // Runs on CPU
     157    std::cout << "||C||=" << cNorm << std::endl;// Runs on CPU
     158
     159
     160    // do the same on CPU 
     161    cNorm = 0.0;
     162#pragma omp target update from(cArray[0:num]) // Transfer cArray from Device to CPU
     163#pragma omp parallel for reduction(+:cNorm)  // Runs on CPU
     164    for (int i = 0 ; i < num ; i++){
     165      cNorm += cArray[i] * cArray[i];
     166    }
     167
     168
     169    cNorm = std::sqrt(cNorm);
     170    std::cout << "||C||=" << cNorm << std::endl;    
     171  }// aArray, bArray, cArray on Device memory are freed
     172 }}}
     173
     174=== Host-Device Parallelism  ===
     175{{{
     176#include <iostream>
     177#include <cmath>
     178#include <omp.h>
     179
     180int main(const int argc, const char** argv) {
     181        omp_set_nested(1);
     182        int num_dev = omp_get_num_devices();
     183        std::cout << "number of devices " << num_dev << std::endl;
     184        int a[10] = { 0 };
     185
     186#pragma omp parallel firstprivate(num_dev) num_threads(num_dev + 1)
     187#pragma omp single
     188        {
     189                for (int dev = 0; dev < num_dev + 1; dev++) {
     190#pragma omp task firstprivate(dev)
     191                        {
     192#pragma omp target if (dev != num_dev) device(dev) map(to:a) map(from:a[dev:dev+1])
     193                                {
     194#pragma omp parallel
     195                                        {
     196#pragma omp master
     197                                                a[dev] = omp_get_num_threads();
     198                                        }
     199                                }
     200                        }
     201                }
     202        }
     203        for (int i = 0; i < num_dev + 1; i++) {
     204                std::cout << a[i] << std::endl;
     205        }
     206        return 0;
     207}
     208}}}
     209 'if' in pragma directive 
     210{{{
     211#pragma omp target if (dev != num_dev) device(dev) map(to:a) map(from:a[dev:dev+1])
     212}}}
     213In this case, when dev is equal to num_dev, this directive is ignored. So next scope of code will run on Host (CPU).