[[PageOutline]] = Explicit Offloading with OpenMP = Note that "host" is the CPU, and "device" is MIC/GPU. This is a simple OpenMP code: {{{ #include #include int main( void ) { int totalProcs; totalProcs = omp_get_num_procs(); std::cout << "Number of Threads = " << totalProcs << std::endl; return 0; } }}} If run on Cypress computing node, the "Number of Threads" will be 20. Add a one-line directive #pragma that offloads to the device a line of executable code. {{{ #include #include int main( void ) { int totalProcs; #pragma omp target device(0) totalProcs = omp_get_num_procs(); std::cout << "Number of Threads = " << totalProcs << std::endl; return 0; } }}} codes now return "240" Note that the host pauses until the device code is finished. This code offloads only one line of {{{ totalProcs = omp_get_num_procs(); }}} to the device. Use { } to offload a block of codes. What happens to 'totalProcs'? Primitive variables are automatically transferred to/from the device. [[Image(https://docs.google.com/drawings/d/e/2PACX-1vRSu_BN8fhGC6vUHyrWPmwFgM60MjQdt8xOJt3gLruenwkjfMtleZR7m7n8Zy6uSy2F9DFUAp03gdxN/pub?w=533&h=285)]] == Parallel Loop == OpenMP region is defined by an omp directive. This for-loop runs on device. {{{ int main( void ) { double a[500000]; // static arrays are allocated on the stack; literal here is important int i; #pragma omp target device(0) #pragma omp parallel for for ( i=0; i<500000; i++ ) { a[i] = (double)i; } ... }}} What happens to “a”? 1. Detect a device 2. Allocate 'a' on the device memory. 3. The static array “a” is transferred to the device memory. 4. Execute the device-side code 5. Values in “a” in the device memory are transferred back to the host memory. == Controlling the Offload == Get the number of devices {{{ const int num_dev = omp_get_num_devices(); std::cout << "number of devices : " << num_dev << std::endl; }}} Control data transfer Transfer data from the device at the end of the offload section {{{ int main( void ) { double a[500000]; // static arrays are allocated on the stack; literal here is important int i; #pragma omp target device(0) map(from:a) #pragma omp parallel for for ( i=0; i<500000; i++ ) { a[i] = (double)i; } } }}} Transfer data to the device at the beginning of the offload section {{{ #pragma omp target device(0) map(to:a) }}} If not specified, do both. === Transfer dynamic arrays === You have to specify the range in the array. {{{ #pragma omp target device(0) map(from:phi[0:num * num]) }}} === Keeping Data on Device Memory === This will allocate a space for the array a on Device memory. {{{ #pragma omp target if (dev != num_dev) device(dev) map(to:a) map(from:a[dev:dev+1]) }}} The memory block for a on Device will be freed when the offload section ends. To keep data on Device memory, we have to allocate array on Device memory explicitly. {{{ void *data; #pragma omp target device(0) map(from:data) { double *vdata = new double[100]; #pragma omp parallel for (int i = 0 ; i < 100 ; i++) vdata[i]= i; data = (void *)vdata; } #pragma omp target device(0) map(to:data) { double *vdata = (double *)data; for (int i = 0 ; i < 100 ; i++){ std::cout << vdata[i] << std::endl; } } }}} Use void * pointer variable to store the address of array on Device memory. === Controlling data transfer === {{{ #pragma omp target data map(to:aArray[0:num], bArray[0:num]) map(alloc:cArray[0:num]) { // aArray, bArray, cArray are allocated on Device memory, and the elements of aArray & bArray are transferred from CPU to Device #pragma omp target // Use aArray,bArray,cArray on Device memory #pragma omp parallel for // Runs on Device for (int i = 0 ; i < num ; i++){ double sum = 0.0; for (int j = 0 ; j < num ; j++){ sum += aArray[i] * bArray[j]; } cArray[i] = sum; } //Compute ||C|| . Host gets the results. double cNorm = 0.0; #pragma omp target // Use aArray,bArray,cArray on Device memory #pragma omp parallel for reduction(+:cNorm) // Runs on Device for (int i = 0 ; i < num ; i++){ cNorm += cArray[i] * cArray[i]; } cNorm = std::sqrt(cNorm); // Runs on CPU std::cout << "||C||=" << cNorm << std::endl;// Runs on CPU // do the same on CPU cNorm = 0.0; #pragma omp target update from(cArray[0:num]) // Transfer cArray from Device to CPU #pragma omp parallel for reduction(+:cNorm) // Runs on CPU for (int i = 0 ; i < num ; i++){ cNorm += cArray[i] * cArray[i]; } cNorm = std::sqrt(cNorm); std::cout << "||C||=" << cNorm << std::endl; }// aArray, bArray, cArray on Device memory are freed }}} === Host-Device Parallelism === {{{ #include #include #include int main(const int argc, const char** argv) { omp_set_nested(1); int num_dev = omp_get_num_devices(); std::cout << "number of devices " << num_dev << std::endl; int a[10] = { 0 }; #pragma omp parallel firstprivate(num_dev) num_threads(num_dev + 1) #pragma omp single { for (int dev = 0; dev < num_dev + 1; dev++) { #pragma omp task firstprivate(dev) { #pragma omp target if (dev != num_dev) device(dev) map(to:a) map(from:a[dev:dev+1]) { #pragma omp parallel { #pragma omp master a[dev] = omp_get_num_threads(); } } } } } for (int i = 0; i < num_dev + 1; i++) { std::cout << a[i] << std::endl; } return 0; } }}} 'if' in pragma directive {{{ #pragma omp target if (dev != num_dev) device(dev) map(to:a) map(from:a[dev:dev+1]) }}} In this case, when dev is equal to num_dev, this directive is ignored. So next scope of code will run on Host (CPU).