= Blas Level 1 = To use MKL Blas on Cypress, you have to load the module, {{{#!bash $ module load intel-psxe }}} == C == {{{#!c /* Tulane HPC Workshop Blas Level 1 : norm2 test */ #include #include #include #include #ifdef __APPLE__ #include #else #ifdef MKL_ILP64 #include #include #else #include #endif #endif #ifdef _OPENMP #include #endif #define SIZE 50000000 /* my norm2 */ double mynorm2(int n, double *x) { int i; double nrm = 0.0; for (i = 0; i < n; i++) { nrm += x[i] * x[i]; } return sqrt(nrm); } int main(int argc, char **argv) { int i, n, m; double *xvec; double nrm; #ifdef _OPENMP double tsomp, teomp; #endif clock_t ts, te; /* alloc vector and setting */ #ifdef MKL_ILP64 xvec = (double *)mkl_malloc(SIZE * sizeof(double),64); #else xvec = (double *)calloc(SIZE, sizeof(double)); #endif if (xvec == NULL) exit(-1); for (i = 0; i < SIZE; i++) xvec[i] = (double) (i % 100) / 50.0; /* call my norm 2*/ ts = clock(); nrm = mynorm2(SIZE, xvec); te = clock() ; printf(" my norm |x|=%e time=%f (msec)\n", nrm, 1000.0 * (te - ts) / CLOCKS_PER_SEC); /* call blas norm2 */ #ifdef _OPENMP tsomp = omp_get_wtime(); #else ts = clock(); #endif nrm = cblas_dnrm2(SIZE, xvec, 1); #ifdef _OPENMP teomp = omp_get_wtime(); printf("blas norm |x|=%e time=%f (msec) threads=%d\n",nrm,1000.0 * (teomp - tsomp), omp_get_max_threads()); #else te = clock(); printf("blas norm |x|=%e time=%f (msec)\n",nrm,1000.0 * (te - ts) / CLOCKS_PER_SEC); #endif /* cleanup */ #ifdef MKL_ILP64 mkl_free(xvec); #else free(xvec); #endif } }}} Compile it as a sequential code with dynamic link libraries, {{{#!bash icc -DMKL_ILP64 -mkl=sequential blas1.c -lpthread -lm }}} Compile it as a parallel (OpenMP) code with dynamic link libraries, {{{#!bash icc -DMKL_ILP64 -openmp -mkl=parallel blas1.c -lpthread -lm }}} == C++ == {{{#!c++ // // Tulane HPC Workshop // // Blas Level 1 : norm2 test // #include #include #include #include #include #include #include #ifdef __APPLE__ #include #else #ifdef MKL_ILP64 #include #include #else extern "C" { #include } #endif #endif #ifdef _OPENMP #include #endif /* my norm2 */ double mynorm2(unsigned int n,double *x){ double nrm = 0.0; for (unsigned int i = 0 ; i < n ; i++){ nrm += x[i] * x[i]; } return std::sqrt(nrm); } int main(int argc, char **argv){ const unsigned int size= 50000000; double nrm; std::cout << std::setprecision(16); /* alloc vector and setting */ #ifdef MKL_ILP64 double *xvec = (double *)mkl_malloc(size * sizeof(double), 64); #else double *xvec = new double[size]; #endif for (int i = 0 ; i < size ; i++) xvec[i] = (i % 100) / 50.0; #ifdef _OPENMP double tsomp, teomp; #endif clock_t ts, te; /* call my norm 2*/ ts = std::clock(); nrm = mynorm2(size,xvec); te = std::clock(); std::cout << "MY NORM=" << nrm << std::endl; std::cout << "Time cost for my norm = " << 1000.0 * (te - ts) / CLOCKS_PER_SEC << "(msec)\n"; /* call blas norm2 */ #ifdef _OPENMP tsomp = omp_get_wtime(); #else ts = std::clock(); #endif nrm = cblas_dnrm2(size,xvec,1); std::cout << "BLAS NORM=" << nrm << std::endl; #ifdef _OPENMP teomp = omp_get_wtime(); std::cout << "Time cost for cblas norm = " << 1000.0 * (teomp - tsomp) << "(msec) Threads=" << omp_get_max_threads() << std::endl; #else te = std::clock(); std::cout << "Time cost for cblas norm = " << 1000.0 * (te - ts) / CLOCKS_PER_SEC << "(msec)\n"; #endif #ifdef MKL_ILP64 mkl_free(xvec); #else delete [] xvec; #endif } }}} Compile it as a sequential code with dynamic link libraries, {{{#!bash icpc -DMKL_ILP64 -mkl=sequential blas1.c -lpthread -lm }}} Compile it as a parallel (OpenMP) code with dynamic link libraries, {{{#!bash icpc -DMKL_ILP64 -openmp -mkl=parallel blas1.c -lpthread -lm }}} == Fortran == {{{#!fortran ! ! ! Blas Level 1 : norm2 test ! !------------------------------------- ! my norm2 double precision function mynorm2(n,x) implicit none integer :: i, n double precision :: x(n) double precision :: nrm nrm = 0.d0 do i = 1,n nrm = nrm + x(i) * x(i) end do mynorm2 = sqrt(nrm) end function mynorm2 ! ! program Blas1 use omp_lib !Provides OpenMP* specific APIs implicit none integer, parameter:: SIZE = 50000000 integer :: i, n, m double precision, allocatable, dimension(:) :: xvec double precision :: nrm, ts, te double precision :: dnrm2, mynorm2 ! alloc vector and setting allocate(xvec(SIZE)) do i = 1,SIZE xvec(i) = dble (mod(i,100)) / 50.0 end do ! call my norm ts = omp_get_wtime() nrm = mynorm2(SIZE, xvec) te = omp_get_wtime(); print*, "my norm |x|=",nrm, " time=", 1000.0 * (te - ts),"(msec)" ! call blas norm2 ts = omp_get_wtime() nrm = dnrm2(SIZE, xvec, 1) te = omp_get_wtime() print*, "blas norm |x|=",nrm, " time=", 1000.0 * (te - ts),"(msec)" deallocate(xvec) end program Blas1 }}} Compile it as a sequential code with dynamic link libraries, {{{#!bash ifort -i8 -mkl=sequential blas1.c -lpthread -lm }}} Compile it as a parallel (OpenMP) code with dynamic link libraries, {{{#!bash ifort -i8 -openmp -mkl=parallel blas1.c -lpthread -lm }}}