= Blas Level 3 = == C == {{{#!c /* Tulane HPC Workshop Blas Level 3 : dgemm test */ #include #include #include #include #ifdef __APPLE__ #include #else #ifdef MKL_ILP64 #include #include #else #include #endif #endif #ifdef _OPENMP #include #endif #define SIZE 2000 /* my simple dgemm */ void mydgemm(int m, int n, double al, double *a, double *b, double bt, double *c) { int i, j, jj; /* compute C := alpha * AB + beta * C */ for (i = 0; i < n; i++) { for (j = 0; j < m; j++) { c[i + m * j] *= bt; for (jj = 0; jj < m; jj++) { c[i + m * j] += al * a[i + m * jj] * b[jj + m * j]; } } } } int main(void) { int i, j, n, m, inc; double *cmat; double *bmat; double *amat; double alpha, beta; #ifdef _OPENMP double tsomp, teomp; #endif clock_t ts, te; /* alloc vector and matrix */ #ifdef MKL_ILP64 cmat = (double *)mkl_malloc(SIZE * SIZE * sizeof(double), 64); bmat = (double *)mkl_malloc(SIZE * SIZE * sizeof(double), 64); amat = (double *)mkl_malloc(SIZE * SIZE * sizeof(double), 64); #else cmat = calloc(SIZE * SIZE, sizeof(double)); bmat = calloc(SIZE * SIZE, sizeof(double)); amat = calloc(SIZE * SIZE, sizeof(double)); #endif if ((cmat == NULL ) || (bmat == NULL ) || (amat == NULL )) { exit(-1); } /* setup matrix A B C*/ for (i = 0; i < SIZE; i++) { for (j = 0; j < SIZE; j++) { amat[i + SIZE * j] = i + 100.0 / (1.0 + fabs((double) i - (double) j)); bmat[i + SIZE * j] = i + 100.0 / (1.0 + fabs((double) i - (double) j)); cmat[i + SIZE * j] = 0.0; } } /* calculate bvec = alpha*amat*bmat + beta*cmat */ alpha = 1.0; beta = 0.0; /* call my dgemv */ ts = clock(); mydgemm(SIZE, SIZE, alpha, amat, bmat, beta, cmat); te = clock(); printf("|C|=%f\n", cblas_dnrm2(SIZE * SIZE, cmat, 1)); printf("Time : my dgemv = %f (sec)\n", 1.0 * (te - ts) / CLOCKS_PER_SEC); /* call blas dgemm */ #ifdef _OPENMP tsomp = omp_get_wtime(); #else ts = clock(); #endif cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, SIZE, SIZE, SIZE, 1.0, amat, SIZE, bmat,SIZE, 0.0, cmat, SIZE); printf("|C|=%f\n", cblas_dnrm2(SIZE * SIZE, cmat, 1)); #ifdef _OPENMP teomp = omp_get_wtime(); printf("Time : cblas dgemv = %f (sec) Threads=%d\n", (teomp - tsomp), omp_get_max_threads()); #else te = clock(); printf("Time : cblas dgemv = %f (sec)\n", 1.0 * (te - ts) / CLOCKS_PER_SEC); #endif #ifdef MKL_ILP64 mkl_free(cmat); mkl_free(bmat); mkl_free(amat); #else free(cmat); free(bmat); free(amat); #endif } }}} == C++ == {{{#!c++ // // Tulane HPC Workshop // // Blas Level 3 : dgemm test // #include #include #include #include #include #include #include #ifdef __APPLE__ #include #else #ifdef MKL_ILP64 #include #include #else extern "C" { #include } #endif #endif #ifdef _OPENMP #include #endif /* my simple dgemm */ void mydgemm(int m, int n, double al, double *a, double *b, double bt, double *c) { /* compute C := alpha * AB + beta * C */ for (int i = 0; i < n; i++) { for (int j = 0; j < m; j++) { c[i + m * j] *= bt; for (int jj = 0; jj < m; jj++) { c[i + m * j] += al * a[i + m * jj] * b[jj + m * j]; } } } } int main(void) { const unsigned int size = 2000; /* alloc vector and matrix */ #ifdef MKL_ILP64 double *cmat = (double *)mkl_malloc(size * size * sizeof(double), 64); double *bmat = (double *)mkl_malloc(size * size * sizeof(double), 64); double *amat = (double *)mkl_malloc(size * size * sizeof(double), 64); #else double *cmat = new double[size * size]; double *bmat = new double[size * size]; double *amat = new double[size * size]; #endif double alpha, beta; /* setup matrix A B C*/ for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { amat[i + size * j] = i + 100.0 / (1.0 + fabs((double) i - (double) j)); bmat[i + size * j] = i + 100.0 / (1.0 + fabs((double) i - (double) j)); cmat[i + size * j] = 0.0; } } #ifdef _OPENMP double tsomp, teomp; #endif clock_t ts, te; /* calculate bvec = alpha*amat*bmat + beta*cmat */ alpha = 1.0; beta = 0.0; /* call my dgemm */ ts = clock(); mydgemm(size, size, alpha, amat, bmat, beta, cmat); te = clock(); std::cout << "|C|=" << cblas_dnrm2(size * size, cmat, 1) << std::endl; std::cout << "Time : my dgemm = " << 1.0 * (te - ts) / CLOCKS_PER_SEC << "(sec)\n"; /* call blas dgemm */ #ifdef _OPENMP tsomp = omp_get_wtime(); #else ts = clock(); #endif cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, amat, size, bmat,size, 0.0, cmat, size); std::cout << "|C|=" << cblas_dnrm2(size * size, cmat, 1) << std::endl; #ifdef _OPENMP teomp = omp_get_wtime(); std::cout << "Time : cblas dgemm = " << (teomp - tsomp) << "(sec) Threads=" << omp_get_max_threads() << std::endl; #else te = clock(); std::cout << "Time : cblas dgemm = " << 1.0 * (te - ts) / CLOCKS_PER_SEC << "(sec)\n"; #endif #ifdef MKL_ILP64 mkl_free(cmat); mkl_free(bmat); mkl_free(amat); #else delete [] cmat; delete [] bmat; delete [] amat; #endif } }}} == Fortran == {{{#!fortran ! ! Tulane HPC Workshop ! ! Blas Level 3 : dgemm test ! ! my simple dgemm subroutine mydgemm(m, n, al, a, b, bt, c) implicit none integer :: i, j, jj, n, m double precision :: a(n * m),b(n * m),c(n * m) double precision :: al, bt ! compute C := alpha * AB + beta * C do i = 1, n do j = 1, m c(i + m * (j - 1)) = bt * c(i + m * (j - 1)) do jj = 1,m c(i + m * (j - 1)) = c(i + m * (j - 1)) + al * a(i + m * (jj - 1)) * b(jj + m * (j - 1)) end do end do end do end subroutine mydgemm ! ! Program blas3 use omp_lib !Provides OpenMP* specific APIs implicit none integer, parameter:: SIZE = 2000 integer :: i, j double precision, allocatable, dimension(:) :: amat,bmat,cmat double precision :: ts, te double precision :: dnrm2 ! alloc vector and matrix allocate(amat(SIZE*SIZE),bmat(SIZE*SIZE),cmat(SIZE*SIZE)) ! setup matrix A B C do i = 1,SIZE do j = 1, SIZE amat(i + SIZE * (j - 1)) = i - 1 + 100.0 / (1.0 + abs(i - j)) bmat(i + SIZE * (j - 1)) = i - 1 + 100.0 / (1.0 + abs(i - j)) cmat(i + SIZE * (j - 1)) = 0.d0 end do end do ! call my dgemv ts = omp_get_wtime() call mydgemm(SIZE, SIZE, 1.d0, amat, bmat, 0.d0, cmat) te = omp_get_wtime() print*, "|C|=", dnrm2(SIZE*SIZE, cmat, 1) print*, "Time : my dgemv = ", (te - ts),"(sec)" ! call blas dgemm ts = omp_get_wtime(); call dgemm('N', 'N', SIZE, SIZE, SIZE, 1.d0, amat, SIZE, bmat,SIZE, 0.d0, cmat, SIZE) te = omp_get_wtime(); print*, "|C|=", dnrm2(SIZE*SIZE, cmat, 1) print*, "Time : blas dgemm = ", (te - ts),"(sec)" deallocate(amat,bmat,cmat) end Program blas3 }}}