Can I get faster MatrixMultiplication using CUDA than with Matlab internal GPU implementation??

5 views (last 30 days)
I have to Multiply a Matrix A (size about 400 x400 ) with M Matrices B, while m is also about 500.
N=400;M=500;
A=rand(N,N);
B=rand(N,N,M);
C=zeros(N,N,M);
Using Matlab GPU implementation:
gpu_A=gpuArray(A);
gpu_B=gpuArray(B);
zCell = arrayfun(@(iz) gpu_A * gpu_B(:,:,iz),1:size(gpu_B,3),'uniformOutput',false);
size(zCell)
C3 = gather(cat(3, zCell{:}));
My hope was that an implementation using CUDA code and a mex file would be faster than this, similar to the Mandelbrot example in the MatLab help.
My Cuda Code uses the CUBLAS function: cublasDgemmBatched
To save memory the matrix is stored only one time on the GPU, but I use 500 pointers showing to the same matrix A..
My complete CUDA code you find below. Any suggestion how to get it faster? Now MatLab is about 10% faster than my CUDA code..
#include "mex.h"
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
void initializeCUDA(int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
if (error != cudaSuccess)
{
printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
////////////////////////////////////////////////////////////////////////////////
//Make Matrix Multiplication using CUBLAS
////////////////////////////////////////////////////////////////////////////////
void matrixMultiply3D( int devID,double *h_A, double *h_B, double *h_C ,mwSize ncolsA , mwSize nrowsA,mwSize ncolsB,mwSize nrowsB,mwSize nB)
{
initializeCUDA( devID);
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
unsigned int size_A = ncolsA * nrowsA;
unsigned int mem_size_A = sizeof(double) * size_A;
unsigned int size_B = ncolsB * nrowsB * nB;
unsigned int mem_size_B = sizeof(double) * size_B;
const unsigned int nrowsC = nrowsA;
const unsigned int ncolsC = ncolsB;
// allocate device memory
double *d_A, *d_B, *d_C;
const double **dd_A, **dd_B;
double **dd_C;
double **point_temp =new double*[nB];
unsigned int size_C = nrowsA * ncolsB * nB;
unsigned int mem_size_C = sizeof(double) * size_C;
unsigned int mem_size_point = sizeof(double*) * nB;
error = cudaMalloc((void **) &d_A, mem_size_A);
if (error != cudaSuccess)
{
printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **) &dd_A, mem_size_point);
if (error != cudaSuccess)
{
printf("cudaMalloc dd_A returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
for ( int i=0; i<nB; i++) point_temp[i] = d_A;
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// copy host memory to device
error = cudaMemcpy(dd_A, point_temp, mem_size_point, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy dd_A d_A returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **) &d_B, mem_size_B);
if (error != cudaSuccess)
{
printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **) &dd_B, nB*sizeof(double*));
if (error != cudaSuccess)
{
printf("cudaMalloc dd_B returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
for ( int i=0; i<nB; i++) point_temp[i] = d_B + i*ncolsB * nrowsB;
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// copy host memory to device
error = cudaMemcpy(dd_B, point_temp, mem_size_point, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy dd_B d_B returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **) &d_C, mem_size_C);
if (error != cudaSuccess)
{
printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **) &dd_C, nB*sizeof(double*));
if (error != cudaSuccess)
{
printf("cudaMalloc dd_C returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
for ( int i=0; i<nB; i++) point_temp[i] = d_C + i*ncolsC * nrowsC;
// copy host memory to device
error = cudaMemcpy(dd_C, point_temp, mem_size_point, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy dd_C d_C returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(ncolsB / threads.x, nrowsA / threads.y);
// execute the kernel
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const double alpha = 1.0f;
const double beta = 0.0f;
ret = cublasDgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, nrowsA, ncolsB, ncolsA, &alpha, dd_A, nrowsA, dd_B, nrowsB, &beta, dd_C, nrowsC, nB);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
// copy result from device to host
error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
if (error != cudaSuccess)
{
printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(dd_A);
cudaFree(dd_B);
cudaFree(dd_C);
delete [] point_temp;
cudaDeviceReset();
}
/* The gateway function */
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[]){
/*nlhs number of outputs
plhs pointer to outputs
nrhs number of inputs
prhs pointer to inputs */
double *inMatrixA; /* 1xN input matrix */
double *inMatrixB; /* 1xN input matrix */
size_t ncolsA; /* size of matrix */
size_t nrowsA; /* size of matrix */
size_t ncolsB; /* size of matrix */
size_t nrowsB; /* size of matrix */
size_t nB; /* number of matrices B */
const mwSize *dims;
double *outMatrix; /* output matrix */
/* check for proper number of arguments */
if(nrhs!=2) {
mexErrMsgIdAndTxt("MyToolbox:arrayProduct:nrhs","Two Matrix inputs required.");
}
if(nlhs!=1) {
mexErrMsgIdAndTxt("MyToolbox:arrayProduct:nlhs","One output required.");
}
/* get the value of the scalar input */
inMatrixA = (double*) mxGetData(prhs[0]);
/* create a pointer to the real data in the input matrix */
inMatrixB = (double*) mxGetData(prhs[1]);
/* get dimensions of the input matrix */
ncolsA = mxGetN(prhs[0]);
nrowsA = mxGetM(prhs[0]);
dims = mxGetDimensions(prhs[1]);
ncolsB = dims[1];
nrowsB = dims[0];
nB = dims[2];
/* create the output matrix */
plhs[0] = mxCreateNumericArray(3 , dims, mxDOUBLE_CLASS, mxREAL);
/* get a pointer to the real data in the output matrix */
outMatrix = (double*) mxGetData(plhs[0]);
/* call the computational routine */
matrixMultiply3D(0, inMatrixA,inMatrixB,outMatrix,(mwSize)ncolsA , (mwSize)nrowsA,(mwSize)ncolsB,(mwSize)nrowsB,(mwSize) nB);//, sMatrixSize &matrix_size)
}
Thanks in advance
Robert

Answers (4)

James Tursa
James Tursa on 1 Feb 2013
I'm curious how the times compare to this:
C = mtimesx(A,B);
MTIMESX can be found here:
href = ""<http://www.mathworks.com/matlabcentral/fileexchange/25977-mtimesx-fast-matrix-multiply-with-multi-dimensional-support</a>>

Francisco Ramírez
Francisco Ramírez on 18 Feb 2013
In which format do you saved the file (.cu, .c, .cpp)?
How was the compilation instruction (mex ...)?
Regards,
Francisco

Francisco Ramírez
Francisco Ramírez on 22 Feb 2013
Hi Robert,
I was looking into the CUBLAS documentation for more information about the function "cublasDgemmBatched" and I found the following:
cublasDgemmBatched. This function is intended to be used for matrices of small sizes where the launch overhead is a significant factor. For small sizes, typically smaller than 100x100, this function improves significantly performance compared to making calls to its corresponding cublas<t>gemm routine.

Zainub
Zainub on 9 Feb 2015
hi... I am using the same thing (cuda with matlab)... but I face a problem while adding path of cublas library... Error using loadlibrary (line 422) There was an error loading the library "libcublas" The specified module could not be found.
Caused by: Error using loaddefinedlibrary The specified module could not be found.

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!