cuda kernel does not work when a data is transfer to the GPU. any problem with my gateway MEX code?

14 views (last 30 days)
Hi,
I'm trying to transfer a data to a KUDA kernel, do some processings and have the output back to Matlab. I have already evaluated the Kernel in visual studio and it works as it should. However, when I make a MEX file out of the code, It does not provide me the output i expect. here is my MEX gateway code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
using namespace std;
#include <mex.h>
__global__ void kernel_Reconstruction2(int* Dev_RfData, int* ReconstructedImage_GPU, int transmit, int NStart_Transmit) {
int TID = threadIdx.y * blockDim.x + threadIdx.x;
int BlockOFFset = blockDim.x * blockDim.y * blockIdx.x;
int RowOFFset = blockDim.x * blockDim.y * gridDim.x * blockIdx.y;
int GID = RowOFFset + BlockOFFset + TID;
}
void mexFunction(int nlhs, mxArray* plhs[],
int nrhs, const mxArray* prhs[]) {
int* RfData; // RF data; a pinned memory was dedicated to this
int* ReconstructedImage_GPU;
RfData = (int*)mxGetPr(prhs[0]);
plhs[0] = mxCreateNumericMatrix(1, 64 * 64, mxINT32_CLASS, mxREAL);
ReconstructedImage_GPU = (int*)mxGetData(plhs[0]);
int ArrayByteSize_RfData = sizeof(int) * (96 * 96 * 4096);
int BYTES_PER_STREAM = ArrayByteSize_RfData / 96;
//Memory allocation: RfData ; we send the RF data to the device with streaming
int* Device_RfData; // device pointer to the RF data.
(cudaMalloc((int**)&Device_RfData, ArrayByteSize_RfData));
int* Device_ReconstructedImage_GPU; // device pointer to the reconstructed image
int ArrayByteSize_ReconstructedImage_GPU = sizeof(int) * (96*96);
(cudaMalloc((int**)&Device_ReconstructedImage_GPU, ArrayByteSize_ReconstructedImage_GPU));
printf("The CUDA reconstruction started... \n");
dim3 block(1024, 1);
dim3 grid(64 * 64, 96);//SystemSetup.NumberOfTransmitter
cudaStream_t* streams = new cudaStream_t[96]; //SystemSetup.NumberOfTransmitter
int NStart_Transmit{};
for (int transmit = 0; transmit < 96; transmit++) {
cudaStreamCreate(&streams[transmit]);
NStart_Transmit = transmit * (96 * 4096);
cudaMemcpyAsync(&Device_RfData[NStart_Transmit], &RfData[NStart_Transmit], BYTES_PER_STREAM, cudaMemcpyHostToDevice, streams[transmit]);
kernel_Reconstruction2 << <grid, block, 0, streams[transmit] >> > (&Device_RfData[NStart_Transmit], Device_ReconstructedImage_GPU, transmit, NStart_Transmit);
(cudaPeekAtLastError());
}
for (int transmit = 0; transmit < 96; transmit++) { cudaStreamDestroy(streams[transmit]); } // destroy the streams
delete[] streams;
cudaDeviceSynchronize();
(cudaMemcpy(ReconstructedImage_GPU, Device_ReconstructedImage_GPU, ArrayByteSize_ReconstructedImage_GPU, cudaMemcpyDeviceToHost));
(cudaFree(Device_RfData));
(cudaFree(Device_ReconstructedImage_GPU));
}
I have also checked if the "RfData" contains the actual values in the host or not; all is good with this as well. So, i think there is something wrong with the "Device_RfData" values. Is there anything that I'm missing ?
Regards,
Moein.
  2 Comments
Richard
Richard on 22 Mar 2021
Hi Moein, could you add a Matlab code example to show how you are calling this function, and what output you expect for that example? (I can't see where ReconstructedImage_GPU is ever altered on the device - are you just expecting a zeros matrix as output and not getting it?)
Moein Mozaffarzadeh
Moein Mozaffarzadeh on 22 Mar 2021
Hi Richard,
To breifly explain it, ReconstructedImage_GPU is an image. For each pixel of this image, i need to sum up some of the samples of the Dev_RfData. This is what happens in my Kernel, and what i provided is the simplified version of my code. Hers is what exactly happens :
__global__ void kernel_Reconstruction2(Setup* SetupLoaded_p, float* MediumZ_p, float* MediumX_p, float* TRansducerCorrZ_p, float* TRansducerCorrX_p
, int* RfData, int* Dir, int Dir_Size, int Reconstruct_SoundSpeed, int* ReconstructedImage_GPU, int transmit, int NStart_Transmit, int size, float* Device_ConvArrivalTime) {
int TID = threadIdx.y * blockDim.x + threadIdx.x;
int BlockOFFset = blockDim.x * blockDim.y * blockIdx.x;
int RowOFFset = blockDim.x * blockDim.y * gridDim.x * blockIdx.y;
int GID = RowOFFset + BlockOFFset + TID;
int GID_RowBased = BlockOFFset + TID;
int D1, D2, sam, Pz_man, Px_man, receive, RoundTripSample, IndexingReceive, IndexingTransmit;
float ReceiveTime, RoundTripTime, TransmitTime;
if (GID_RowBased < size) {
Px_man = (GID_RowBased) % (SetupLoaded_p->Nx);
Pz_man = (GID_RowBased) / (SetupLoaded_p->Nx);
receive = blockIdx.y;
IndexingReceive = receive * Dir_Size + (GID_RowBased);
IndexingTransmit = transmit * Dir_Size + (GID_RowBased);
TransmitTime = (sqrtf(((TRansducerCorrX_p[transmit] - MediumX_p[Px_man]) * (TRansducerCorrX_p[transmit] - MediumX_p[Px_man])) + ((TRansducerCorrZ_p[transmit] - MediumZ_p[Pz_man]) * (TRansducerCorrZ_p[transmit] - MediumZ_p[Pz_man])))) / Reconstruct_SoundSpeed;
ReceiveTime = (sqrtf(((TRansducerCorrX_p[receive] - MediumX_p[Px_man]) * (TRansducerCorrX_p[receive] - MediumX_p[Px_man])) + ((TRansducerCorrZ_p[receive] - MediumZ_p[Pz_man]) * (TRansducerCorrZ_p[receive] - MediumZ_p[Pz_man])))) / Reconstruct_SoundSpeed;
RoundTripTime = (TransmitTime + ReceiveTime);
RoundTripTime += (SetupLoaded_p->TransmissionOffset);
RoundTripSample = lroundf(RoundTripTime * SetupLoaded_p->Fs)-1;
ReconstructedImage_GPU[GID_RowBased] += ((RfData[RoundTripSample + ((receive)*SetupLoaded_p->NumberOfSamples)])
* (Dir[IndexingReceive] * Dir[IndexingTransmit]));
}
}
Of course, i need to copy all the arrays used in the kernel beforehand using cudaMemcpy. Sorry if i does not match well with my Gateway mex code (I tried to provide a compileable code in my first post).
I'm sure that the kernel works fine since it gives me the correct image in my Visual Studio project (sorry if i do not put all the codes here as it takes a lot of variables and a large datasets).
I think the Matlab does not launch the kernel correctly. Do you have any idea what could be wrong?
Moein.

Sign in to comment.

Accepted Answer

Richard
Richard on 24 Mar 2021
Edited: Richard on 24 Mar 2021
Hi Moein,
There is no way of allocating MATLAB arrays directly as pinned CUDA memory. Inputs to a mex function will always be allocated by MATLAB's memory manager.
You should be able to allocate your own pinned host memory and copy data from the mxArray into it before then using streams to do the async copy to device. I don't know whether this additional host-side copy will be more expensive than the savings from using streams though.
In my tests I was using a basic call to mexcuda:
mexcuda TUI_CUDA.cu MexFunctions.cu
You might also want to call mex -setup to ensure that you have the same compiler selected in MATLAB as you are using in VS.
I do not think it is likely that the differences you are seeing are due to compiler flags. I think it is better to concentrate on validating that the inputs, then processing steps and then outputs match between your VS version and MATLAB version.
As seen with the output visualization issue, the memory order of arrays is different between the two. There is clearly a 2D (or even 3D) structure to your input data, and you are processing it with kernels that are designed to work on a slice along one of those dimensions. But the input arrays are 1D. Is the input data that you pass in from MATLAB definitely imported correctly? If 2D matrices were involved at some stage then there may be a transpose missing there as well. Have you tried resizing the inputs to an (N-by-ntransmitters) image to check that both of the 1D streams look to have the expected structure?
When you have verified that the input data streams are identical, then you will need to look at exactly where in the kernel processing a deviation occurs. It will likely be very helpful if you can formulate an example input that is much smaller so that you have fewer data items to track and compare between the two systems while you do this. Progressively (temporarily) cutting code out of the kernel until outputs match is another approach to try - you could even go as far as having the kernel simply output thread indices as a basic test to prove each approach is executing the same kernel.
Richard
  9 Comments
Richard
Richard on 24 May 2021
Hi Moein,
The direct problem here is that variables in C that you want to reference in future calls to a function need to be declared as static, and their declaration needs to be in an appropriate scope, i.e. not in the if() branch. Only the code that creates the contents of the variables needs to be guarded and run just once. They also should not be freed at the end of the mexfunction, and you should not use cudaResetDevice, because both of those will make the cuda pointers invalid in the next call. You will need to register a function to free the pointers when the mex file is cleared using mexAtExit - see https://www.mathworks.com/help/matlab/apiref/mexatexit.html.
However, I don't think that this approach is best way to approach the problem. It does not mean that the mex function will work without any user intervention: you will need to clearly document to users that they must set the "TransferToHost" flag whenever certain other input variables have different data. And also, they should not set that flag if they want to see best performance at other times. This interface is confusing and is likely that a user will make a mistake. It also means that the user has no real control over the lifetime of the persistent GPU data - there is no obvious way for them to say "ok, I'm done with your code, now I want to use the whole GPU for something else".
If instead you accept a gpuArray as input to the mex function, you will just need to document that you support gpuArrays as inputs, and that the user should type "otherData = gpuArray(otherData)" before calling your mex function. You can point to MathWorks documentation for how to create a gpuArray, and the lifetime of the persistent data will be controlled by standard Matlab rules for the life of data - it will persist for as long as the user has a variable that contains it, and it will be freed when the variable is cleared.
Looking at your StackOverflow question , it looks like you are making progress on the latter option. The easiest way of compiling these GPU mex functions for Matlab is to use the mexcuda command inside Matlab, but I do understand that this is part of a bigger project and you want VS to produce it - I would suggest using mexcuda as a starting point so that you can test and confirm that your code works as expected, and then work on configuring VS (I suspect that you may just need to point VS at the extern/lib/win64/microsoft directory so that it finds the gpu.lib file)
Richard
Moein Mozaffarzadeh
Moein Mozaffarzadeh on 24 May 2021
Hi Richard,
Hi Richard, Thank you very much for your explanation. Yes, I think you are right about a document which support gpuArrays. So, I'll switch to use this feasure of Matlab.
I have just figured out that there is a memory management problem with my MEXGateway code. The first step to use gpuArrays is to remove the "cudaResetDevice" at the end of my code (but still have all the memories removed) to make sure that I'm in a full control of the memories I have defined throughout the code (and I'm not missing something). However, makes the output of my MexFunction incorrect (the output almost gets doubled each time i run the MexFunction in Matlab); the first time i run the MexFunction, all is fine and I get the output i expect though. The problem and the codes are provided in the following link:
Since you are already sort of familiar with my coding and project :), I would appreciate your comment and support for this. Thanks in advance.
Moein.

Sign in to comment.

More Answers (1)

Richard
Richard on 22 Mar 2021
Edited: Richard on 22 Mar 2021
Moein, there are a couple of issues that I can see in the code.
The first issue is that it is not initializing the GPU device in any way. The easiest way to do this is to add a call to mxInitGPU as documented in https://www.mathworks.com/help/parallel-computing/mxinitgpu.html . This will use MATLAB's normal rules to select the default GPU. I added a simple assignment line in the kernel to produce dummy output values which I also observed were not returned, and then confirmed that adding a call to mxInitGPU at the start of mexFunction() caused the expected values to be returned (after restarting MATLAB to fix the GPU state).
The second issue is that the code sample looks to be over-running array bounds when copying back from Device_ReconstructedImage_GPU to ReconstructedImage_GPU. The output mxArray is created as a 1-by-(64x64) matrix but the array size used on the device is 1-by-(96x96). This is likely to cause MATLAB to crash - the output mxArray needs to match the GPU device array size.
  7 Comments
Moein Mozaffarzadeh
Moein Mozaffarzadeh on 23 Mar 2021
Hi Richard,
Thank you for your help. I think you are right about "DAS_Ccode=reshape(Image,[64, 256])';".
Yes, the first image you posted is what I also get here with my MEX function, which is not sensible and is not correct. The second image you posted seems more sensible to me as well, but still not the correct image. The correct image is the one i posted on my previous comment.
About the issues you found:
1- yes, you are right. That was indeed a problem with my code. I fix it now, but still i cannot get the correct image. I guess it is because of the next issue
2- In my cuda project in visual studio I used :
int* RfData; // RF data; a pinned memory was dedicated to this
int* Device_RfData; // device pointer to the RF data.
int ArrayByteSize_RfData = sizeof(int) * (SystemSetup.NumberOfTransmitter * SystemSetup.NumberOfReceiver * SystemSetup.NumberOfSamples);
cudaMallocHost((int**)&RfData, ArrayByteSize_RfData); // pinned memory
gpuErrchk(cudaMalloc((int**)&Device_RfData, ArrayByteSize_RfData));
and using "cudaMemcpyAsync(&Device_RfData[NStart_Transmit], &RfData[NStart_Transmit], BYTES_PER_STREAM, cudaMemcpyHostToDevice, streams[transmit]);" inside the "for loop" containing my Kernel i transfer the "RfData" to the Kernel in an asynchronous way for the seek of time.
In my Mex code, as you can see, first i define a pointer "int* RfData;" and then use
RfData = (int *)mxGetPr(prhs[Counter]); Counter++;
to be able to copy the "data_Rearranged_saved" from Matlab to "RfData". I presume that since "RfData" is not a pinned memory in my MEX code is why my kernel does not work fine in MEX (but works good in visual studio project since i use a pinned memory there and as you also mentioned The async API requires "pinned" memory).
So, my question is now: how can i define a pinned memory and transfer the "data_Rearranged_saved" from Matlab to it? Please let me know.
Moein.
Moein Mozaffarzadeh
Moein Mozaffarzadeh on 23 Mar 2021
Richard,
to add more to my previous comment, I also checked the version of my code which does not need a pinned memory. It works fine in my CUDA project in visual studio, but again does not work with the MEX file.
As a guess: I compile my MEX code in visual studio. Does it make any difference to compile in Matlab instead? if so, could you please provide me the code i need to use in Matlab?
I've already used this:
mexcuda COMPFLAGS='$COMPFLAGS --use-local-env -maxrregcount=0 --machine 64 -compatibleArrayDims --compile -cudart static -Xptxas -dlcm=cg -use_fast_math -DWIN32 -DWIN64 -DNDEBUG -D_CONSOLE -D_MBCS -Xcompiler' TUI_Main.cu MexFunctions.cu CudaKernels.cu
but the same wrong Image is generated.
Moein.

Sign in to comment.

Categories

Find more on Kernel Creation from MATLAB Code in Help Center and File Exchange

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!