Why doesn't CudaFree seem to free memory?

Beau Bellamy picture Beau Bellamy · May 1, 2012 · Viewed 8k times · Source

I am trying to allocate device memory, copy to it, perform the calculations on the GPU, copy the results back and then free up the device memory I allocated. I wanted to make sure that I wasn't going over the limit and I wanted to see if I would have enough memory in the shared memory space to dump a few arrays.

When I allocate device memory, there are no errors being returned. When I use cudaMemGetInfo to check the amount of memory allocated, it looks like one cudaMalloc hasn't allocated any memory. Also when I try to free the memory, it looks like only one pointer is freed.

I am using the matlab Mexfunction interface to setup the GPU memory and launch the kernel. At this point, I'm not even calling into the kernel and just returning back a unit matrix for the results.

cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);  

/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;

/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceScattDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{   
    mexPrintf("could not allocate memory to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}  
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedReal\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));

/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);   
if (cudaErr != cudaSuccess)
{  
    mexPrintf("could not copy to deviceScattDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));  

/* call the kernel */
// launchKernel<<<1,512>>>(........);   

/* retireve the output */  
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);   
if (cudaErr != cudaSuccess)
{   
    mexPrintf("could not copy to receivedReal\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); 
if (cudaErr != cudaSuccess)
{ 
    mexPrintf("could not copy to receivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));   

/* free the memory. */ 
mexPrintf("\nFree'ing memory.\n");   
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);  
cudaErr = cudaFree(devicePulseDelay); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free devicePulseDelay\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceTarDistance);   
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could free deviceTarDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScattDistance);   
if (cudaErr != cudaSuccess) 
{   
    mexPrintf("could free deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}   
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScatterers);  
if (cudaErr != cudaSuccess) 
{   
    mexPrintf("could free deviceScatterers\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceReceivedReal);  
if (cudaErr != cudaSuccess) 
{  
    mexPrintf("could free deviceReceivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceReceivedImag);   
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceReceivedImag\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

Here is the output from this:

 
Memory avaliable: Free: 2523959296, Total: 2818572288
 Allocating memory.
 devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576
 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576
 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880
 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456
 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608
 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

Copying memory.
 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

Free'ing memory.
 Before freeing: Free 2513473536, Total: 2818572288
 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0
 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0
 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0
 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576

I feel like there is something obvious that i'm missing. Can anyone help explain what is going on?

EDIT: platform is windows 7 with a Tesla C2050 GPu card.

Answer

talonmies picture talonmies · May 2, 2012

It is a pretty common misconception that malloc directly gets memory allocations from the host operating system when called, and free directly releases them back to the host operating when called. But they almost always don't work like that, instead the standard library maintains a circular list of free'd and malloc'd memory which is opportunistically expanded and contracted by interacting with the host OS (see some of the answers on How do malloc() and free() work? for more details if you are interested). Irrespective of how it works, this leads to a number of non-intuitive results, including the fact that it is usually impossible to allocate as much memory as the OS says is free, that allocations sometimes appear to not change the amount of free memory, and that free sometimes has no effect on the amount of memory the OS says is free.

Although I have nothing but empirical evidence to support this, I believe CUDA works exactly the same way. The context maintains its own list of malloc'd and free'd memory, and will expand and contract the memory held in that list as host driver/window manager and the GPU itself allows. All hardware has a characteristic MMU page size, and there is evidence to suggest that the page size on NVIDIA GPUs is rather large. This implies there is rather coarse granularity in cudaMalloc calls, and means sometimes a malloc appears to not effect the amount of free memory or to consume much more memory than was requested, and sometimes free calls appear to have no effect (If you are interested, you can find a little tool which helps illustrate the page size behaviour of the CUDA driver here, although it was written for an early version of the CUDA API and might need a couple of changes to compile with modern versions). I believe this is the most likely explanation for the behaviour you are observing.

Incidentally, if I run a simplified version of the code you posted on MacOS 10.6 with a GT200 family device:

#include <cstdio>

#define mexPrintf printf

inline void gpuAssert(cudaError_t code, char *file, int line, 
                 bool abort=true)
{
   if (code != cudaSuccess) 
   {
      mexPrintf("GPUassert: %s %s %d\n", cudaGetErrorString(code),
          file, line);
      if (abort) exit(code);
   }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuMemReport(size_t * avail, size_t * total, 
        const char * title = 0, const size_t * free = 0, const bool sense = true) 
{
    char tstring[32] = { '\0' };
    gpuErrchk( cudaMemGetInfo(avail, total) );  

    if (free) {
        if (title) {
            strncpy(tstring, title, 31);
        }
        mexPrintf("%s Memory avaliable: Free: %zu, Total: %zu, %s: %zu\n",
                tstring, *avail, *total, (sense) ? "Allocated\0" : "Freed\0", 
                (sense) ? (*free - *avail) : (*avail - *free));
    } else {
        mexPrintf("Memory avaliable: Free: %zu, Total: %zu\n", *avail, *total);  
    }
}

int main()
{
    size_t freeMem = 0;
    size_t totalMem = 0;
    size_t allocMem = 0;

    gpuErrchk( cudaFree(0) );
    gpuMemReport(&freeMem, &totalMem);

    double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
    double *deviceReceivedReal, *deviceReceivedImag;

    mexPrintf("Allocating memory.\n");
    gpuErrchk( cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512) );
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999) );
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem);

    mexPrintf("\nFree'ing memory.\n");   
    gpuMemReport(&freeMem, &totalMem);

    gpuErrchk( cudaFree(devicePulseDelay) ); 
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem, false);

    gpuErrchk( cudaFree(deviceTarDistance) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem, false);

    gpuErrchk( cudaFree(deviceScattDistance) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem, false);

    gpuErrchk( cudaFree(deviceScatterers) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem, false);

    gpuErrchk( cudaFree(deviceReceivedReal) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem, false);

    gpuErrchk( cudaFree(deviceReceivedImag) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem, false);

    return 0;
}

I get a different result, but also one showing the same phenomena:

Allocating memory.
devicePulseDelay: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceTarDistance: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceScattDistance: Memory avaliable: Free: 198778880, Total: 265027584, Allocated: 5140480
deviceScatterers: Memory avaliable: Free: 197730304, Total: 265027584, Allocated: 6189056
deviceReceivedReal: Memory avaliable: Free: 193638400, Total: 265027584, Allocated: 10280960
deviceReceivedImag: Memory avaliable: Free: 189546496, Total: 265027584, Allocated: 14372864

Free'ing memory.
Memory avaliable: Free: 189546496, Total: 265027584
devicePulseDelay: Memory avaliable: Free: 189546496, Total: 265027584, Freed: 0
deviceTarDistance: Memory avaliable: Free: 190595072, Total: 265027584, Freed: 1048576
deviceScattDistance: Memory avaliable: Free: 194686976, Total: 265027584, Freed: 5140480
deviceScatterers: Memory avaliable: Free: 195735552, Total: 265027584, Freed: 6189056
deviceReceivedReal: Memory avaliable: Free: 199827456, Total: 265027584, Freed: 10280960
deviceReceivedImag: Memory avaliable: Free: 203919360, Total: 265027584, Freed: 14372864

Which suggests that the behaviour is hardware/host OS dependent as well.