I have a OpenCL kernel that needs to process a array as multiple arrays where each sub-array sum is saved in a local cache array.
For example, imagine the fowling array:
[[1, 2, 3, 4], [10, 30, 1, 23]]
Each work-item process two array indexes (for example multiply the value index the local_id), where the work-item result is saved in a work-group shared array.
__kernel void test(__global int **values, __global int *result, const int array_size){
__local int cache[array_size];
// initialise
if (get_local_id(0) == 0){
for (int i = 0; i < array_size; i++)
cache[i] = 0;
}
barrier (CLK_LOCAL_MEM_FENCE);
if(get_global_id(0) < 4){
for (int i = 0; i<2; i++)
cache[get_local_id(0)] += values[get_group_id(0)][i] *
get_local_id(0);
}
barrier (CLK_LOCAL_MEM_FENCE);
if(get_local_id(0) == 0){
for (int i = 0; i<array_size; i++)
result[get_group_id(0)] += cache[i];
}
}
The problem is that I can not define the cache array size by using a kernel parameter, but i need to in order to have a dynamic kernel.
How can I create it dynamically? like malloc function in c...
Or the only solution available is to send a temp array to my kernel function?
This can be achieved by adding __local
array as a kernel parameter:
__kernel void test(__global int **values, __global int *result,
const int array_size, __local int * cache)
and providing desired size of the kernel parameter:
clSetKernelArg(kernel, 3, array_size*sizeof(int), NULL);
The local memory will be allocated upon the kernel invocation. Note, that extra checks may be necessary to ensure that required local memory size does not exceed the device limit.