Goal:
test
executable for the shared library.Problem
MYLIB.so
seems to compile fine. (no problem)../libMYLIB.so: undefined reference to __cudaRegisterLinkedBinary_39_tmpxft_000018cf_00000000_6_MYLIB_cpp1_ii_74c599a1
simplified makefile:
libMYlib.so : MYLIB.o
g++ -shared -Wl,-soname,libMYLIB.so -o libMYLIB.so MYLIB.o -L/the/cuda/lib/dir -lcudart
MYLIB.o : MYLIB.cu MYLIB.h
nvcc -m64 -arch=sm_20 -dc -Xcompiler '-fPIC' MYLIB.cu -o MYLIB.o -L/the/cuda/lib/dir -lcudart
test : test.cpp libMYlib.so
g++ test.cpp -o test -L. -ldl -Wl,-rpath,. -lMYLIB -L/the/cuda/lib/dir -lcudart
indeed
nm libMYLIB.so
shows that all CUDA api functions are "undefined symbols":
U __cudaRegisterFunction
U __cudaRegisterLinkedBinary_39_tmpxft_0000598c_00000000_6_CUPA_cpp1_ii_74c599a1
U cudaEventRecord
U cudaFree
U cudaGetDevice
U cudaGetDeviceProperties
U cudaGetErrorString
U cudaLaunch
U cudaMalloc
U cudaMemcpy
So CUDA somehow did not get linked to the shared library MYLIB.so What am I missing?
CUDA did not even get linked to the object file somehow:
nm MYLIB.o
U __cudaRegisterFunction
U __cudaRegisterLinkedBinary_39_tmpxft_0000598c_00000000_6_CUPA_cpp1_ii_74c599a1
U cudaEventRecord
U cudaFree
U cudaGetDevice
U cudaGetDeviceProperties
U cudaGetErrorString
U cudaLaunch
U cudaMalloc
U cudaMemcpy
(same as above)
Here's an example linux shared object creation along the lines you indicated:
First the shared library. The build commands for this are as follows:
nvcc -arch=sm_20 -Xcompiler '-fPIC' -dc test1.cu test2.cu
nvcc -arch=sm_20 -Xcompiler '-fPIC' -dlink test1.o test2.o -o link.o
g++ -shared -o test.so test1.o test2.o link.o -L/usr/local/cuda/lib64 -lcudart
It seems you may be missing the second step above in your makefile, but I haven't analyzed if there are any other issues with your makefile.
Now, for the test executable, the build commands are as follows:
g++ -c main.cpp
g++ -o testmain main.o test.so
To run it, simply execute the testmain
executable, but be sure the test.so
library is on your LD_LIBRARY_PATH
.
These are the files I used for test purposes:
test1.h:
int my_test_func1();
test1.cu:
#include <stdio.h>
#include "test1.h"
#define DSIZE 1024
#define DVAL 10
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void my_kernel1(int *data){
int idx = threadIdx.x + (blockDim.x *blockIdx.x);
if (idx < DSIZE) data[idx] =+ DVAL;
}
int my_test_func1(){
int *d_data, *h_data;
h_data = (int *) malloc(DSIZE * sizeof(int));
if (h_data == 0) {printf("malloc fail\n"); exit(1);}
cudaMalloc((void **)&d_data, DSIZE * sizeof(int));
cudaCheckErrors("cudaMalloc fail");
for (int i = 0; i < DSIZE; i++) h_data[i] = 0;
cudaMemcpy(d_data, h_data, DSIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
my_kernel1<<<((DSIZE+nTPB-1)/nTPB), nTPB>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
cudaMemcpy(h_data, d_data, DSIZE * sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2");
for (int i = 0; i < DSIZE; i++)
if (h_data[i] != DVAL) {printf("Results check failed at offset %d, data was: %d, should be %d\n", i, h_data[i], DVAL); exit(1);}
printf("Results check 1 passed!\n");
return 0;
}
test2.h:
int my_test_func2();
test2.cu:
#include <stdio.h>
#include "test2.h"
#define DSIZE 1024
#define DVAL 20
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void my_kernel2(int *data){
int idx = threadIdx.x + (blockDim.x *blockIdx.x);
if (idx < DSIZE) data[idx] =+ DVAL;
}
int my_test_func2(){
int *d_data, *h_data;
h_data = (int *) malloc(DSIZE * sizeof(int));
if (h_data == 0) {printf("malloc fail\n"); exit(1);}
cudaMalloc((void **)&d_data, DSIZE * sizeof(int));
cudaCheckErrors("cudaMalloc fail");
for (int i = 0; i < DSIZE; i++) h_data[i] = 0;
cudaMemcpy(d_data, h_data, DSIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
my_kernel2<<<((DSIZE+nTPB-1)/nTPB), nTPB>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
cudaMemcpy(h_data, d_data, DSIZE * sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2");
for (int i = 0; i < DSIZE; i++)
if (h_data[i] != DVAL) {printf("Results check failed at offset %d, data was: %d, should be %d\n", i, h_data[i], DVAL); exit(1);}
printf("Results check 2 passed!\n");
return 0;
}
main.cpp:
#include <stdio.h>
#include "test1.h"
#include "test2.h"
int main(){
my_test_func1();
my_test_func2();
return 0;
}
When I compile according to the commands given, and run ./testmain
I get:
$ ./testmain
Results check 1 passed!
Results check 2 passed!
Note that if you prefer, you may generate a libtest.so
instead of test.so
, and then you may use a modified build sequence for the test executable:
g++ -c main.cpp
g++ -o testmain main.o -L. -ltest
I don't believe it makes any difference, but it may be more familiar syntax.
I'm sure there is more than one way to accomplish this. This is just an example. You may wish to also review the relevant section of the nvcc manual and also review the examples.
EDIT: I tested this under cuda 5.5 RC, and the final application link step complained about not finding the cudart lib (warning: libcudart.so.5.5., needed by ./libtest.so, not found
). However the following relatively simple modification (example Makefile) should work under either cuda 5.0 or cuda 5.5.
Makefile:
testmain : main.cpp libtest.so
g++ -c main.cpp
g++ -o testmain -L. -ldl -Wl,-rpath,. -ltest -L/usr/local/cuda/lib64 -lcudart main.o
libtest.so : link.o
g++ -shared -Wl,-soname,libtest.so -o libtest.so test1.o test2.o link.o -L/usr/local/cuda/lib64 -lcudart
link.o : test1.cu test2.cu test1.h test2.h
nvcc -m64 -arch=sm_20 -dc -Xcompiler '-fPIC' test1.cu test2.cu
nvcc -m64 -arch=sm_20 -Xcompiler '-fPIC' -dlink test1.o test2.o -o link.o
clean :
rm -f testmain test1.o test2.o link.o libtest.so main.o