I'm learning the basics of paralel execution of for loop using OpenMP.
Sadly, my paralel program runs 10x slower than serial version. What am I doing wrong? Am I missing some barriers?
double **basicMultiply(double **A, double **B, int size) {
int i, j, k;
double **res = createMatrix(size);
omp_set_num_threads(4);
#pragma omp parallel for private(k)
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
for (k = 0; k < size; k++) {
res[i][j] += A[i][k] * B[k][j];
}
}
}
return res;
}
Thank you very much!
Your problem is due to a race condition on the inner loop variable j
. It needs to be made private.
For C89 I would do something like this:
#pragma omp parallel
{
int i, j, k;
#pragma omp for
for(i=0; ...
For C++ or C99 use mixed declarations
#pragma omp parallel for
for(int i=0; ...
Doing this you don't have to explicitly declare anything shared or private.
Some further comments to your code. Your single threaded code is not cache friendly when you do B[k][j]
. This reads a cacheline then moves to the next cache line and so forth until the dot product is done by which time the other cachelines have been evicted. Instead you should take the transpose first and access as BT[j][k]
. Additionally, you have allocated arrays of arrays and not one contiguous 2D array. I fixed your code to use the transpose and a contiguous 2D array.
Here are the times I get for size=512.
no transpose no openmp 0.94s
no transpose, openmp 0.23s
tranpose, no openmp 0.27s
transpose, openmp 0.08s
Below is the code (also see http://coliru.stacked-crooked.com/a/ee174916fa035f97)
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
void transpose(double *A, double *B, int n) {
int i,j;
for(i=0; i<n; i++) {
for(j=0; j<n; j++) {
B[j*n+i] = A[i*n+j];
}
}
}
void gemm(double *A, double *B, double *C, int n)
{
int i, j, k;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
double dot = 0;
for (k = 0; k < n; k++) {
dot += A[i*n+k]*B[k*n+j];
}
C[i*n+j ] = dot;
}
}
}
void gemm_omp(double *A, double *B, double *C, int n)
{
#pragma omp parallel
{
int i, j, k;
#pragma omp for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
double dot = 0;
for (k = 0; k < n; k++) {
dot += A[i*n+k]*B[k*n+j];
}
C[i*n+j ] = dot;
}
}
}
}
void gemmT(double *A, double *B, double *C, int n)
{
int i, j, k;
double *B2;
B2 = (double*)malloc(sizeof(double)*n*n);
transpose(B,B2, n);
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
double dot = 0;
for (k = 0; k < n; k++) {
dot += A[i*n+k]*B2[j*n+k];
}
C[i*n+j ] = dot;
}
}
free(B2);
}
void gemmT_omp(double *A, double *B, double *C, int n)
{
double *B2;
B2 = (double*)malloc(sizeof(double)*n*n);
transpose(B,B2, n);
#pragma omp parallel
{
int i, j, k;
#pragma omp for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
double dot = 0;
for (k = 0; k < n; k++) {
dot += A[i*n+k]*B2[j*n+k];
}
C[i*n+j ] = dot;
}
}
}
free(B2);
}
int main() {
int i, n;
double *A, *B, *C, dtime;
n=512;
A = (double*)malloc(sizeof(double)*n*n);
B = (double*)malloc(sizeof(double)*n*n);
C = (double*)malloc(sizeof(double)*n*n);
for(i=0; i<n*n; i++) { A[i] = rand()/RAND_MAX; B[i] = rand()/RAND_MAX;}
dtime = omp_get_wtime();
gemm(A,B,C, n);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
dtime = omp_get_wtime();
gemm_omp(A,B,C, n);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
dtime = omp_get_wtime();
gemmT(A,B,C, n);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
dtime = omp_get_wtime();
gemmT_omp(A,B,C, n);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
return 0;
}