Speeding up element-wise array multiplication in python

JEquihua picture JEquihua · Oct 9, 2013 · Viewed 7.7k times · Source

I have been playing around with numba and numexpr trying to speed up a simple element-wise matrix multiplication. I have not been able to get better results, they both are basically (speedwise) equivalent to numpys multiply function. Has anyone had any luck in this area? Am I using numba and numexpr wrong (I'm quite new to this) or is this altogether a bad approach to try and speed this up. Here is a reproducible code, thank you in advanced:

import numpy as np
from numba import autojit
import numexpr as ne

a=np.random.rand(10,5000000)

# numpy
multiplication1 = np.multiply(a,a)

# numba
def multiplix(X,Y):
    M = X.shape[0]
    N = X.shape[1]
    D = np.empty((M, N), dtype=np.float)
    for i in range(M):
        for j in range(N):
            D[i,j] = X[i, j] * Y[i, j]
    return D

mul = autojit(multiplix)
multiplication2 = mul(a,a)

# numexpr
def numexprmult(X,Y):
    M = X.shape[0]
    N = X.shape[1]
    return ne.evaluate("X * Y")

multiplication3 = numexprmult(a,a) 

Answer

Alexander Vogt picture Alexander Vogt · Oct 18, 2013

What about using and ?

elementwise.F90:

subroutine elementwise( a, b, c, M, N ) bind(c, name='elementwise')
  use iso_c_binding, only: c_float, c_int

  integer(c_int),intent(in) :: M, N
  real(c_float), intent(in) :: a(M, N), b(M, N)
  real(c_float), intent(out):: c(M, N)

  integer :: i,j

  forall (i=1:M,j=1:N)
    c(i,j) = a(i,j) * b(i,j)
  end forall

end subroutine 

elementwise.py:

from ctypes import CDLL, POINTER, c_int, c_float
import numpy as np
import time

fortran = CDLL('./elementwise.so')
fortran.elementwise.argtypes = [ POINTER(c_float), 
                                 POINTER(c_float), 
                                 POINTER(c_float),
                                 POINTER(c_int),
                                 POINTER(c_int) ]

# Setup    
M=10
N=5000000

a = np.empty((M,N), dtype=c_float)
b = np.empty((M,N), dtype=c_float)
c = np.empty((M,N), dtype=c_float)

a[:] = np.random.rand(M,N)
b[:] = np.random.rand(M,N)


# Fortran call
start = time.time()
fortran.elementwise( a.ctypes.data_as(POINTER(c_float)), 
                     b.ctypes.data_as(POINTER(c_float)), 
                     c.ctypes.data_as(POINTER(c_float)), 
                     c_int(M), c_int(N) )
stop = time.time()
print 'Fortran took ',stop - start,'seconds'

# Numpy
start = time.time()
c = np.multiply(a,b)
stop = time.time()
print 'Numpy took ',stop - start,'seconds'

I compiled the Fortran file using

gfortran -O3 -funroll-loops -ffast-math -floop-strip-mine -shared -fPIC \
         -o elementwise.so elementwise.F90

The output yields a speed-up of ~10%:

 $ python elementwise.py 
Fortran took  0.213667869568 seconds
Numpy took  0.230120897293 seconds
 $ python elementwise.py 
Fortran took  0.209784984589 seconds
Numpy took  0.231616973877 seconds
 $ python elementwise.py 
Fortran took  0.214708089828 seconds
Numpy took  0.25369310379 seconds