When can I get better performance using memcpy
or how do I benefit from using it?
For example:
float a[3]; float b[3];
is code:
memcpy(a, b, 3*sizeof(float));
faster than this one?
a[0] = b[0];
a[1] = b[1];
a[2] = b[2];
Efficiency should not be your concern.
Write clean maintainable code.
It bothers me that so many answers indicate that the memcpy() is inefficient. It is designed to be the most efficient way of copy blocks of memory (for C programs).
So I wrote the following as a test:
#include <algorithm>
extern float a[3];
extern float b[3];
extern void base();
int main()
{
base();
#if defined(M1)
a[0] = b[0];
a[1] = b[1];
a[2] = b[2];
#elif defined(M2)
memcpy(a, b, 3*sizeof(float));
#elif defined(M3)
std::copy(&a[0], &a[3], &b[0]);
#endif
base();
}
Then to compare the code produces:
g++ -O3 -S xr.cpp -o s0.s
g++ -O3 -S xr.cpp -o s1.s -DM1
g++ -O3 -S xr.cpp -o s2.s -DM2
g++ -O3 -S xr.cpp -o s3.s -DM3
echo "=======" > D
diff s0.s s1.s >> D
echo "=======" >> D
diff s0.s s2.s >> D
echo "=======" >> D
diff s0.s s3.s >> D
This resulted in: (comments added by hand)
======= // Copy by hand
10a11,18
> movq _a@GOTPCREL(%rip), %rcx
> movq _b@GOTPCREL(%rip), %rdx
> movl (%rdx), %eax
> movl %eax, (%rcx)
> movl 4(%rdx), %eax
> movl %eax, 4(%rcx)
> movl 8(%rdx), %eax
> movl %eax, 8(%rcx)
======= // memcpy()
10a11,16
> movq _a@GOTPCREL(%rip), %rcx
> movq _b@GOTPCREL(%rip), %rdx
> movq (%rdx), %rax
> movq %rax, (%rcx)
> movl 8(%rdx), %eax
> movl %eax, 8(%rcx)
======= // std::copy()
10a11,14
> movq _a@GOTPCREL(%rip), %rsi
> movl $12, %edx
> movq _b@GOTPCREL(%rip), %rdi
> call _memmove
Added Timing results for running the above inside a loop of 1000000000
.
g++ -c -O3 -DM1 X.cpp
g++ -O3 X.o base.o -o m1
g++ -c -O3 -DM2 X.cpp
g++ -O3 X.o base.o -o m2
g++ -c -O3 -DM3 X.cpp
g++ -O3 X.o base.o -o m3
time ./m1
real 0m2.486s
user 0m2.478s
sys 0m0.005s
time ./m2
real 0m1.859s
user 0m1.853s
sys 0m0.004s
time ./m3
real 0m1.858s
user 0m1.851s
sys 0m0.006s