Consider the following code (p
is of type unsigned char*
and bitmap->width
is of some integer type, exactly which is unknown and depends on which version of some external library we're using):
for (unsigned x = 0; x < static_cast<unsigned>(bitmap->width); ++x)
{
*p++ = 0xAA;
*p++ = 0xBB;
*p++ = 0xCC;
}
Is it worth optimizing it [..]
Could there be a case where this could yield more efficient results by writing:
unsigned width(static_cast<unsigned>(bitmap->width));
for (unsigned x = 0; x < width; ++x)
{
*p++ = 0xAA;
*p++ = 0xBB;
*p++ = 0xCC;
}
... or is this trivial for the compiler to optimize?
What would you consider to be "better" code?
Note from editor (Ike): for those wondering about the strikeout text, the original question, as phrased, was dangerously close to off-topic territory and was very close to being closed in spite of positive feedback. These have been stricken out. Yet please do not punish the answerers who addressed these stricken sections of the question.
At first glance, I thought the compiler could generate equivalent assembly for both versions with optimization flags activated. When I checked it, I was surprised to see the result:
unoptimized.cpp
note: this code is not meant to be executed.
struct bitmap_t
{
long long width;
} bitmap;
int main(int argc, char** argv)
{
for (unsigned x = 0 ; x < static_cast<unsigned>(bitmap.width) ; ++x)
{
argv[x][0] = '\0';
}
return 0;
}
optimized.cpp
note: this code is not meant to be executed.
struct bitmap_t
{
long long width;
} bitmap;
int main(int argc, char** argv)
{
const unsigned width = static_cast<unsigned>(bitmap.width);
for (unsigned x = 0 ; x < width ; ++x)
{
argv[x][0] = '\0';
}
return 0;
}
$ g++ -s -O3 unoptimized.cpp
$ g++ -s -O3 optimized.cpp
.file "unoptimized.cpp"
.text
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
movl bitmap(%rip), %eax
testl %eax, %eax
je .L2
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L3:
mov %eax, %edx
addl $1, %eax
movq (%rsi,%rdx,8), %rdx
movb $0, (%rdx)
cmpl bitmap(%rip), %eax
jb .L3
.L2:
xorl %eax, %eax
ret
.cfi_endproc
.LFE0:
.size main, .-main
.globl bitmap
.bss
.align 8
.type bitmap, @object
.size bitmap, 8
bitmap:
.zero 8
.ident "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-16)"
.section .note.GNU-stack,"",@progbits
.file "optimized.cpp"
.text
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
movl bitmap(%rip), %eax
testl %eax, %eax
je .L2
subl $1, %eax
leaq 8(,%rax,8), %rcx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L3:
movq (%rsi,%rax), %rdx
addq $8, %rax
cmpq %rcx, %rax
movb $0, (%rdx)
jne .L3
.L2:
xorl %eax, %eax
ret
.cfi_endproc
.LFE0:
.size main, .-main
.globl bitmap
.bss
.align 8
.type bitmap, @object
.size bitmap, 8
bitmap:
.zero 8
.ident "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-16)"
.section .note.GNU-stack,"",@progbits
$ diff -uN unoptimized.s optimized.s
--- unoptimized.s 2015-11-24 16:11:55.837922223 +0000
+++ optimized.s 2015-11-24 16:12:02.628922941 +0000
@@ -1,4 +1,4 @@
- .file "unoptimized.cpp"
+ .file "optimized.cpp"
.text
.p2align 4,,15
.globl main
@@ -10,16 +10,17 @@
movl bitmap(%rip), %eax
testl %eax, %eax
je .L2
+ subl $1, %eax
+ leaq 8(,%rax,8), %rcx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L3:
- mov %eax, %edx
- addl $1, %eax
- movq (%rsi,%rdx,8), %rdx
+ movq (%rsi,%rax), %rdx
+ addq $8, %rax
+ cmpq %rcx, %rax
movb $0, (%rdx)
- cmpl bitmap(%rip), %eax
- jb .L3
+ jne .L3
.L2:
xorl %eax, %eax
ret
The generated assembly for the optimized version does actually load (lea
) the width
constant unlike the unoptimized version which computes the width
offset at each iteration (movq
).
When I'll get time, I eventually post some benchmark on that. Good question.