In C++, should I bother to cache variables, or let the compiler do the optimization? (Aliasing)

Yaron Cohen-Tal picture Yaron Cohen-Tal · Nov 24, 2015 · Viewed 8.3k times · Source

Consider the following code (p is of type unsigned char* and bitmap->width is of some integer type, exactly which is unknown and depends on which version of some external library we're using):

for (unsigned x = 0;  x < static_cast<unsigned>(bitmap->width);  ++x)
{
    *p++ = 0xAA;
    *p++ = 0xBB;
    *p++ = 0xCC;
}

Is it worth optimizing it [..]

Could there be a case where this could yield more efficient results by writing:

unsigned width(static_cast<unsigned>(bitmap->width));
for (unsigned x = 0;  x < width;  ++x)
{
    *p++ = 0xAA;
    *p++ = 0xBB;
    *p++ = 0xCC;
}

... or is this trivial for the compiler to optimize?

What would you consider to be "better" code?

Note from editor (Ike): for those wondering about the strikeout text, the original question, as phrased, was dangerously close to off-topic territory and was very close to being closed in spite of positive feedback. These have been stricken out. Yet please do not punish the answerers who addressed these stricken sections of the question.

Answer

YSC picture YSC · Nov 24, 2015

At first glance, I thought the compiler could generate equivalent assembly for both versions with optimization flags activated. When I checked it, I was surprised to see the result:

Source unoptimized.cpp

note: this code is not meant to be executed.

struct bitmap_t
{
    long long width;
} bitmap;

int main(int argc, char** argv)
{
    for (unsigned x = 0 ; x < static_cast<unsigned>(bitmap.width) ; ++x)
    {
        argv[x][0] = '\0';
    }
    return 0;
}

Source optimized.cpp

note: this code is not meant to be executed.

struct bitmap_t
{
    long long width;
} bitmap;

int main(int argc, char** argv)
{
    const unsigned width = static_cast<unsigned>(bitmap.width);
    for (unsigned x = 0 ; x < width ; ++x)
    {
        argv[x][0] = '\0';
    }
    return 0;
}

Compilation

  • $ g++ -s -O3 unoptimized.cpp
  • $ g++ -s -O3 optimized.cpp

Assembly (unoptimized.s)

    .file   "unoptimized.cpp"
    .text
    .p2align 4,,15
.globl main
    .type   main, @function
main:
.LFB0:
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    movl    bitmap(%rip), %eax
    testl   %eax, %eax
    je  .L2
    xorl    %eax, %eax
    .p2align 4,,10
    .p2align 3
.L3:
    mov %eax, %edx
    addl    $1, %eax
    movq    (%rsi,%rdx,8), %rdx
    movb    $0, (%rdx)
    cmpl    bitmap(%rip), %eax
    jb  .L3
.L2:
    xorl    %eax, %eax
    ret
    .cfi_endproc
.LFE0:
    .size   main, .-main
.globl bitmap
    .bss
    .align 8
    .type   bitmap, @object
    .size   bitmap, 8
bitmap:
    .zero   8
    .ident  "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-16)"
    .section    .note.GNU-stack,"",@progbits

Assembly (optimized.s)

    .file   "optimized.cpp"
    .text
    .p2align 4,,15
.globl main
    .type   main, @function
main:
.LFB0:
    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
    movl    bitmap(%rip), %eax
    testl   %eax, %eax
    je  .L2
    subl    $1, %eax
    leaq    8(,%rax,8), %rcx
    xorl    %eax, %eax
    .p2align 4,,10
    .p2align 3
.L3:
    movq    (%rsi,%rax), %rdx
    addq    $8, %rax
    cmpq    %rcx, %rax
    movb    $0, (%rdx)
    jne .L3
.L2:
    xorl    %eax, %eax
    ret
    .cfi_endproc
.LFE0:
    .size   main, .-main
.globl bitmap
    .bss
    .align 8
    .type   bitmap, @object
    .size   bitmap, 8
bitmap:
    .zero   8
    .ident  "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-16)"
    .section    .note.GNU-stack,"",@progbits

diff

$ diff -uN unoptimized.s optimized.s
--- unoptimized.s   2015-11-24 16:11:55.837922223 +0000
+++ optimized.s 2015-11-24 16:12:02.628922941 +0000
@@ -1,4 +1,4 @@
-   .file   "unoptimized.cpp"
+   .file   "optimized.cpp"
    .text
    .p2align 4,,15
 .globl main
@@ -10,16 +10,17 @@
    movl    bitmap(%rip), %eax
    testl   %eax, %eax
    je  .L2
+   subl    $1, %eax
+   leaq    8(,%rax,8), %rcx
    xorl    %eax, %eax
    .p2align 4,,10
    .p2align 3
 .L3:
-   mov %eax, %edx
-   addl    $1, %eax
-   movq    (%rsi,%rdx,8), %rdx
+   movq    (%rsi,%rax), %rdx
+   addq    $8, %rax
+   cmpq    %rcx, %rax
    movb    $0, (%rdx)
-   cmpl    bitmap(%rip), %eax
-   jb  .L3
+   jne .L3
 .L2:
    xorl    %eax, %eax
    ret

The generated assembly for the optimized version does actually load (lea) the width constant unlike the unoptimized version which computes the width offset at each iteration (movq).

When I'll get time, I eventually post some benchmark on that. Good question.