GCC is a fine compiler, but... once more

By: Michael S (already5chosen.delete@this.yahoo.com), December 4, 2018 5:54 am
Room: Moderated Discussions
Semiannual gcc bashing was long overdue.

Today's theme is gcc8 is fine compiler, but not quite as good as gcc3.

So, let's start with source code:

#if __GNUC__ > 3
#include <stdint.h>
#else
typedef unsigned uint32_t;
#endif

void memcopy32(uint32_t* restrict dst, const uint32_t* src, unsigned count)
{
unsigned count1 = count & 3;
if (count1 != 0) {
count -= count1;
const uint32_t* lastSrc = &src[count1];
do {
uint32_t w0 = src[0];
src += 1;
dst += 1;
dst[-1] = w0;
} while (src != lastSrc);
}
if (count != 0) {
const uint32_t* lastSrc = &src[count];
do {
uint32_t w0 = src[0];
uint32_t w1 = src[1];
uint32_t w2 = src[2];
uint32_t w3 = src[3];
dst[0] = w0;
dst[1] = w1;
dst[2] = w2;
dst[3] = w3;
src += 4;
dst += 4;
} while (src < lastSrc);
}
}


And here is gcc3 output:
	.file	"memcopy32.c"
.section .text
.align 2
.global memcopy32
.type memcopy32, @function
memcopy32:
andi r3, r6, 3
mov r8, r4
mov r7, r5
beq r3, zero, .L2
slli r2,r3,2
sub r6, r6, r3
add r3, r2, r5
.L3:
ldw r2, 0(r7)
addi r8, r8, 4
addi r7, r7, 4
stw r2, -4(r8)
bne r7, r3, .L3
.L2:
beq r6, zero, .L1
slli r2,r6,2
add r6, r2, r7
.L7:
ldw r2, 0(r7)
ldw r3, 4(r7)
ldw r4, 8(r7)
ldw r5, 12(r7)
addi r7, r7, 16
stw r2, 0(r8)
stw r3, 4(r8)
stw r4, 8(r8)
stw r5, 12(r8)
addi r8, r8, 16
bltu r7, r6, .L7
.L1:
ret
.size memcopy32, .-memcopy32
.ident "GCC: (GNU) 3.4.6 (Altera Nios II 9.1 b190)"


As you can see, it literally translates the source. And result is pretty good - small, fast, predictable. As it should be.

Then, starting from gcc4, they decided to become modern.

Output of gcc.4.1.2
	.file	"memcopy32.c"
.section .text
.align 2
.global memcopy32
.type memcopy32, @function
memcopy32:
andi r9, r6, 3
mov r8, r4
mov r7, r5
beq r9, zero, .L2
add r2, r9, r9
add r2, r2, r2
add r3, r5, r2
.L4:
ldw r2, 0(r7)
addi r8, r8, 4
addi r7, r7, 4
stw r2, -4(r8)
bne r3, r7, .L4
sub r6, r6, r9
.L2:
beq r6, zero, .L9
add r2, r6, r6
add r2, r2, r2
add r6, r7, r2
.L8:
ldw r3, 4(r7)
ldw r4, 8(r7)
ldw r5, 12(r7)
ldw r2, 0(r7)
addi r7, r7, 16
stw r3, 4(r8)
stw r2, 0(r8)
stw r4, 8(r8)
stw r5, 12(r8)
addi r8, r8, 16
bltu r7, r6, .L8
.L9:
ret
.size memcopy32, .-memcopy32
.ident "GCC: (Altera 11.1sp2 Build 259) 4.1.2"


Still, nearly identical to gcc3, but you can see the first signs of modernity.
The code is little longer. It's probably not measurably slower. Longer, but not slower, is not it beautiful?
Also memory in the second loop accessed in the different pattern from what specified in the source code. Is not it great, to access the memory differently? Not faster, not slower, just differently!

Output of gcc.4.7.3
	.file	"memcopy32.c"
.section .text
.align 2
.global memcopy32
.type memcopy32, @function
memcopy32:
andi r2, r6, 3
beq r2, zero, .L2
add r9, r2, r2
add r9, r9, r9
sub r6, r6, r2
add r9, r5, r9
mov r3, r4
mov r2, r5
.L3:
ldw r7, 0(r2)
addi r3, r3, 4
addi r2, r2, 4
stw r7, -4(r3)
bne r2, r9, .L3
addi r3, r5, 4
sub r2, r2, r3
srli r2, r2, 2
addi r2, r2, 1
add r2, r2, r2
add r2, r2, r2
add r5, r5, r2
add r4, r4, r2
.L2:
beq r6, zero, .L1
add r6, r6, r6
add r6, r6, r6
add r8, r5, r6
.L5:
ldw r7, 0(r5)
ldw r6, 4(r5)
ldw r3, 8(r5)
ldw r2, 12(r5)
stw r7, 0(r4)
stw r6, 4(r4)
stw r3, 8(r4)
stw r2, 12(r4)
addi r5, r5, 16
addi r4, r4, 16
bltu r5, r8, .L5
.L1:
ret
.size memcopy32, .-memcopy32
.ident "GCC: (Altera 13.1 Build 162) 4.7.3"


Still not much slower than gcc3, but now significantly longer. And it's became quite difficult to figure out what's exactly goes on right after the end of the first loop. The old code was boring and now observer has a puzzle to solve. That's what we call progress!

Output of gcc.4.8.3
	.file	"memcopy32.c"
.section .text
.align 2
.global memcopy32
.type memcopy32, @function
memcopy32:
andi r2, r6, 3
beq r2, zero, .L2
add r9, r2, r2
add r9, r9, r9
sub r6, r6, r2
add r9, r5, r9
mov r3, r4
mov r2, r5
.L4:
ldw r7, 0(r2)
addi r3, r3, 4
addi r2, r2, 4
stw r7, -4(r3)
bne r2, r9, .L4
addi r3, r5, 4
sub r2, r2, r3
srli r2, r2, 2
addi r2, r2, 1
add r2, r2, r2
add r2, r2, r2
add r5, r5, r2
add r4, r4, r2
.L2:
beq r6, zero, .L1
add r6, r6, r6
add r6, r6, r6
add r8, r5, r6
.L7:
ldw r7, 0(r5)
ldw r6, 4(r5)
ldw r3, 8(r5)
ldw r2, 12(r5)
stw r7, 0(r4)
stw r6, 4(r4)
stw r3, 8(r4)
stw r2, 12(r4)
addi r5, r5, 16
addi r4, r4, 16
bltu r5, r8, .L7
.L1:
ret
.size memcopy32, .-memcopy32
.ident "GCC: (Altera 14.1 Build 186) 4.8.3 20140320 (prerelease)"


Virtually the same as the previous one. Seems that in 2013 the progress ran out of steam. But wait...


Output of gcc.4.9.2
	.file	"memcopy32.c"
.section .text
.align 2
.global memcopy32
.type memcopy32, @function
memcopy32:
andi r2, r6, 3
beq r2, zero, .L2
add r8, r2, r2
add r8, r8, r8
sub r6, r6, r2
add r8, r5, r8
mov r3, r4
mov r2, r5
.L3:
ldw r7, 0(r2)
addi r3, r3, 4
addi r2, r2, 4
stw r7, -4(r3)
bne r2, r8, .L3
addi r3, r5, 4
sub r3, r2, r3
srli r3, r3, 2
mov r5, r2
addi r2, r3, 1
add r2, r2, r2
add r2, r2, r2
add r4, r4, r2
.L2:
beq r6, zero, .L1
add r6, r6, r6
add r6, r6, r6
add r10, r5, r6
addi r9, r5, 4
addi r8, r5, 8
addi r7, r5, 12
addi r6, r4, 4
addi r3, r4, 8
addi r2, r4, 12
.L5:
ldw r14, 0(r5)
ldw r13, 0(r9)
ldw r12, 0(r8)
ldw r11, 0(r7)
stw r14, 0(r4)
stw r13, 0(r6)
stw r12, 0(r3)
stw r11, 0(r2)
addi r5, r5, 16
addi r4, r4, 16
addi r9, r9, 16
addi r8, r8, 16
addi r7, r7, 16
addi r6, r6, 16
addi r3, r3, 16
addi r2, r2, 16
bltu r5, r10, .L5
.L1:
ret
.size memcopy32, .-memcopy32
.ident "GCC: (Altera 15.1 Build 185) 4.9.2"


Now gcc developers showed us that they are really progressive!
The second loop uses and updates 8 address registers instead of 2. And the inner loop is almost twice slower than before. A real masterpiece.

Output of gcc.5.3.0
	.file	"memcopy32.c"
.section .text
.align 2
.global memcopy32
.type memcopy32, @function
memcopy32:
andi r2, r6, 3
beq r2, zero, .L2
add r7, r2, r2
add r7, r7, r7
sub r6, r6, r2
add r7, r5, r7
mov r3, r4
mov r2, r5
.L3:
ldw r8, 0(r2)
addi r3, r3, 4
addi r2, r2, 4
stw r8, -4(r3)
bne r7, r2, .L3
addi r2, r5, 4
sub r2, r7, r2
srli r2, r2, 2
addi r2, r2, 1
add r2, r2, r2
add r2, r2, r2
add r5, r5, r2
add r4, r4, r2
.L2:
beq r6, zero, .L1
add r6, r6, r6
add r6, r6, r6
add r10, r5, r6
addi r9, r5, 4
addi r8, r5, 8
addi r7, r5, 12
addi r6, r4, 4
addi r3, r4, 8
addi r2, r4, 12
.L5:
ldw r14, 0(r5)
ldw r13, 0(r9)
ldw r12, 0(r8)
ldw r11, 0(r7)
stw r14, 0(r4)
stw r13, 0(r6)
stw r12, 0(r3)
stw r11, 0(r2)
addi r5, r5, 16
addi r4, r4, 16
addi r9, r9, 16
addi r8, r8, 16
addi r7, r7, 16
addi r6, r6, 16
addi r3, r3, 16
addi r2, r2, 16
bltu r5, r10, .L5
.L1:
ret
.size memcopy32, .-memcopy32
.ident "GCC: (Altera 16.1 Build 196) 5.3.0"


Not much of excitement here. The code is rather similar to the previous one. Shuffle there, swizzle here...
What is interesting is not the code itself, but the fact that 5.3.0 is still the version shipped by Altera/Intel as official compiler with Nios II EDS 18.1.0.625, i.e. the latest non-premium version of the tools.

Output of gcc.7.2.1
	.file	"memcopy32.c"
.section .text
.align 2
.global memcopy32
.type memcopy32, @function
memcopy32:
andi r2, r6, 3
beq r2, zero, .L2
slli r7, r2, 2
sub r6, r6, r2
mov r3, r4
mov r2, r5
add r7, r5, r7
.L3:
ldw r8, 0(r2)
addi r3, r3, 4
addi r2, r2, 4
stw r8, -4(r3)
bne r7, r2, .L3
addi r2, r5, 4
sub r2, r7, r2
srli r2, r2, 2
addi r2, r2, 1
slli r2, r2, 2
add r4, r4, r2
add r5, r5, r2
.L2:
beq r6, zero, .L1
slli r6, r6, 2
add r6, r5, r6
.L5:
ldw r8, 0(r5)
ldw r7, 4(r5)
ldw r3, 8(r5)
ldw r2, 12(r5)
stw r8, 0(r4)
stw r7, 4(r4)
stw r3, 8(r4)
stw r2, 12(r4)
addi r5, r5, 16
addi r4, r4, 16
bltu r5, r6, .L5
.L1:
ret
.size memcopy32, .-memcopy32
.ident "GCC: (Altera 18.0 Build 219) 7.2.1 20171018"


Very similar to 4.7.3. Only multiplication by 4 is now implemented by left shift instead of couple of adds, as it was back in 2009. Shorter. Same speed.


Output of gcc.8.2.0
	.file	"memcopy32.c"
.section .text
.align 2
.global memcopy32
.type memcopy32, @function
memcopy32:
andi r2, r6, 3
beq r2, zero, .L2
slli r7, r2, 2
sub r6, r6, r2
mov r3, r4
mov r2, r5
add r7, r5, r7
.L3:
ldw r8, 0(r2)
addi r3, r3, 4
addi r2, r2, 4
stw r8, -4(r3)
bne r7, r2, .L3
sub r2, r7, r5
addi r2, r2, -4
srli r2, r2, 2
addi r2, r2, 1
slli r2, r2, 2
add r4, r4, r2
add r5, r5, r2
.L2:
beq r6, zero, .L1
slli r6, r6, 2
add r6, r5, r6
.L5:
ldw r8, 0(r5)
ldw r7, 4(r5)
ldw r3, 8(r5)
ldw r2, 12(r5)
stw r8, 0(r4)
stw r7, 4(r4)
stw r3, 8(r4)
stw r2, 12(r4)
addi r5, r5, 16
addi r4, r4, 16
bltu r5, r6, .L5
.L1:
ret
.size memcopy32, .-memcopy32
.ident "GCC: (GNU) 8.2.0"


Nearly identical to 7.2.1. Just two instructions exchanged places.

So, after 9 years of changes "modern" gcc is almost capable of [after lengthy chain of transoformations] transforming the source code to the pretty much the same source code.
Except of "almost". The code, generated by gcc8 is only marginally slower than the code, generated by gcc3. However it is 26% longer and significantly harder to follow.






 Next Post in Thread >
TopicPosted ByDate
GCC is a fine compiler, but... once moreMichael S2018/12/04 05:54 AM
  GCC is a fine compiler, but... once morerwessel2018/12/04 12:23 PM
    GCC is a fine compiler, but... once moreMichael S2018/12/04 01:50 PM
      GCC is a fine compiler, but... once moreLinus Torvalds2018/12/04 03:32 PM
        GCC is a fine compiler, but... once moreMichael S2018/12/04 04:10 PM
          GCC is a fine compiler, but... once moreLinus2018/12/04 07:07 PM
            GCC is a fine compiler, but... once moreMaynard Handley2018/12/04 07:40 PM
  GCC is a fine compiler, but... once moreTim McCaffrey2018/12/04 02:37 PM
    GCC is a fine compiler, but... once moreWilco2018/12/04 03:52 PM
Reply to this Topic
Name:
Email:
Topic:
Body: No Text
How do you spell green?