clang/LLVM project level optimization - optimization

So periodically I try LLVM as I have this theory it should outperform GNU. And then it sadly doesn't.
Part of the theory has to do with its ability to link modules/objects together and THEN optimize, where normally optimization happens on a per file/object basis.
Instead of using a generic one, I see how to build for a specific default target
rm -rf llvm-project
git clone https://github.com/llvm/llvm-project.git
cd llvm-project
git checkout llvmorg-10.0.0
mkdir build
cd build
cmake -DLLVM_ENABLE_PROJECTS='clang;lld' -DCMAKE_CROSSCOMPILING=True -DCMAKE_INSTALL_PREFIX=/opt/llvm/llvm10armv6m -DLLVM_DEFAULT_TARGET_TRIPLE=armv6m-none-eabi -DLLVM_TARGET_ARCH=ARM -DLLVM_TARGETS_TO_BUILD=ARM -G "Unix Makefiles" ../llvm
make -j 8
make -j 4
make
sudo make install
And the test files
test.c
unsigned int one ( void )
{
return(1);
}
unsigned int two ( void );
unsigned int testone ( void )
{
return(one());
}
unsigned int testtwo ( void )
{
return(two());
}
two.c
unsigned int two ( void )
{
return(2);
}
basic run
clang -O2 -fomit-frame-pointer -c test.c -o test.o
llvm-objdump -D test.o
00000000 one:
0: 01 20 movs r0, #1
2: 70 47 bx lr
00000004 testone:
4: 01 20 movs r0, #1
6: 70 47 bx lr
00000008 testtwo:
8: 80 b5 push {r7, lr}
a: ff f7 fe ff bl #-4
e: 80 bd pop {r7, pc}
as one would expect, one() has been inlined into testone().
The desire is to get testwo() inlined as well.
clang -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
llc both.bc -o both.s
cat both.s
opt -O2 both.bc -o both.opt.bc
llc both.opt.bc -o both.opt.s
cat both.opt.s
gives
testone:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl one
pop {r7, pc}
testtwo:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl two
pop {r7, pc}
and
testone:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl one
pop {r7, pc}
testtwo:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl two
pop {r7, pc}
that is worse.
opt -std-link-opts both.bc -o both.opt.bc
same, no better
Now this works
clang -O2 -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -O2 -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
opt -O2 both.bc -o both.opt.bc
llc both.opt.bc -o both.opt.s
cat both.opt.s
testone:
.fnstart
# %bb.0: # %entry
movs r0, #1
bx lr
testtwo:
.fnstart
# %bb.0: # %entry
movs r0, #2
bx lr
One would think that not optimizing the parts would give more meat for the optimization of the whole to chew on. Yes? Although this indicates otherwise.
clang -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
opt -O3 both.bc -o both.opt.bc
llc both.opt.bc -o both.opt.s
cat both.opt.s
testone:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl one
movs r0, #1
pop {r7, pc}
testtwo:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl two
movs r0, #2
pop {r7, pc}
-O3 doesn't help either, and this output is as pretty bad it calls the function AND inlines it. What is going on there?!
llvm-dis both.opt.bc
cat both.opt.ll
; ModuleID = 'both.opt.bc'
source_filename = "llvm-link"
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv6m-none-unknown-eabi"
; Function Attrs: noinline nounwind optnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: noinline nounwind optnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
%call = call i32 #one()
ret i32 1
}
; Function Attrs: noinline nounwind optnone
define dso_local i32 #testtwo() local_unnamed_addr #0 {
entry:
%call = call i32 #two()
ret i32 2
}
; Function Attrs: noinline nounwind optnone
define dso_local i32 #two() local_unnamed_addr #0 {
entry:
ret i32 2
}
How does one undo that?
clang -O2 -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -O2 -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
llvm-dis both.bc
cat both.ll
opt -O3 both.bc -o both.opt.bc
llvm-dis both.opt.bc
cat both.opt.ll
gives
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: nounwind
define dso_local i32 #testtwo() local_unnamed_addr #1 {
entry:
%call = tail call i32 #two() #2
ret i32 %call
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #two() local_unnamed_addr #0 {
entry:
ret i32 2
}
and
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testtwo() local_unnamed_addr #0 {
entry:
ret i32 2
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #two() local_unnamed_addr #0 {
entry:
ret i32 2
}
So is it correct that you have to apply the optimizations everywhere, at the file/object level in order to get the project level to optimize?
And then there is the question of tail call or leaf, etc optimization, if nothing else testtwo: even in the first case
clang -O2 -fomit-frame-pointer -c test.c -o test.o
could simply branch to two() and not setup a stack frame not do any of that. Or is this a thumb thing? b cant reach?
one:
0: b8 01 00 00 00 movl $1, %eax
5: c3 retq
testone:
10: b8 01 00 00 00 movl $1, %eax
15: c3 retq
testtwo:
20: e9 00 00 00 00 jmp 0 <testtwo+5>
In gnu the linker patches up any branch reaching or mode issues with trampolines
arm-none-eabi-gcc -c -O2 -mcpu=cortex-m0 test.c -o test.o
arm-none-eabi-objdump -D test.o
00000000 <one>:
0: 2001 movs r0, #1
2: 4770 bx lr
00000004 <testone>:
4: 2001 movs r0, #1
6: 4770 bx lr
00000008 <testtwo>:
8: b510 push {r4, lr}
a: f7ff fffe bl 0 <two>
e: bd10 pop {r4, pc}
Okay I stand corrected...
clang --version
clang version 10.0.0 (https://github.com/llvm/llvm-project.git d32170dbd5b0d54436537b6b75beaf44324e0c28)
Target: armv6m-none-unknown-eabi
Thread model: posix
InstalledDir: /opt/llvm/llvm10armv6m/bin
arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
I guess the question is if one wants to do a project level optimization using llvm-link and opt, is optimization of each of the individual items required or is there a command line option I am missing. Not interested in compiler specific attributes that go into the source code itself, want the code infected with neither gcc nor llvm specifics.
After gcc 5.x.x the code got more bloated was hoping that llvm would have a chance but whenever I try this (on projects not just 10 lines of code) gcc ends up with fewer executed instructions, and/or fewer memory accesses, etc, etc. For simple demonstration functions like the ones above, with some exceptions they produce the same/equivalent output.
Is there something, another one of the tools, or command line options, that I am missing in order to get more out of clang/llvm?
Is it that this is too trivial of an example for the tool to shine?
EDIT based on answer
clang -c start.s -o start.o
clang -O2 -flto=thin -fomit-frame-pointer -c test.c
clang -O2 -flto=thin -fomit-frame-pointer -c two.c
ld.lld start.o test.o two.o -o test.elf
llvm-objdump -D test.elf
000110fc testtwo:
110fc: 02 20 movs r0, #2
110fe: 70 47 bx lr
00011100 two:
11100: 02 20 movs r0, #2
11102: 70 47 bx lr
so getting rid of the -emit-llvm and using lto basically gives the desired result.
Looking at the bc disassembly
clang -O2 -flto=thin -fomit-frame-pointer -c test.c
llvm-dis test.o
cat test.o.ll
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: nounwind
define dso_local i32 #testtwo() local_unnamed_addr #1 {
entry:
%call = tail call i32 #two() #3
ret i32 %call
}
enables/adds the tail call. I really dislike using the compiler/shell as a linker (for embedded projects that have their own bootstrap and linker script), llvm-ldd usage wasn't easy to figure out or basically couldn't figure out, but ld.lld also supports the tlo stuff, so that worked out.

The answer is pretty easy actually: one should never want to use llc / opt / llvm-link for performing "end-user" project level kind of optimizations. These are developer-side tools with different defaults, thresholds, etc. Basically, they are just simple command-line frontends to various pieces of LLVM toolbox.
In order to perform the proper link-time-optimization you'd need to use the pipelines that were intended for such task. Basically, compiling everything using "clang -flto" and then linking everything again via "clang -flto" would work. Using LTO-aware linker like lld is a prerequisite as well.
Some further information about ThinLTO could also be found here: https://clang.llvm.org/docs/ThinLTO.html and http://blog.llvm.org/2016/06/thinlto-scalable-and-incremental-lto.html

Related

What is the format of special section in ELF

I try to write a program to generate ELF(based on Arm and execute through qemu-arm). Most format in ELF has been well illustrated
on wiki. But I can't find any spec describe the format of special section(e.g. .text .data(especially what I want to know)).
I tried to put some initialized global variable in .data section. What format should I write in ELF(.data section) if I have global statement like: int a = 10;
There is not special format for .text and .data.
When the static linker links several .o file,
it simply concatenates the .text and .data segments (while resolving relocations)
and places them in the final .so or executable file according to the linker script (see gcc -Wl,-verbose /dev/null).
The .data segment simply contains the initial values of the instanciated global variables.
The .text segment simply contains the machine code of the routines/functions.
Let's take this simple C file:
char x[5] = {0xba, 0xbb, 0xbc, 0xbd, 0xbe};
char f(int i) {
return x[i];
}
Let's compile it:
$ gcc -c -o test.o test.c
Let's dump the .data section, using elfcat:
$ elfcat test.o --section-name .data | xxd
00000000: babb bcbd be .....
We can clearly explain the content of .data section.
Let's dump the .text section:
$ elfcat test.o --section-name .text | xxd
00000000: 5548 89e5 897d fc8b 45fc 4898 488d 1500 UH...}..E.H.H...
00000010: 0000 000f b604 105d c3
Let's decompile this:
$ elfcat test.o --section-name .text > test.text
$ r2 -a x86 -b 64 -qc pd test.text
0x00000000 55 push rbp
0x00000001 4889e5 mov rbp, rsp
0x00000004 897dfc mov dword [rbp - 4], edi
0x00000007 8b45fc mov eax, dword [rbp - 4]
0x0000000a 4898 cdqe
0x0000000c 488d15000000. lea rdx, qword [0x00000013] ; 19
0x00000013 0fb60410 movzx eax, byte [rax + rdx]
0x00000017 5d pop rbp
0x00000018 c3 ret
Again, there is nothing special in the text segment: it only contains the machine code of the routines/functions of my program.
Notice however the relocation and symbol informations in other segments:
$ readelf -a test.o
[ ... ]
Relocation section '.rela.text' at offset 0x1b8 contains 1 entry:
Offset Info Type Sym. Value Sym. Name + Addend
00000000000f 000800000002 R_X86_64_PC32 0000000000000000 x - 4
Relocation section '.rela.eh_frame' at offset 0x1d0 contains 1 entry:
Offset Info Type Sym. Value Sym. Name + Addend
000000000020 000200000002 R_X86_64_PC32 0000000000000000 .text + 0
[...]
Symbol table '.symtab' contains 10 entries:
Num: Value Size Type Bind Vis Ndx Name
0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND
1: 0000000000000000 0 FILE LOCAL DEFAULT ABS test.c
2: 0000000000000000 0 SECTION LOCAL DEFAULT 1
3: 0000000000000000 0 SECTION LOCAL DEFAULT 3
4: 0000000000000000 0 SECTION LOCAL DEFAULT 4
5: 0000000000000000 0 SECTION LOCAL DEFAULT 6
6: 0000000000000000 0 SECTION LOCAL DEFAULT 7
7: 0000000000000000 0 SECTION LOCAL DEFAULT 5
8: 0000000000000000 5 OBJECT GLOBAL DEFAULT 3 x
9: 0000000000000000 25 FUNC GLOBAL DEFAULT 1 f

STM32F4-Disc1: user defined software delay in keil MDK version 5 not working

I am getting into learning embedded systems and I tried to implement blinky but the software delay gets skipped for some reason. I was expecting it to blink when I am pushing the button but instead the LEDs kept on.
Code I have used is shown below,
#include Board_LED.h
#include Board_Buttons.h
#include <stdint.h>
void delay(void);
void delay(void) {
int i;
for (i = 0; i < 5000000; i++)
;
}
int main(void) {
LED_Initialize();
Buttons_Initialize();
while (1) {
if (Buttons_GetState() == 1) {
LED_On(0);
LED_On(1);
LED_On(2);
LED_On(3);
delay();
LED_Off(0);
LED_Off(1);
LED_Off(2);
LED_Off(3);
delay();
}
}
return 0;
}
I'm using board support LED and button APIs.
How do I fix this?
My debugger starts as follows:
The problem here is this is dead code, it does nothing interacts with nothing so can/should be optimized out. And an optimizer will often do this.
void delay(void)
{
int i;
for(i=0; i<5000000 ;i++);
}
optimized output:
00000000 <delay>:
0: 4770 bx lr
One way is to not optimize
00000000 <delay>:
0: b580 push {r7, lr}
2: b082 sub sp, #8
4: af00 add r7, sp, #0
6: 2300 movs r3, #0
8: 607b str r3, [r7, #4]
a: e002 b.n 12 <delay+0x12>
c: 687b ldr r3, [r7, #4]
e: 3301 adds r3, #1
10: 607b str r3, [r7, #4]
12: 687b ldr r3, [r7, #4]
14: 4a04 ldr r2, [pc, #16] ; (28 <delay+0x28>)
16: 4293 cmp r3, r2
18: ddf8 ble.n c <delay+0xc>
1a: 46c0 nop ; (mov r8, r8)
1c: 46c0 nop ; (mov r8, r8)
1e: 46bd mov sp, r7
20: b002 add sp, #8
22: bc80 pop {r7}
24: bc01 pop {r0}
26: 4700 bx r0
But that's a bit brutal for an embedded platform so another is to beg the compiler to do something with the variable, keep it in memory and up to date:
void delay(void)
{
volatile int i;
for(i=0; i<5000000 ;i++);
}
It's still a bit ugly but that will burn some time:
00000000 <delay>:
0: 2300 movs r3, #0
2: b082 sub sp, #8
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 4a05 ldr r2, [pc, #20] ; (20 <delay+0x20>)
a: 4293 cmp r3, r2
c: dc05 bgt.n 1a <delay+0x1a>
e: 9b01 ldr r3, [sp, #4]
10: 3301 adds r3, #1
12: 9301 str r3, [sp, #4]
14: 9b01 ldr r3, [sp, #4]
16: 4293 cmp r3, r2
18: ddf9 ble.n e <delay+0xe>
1a: b002 add sp, #8
1c: 4770 bx lr
1e: 46c0 nop ; (mov r8, r8)
20: 004c4b3f .word 0x004c4b3f
The win-win way is to have another function outside the compile domain and let the optimizer work.
void dummy ( int );
void delay(void)
{
int i;
for(i=0; i<5000000 ;i++) dummy(i);
}
00000000 <delay>:
0: b570 push {r4, r5, r6, lr}
2: 2400 movs r4, #0
4: 4d04 ldr r5, [pc, #16] ; (18 <delay+0x18>)
6: 0020 movs r0, r4
8: 3401 adds r4, #1
a: f7ff fffe bl 0 <dummy>
e: 42ac cmp r4, r5
10: d1f9 bne.n 6 <delay+0x6>
12: bc70 pop {r4, r5, r6}
14: bc01 pop {r0}
16: 4700 bx r0
18: 004c4b40 .word 0x004c4b40
A little cleaner, burns some time but isn't excessive, yes note this is all-thumb-variants code. The called function can simply be a bx lr since you don't care what it does with the call.
00000000 <delay>:
0: b538 push {r3, r4, r5, lr}
2: 2400 movs r4, #0
4: 4d03 ldr r5, [pc, #12] ; (14 <delay+0x14>)
6: 4620 mov r0, r4
8: 3401 adds r4, #1
a: f7ff fffe bl 0 <dummy>
e: 42ac cmp r4, r5
10: d1f9 bne.n 6 <delay+0x6>
12: bd38 pop {r3, r4, r5, pc}
14: 004c4b40 .word 0x004c4b40
Building for the mcu cleans up the pop as after armv4t or 5t you could pop the pc to return to either mode, even though this is thumb mode only you still deal with that with these tools.
Now as shown by others, since you don't care about order just want to count you can, depending on architecture (often this is supported) count down. We are asking the compiler to not make this dead code so it has to do it in the order we asked, to be a functional representation of the C code.
void dummy ( int );
void delay(void)
{
int i=5000000;
while(--i) dummy(i);
}
00000000 <delay>:
0: b510 push {r4, lr}
2: 4c03 ldr r4, [pc, #12] ; (10 <delay+0x10>)
4: 4620 mov r0, r4
6: f7ff fffe bl 0 <dummy>
a: 3c01 subs r4, #1
c: d1fa bne.n 4 <delay+0x4>
e: bd10 pop {r4, pc}
10: 004c4b3f .word 0x004c4b3f
And now the compare went away (i-- vs --i made a difference i-- makes for more code)
With volatile:
void delay(void)
{
volatile int i=5000000;
while(--i) continue;
}
00000000 <delay>:
0: b082 sub sp, #8
2: 4b04 ldr r3, [pc, #16] ; (14 <delay+0x14>)
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 3b01 subs r3, #1
a: 9301 str r3, [sp, #4]
c: 2b00 cmp r3, #0
e: d1fa bne.n 6 <delay+0x6>
10: b002 add sp, #8
12: 4770 bx lr
14: 004c4b40 .word 0x004c4b40
void delay(void)
{
volatile int i=5000000;
while(i--) continue;
}
00000000 <delay>:
0: b082 sub sp, #8
2: 4b04 ldr r3, [pc, #16] ; (14 <delay+0x14>)
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 1e5a subs r2, r3, #1
a: 9201 str r2, [sp, #4]
c: 2b00 cmp r3, #0
e: d1fa bne.n 6 <delay+0x6>
10: b002 add sp, #8
12: 4770 bx lr
14: 004c4b40 .word 0x004c4b40
And that doesn't take advantage of the instruction set, oh well. (Being higher or lower one count doesn't matter as this really can't/won't be a tuned loop, to tune it on a platform like this you really need to use asm and even there it is difficult to tune).
Even cleaner just do it in assembly
.globl delay
delay:
ldr r0,=5000000
dinner:
sub r0,#1
bne dinner
bx lr
00000000 <delay>:
0: 4801 ldr r0, [pc, #4] ; (8 <dinner+0x6>)
00000002 <dinner>:
2: 3801 subs r0, #1
4: d1fd bne.n 2 <dinner>
6: 4770 bx lr
8: 004c4b40 .word 0x004c4b40
or make it generic
.globl delay
delay:
sub r0,#1
bne delay
bx lr
00000000 <delay>:
0: 3801 subs r0, #1
2: d1fe bne.n 0 <delay>
4: 4770 bx lr
and then call it from C with
delay(5000000);
Lots of options, but what others didn't show is the code being optimized away and what the choices do to the code. It is quite easy to see in the compiler output using the tools what is going on and why this happened.
And there are various ways to make it or request it to not be dead code. Most people just toss in a volatile and move on. Nothing wrong with that, usually.
Either specify -O0 as optimization flag in the compiler settings to avoid that the useless loop (from the compiler point of view) is optimized away.
Alternatively check the MDK or BSP for a provided delay() function known to work.
How did you discover that the loop was skipped (maybe your button function is not working)
Test it with:
void delay(volatile uint32_t del)
{
while(del--);
}
int main(void)
{
LED_Initialize();
Buttons_Initialize();
while(1){
if( 1 || Buttons_GetState() == 1){ //it skips the if checks
LED_On(0);
LED_On(1);
LED_On(2);
LED_On(3);
delay(500000);
LED_Off(0);
LED_Off(1);
LED_Off(2);
LED_Off(3);
delay(500000);
}
}
}
void delay(void)
{
volatile int i;
for(i=0; i<5000000 ;i++);
}
This should work provided that Buttons_GetState() is working fine. Declared variable 'i' as volatile so that no optimization happens by compiler.

llvm code optimization options do not work

I am reading about LLVM Code optimization.I tried to apply opt command options on a number of examples but they do not have any effect. For example.Here is a c++ code called deadCode.cpp:
#include<stdio.h>
int square(int x){
return x*x;
}
int main(){
int a=2;
int b=3;
int c=4;
int result =square(a);
printf("%d\n",b);
}
I generated the LLVM IR with clang like this:
clang -emit-llvm -S deadCode.cpp -o deadCodeBefore
and the result file deadCodeBefore content is :
; ModuleID = 'deadCode.cpp'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
#.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
; Function Attrs: nounwind uwtable
define i32 #_Z6squarei(i32 %x) #0 {
%1 = alloca i32, align 4
store i32 %x, i32* %1, align 4
%2 = load i32, i32* %1, align 4
%3 = load i32, i32* %1, align 4
%4 = mul nsw i32 %2, %3
ret i32 %4
}
; Function Attrs: norecurse uwtable
define i32 #main() #1 {
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
%result = alloca i32, align 4
store i32 2, i32* %a, align 4
store i32 3, i32* %b, align 4
store i32 4, i32* %c, align 4
%1 = load i32, i32* %a, align 4
%2 = call i32 #_Z6squarei(i32 %1)
store i32 %2, i32* %result, align 4
%3 = load i32, i32* %b, align 4
%4 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* #.str, i32 0, i32 0), i32 %3)
ret i32 0
}
declare i32 #printf(i8*, ...) #2
attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
the optimization command I used:
opt -S -adce deadCodeBefore -o deadCodeAfter1
As I read it should remove the call to square function and also the declaration of c variable because they have no effect. But the result is the same. Here is deadCodeAfter1 which is the same as deadCodeBefore:
; ModuleID = 'deadCodeBefore'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
#.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
; Function Attrs: nounwind uwtable
define i32 #_Z6squarei(i32 %x) #0 {
%1 = alloca i32, align 4
store i32 %x, i32* %1, align 4
%2 = load i32, i32* %1, align 4
%3 = load i32, i32* %1, align 4
%4 = mul nsw i32 %2, %3
ret i32 %4
}
; Function Attrs: norecurse uwtable
define i32 #main() #1 {
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
%result = alloca i32, align 4
store i32 2, i32* %a, align 4
store i32 3, i32* %b, align 4
store i32 4, i32* %c, align 4
%1 = load i32, i32* %a, align 4
%2 = call i32 #_Z6squarei(i32 %1)
store i32 %2, i32* %result, align 4
%3 = load i32, i32* %b, align 4
%4 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* #.str, i32 0, i32 0), i32 %3)
ret i32 0
}
declare i32 #printf(i8*, ...) #2
attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
Because it's doing exactly as it is supposed to do. It checks in the IR if an instruction is being used by some other instruction or not. If not only then it removes it. For example in your code, declaration of variable %a (%a = alloca i32, align 4) is being used in the store instruction store i32 2, i32* %a, align 4
If you had just declare a variable and not assigned any value to it, then adce pass would have eliminated it. You can see that by just declaring a variable like int e; and run the optimization on it.
Usually passes in LLVM are dependent on the output of some other pass in order to be effective. An individual pass on itself might not give you the result that you might have expected it to provide.

CUDA compiler produce unoptimal assembler

I have compiled flowing simple test kernel (CUDA5, sm2.0):
__device__ void TestKernel(int *pdata)
{
int a0,b0,c0;
a0 = pdata[0];
b0 = pdata[1];
c0 = a0 + b0;
pdata[2] = c0;
}
and expect something like flowing assembler:
LD R3,[R0]
LD R4,[R0+4]
IADD R4,R4,R3
ST [R0+8],R4
but from cuobjdump --dump-sass I see flowing much longer result:
/*0000*/ /*0x10001de428000000*/ MOV R0, R4;
/*0008*/ /*0x00001de428000000*/ MOV R0, R0;
/*0010*/ /*0x00001de428000000*/ MOV R0, R0;
/*0018*/ /*0x00001de428000000*/ MOV R0, R0;
/*0020*/ /*0x0000dc8580000000*/ LD R3, [R0];
/*0028*/ /*0x0c00dde428000000*/ MOV R3, R3;
/*0030*/ /*0x10011c034800c000*/ IADD R4, R0, 0x4;
/*0038*/ /*0x10011de428000000*/ MOV R4, R4;
/*0040*/ /*0x00411c8580000000*/ LD R4, [R4];
/*0048*/ /*0x10011de428000000*/ MOV R4, R4;
/*0050*/ /*0x1030dc0348000000*/ IADD R3, R3, R4;
/*0058*/ /*0x20001c034800c000*/ IADD R0, R0, 0x8;
/*0060*/ /*0x00001de428000000*/ MOV R0, R0;
/*0068*/ /*0x0000dc8590000000*/ ST [R0], R3;
/*0070*/ /*0x00001de790000000*/ RET;
/*0078*/ /*0x00001de780000000*/ EXIT;
/*0080*/ /*0x00001de780000000*/ EXIT;
Very strange to me MOVs instruction in addresses 8,10,18,28,38,60
also the immediate offset in load/store instruction doesn't used.
So instead expected 4 (actually 6 including RET,EXIT) instruction I get 15
What is possible reason?
What you are seeing is almost certainly because you are compiling with debugging turned on. If I build your kernel I get this:
$ nvcc -arch=sm_30 -c asmprob.cu
$ cuobjdump -sass asmprob.o
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = mac
compile_size = 32bit
identifier = asmprob.cu
code for sm_30
Function : _Z10TestKernelPi
/*0008*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44];
/*0010*/ /*0x00009de428004005*/ MOV R2, c [0x0] [0x140];
/*0018*/ /*0x10211c034800c000*/ IADD R4, R2, 0x4;
/*0020*/ /*0x20209c034800c000*/ IADD R2, R2, 0x8;
/*0028*/ /*0x0040dc8580000000*/ LD R3, [R4];
/*0030*/ /*0xf0401c8583ffffff*/ LD R0, [R4+-0x4];
/*0038*/ /*0x00301c0348000000*/ IADD R0, R3, R0;
/*0048*/ /*0x00201c8590000000*/ ST [R2], R0;
/*0050*/ /*0x00001de780000000*/ EXIT;
/*0058*/ /*0xe0001de74003ffff*/ BRA 0x58;
/*0060*/ /*0x00001de440000000*/ NOP CC.T;
/*0068*/ /*0x00001de440000000*/ NOP CC.T;
/*0070*/ /*0x00001de440000000*/ NOP CC.T;
/*0078*/ /*0x00001de440000000*/ NOP CC.T;
.................................
on the other hand, if I build it with debug settings, I get code just like you show:
$ nvcc -arch=sm_30 -G -c asmprob.cu
$ cuobjdump -sass asmprob.o
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = mac
compile_size = 32bit
has debug info
compressed
identifier = asmprob.cu
code for sm_30
Function : _Z10TestKernelPi
/*0000*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44];
/*0008*/ /*0x00001de218000005*/ MOV32I R0, 0x140;
/*0010*/ /*0x00001c8614000000*/ LDC R0, c [0x0] [R0];
/*0018*/ /*0x00001de428000000*/ MOV R0, R0;
/*0020*/ /*0x00009c8580000000*/ LD R2, [R0];
/*0028*/ /*0x08009de428000000*/ MOV R2, R2;
/*0030*/ /*0x1000dc034800c000*/ IADD R3, R0, 0x4;
/*0038*/ /*0x0c00dde428000000*/ MOV R3, R3;
/*0040*/ /*0x0030dc8580000000*/ LD R3, [R3];
/*0048*/ /*0x0c00dde428000000*/ MOV R3, R3;
/*0050*/ /*0x0c209c0348000000*/ IADD R2, R2, R3;
/*0058*/ /*0x20001c034800c000*/ IADD R0, R0, 0x8;
/*0060*/ /*0x00001de428000000*/ MOV R0, R0;
/*0068*/ /*0x00009c8590000000*/ ST [R0], R2;
/*0070*/ /*0x40001de740000000*/ BRA 0x88;
/*0078*/ /*0x00001de780000000*/ EXIT;
/*0080*/ /*0x00001de780000000*/ EXIT;
/*0088*/ /*0x00001de780000000*/ EXIT;
/*0090*/ /*0x00001de780000000*/ EXIT;
/*0098*/ /*0xe0001de74003ffff*/ BRA 0x98;
/*00a0*/ /*0x00001de440000000*/ NOP CC.T;
/*00a8*/ /*0x00001de440000000*/ NOP CC.T;
/*00b0*/ /*0x00001de440000000*/ NOP CC.T;
/*00b8*/ /*0x00001de440000000*/ NOP CC.T;
.................................
Which makes me think that your question is "why doesn't the compiler produce optimal code when I disable optimisations and compile for the debugger?", which is something of a rhetorical question, methinks....
EDIT:
And lest there be any doubts that enabling GPU debugging disables compiler optimisation, consider the following output from ´nvcc´:
$ nvcc -arch=sm_30 -G -c --dryrun asmprob.cu
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/usr/local/cuda/bin
#$ _THERE_=/usr/local/cuda/bin
#$ _TARGET_SIZE_=
#$ TOP=/usr/local/cuda/bin/..
#$ PATH=/usr/local/cuda/bin/../open64/bin:/usr/local/cuda/bin/../nvvm:/usr/local/cuda/bin:/opt/local/bin:/opt/local/sbin:/Library/Frameworks/Python.framework/Versions/Current/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin:/usr/local/git/bin:/usr/texbin:/usr/X11/bin:/usr/NX/bin:/usr/local/bin:/Users/talonmies/bin:/usr/local/cuda/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../lib" -lcudart
#$ CUDAFE_FLAGS=
#$ OPENCC_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=300 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ "-I/usr/local/cuda/bin/../include" -include "cuda_runtime.h" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-6_asmprob.cpp1.ii" "asmprob.cu"
#$ cudafe --m32 --gnu_version=40201 -tused --no_remove_unneeded_entities --debug_mode --gen_c_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.c" --stub_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.gpu" --nv_arch "compute_30" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00005ceb_00000000-2_asmprob.module_id" --include_file_name "tmpxft_00005ceb_00000000-1_asmprob.fatbin.c" "/tmp/tmpxft_00005ceb_00000000-6_asmprob.cpp1.ii"
#$ gcc -D__CUDA_ARCH__=300 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDANVVM__ -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/usr/local/cuda/bin/../include" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-7_asmprob.cpp2.i" "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.gpu"
#$ cudafe -w --m32 --gnu_version=40201 --c --debug_mode --gen_c_file_name "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.c" --stub_file_name "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.stub.c" --gen_device_file_name "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.gpu" --nv_arch "compute_30" --module_id_file_name "/tmp/tmpxft_00005ceb_00000000-2_asmprob.module_id" --include_file_name "tmpxft_00005ceb_00000000-1_asmprob.fatbin.c" "/tmp/tmpxft_00005ceb_00000000-7_asmprob.cpp2.i"
#$ gcc -D__CUDA_ARCH__=300 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDABE__ -D__CUDANVVM__ -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/usr/local/cuda/bin/../include" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-9_asmprob.cpp3.i" "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.gpu"
#$ filehash -s " -g --dont-merge-basicblocks --return-at-end " "/tmp/tmpxft_00005ceb_00000000-9_asmprob.cpp3.i" > "/tmp/tmpxft_00005ceb_00000000-10_asmprob.hash"
#$ gcc -E -x c++ -D__CUDACC__ -D__NVCC__ "-I/usr/local/cuda/bin/../include" -include "cuda_runtime.h" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-4_asmprob.cpp4.ii" "asmprob.cu"
#$ cudafe++ --m32 --gnu_version=40201 --parse_templates --debug_mode --gen_c_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.cpp" --stub_file_name "tmpxft_00005ceb_00000000-3_asmprob.cudafe1.stub.c" --module_id_file_name "/tmp/tmpxft_00005ceb_00000000-2_asmprob.module_id" "/tmp/tmpxft_00005ceb_00000000-4_asmprob.cpp4.ii"
#$ cicc -arch compute_30 -m32 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -g -O0 "/tmp/tmpxft_00005ceb_00000000-11_asmprob" "/tmp/tmpxft_00005ceb_00000000-9_asmprob.cpp3.i" -o "/tmp/tmpxft_00005ceb_00000000-5_asmprob.ptx"
#$ ptxas -arch=sm_30 -m32 -g --dont-merge-basicblocks --return-at-end "/tmp/tmpxft_00005ceb_00000000-5_asmprob.ptx" -o "/tmp/tmpxft_00005ceb_00000000-12_asmprob.sm_30.cubin"
#$ fatbinary --create="/tmp/tmpxft_00005ceb_00000000-1_asmprob.fatbin" -32 --key="xxxxxxxxxx" --ident="asmprob.cu" --cmdline=" -g --dont-merge-basicblocks --return-at-end " -g "--image=profile=sm_30,file=/tmp/tmpxft_00005ceb_00000000-12_asmprob.sm_30.cubin" "--image=profile=compute_30,file=/tmp/tmpxft_00005ceb_00000000-5_asmprob.ptx" --embedded-fatbin="/tmp/tmpxft_00005ceb_00000000-1_asmprob.fatbin.c" --cuda
#$ rm /tmp/tmpxft_00005ceb_00000000-1_asmprob.fatbin
#$ gcc -D__CUDA_ARCH__=300 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/usr/local/cuda/bin/../include" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-13_asmprob.ii" "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.cpp"
#$ gcc -c -x c++ "-I/usr/local/cuda/bin/../include" -fpreprocessed -m32 -malign-double -o "asmprob.o" "/tmp/tmpxft_00005ceb_00000000-13_asmprob.ii"
Note the device code compilation phase command:
cicc -arch compute_30 -m32 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -g -O0 <----
ie. debugging builds compile with optimisation set to 0.

Why do these simple methods compile differently?

I'm slightly confused as to why clang is emitting different code for the following two method:
#interface ClassA : NSObject
#end
#implementation ClassA
+ (ClassA*)giveMeAnObject1 {
return [[ClassA alloc] init];
}
+ (id)giveMeAnObject2 {
return [[ClassA alloc] init];
}
#end
If we look at the ARMv7 emitted then we see this, at O3, with ARC enabled:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject1]"
"+[ClassA giveMeAnObject1]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
LPC0_0:
add r1, pc
LPC0_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
LPC0_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autorelease
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject2]"
"+[ClassA giveMeAnObject2]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autoreleaseReturnValue
The only difference is the tail call to objc_autoreleaseReturnValue vs objc_autorelease. I would expect both to call objc_autoreleaseReturnValue to be honest. In-fact the first method not using objc_autoreleaseReturnValue means that it will potentially be slower than the second because there will definitely be an autorelease then a retain by the caller, rather than the faster bypass of this redundant call that ARC can do if it's supported in the runtime.
The LLVM which is emitted gives some kind of reason why it's like that:
define internal %1* #"\01+[ClassA giveMeAnObject1]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autorelease(i8* %6) nounwind
%8 = bitcast i8* %6 to %1*
ret %1* %8
}
define internal i8* #"\01+[ClassA giveMeAnObject2]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autoreleaseReturnValue(i8* %6) nounwind
ret i8* %6
}
But I'm struggling to see why it's decided to compile these two method differently. Can anyone shed some light onto it?
Update:
Even weirder is these other methods:
+ (ClassA*)giveMeAnObject3 {
ClassA *a = [[ClassA alloc] init];
return a;
}
+ (id)giveMeAnObject4 {
ClassA *a = [[ClassA alloc] init];
return a;
}
These compile to:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject3]"
"+[ClassA giveMeAnObject3]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject4]"
"+[ClassA giveMeAnObject4]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
LPC3_0:
add r1, pc
LPC3_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
This time, they are identical however there's a few things which could be optimised even more here:
There's a redundant mov r4, r0 followed by mov r0, r4.
There's a retain followed by a release.
Surely, the bottom bit of both of those methods can turn into:
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
Obviously we could then also omit popping r4 because we don't actually clobber it any more. Then the method would turn into the exact same as giveMeAnObject2 which is exactly what we'd expect.
Why is clang not being clever and doing this?!
This appears to be a bug in the optimizer and is being tracked as rdar://problem/10813093.