CUDA compiler produce unoptimal assembler - optimization

I have compiled flowing simple test kernel (CUDA5, sm2.0):
__device__ void TestKernel(int *pdata)
{
int a0,b0,c0;
a0 = pdata[0];
b0 = pdata[1];
c0 = a0 + b0;
pdata[2] = c0;
}
and expect something like flowing assembler:
LD R3,[R0]
LD R4,[R0+4]
IADD R4,R4,R3
ST [R0+8],R4
but from cuobjdump --dump-sass I see flowing much longer result:
/*0000*/ /*0x10001de428000000*/ MOV R0, R4;
/*0008*/ /*0x00001de428000000*/ MOV R0, R0;
/*0010*/ /*0x00001de428000000*/ MOV R0, R0;
/*0018*/ /*0x00001de428000000*/ MOV R0, R0;
/*0020*/ /*0x0000dc8580000000*/ LD R3, [R0];
/*0028*/ /*0x0c00dde428000000*/ MOV R3, R3;
/*0030*/ /*0x10011c034800c000*/ IADD R4, R0, 0x4;
/*0038*/ /*0x10011de428000000*/ MOV R4, R4;
/*0040*/ /*0x00411c8580000000*/ LD R4, [R4];
/*0048*/ /*0x10011de428000000*/ MOV R4, R4;
/*0050*/ /*0x1030dc0348000000*/ IADD R3, R3, R4;
/*0058*/ /*0x20001c034800c000*/ IADD R0, R0, 0x8;
/*0060*/ /*0x00001de428000000*/ MOV R0, R0;
/*0068*/ /*0x0000dc8590000000*/ ST [R0], R3;
/*0070*/ /*0x00001de790000000*/ RET;
/*0078*/ /*0x00001de780000000*/ EXIT;
/*0080*/ /*0x00001de780000000*/ EXIT;
Very strange to me MOVs instruction in addresses 8,10,18,28,38,60
also the immediate offset in load/store instruction doesn't used.
So instead expected 4 (actually 6 including RET,EXIT) instruction I get 15
What is possible reason?

What you are seeing is almost certainly because you are compiling with debugging turned on. If I build your kernel I get this:
$ nvcc -arch=sm_30 -c asmprob.cu
$ cuobjdump -sass asmprob.o
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = mac
compile_size = 32bit
identifier = asmprob.cu
code for sm_30
Function : _Z10TestKernelPi
/*0008*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44];
/*0010*/ /*0x00009de428004005*/ MOV R2, c [0x0] [0x140];
/*0018*/ /*0x10211c034800c000*/ IADD R4, R2, 0x4;
/*0020*/ /*0x20209c034800c000*/ IADD R2, R2, 0x8;
/*0028*/ /*0x0040dc8580000000*/ LD R3, [R4];
/*0030*/ /*0xf0401c8583ffffff*/ LD R0, [R4+-0x4];
/*0038*/ /*0x00301c0348000000*/ IADD R0, R3, R0;
/*0048*/ /*0x00201c8590000000*/ ST [R2], R0;
/*0050*/ /*0x00001de780000000*/ EXIT;
/*0058*/ /*0xe0001de74003ffff*/ BRA 0x58;
/*0060*/ /*0x00001de440000000*/ NOP CC.T;
/*0068*/ /*0x00001de440000000*/ NOP CC.T;
/*0070*/ /*0x00001de440000000*/ NOP CC.T;
/*0078*/ /*0x00001de440000000*/ NOP CC.T;
.................................
on the other hand, if I build it with debug settings, I get code just like you show:
$ nvcc -arch=sm_30 -G -c asmprob.cu
$ cuobjdump -sass asmprob.o
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = mac
compile_size = 32bit
has debug info
compressed
identifier = asmprob.cu
code for sm_30
Function : _Z10TestKernelPi
/*0000*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44];
/*0008*/ /*0x00001de218000005*/ MOV32I R0, 0x140;
/*0010*/ /*0x00001c8614000000*/ LDC R0, c [0x0] [R0];
/*0018*/ /*0x00001de428000000*/ MOV R0, R0;
/*0020*/ /*0x00009c8580000000*/ LD R2, [R0];
/*0028*/ /*0x08009de428000000*/ MOV R2, R2;
/*0030*/ /*0x1000dc034800c000*/ IADD R3, R0, 0x4;
/*0038*/ /*0x0c00dde428000000*/ MOV R3, R3;
/*0040*/ /*0x0030dc8580000000*/ LD R3, [R3];
/*0048*/ /*0x0c00dde428000000*/ MOV R3, R3;
/*0050*/ /*0x0c209c0348000000*/ IADD R2, R2, R3;
/*0058*/ /*0x20001c034800c000*/ IADD R0, R0, 0x8;
/*0060*/ /*0x00001de428000000*/ MOV R0, R0;
/*0068*/ /*0x00009c8590000000*/ ST [R0], R2;
/*0070*/ /*0x40001de740000000*/ BRA 0x88;
/*0078*/ /*0x00001de780000000*/ EXIT;
/*0080*/ /*0x00001de780000000*/ EXIT;
/*0088*/ /*0x00001de780000000*/ EXIT;
/*0090*/ /*0x00001de780000000*/ EXIT;
/*0098*/ /*0xe0001de74003ffff*/ BRA 0x98;
/*00a0*/ /*0x00001de440000000*/ NOP CC.T;
/*00a8*/ /*0x00001de440000000*/ NOP CC.T;
/*00b0*/ /*0x00001de440000000*/ NOP CC.T;
/*00b8*/ /*0x00001de440000000*/ NOP CC.T;
.................................
Which makes me think that your question is "why doesn't the compiler produce optimal code when I disable optimisations and compile for the debugger?", which is something of a rhetorical question, methinks....
EDIT:
And lest there be any doubts that enabling GPU debugging disables compiler optimisation, consider the following output from ´nvcc´:
$ nvcc -arch=sm_30 -G -c --dryrun asmprob.cu
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/usr/local/cuda/bin
#$ _THERE_=/usr/local/cuda/bin
#$ _TARGET_SIZE_=
#$ TOP=/usr/local/cuda/bin/..
#$ PATH=/usr/local/cuda/bin/../open64/bin:/usr/local/cuda/bin/../nvvm:/usr/local/cuda/bin:/opt/local/bin:/opt/local/sbin:/Library/Frameworks/Python.framework/Versions/Current/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin:/usr/local/git/bin:/usr/texbin:/usr/X11/bin:/usr/NX/bin:/usr/local/bin:/Users/talonmies/bin:/usr/local/cuda/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../lib" -lcudart
#$ CUDAFE_FLAGS=
#$ OPENCC_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=300 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ "-I/usr/local/cuda/bin/../include" -include "cuda_runtime.h" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-6_asmprob.cpp1.ii" "asmprob.cu"
#$ cudafe --m32 --gnu_version=40201 -tused --no_remove_unneeded_entities --debug_mode --gen_c_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.c" --stub_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.gpu" --nv_arch "compute_30" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00005ceb_00000000-2_asmprob.module_id" --include_file_name "tmpxft_00005ceb_00000000-1_asmprob.fatbin.c" "/tmp/tmpxft_00005ceb_00000000-6_asmprob.cpp1.ii"
#$ gcc -D__CUDA_ARCH__=300 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDANVVM__ -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/usr/local/cuda/bin/../include" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-7_asmprob.cpp2.i" "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.gpu"
#$ cudafe -w --m32 --gnu_version=40201 --c --debug_mode --gen_c_file_name "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.c" --stub_file_name "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.stub.c" --gen_device_file_name "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.gpu" --nv_arch "compute_30" --module_id_file_name "/tmp/tmpxft_00005ceb_00000000-2_asmprob.module_id" --include_file_name "tmpxft_00005ceb_00000000-1_asmprob.fatbin.c" "/tmp/tmpxft_00005ceb_00000000-7_asmprob.cpp2.i"
#$ gcc -D__CUDA_ARCH__=300 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDABE__ -D__CUDANVVM__ -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/usr/local/cuda/bin/../include" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-9_asmprob.cpp3.i" "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.gpu"
#$ filehash -s " -g --dont-merge-basicblocks --return-at-end " "/tmp/tmpxft_00005ceb_00000000-9_asmprob.cpp3.i" > "/tmp/tmpxft_00005ceb_00000000-10_asmprob.hash"
#$ gcc -E -x c++ -D__CUDACC__ -D__NVCC__ "-I/usr/local/cuda/bin/../include" -include "cuda_runtime.h" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-4_asmprob.cpp4.ii" "asmprob.cu"
#$ cudafe++ --m32 --gnu_version=40201 --parse_templates --debug_mode --gen_c_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.cpp" --stub_file_name "tmpxft_00005ceb_00000000-3_asmprob.cudafe1.stub.c" --module_id_file_name "/tmp/tmpxft_00005ceb_00000000-2_asmprob.module_id" "/tmp/tmpxft_00005ceb_00000000-4_asmprob.cpp4.ii"
#$ cicc -arch compute_30 -m32 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -g -O0 "/tmp/tmpxft_00005ceb_00000000-11_asmprob" "/tmp/tmpxft_00005ceb_00000000-9_asmprob.cpp3.i" -o "/tmp/tmpxft_00005ceb_00000000-5_asmprob.ptx"
#$ ptxas -arch=sm_30 -m32 -g --dont-merge-basicblocks --return-at-end "/tmp/tmpxft_00005ceb_00000000-5_asmprob.ptx" -o "/tmp/tmpxft_00005ceb_00000000-12_asmprob.sm_30.cubin"
#$ fatbinary --create="/tmp/tmpxft_00005ceb_00000000-1_asmprob.fatbin" -32 --key="xxxxxxxxxx" --ident="asmprob.cu" --cmdline=" -g --dont-merge-basicblocks --return-at-end " -g "--image=profile=sm_30,file=/tmp/tmpxft_00005ceb_00000000-12_asmprob.sm_30.cubin" "--image=profile=compute_30,file=/tmp/tmpxft_00005ceb_00000000-5_asmprob.ptx" --embedded-fatbin="/tmp/tmpxft_00005ceb_00000000-1_asmprob.fatbin.c" --cuda
#$ rm /tmp/tmpxft_00005ceb_00000000-1_asmprob.fatbin
#$ gcc -D__CUDA_ARCH__=300 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/usr/local/cuda/bin/../include" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-13_asmprob.ii" "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.cpp"
#$ gcc -c -x c++ "-I/usr/local/cuda/bin/../include" -fpreprocessed -m32 -malign-double -o "asmprob.o" "/tmp/tmpxft_00005ceb_00000000-13_asmprob.ii"
Note the device code compilation phase command:
cicc -arch compute_30 -m32 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -g -O0 <----
ie. debugging builds compile with optimisation set to 0.

Related

STM32F4-Disc1: user defined software delay in keil MDK version 5 not working

I am getting into learning embedded systems and I tried to implement blinky but the software delay gets skipped for some reason. I was expecting it to blink when I am pushing the button but instead the LEDs kept on.
Code I have used is shown below,
#include Board_LED.h
#include Board_Buttons.h
#include <stdint.h>
void delay(void);
void delay(void) {
int i;
for (i = 0; i < 5000000; i++)
;
}
int main(void) {
LED_Initialize();
Buttons_Initialize();
while (1) {
if (Buttons_GetState() == 1) {
LED_On(0);
LED_On(1);
LED_On(2);
LED_On(3);
delay();
LED_Off(0);
LED_Off(1);
LED_Off(2);
LED_Off(3);
delay();
}
}
return 0;
}
I'm using board support LED and button APIs.
How do I fix this?
My debugger starts as follows:
The problem here is this is dead code, it does nothing interacts with nothing so can/should be optimized out. And an optimizer will often do this.
void delay(void)
{
int i;
for(i=0; i<5000000 ;i++);
}
optimized output:
00000000 <delay>:
0: 4770 bx lr
One way is to not optimize
00000000 <delay>:
0: b580 push {r7, lr}
2: b082 sub sp, #8
4: af00 add r7, sp, #0
6: 2300 movs r3, #0
8: 607b str r3, [r7, #4]
a: e002 b.n 12 <delay+0x12>
c: 687b ldr r3, [r7, #4]
e: 3301 adds r3, #1
10: 607b str r3, [r7, #4]
12: 687b ldr r3, [r7, #4]
14: 4a04 ldr r2, [pc, #16] ; (28 <delay+0x28>)
16: 4293 cmp r3, r2
18: ddf8 ble.n c <delay+0xc>
1a: 46c0 nop ; (mov r8, r8)
1c: 46c0 nop ; (mov r8, r8)
1e: 46bd mov sp, r7
20: b002 add sp, #8
22: bc80 pop {r7}
24: bc01 pop {r0}
26: 4700 bx r0
But that's a bit brutal for an embedded platform so another is to beg the compiler to do something with the variable, keep it in memory and up to date:
void delay(void)
{
volatile int i;
for(i=0; i<5000000 ;i++);
}
It's still a bit ugly but that will burn some time:
00000000 <delay>:
0: 2300 movs r3, #0
2: b082 sub sp, #8
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 4a05 ldr r2, [pc, #20] ; (20 <delay+0x20>)
a: 4293 cmp r3, r2
c: dc05 bgt.n 1a <delay+0x1a>
e: 9b01 ldr r3, [sp, #4]
10: 3301 adds r3, #1
12: 9301 str r3, [sp, #4]
14: 9b01 ldr r3, [sp, #4]
16: 4293 cmp r3, r2
18: ddf9 ble.n e <delay+0xe>
1a: b002 add sp, #8
1c: 4770 bx lr
1e: 46c0 nop ; (mov r8, r8)
20: 004c4b3f .word 0x004c4b3f
The win-win way is to have another function outside the compile domain and let the optimizer work.
void dummy ( int );
void delay(void)
{
int i;
for(i=0; i<5000000 ;i++) dummy(i);
}
00000000 <delay>:
0: b570 push {r4, r5, r6, lr}
2: 2400 movs r4, #0
4: 4d04 ldr r5, [pc, #16] ; (18 <delay+0x18>)
6: 0020 movs r0, r4
8: 3401 adds r4, #1
a: f7ff fffe bl 0 <dummy>
e: 42ac cmp r4, r5
10: d1f9 bne.n 6 <delay+0x6>
12: bc70 pop {r4, r5, r6}
14: bc01 pop {r0}
16: 4700 bx r0
18: 004c4b40 .word 0x004c4b40
A little cleaner, burns some time but isn't excessive, yes note this is all-thumb-variants code. The called function can simply be a bx lr since you don't care what it does with the call.
00000000 <delay>:
0: b538 push {r3, r4, r5, lr}
2: 2400 movs r4, #0
4: 4d03 ldr r5, [pc, #12] ; (14 <delay+0x14>)
6: 4620 mov r0, r4
8: 3401 adds r4, #1
a: f7ff fffe bl 0 <dummy>
e: 42ac cmp r4, r5
10: d1f9 bne.n 6 <delay+0x6>
12: bd38 pop {r3, r4, r5, pc}
14: 004c4b40 .word 0x004c4b40
Building for the mcu cleans up the pop as after armv4t or 5t you could pop the pc to return to either mode, even though this is thumb mode only you still deal with that with these tools.
Now as shown by others, since you don't care about order just want to count you can, depending on architecture (often this is supported) count down. We are asking the compiler to not make this dead code so it has to do it in the order we asked, to be a functional representation of the C code.
void dummy ( int );
void delay(void)
{
int i=5000000;
while(--i) dummy(i);
}
00000000 <delay>:
0: b510 push {r4, lr}
2: 4c03 ldr r4, [pc, #12] ; (10 <delay+0x10>)
4: 4620 mov r0, r4
6: f7ff fffe bl 0 <dummy>
a: 3c01 subs r4, #1
c: d1fa bne.n 4 <delay+0x4>
e: bd10 pop {r4, pc}
10: 004c4b3f .word 0x004c4b3f
And now the compare went away (i-- vs --i made a difference i-- makes for more code)
With volatile:
void delay(void)
{
volatile int i=5000000;
while(--i) continue;
}
00000000 <delay>:
0: b082 sub sp, #8
2: 4b04 ldr r3, [pc, #16] ; (14 <delay+0x14>)
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 3b01 subs r3, #1
a: 9301 str r3, [sp, #4]
c: 2b00 cmp r3, #0
e: d1fa bne.n 6 <delay+0x6>
10: b002 add sp, #8
12: 4770 bx lr
14: 004c4b40 .word 0x004c4b40
void delay(void)
{
volatile int i=5000000;
while(i--) continue;
}
00000000 <delay>:
0: b082 sub sp, #8
2: 4b04 ldr r3, [pc, #16] ; (14 <delay+0x14>)
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 1e5a subs r2, r3, #1
a: 9201 str r2, [sp, #4]
c: 2b00 cmp r3, #0
e: d1fa bne.n 6 <delay+0x6>
10: b002 add sp, #8
12: 4770 bx lr
14: 004c4b40 .word 0x004c4b40
And that doesn't take advantage of the instruction set, oh well. (Being higher or lower one count doesn't matter as this really can't/won't be a tuned loop, to tune it on a platform like this you really need to use asm and even there it is difficult to tune).
Even cleaner just do it in assembly
.globl delay
delay:
ldr r0,=5000000
dinner:
sub r0,#1
bne dinner
bx lr
00000000 <delay>:
0: 4801 ldr r0, [pc, #4] ; (8 <dinner+0x6>)
00000002 <dinner>:
2: 3801 subs r0, #1
4: d1fd bne.n 2 <dinner>
6: 4770 bx lr
8: 004c4b40 .word 0x004c4b40
or make it generic
.globl delay
delay:
sub r0,#1
bne delay
bx lr
00000000 <delay>:
0: 3801 subs r0, #1
2: d1fe bne.n 0 <delay>
4: 4770 bx lr
and then call it from C with
delay(5000000);
Lots of options, but what others didn't show is the code being optimized away and what the choices do to the code. It is quite easy to see in the compiler output using the tools what is going on and why this happened.
And there are various ways to make it or request it to not be dead code. Most people just toss in a volatile and move on. Nothing wrong with that, usually.
Either specify -O0 as optimization flag in the compiler settings to avoid that the useless loop (from the compiler point of view) is optimized away.
Alternatively check the MDK or BSP for a provided delay() function known to work.
How did you discover that the loop was skipped (maybe your button function is not working)
Test it with:
void delay(volatile uint32_t del)
{
while(del--);
}
int main(void)
{
LED_Initialize();
Buttons_Initialize();
while(1){
if( 1 || Buttons_GetState() == 1){ //it skips the if checks
LED_On(0);
LED_On(1);
LED_On(2);
LED_On(3);
delay(500000);
LED_Off(0);
LED_Off(1);
LED_Off(2);
LED_Off(3);
delay(500000);
}
}
}
void delay(void)
{
volatile int i;
for(i=0; i<5000000 ;i++);
}
This should work provided that Buttons_GetState() is working fine. Declared variable 'i' as volatile so that no optimization happens by compiler.

clang/LLVM project level optimization

So periodically I try LLVM as I have this theory it should outperform GNU. And then it sadly doesn't.
Part of the theory has to do with its ability to link modules/objects together and THEN optimize, where normally optimization happens on a per file/object basis.
Instead of using a generic one, I see how to build for a specific default target
rm -rf llvm-project
git clone https://github.com/llvm/llvm-project.git
cd llvm-project
git checkout llvmorg-10.0.0
mkdir build
cd build
cmake -DLLVM_ENABLE_PROJECTS='clang;lld' -DCMAKE_CROSSCOMPILING=True -DCMAKE_INSTALL_PREFIX=/opt/llvm/llvm10armv6m -DLLVM_DEFAULT_TARGET_TRIPLE=armv6m-none-eabi -DLLVM_TARGET_ARCH=ARM -DLLVM_TARGETS_TO_BUILD=ARM -G "Unix Makefiles" ../llvm
make -j 8
make -j 4
make
sudo make install
And the test files
test.c
unsigned int one ( void )
{
return(1);
}
unsigned int two ( void );
unsigned int testone ( void )
{
return(one());
}
unsigned int testtwo ( void )
{
return(two());
}
two.c
unsigned int two ( void )
{
return(2);
}
basic run
clang -O2 -fomit-frame-pointer -c test.c -o test.o
llvm-objdump -D test.o
00000000 one:
0: 01 20 movs r0, #1
2: 70 47 bx lr
00000004 testone:
4: 01 20 movs r0, #1
6: 70 47 bx lr
00000008 testtwo:
8: 80 b5 push {r7, lr}
a: ff f7 fe ff bl #-4
e: 80 bd pop {r7, pc}
as one would expect, one() has been inlined into testone().
The desire is to get testwo() inlined as well.
clang -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
llc both.bc -o both.s
cat both.s
opt -O2 both.bc -o both.opt.bc
llc both.opt.bc -o both.opt.s
cat both.opt.s
gives
testone:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl one
pop {r7, pc}
testtwo:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl two
pop {r7, pc}
and
testone:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl one
pop {r7, pc}
testtwo:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl two
pop {r7, pc}
that is worse.
opt -std-link-opts both.bc -o both.opt.bc
same, no better
Now this works
clang -O2 -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -O2 -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
opt -O2 both.bc -o both.opt.bc
llc both.opt.bc -o both.opt.s
cat both.opt.s
testone:
.fnstart
# %bb.0: # %entry
movs r0, #1
bx lr
testtwo:
.fnstart
# %bb.0: # %entry
movs r0, #2
bx lr
One would think that not optimizing the parts would give more meat for the optimization of the whole to chew on. Yes? Although this indicates otherwise.
clang -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
opt -O3 both.bc -o both.opt.bc
llc both.opt.bc -o both.opt.s
cat both.opt.s
testone:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl one
movs r0, #1
pop {r7, pc}
testtwo:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl two
movs r0, #2
pop {r7, pc}
-O3 doesn't help either, and this output is as pretty bad it calls the function AND inlines it. What is going on there?!
llvm-dis both.opt.bc
cat both.opt.ll
; ModuleID = 'both.opt.bc'
source_filename = "llvm-link"
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv6m-none-unknown-eabi"
; Function Attrs: noinline nounwind optnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: noinline nounwind optnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
%call = call i32 #one()
ret i32 1
}
; Function Attrs: noinline nounwind optnone
define dso_local i32 #testtwo() local_unnamed_addr #0 {
entry:
%call = call i32 #two()
ret i32 2
}
; Function Attrs: noinline nounwind optnone
define dso_local i32 #two() local_unnamed_addr #0 {
entry:
ret i32 2
}
How does one undo that?
clang -O2 -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -O2 -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
llvm-dis both.bc
cat both.ll
opt -O3 both.bc -o both.opt.bc
llvm-dis both.opt.bc
cat both.opt.ll
gives
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: nounwind
define dso_local i32 #testtwo() local_unnamed_addr #1 {
entry:
%call = tail call i32 #two() #2
ret i32 %call
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #two() local_unnamed_addr #0 {
entry:
ret i32 2
}
and
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testtwo() local_unnamed_addr #0 {
entry:
ret i32 2
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #two() local_unnamed_addr #0 {
entry:
ret i32 2
}
So is it correct that you have to apply the optimizations everywhere, at the file/object level in order to get the project level to optimize?
And then there is the question of tail call or leaf, etc optimization, if nothing else testtwo: even in the first case
clang -O2 -fomit-frame-pointer -c test.c -o test.o
could simply branch to two() and not setup a stack frame not do any of that. Or is this a thumb thing? b cant reach?
one:
0: b8 01 00 00 00 movl $1, %eax
5: c3 retq
testone:
10: b8 01 00 00 00 movl $1, %eax
15: c3 retq
testtwo:
20: e9 00 00 00 00 jmp 0 <testtwo+5>
In gnu the linker patches up any branch reaching or mode issues with trampolines
arm-none-eabi-gcc -c -O2 -mcpu=cortex-m0 test.c -o test.o
arm-none-eabi-objdump -D test.o
00000000 <one>:
0: 2001 movs r0, #1
2: 4770 bx lr
00000004 <testone>:
4: 2001 movs r0, #1
6: 4770 bx lr
00000008 <testtwo>:
8: b510 push {r4, lr}
a: f7ff fffe bl 0 <two>
e: bd10 pop {r4, pc}
Okay I stand corrected...
clang --version
clang version 10.0.0 (https://github.com/llvm/llvm-project.git d32170dbd5b0d54436537b6b75beaf44324e0c28)
Target: armv6m-none-unknown-eabi
Thread model: posix
InstalledDir: /opt/llvm/llvm10armv6m/bin
arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
I guess the question is if one wants to do a project level optimization using llvm-link and opt, is optimization of each of the individual items required or is there a command line option I am missing. Not interested in compiler specific attributes that go into the source code itself, want the code infected with neither gcc nor llvm specifics.
After gcc 5.x.x the code got more bloated was hoping that llvm would have a chance but whenever I try this (on projects not just 10 lines of code) gcc ends up with fewer executed instructions, and/or fewer memory accesses, etc, etc. For simple demonstration functions like the ones above, with some exceptions they produce the same/equivalent output.
Is there something, another one of the tools, or command line options, that I am missing in order to get more out of clang/llvm?
Is it that this is too trivial of an example for the tool to shine?
EDIT based on answer
clang -c start.s -o start.o
clang -O2 -flto=thin -fomit-frame-pointer -c test.c
clang -O2 -flto=thin -fomit-frame-pointer -c two.c
ld.lld start.o test.o two.o -o test.elf
llvm-objdump -D test.elf
000110fc testtwo:
110fc: 02 20 movs r0, #2
110fe: 70 47 bx lr
00011100 two:
11100: 02 20 movs r0, #2
11102: 70 47 bx lr
so getting rid of the -emit-llvm and using lto basically gives the desired result.
Looking at the bc disassembly
clang -O2 -flto=thin -fomit-frame-pointer -c test.c
llvm-dis test.o
cat test.o.ll
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: nounwind
define dso_local i32 #testtwo() local_unnamed_addr #1 {
entry:
%call = tail call i32 #two() #3
ret i32 %call
}
enables/adds the tail call. I really dislike using the compiler/shell as a linker (for embedded projects that have their own bootstrap and linker script), llvm-ldd usage wasn't easy to figure out or basically couldn't figure out, but ld.lld also supports the tlo stuff, so that worked out.
The answer is pretty easy actually: one should never want to use llc / opt / llvm-link for performing "end-user" project level kind of optimizations. These are developer-side tools with different defaults, thresholds, etc. Basically, they are just simple command-line frontends to various pieces of LLVM toolbox.
In order to perform the proper link-time-optimization you'd need to use the pipelines that were intended for such task. Basically, compiling everything using "clang -flto" and then linking everything again via "clang -flto" would work. Using LTO-aware linker like lld is a prerequisite as well.
Some further information about ThinLTO could also be found here: https://clang.llvm.org/docs/ThinLTO.html and http://blog.llvm.org/2016/06/thinlto-scalable-and-incremental-lto.html

ADICUP360 and CrossCore Embedded Studio - error: No source available for "_start() at 0x150"

The demo projects of the ADuCM360 / ADICUP360 from https://wiki.analog.com/resources/eval/user-guides/eval-adicup360 are working fine.
But if I try to create (build project -> debug as) my own project, for example just something like this:
#include <sys/platform.h>
int main (int argc, char *argv[])
{
/* Begin adding your custom code here */
return 0;
}
Then I receive the error message:
No source available for "_start() at 0x150"
When I then press the button View Disassembly, then i get the following result:
00000150: bkpt 0x00ab
00000152: ldr r0, [pc, #192] ; (0x214 <_start+200>)
00000154: ldr r1, [r0, #4]
00000156: cmp r1, #0
00000158: beq.n 0x15e <_start+18>
0000015a: ldr r2, [pc, #212] ; (0x230 <_start+228>)
0000015c: str r1, [r2, #0]
0000015e: ldr r1, [r0, #0]
00000160: cmp r1, #0
00000162: bne.n 0x168 <_start+28>
00000164: ldr r1, [pc, #196] ; (0x22c <_start+224>)
00000166: str r1, [r0, #0]
00000168: ldr r1, [r0, #8]
0000016a: ldr r2, [r0, #12]
0000016c: cmp r1, #0
0000016e: beq.n 0x172 <_start+38>
00000170: mov sp, r1
00000172: cmp r2, #0
00000174: beq.n 0x17a <_start+46>
00000176: add.w r10, r2, #256 ; 0x100
0000017a: movs r1, #0
0000017c: mov r11, r1
0000017e: mov r7, r1
00000180: ldr r0, [pc, #148] ; (0x218 <_start+204>)
00000182: ldr r2, [pc, #152] ; (0x21c <_start+208>)
00000184: subs r2, r2, r0
00000186: bl 0x41c <memset>
0000018a: bl 0x6c0 <initialise_monitor_handles>
0000018e: movs r0, #21
00000190: ldr r1, [pc, #148] ; (0x228 <_start+220>)
00000192: bkpt 0x00ab
00000194: ldr r1, [pc, #144] ; (0x228 <_start+220>)
00000196: ldr r1, [r1, #0]
00000198: movs r0, #0
0000019a: push {r0}
0000019c: ldrb r3, [r1, #0]
0000019e: adds r1, #1
000001a0: cmp r3, #0
000001a2: beq.n 0x1d0 <_start+132>
000001a4: cmp r3, #32
000001a6: beq.n 0x19c <_start+80>
000001a8: cmp r3, #34 ; 0x22
000001aa: beq.n 0x1b0 <_start+100>
000001ac: cmp r3, #39 ; 0x27
000001ae: bne.n 0x1b4 <_start+104>
000001b0: movs r2, r3
000001b2: b.n 0x1b8 <_start+108>
000001b4: movs r2, #32
000001b6: subs r1, #1
000001b8: push {r1}
000001ba: adds r0, #1
000001bc: ldrb r3, [r1, #0]
000001be: adds r1, #1
000001c0: cmp r3, #0
000001c2: beq.n 0x1d0 <_start+132>
......... ...
How can I fix this?
That is the C runtime start-up - possibly provided to you project as object code, hence no source.
Do you need source-level debugging of the start-up code? If so you need to include the source (which is quite possibly in assembly code in any case if it is available to you at all).
Place a break point in or on main() and continue until you are in code for which you can provide the source.
The question is why you have a breakpoint instruction (soft breakpoint) at _start()? Did you set that, or did your debugger automatically insert it?
The CrossCore compiler manual refers to the "System Run-Time Documentation" for details of the runtime start-up, but one of the FAQ's is "Where is the System Run-Time Documentation?".
The better and easy way to start is take working example ( for example blink project),remove contents of main function and add your custom code there.
I understand from your comment that you are debugging the code.
May I know
1. How the behavior is if step over from breakpoint is done?
2. What is the behavior in free run?

ARM disassemble+ Crash at ldmge r1!, {r4, r5, r6, r7, r8, r9, r10, r11}

I found a crash in memcpy() function, which gets called from one of 802.11n specific aggregation function in wifi driver. From the core analysis, the crash point is mentioned below,
0x012014f8 <memcpy+100>: ldmge r1!, {r4, r5, r6, r7, r8, r9, r10, r11}
Why do we see a crash after the ldmge instruction execution?
I would like to know which parameter of memcpy() is corrupted - src_addr,dest_addr or length? Could you please provide your inputs by looking into the disassemble code.
Please find the disassemble code of memcpy() from gdb and backtrace from core file.
disas *
Dump of assembler code for function memcpy:
0x01201494 <memcpy+0>: cmp r2, #0 ; 0x0
0x01201498 <memcpy+4>: moveq pc, lr
0x0120149c <memcpy+8>: push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
0x012014a0 <memcpy+12>: mov r3, r0
0x012014a4 <memcpy+16>: cmp r2, #16 ; 0x10
0x012014a8 <memcpy+20>: blt 0x12016b8 <mc_bytes>
0x012014ac <memcpy+24>: ands r12, r3, #3 ; 0x3
0x012014b0 <memcpy+28>: beq 0x12014d8 <memcpy+68>
0x012014b4 <memcpy+32>: rsb r12, r12, #4 ; 0x4
0x012014b8 <memcpy+36>: cmp r12, #2 ; 0x2
0x012014bc <memcpy+40>: ldrb r4, [r1], #1
0x012014c0 <memcpy+44>: ldrbge r5, [r1], #1
0x012014c4 <memcpy+48>: ldrbgt r6, [r1], #1
0x012014c8 <memcpy+52>: strb r4, [r3], #1
0x012014cc <memcpy+56>: strbge r5, [r3], #1
0x012014d0 <memcpy+60>: strbgt r6, [r3], #1
0x012014d4 <memcpy+64>: sub r2, r2, r12
0x012014d8 <memcpy+68>: ands r12, r1, #3 ; 0x3
0x012014dc <memcpy+72>: bne 0x120156c <mc_unaligned>
0x012014e0 <memcpy+76>: tst r3, #15 ; 0xf
0x012014e4 <memcpy+80>: ldrne r4, [r1], #4
0x012014e8 <memcpy+84>: subne r2, r2, #4 ; 0x4
0x012014ec <memcpy+88>: strne r4, [r3], #4
0x012014f0 <memcpy+92>: bne 0x12014e0 <memcpy+76>
0x012014f4 <memcpy+96>: cmp r2, #32 ; 0x20
**0x012014f8 <memcpy+100>: ldmge r1!, {r4, r5, r6, r7, r8, r9, r10, r11}****
0x012014fc <memcpy+104>: subge r2, r2, #32 ; 0x20
0x01201500 <memcpy+108>: stmiage r3!, {r4, r5, r6, r7, r8, r9, r10, r11}
0x01201504 <memcpy+112>: bge 0x12014f4 <memcpy+96>
0x01201508 <memcpy+116>: cmp r2, #16 ; 0x10
0x0120150c <memcpy+120>: ldmge r1!, {r4, r5, r6, r7}
0x01201510 <memcpy+124>: subge r2, r2, #16 ; 0x10
0x01201514 <memcpy+128>: stmiage r3!, {r4, r5, r6, r7}
0x01201518 <memcpy+132>: tst r2, #8 ; 0x8
0x0120151c <memcpy+136>: beq 0x1201534 <memcpy+160>
0x01201520 <memcpy+140>: ldr r4, [r1], #4
0x01201524 <memcpy+144>: ldr r5, [r1], #4
0x01201528 <memcpy+148>: sub r2, r2, #8 ; 0x8
0x0120152c <memcpy+152>: str r4, [r3], #4
0x01201530 <memcpy+156>: str r5, [r3], #4
0x01201534 <memcpy+160>: tst r2, #4 ; 0x4
0x01201538 <memcpy+164>: ldrne r4, [r1], #4
0x0120153c <memcpy+168>: subne r2, r2, #4 ; 0x4
0x01201540 <memcpy+172>: strne r4, [r3], #4
0x01201544 <memcpy+176>: cmp r2, #0 ; 0x0
0x01201548 <memcpy+180>: beq 0x1201568 <memcpy+212>
0x0120154c <memcpy+184>: cmp r2, #2 ; 0x2
0x01201550 <memcpy+188>: ldrb r4, [r1], #1
0x01201554 <memcpy+192>: ldrbge r5, [r1], #1
0x01201558 <memcpy+196>: ldrbgt r6, [r1], #1
0x0120155c <memcpy+200>: strb r4, [r3], #1
0x01201560 <memcpy+204>: strbge r5, [r3], #1
0x01201564 <memcpy+208>: strbgt r6, [r3], #1
0x01201568 <memcpy+212>: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
0x0120156c <mc_unaligned+0>: bic r1, r1, #3 ; 0x3
0x01201570 <mc_unaligned+4>: teq r12, #1 ; 0x1
0x01201574 <mc_unaligned+8>: beq 0x12015e8 <mc_1>
0x01201578 <mc_unaligned+12>: teq r12, #2 ; 0x2
0x0120157c <mc_unaligned+16>: beq 0x1201650 <mc_2>
0x01201580 <mc_3+0>: ldr r8, [r1], #4
0x01201584 <mc_3+4>: cmp r2, #16 ; 0x10
0x01201588 <mc_3+8>: blt 0x12015d8 <mc_3+88>
0x0120158c <mc_3+12>: lsr r4, r8, #24
0x01201590 <mc_3+16>: ldm r1!, {r5, r6, r7, r8}
0x01201594 <mc_3+20>: orr r4, r4, r5, lsl #8
0x01201598 <mc_3+24>: lsr r5, r5, #24
0x0120159c <mc_3+28>: orr r5, r5, r6, lsl #8
0x012015a0 <mc_3+32>: lsr r6, r6, #24
0x012015a4 <mc_3+36>: orr r6, r6, r7, lsl #8
0x012015a8 <mc_3+40>: lsr r7, r7, #24
0x012015ac <mc_3+44>: orr r7, r7, r8, lsl #8
0x012015b0 <mc_3+48>: stmia r3!, {r4, r5, r6, r7}
0x012015b4 <mc_3+52>: sub r2, r2, #16 ; 0x10
0x012015b8 <mc_3+56>: cmp r2, #32 ; 0x20
0x012015bc <mc_3+60>: bge 0x120158c <mc_3+12>
0x012015c0 <mc_3+64>: b 0x12015d8 <mc_3+88>
0x012015c4 <mc_3+68>: lsr r4, r8, #24
0x012015c8 <mc_3+72>: ldr r8, [r1], #4
0x012015cc <mc_3+76>: orr r4, r4, r8, lsl #8
0x012015d0 <mc_3+80>: str r4, [r3], #4
0x012015d4 <mc_3+84>: sub r2, r2, #4 ; 0x4
0x012015d8 <mc_3+88>: cmp r2, #4 ; 0x4
0x012015dc <mc_3+92>: bge 0x12015c4 <mc_3+68>
0x012015e0 <mc_3+96>: sub r1, r1, #1 ; 0x1
0x012015e4 <mc_3+100>: b 0x1201544 <memcpy+176>
0x012015e8 <mc_1+0>: ldr r8, [r1], #4
0x012015ec <mc_1+4>: cmp r2, #16 ; 0x10
0x012015f0 <mc_1+8>: blt 0x1201640 <mc_1+88>
0x012015f4 <mc_1+12>: lsr r4, r8, #8
0x012015f8 <mc_1+16>: ldm r1!, {r5, r6, r7, r8}
0x012015fc <mc_1+20>: orr r4, r4, r5, lsl #24
0x01201600 <mc_1+24>: lsr r5, r5, #8
0x01201604 <mc_1+28>: orr r5, r5, r6, lsl #24
0x01201608 <mc_1+32>: lsr r6, r6, #8
0x0120160c <mc_1+36>: orr r6, r6, r7, lsl #24
0x01201610 <mc_1+40>: lsr r7, r7, #8
0x01201614 <mc_1+44>: orr r7, r7, r8, lsl #24
0x01201618 <mc_1+48>: stmia r3!, {r4, r5, r6, r7}
0x0120161c <mc_1+52>: sub r2, r2, #16 ; 0x10
0x01201620 <mc_1+56>: cmp r2, #32 ; 0x20
0x01201624 <mc_1+60>: bge 0x12015f4 <mc_1+12>
0x01201628 <mc_1+64>: b 0x1201640 <mc_1+88>
0x0120162c <mc_1+68>: lsr r4, r8, #8
0x01201630 <mc_1+72>: ldr r8, [r1], #4
0x01201634 <mc_1+76>: orr r4, r4, r8, lsl #24
0x01201638 <mc_1+80>: str r4, [r3], #4
0x0120163c <mc_1+84>: sub r2, r2, #4 ; 0x4
0x01201640 <mc_1+88>: cmp r2, #4 ; 0x4
0x01201644 <mc_1+92>: bge 0x120162c <mc_1+68>
0x01201648 <mc_1+96>: sub r1, r1, #3 ; 0x3
0x0120164c <mc_1+100>: b 0x1201544 <memcpy+176>
0x01201650 <mc_2+0>: ldr r8, [r1], #4
0x01201654 <mc_2+4>: cmp r2, #16 ; 0x10
0x01201658 <mc_2+8>: blt 0x12016a8 <mc_2+88>
0x0120165c <mc_2+12>: lsr r4, r8, #16
0x01201660 <mc_2+16>: ldm r1!, {r5, r6, r7, r8}
0x01201664 <mc_2+20>: orr r4, r4, r5, lsl #16
0x01201668 <mc_2+24>: lsr r5, r5, #16
0x0120166c <mc_2+28>: orr r5, r5, r6, lsl #16
0x01201670 <mc_2+32>: lsr r6, r6, #16
0x01201674 <mc_2+36>: orr r6, r6, r7, lsl #16
0x01201678 <mc_2+40>: lsr r7, r7, #16
0x0120167c <mc_2+44>: orr r7, r7, r8, lsl #16
0x01201680 <mc_2+48>: stmia r3!, {r4, r5, r6, r7}
0x01201684 <mc_2+52>: sub r2, r2, #16 ; 0x10
0x01201688 <mc_2+56>: cmp r2, #32 ; 0x20
0x0120168c <mc_2+60>: bge 0x120165c <mc_2+12>
0x01201690 <mc_2+64>: b 0x12016a8 <mc_2+88>
0x01201694 <mc_2+68>: lsr r4, r8, #16
0x01201698 <mc_2+72>: ldr r8, [r1], #4
0x0120169c <mc_2+76>: orr r4, r4, r8, lsl #16
0x012016a0 <mc_2+80>: str r4, [r3], #4
0x012016a4 <mc_2+84>: sub r2, r2, #4 ; 0x4
0x012016a8 <mc_2+88>: cmp r2, #4 ; 0x4
0x012016ac <mc_2+92>: bge 0x1201694 <mc_2+68>
0x012016b0 <mc_2+96>: sub r1, r1, #2 ; 0x2
0x012016b4 <mc_2+100>: b 0x1201544 <memcpy+176>
0x012016b8 <mc_bytes+0>: teq r2, #0 ; 0x0
0x012016bc <mc_bytes+4>: ldrbne r12, [r1], #1
0x012016c0 <mc_bytes+8>: strbne r12, [r3], #1
0x012016c4 <mc_bytes+12>: subsne r2, r2, #1 ; 0x1
0x012016c8 <mc_bytes+16>: bne 0x12016bc <mc_bytes+4>
0x012016cc <mc_bytes+20>: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
End of assembler dump.
Crash Point:
backtrace *
#0 0x012014f8 in memcpy () from libc.so.3
#1 0x78276c04 in wlan_11n_aggregate_pkt (priv=0x0, pra_list=0x3f5898,
headroom=<value optimized out>, ptrindex=0)
at \mlan\mlan_11n_aggr.c:98
#2 0x002ff4e8 in ?? ()
Please let me know if you need further information.

Why do these simple methods compile differently?

I'm slightly confused as to why clang is emitting different code for the following two method:
#interface ClassA : NSObject
#end
#implementation ClassA
+ (ClassA*)giveMeAnObject1 {
return [[ClassA alloc] init];
}
+ (id)giveMeAnObject2 {
return [[ClassA alloc] init];
}
#end
If we look at the ARMv7 emitted then we see this, at O3, with ARC enabled:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject1]"
"+[ClassA giveMeAnObject1]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
LPC0_0:
add r1, pc
LPC0_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
LPC0_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autorelease
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject2]"
"+[ClassA giveMeAnObject2]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autoreleaseReturnValue
The only difference is the tail call to objc_autoreleaseReturnValue vs objc_autorelease. I would expect both to call objc_autoreleaseReturnValue to be honest. In-fact the first method not using objc_autoreleaseReturnValue means that it will potentially be slower than the second because there will definitely be an autorelease then a retain by the caller, rather than the faster bypass of this redundant call that ARC can do if it's supported in the runtime.
The LLVM which is emitted gives some kind of reason why it's like that:
define internal %1* #"\01+[ClassA giveMeAnObject1]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autorelease(i8* %6) nounwind
%8 = bitcast i8* %6 to %1*
ret %1* %8
}
define internal i8* #"\01+[ClassA giveMeAnObject2]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autoreleaseReturnValue(i8* %6) nounwind
ret i8* %6
}
But I'm struggling to see why it's decided to compile these two method differently. Can anyone shed some light onto it?
Update:
Even weirder is these other methods:
+ (ClassA*)giveMeAnObject3 {
ClassA *a = [[ClassA alloc] init];
return a;
}
+ (id)giveMeAnObject4 {
ClassA *a = [[ClassA alloc] init];
return a;
}
These compile to:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject3]"
"+[ClassA giveMeAnObject3]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject4]"
"+[ClassA giveMeAnObject4]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
LPC3_0:
add r1, pc
LPC3_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
This time, they are identical however there's a few things which could be optimised even more here:
There's a redundant mov r4, r0 followed by mov r0, r4.
There's a retain followed by a release.
Surely, the bottom bit of both of those methods can turn into:
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
Obviously we could then also omit popping r4 because we don't actually clobber it any more. Then the method would turn into the exact same as giveMeAnObject2 which is exactly what we'd expect.
Why is clang not being clever and doing this?!
This appears to be a bug in the optimizer and is being tracked as rdar://problem/10813093.