Why do these simple methods compile differently? - objective-c

I'm slightly confused as to why clang is emitting different code for the following two method:
#interface ClassA : NSObject
#end
#implementation ClassA
+ (ClassA*)giveMeAnObject1 {
return [[ClassA alloc] init];
}
+ (id)giveMeAnObject2 {
return [[ClassA alloc] init];
}
#end
If we look at the ARMv7 emitted then we see this, at O3, with ARC enabled:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject1]"
"+[ClassA giveMeAnObject1]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
LPC0_0:
add r1, pc
LPC0_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
LPC0_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autorelease
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject2]"
"+[ClassA giveMeAnObject2]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autoreleaseReturnValue
The only difference is the tail call to objc_autoreleaseReturnValue vs objc_autorelease. I would expect both to call objc_autoreleaseReturnValue to be honest. In-fact the first method not using objc_autoreleaseReturnValue means that it will potentially be slower than the second because there will definitely be an autorelease then a retain by the caller, rather than the faster bypass of this redundant call that ARC can do if it's supported in the runtime.
The LLVM which is emitted gives some kind of reason why it's like that:
define internal %1* #"\01+[ClassA giveMeAnObject1]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autorelease(i8* %6) nounwind
%8 = bitcast i8* %6 to %1*
ret %1* %8
}
define internal i8* #"\01+[ClassA giveMeAnObject2]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autoreleaseReturnValue(i8* %6) nounwind
ret i8* %6
}
But I'm struggling to see why it's decided to compile these two method differently. Can anyone shed some light onto it?
Update:
Even weirder is these other methods:
+ (ClassA*)giveMeAnObject3 {
ClassA *a = [[ClassA alloc] init];
return a;
}
+ (id)giveMeAnObject4 {
ClassA *a = [[ClassA alloc] init];
return a;
}
These compile to:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject3]"
"+[ClassA giveMeAnObject3]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject4]"
"+[ClassA giveMeAnObject4]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
LPC3_0:
add r1, pc
LPC3_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
This time, they are identical however there's a few things which could be optimised even more here:
There's a redundant mov r4, r0 followed by mov r0, r4.
There's a retain followed by a release.
Surely, the bottom bit of both of those methods can turn into:
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
Obviously we could then also omit popping r4 because we don't actually clobber it any more. Then the method would turn into the exact same as giveMeAnObject2 which is exactly what we'd expect.
Why is clang not being clever and doing this?!

This appears to be a bug in the optimizer and is being tracked as rdar://problem/10813093.

Related

STM32F4-Disc1: user defined software delay in keil MDK version 5 not working

I am getting into learning embedded systems and I tried to implement blinky but the software delay gets skipped for some reason. I was expecting it to blink when I am pushing the button but instead the LEDs kept on.
Code I have used is shown below,
#include Board_LED.h
#include Board_Buttons.h
#include <stdint.h>
void delay(void);
void delay(void) {
int i;
for (i = 0; i < 5000000; i++)
;
}
int main(void) {
LED_Initialize();
Buttons_Initialize();
while (1) {
if (Buttons_GetState() == 1) {
LED_On(0);
LED_On(1);
LED_On(2);
LED_On(3);
delay();
LED_Off(0);
LED_Off(1);
LED_Off(2);
LED_Off(3);
delay();
}
}
return 0;
}
I'm using board support LED and button APIs.
How do I fix this?
My debugger starts as follows:
The problem here is this is dead code, it does nothing interacts with nothing so can/should be optimized out. And an optimizer will often do this.
void delay(void)
{
int i;
for(i=0; i<5000000 ;i++);
}
optimized output:
00000000 <delay>:
0: 4770 bx lr
One way is to not optimize
00000000 <delay>:
0: b580 push {r7, lr}
2: b082 sub sp, #8
4: af00 add r7, sp, #0
6: 2300 movs r3, #0
8: 607b str r3, [r7, #4]
a: e002 b.n 12 <delay+0x12>
c: 687b ldr r3, [r7, #4]
e: 3301 adds r3, #1
10: 607b str r3, [r7, #4]
12: 687b ldr r3, [r7, #4]
14: 4a04 ldr r2, [pc, #16] ; (28 <delay+0x28>)
16: 4293 cmp r3, r2
18: ddf8 ble.n c <delay+0xc>
1a: 46c0 nop ; (mov r8, r8)
1c: 46c0 nop ; (mov r8, r8)
1e: 46bd mov sp, r7
20: b002 add sp, #8
22: bc80 pop {r7}
24: bc01 pop {r0}
26: 4700 bx r0
But that's a bit brutal for an embedded platform so another is to beg the compiler to do something with the variable, keep it in memory and up to date:
void delay(void)
{
volatile int i;
for(i=0; i<5000000 ;i++);
}
It's still a bit ugly but that will burn some time:
00000000 <delay>:
0: 2300 movs r3, #0
2: b082 sub sp, #8
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 4a05 ldr r2, [pc, #20] ; (20 <delay+0x20>)
a: 4293 cmp r3, r2
c: dc05 bgt.n 1a <delay+0x1a>
e: 9b01 ldr r3, [sp, #4]
10: 3301 adds r3, #1
12: 9301 str r3, [sp, #4]
14: 9b01 ldr r3, [sp, #4]
16: 4293 cmp r3, r2
18: ddf9 ble.n e <delay+0xe>
1a: b002 add sp, #8
1c: 4770 bx lr
1e: 46c0 nop ; (mov r8, r8)
20: 004c4b3f .word 0x004c4b3f
The win-win way is to have another function outside the compile domain and let the optimizer work.
void dummy ( int );
void delay(void)
{
int i;
for(i=0; i<5000000 ;i++) dummy(i);
}
00000000 <delay>:
0: b570 push {r4, r5, r6, lr}
2: 2400 movs r4, #0
4: 4d04 ldr r5, [pc, #16] ; (18 <delay+0x18>)
6: 0020 movs r0, r4
8: 3401 adds r4, #1
a: f7ff fffe bl 0 <dummy>
e: 42ac cmp r4, r5
10: d1f9 bne.n 6 <delay+0x6>
12: bc70 pop {r4, r5, r6}
14: bc01 pop {r0}
16: 4700 bx r0
18: 004c4b40 .word 0x004c4b40
A little cleaner, burns some time but isn't excessive, yes note this is all-thumb-variants code. The called function can simply be a bx lr since you don't care what it does with the call.
00000000 <delay>:
0: b538 push {r3, r4, r5, lr}
2: 2400 movs r4, #0
4: 4d03 ldr r5, [pc, #12] ; (14 <delay+0x14>)
6: 4620 mov r0, r4
8: 3401 adds r4, #1
a: f7ff fffe bl 0 <dummy>
e: 42ac cmp r4, r5
10: d1f9 bne.n 6 <delay+0x6>
12: bd38 pop {r3, r4, r5, pc}
14: 004c4b40 .word 0x004c4b40
Building for the mcu cleans up the pop as after armv4t or 5t you could pop the pc to return to either mode, even though this is thumb mode only you still deal with that with these tools.
Now as shown by others, since you don't care about order just want to count you can, depending on architecture (often this is supported) count down. We are asking the compiler to not make this dead code so it has to do it in the order we asked, to be a functional representation of the C code.
void dummy ( int );
void delay(void)
{
int i=5000000;
while(--i) dummy(i);
}
00000000 <delay>:
0: b510 push {r4, lr}
2: 4c03 ldr r4, [pc, #12] ; (10 <delay+0x10>)
4: 4620 mov r0, r4
6: f7ff fffe bl 0 <dummy>
a: 3c01 subs r4, #1
c: d1fa bne.n 4 <delay+0x4>
e: bd10 pop {r4, pc}
10: 004c4b3f .word 0x004c4b3f
And now the compare went away (i-- vs --i made a difference i-- makes for more code)
With volatile:
void delay(void)
{
volatile int i=5000000;
while(--i) continue;
}
00000000 <delay>:
0: b082 sub sp, #8
2: 4b04 ldr r3, [pc, #16] ; (14 <delay+0x14>)
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 3b01 subs r3, #1
a: 9301 str r3, [sp, #4]
c: 2b00 cmp r3, #0
e: d1fa bne.n 6 <delay+0x6>
10: b002 add sp, #8
12: 4770 bx lr
14: 004c4b40 .word 0x004c4b40
void delay(void)
{
volatile int i=5000000;
while(i--) continue;
}
00000000 <delay>:
0: b082 sub sp, #8
2: 4b04 ldr r3, [pc, #16] ; (14 <delay+0x14>)
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 1e5a subs r2, r3, #1
a: 9201 str r2, [sp, #4]
c: 2b00 cmp r3, #0
e: d1fa bne.n 6 <delay+0x6>
10: b002 add sp, #8
12: 4770 bx lr
14: 004c4b40 .word 0x004c4b40
And that doesn't take advantage of the instruction set, oh well. (Being higher or lower one count doesn't matter as this really can't/won't be a tuned loop, to tune it on a platform like this you really need to use asm and even there it is difficult to tune).
Even cleaner just do it in assembly
.globl delay
delay:
ldr r0,=5000000
dinner:
sub r0,#1
bne dinner
bx lr
00000000 <delay>:
0: 4801 ldr r0, [pc, #4] ; (8 <dinner+0x6>)
00000002 <dinner>:
2: 3801 subs r0, #1
4: d1fd bne.n 2 <dinner>
6: 4770 bx lr
8: 004c4b40 .word 0x004c4b40
or make it generic
.globl delay
delay:
sub r0,#1
bne delay
bx lr
00000000 <delay>:
0: 3801 subs r0, #1
2: d1fe bne.n 0 <delay>
4: 4770 bx lr
and then call it from C with
delay(5000000);
Lots of options, but what others didn't show is the code being optimized away and what the choices do to the code. It is quite easy to see in the compiler output using the tools what is going on and why this happened.
And there are various ways to make it or request it to not be dead code. Most people just toss in a volatile and move on. Nothing wrong with that, usually.
Either specify -O0 as optimization flag in the compiler settings to avoid that the useless loop (from the compiler point of view) is optimized away.
Alternatively check the MDK or BSP for a provided delay() function known to work.
How did you discover that the loop was skipped (maybe your button function is not working)
Test it with:
void delay(volatile uint32_t del)
{
while(del--);
}
int main(void)
{
LED_Initialize();
Buttons_Initialize();
while(1){
if( 1 || Buttons_GetState() == 1){ //it skips the if checks
LED_On(0);
LED_On(1);
LED_On(2);
LED_On(3);
delay(500000);
LED_Off(0);
LED_Off(1);
LED_Off(2);
LED_Off(3);
delay(500000);
}
}
}
void delay(void)
{
volatile int i;
for(i=0; i<5000000 ;i++);
}
This should work provided that Buttons_GetState() is working fine. Declared variable 'i' as volatile so that no optimization happens by compiler.

ADICUP360 and CrossCore Embedded Studio - error: No source available for "_start() at 0x150"

The demo projects of the ADuCM360 / ADICUP360 from https://wiki.analog.com/resources/eval/user-guides/eval-adicup360 are working fine.
But if I try to create (build project -> debug as) my own project, for example just something like this:
#include <sys/platform.h>
int main (int argc, char *argv[])
{
/* Begin adding your custom code here */
return 0;
}
Then I receive the error message:
No source available for "_start() at 0x150"
When I then press the button View Disassembly, then i get the following result:
00000150: bkpt 0x00ab
00000152: ldr r0, [pc, #192] ; (0x214 <_start+200>)
00000154: ldr r1, [r0, #4]
00000156: cmp r1, #0
00000158: beq.n 0x15e <_start+18>
0000015a: ldr r2, [pc, #212] ; (0x230 <_start+228>)
0000015c: str r1, [r2, #0]
0000015e: ldr r1, [r0, #0]
00000160: cmp r1, #0
00000162: bne.n 0x168 <_start+28>
00000164: ldr r1, [pc, #196] ; (0x22c <_start+224>)
00000166: str r1, [r0, #0]
00000168: ldr r1, [r0, #8]
0000016a: ldr r2, [r0, #12]
0000016c: cmp r1, #0
0000016e: beq.n 0x172 <_start+38>
00000170: mov sp, r1
00000172: cmp r2, #0
00000174: beq.n 0x17a <_start+46>
00000176: add.w r10, r2, #256 ; 0x100
0000017a: movs r1, #0
0000017c: mov r11, r1
0000017e: mov r7, r1
00000180: ldr r0, [pc, #148] ; (0x218 <_start+204>)
00000182: ldr r2, [pc, #152] ; (0x21c <_start+208>)
00000184: subs r2, r2, r0
00000186: bl 0x41c <memset>
0000018a: bl 0x6c0 <initialise_monitor_handles>
0000018e: movs r0, #21
00000190: ldr r1, [pc, #148] ; (0x228 <_start+220>)
00000192: bkpt 0x00ab
00000194: ldr r1, [pc, #144] ; (0x228 <_start+220>)
00000196: ldr r1, [r1, #0]
00000198: movs r0, #0
0000019a: push {r0}
0000019c: ldrb r3, [r1, #0]
0000019e: adds r1, #1
000001a0: cmp r3, #0
000001a2: beq.n 0x1d0 <_start+132>
000001a4: cmp r3, #32
000001a6: beq.n 0x19c <_start+80>
000001a8: cmp r3, #34 ; 0x22
000001aa: beq.n 0x1b0 <_start+100>
000001ac: cmp r3, #39 ; 0x27
000001ae: bne.n 0x1b4 <_start+104>
000001b0: movs r2, r3
000001b2: b.n 0x1b8 <_start+108>
000001b4: movs r2, #32
000001b6: subs r1, #1
000001b8: push {r1}
000001ba: adds r0, #1
000001bc: ldrb r3, [r1, #0]
000001be: adds r1, #1
000001c0: cmp r3, #0
000001c2: beq.n 0x1d0 <_start+132>
......... ...
How can I fix this?
That is the C runtime start-up - possibly provided to you project as object code, hence no source.
Do you need source-level debugging of the start-up code? If so you need to include the source (which is quite possibly in assembly code in any case if it is available to you at all).
Place a break point in or on main() and continue until you are in code for which you can provide the source.
The question is why you have a breakpoint instruction (soft breakpoint) at _start()? Did you set that, or did your debugger automatically insert it?
The CrossCore compiler manual refers to the "System Run-Time Documentation" for details of the runtime start-up, but one of the FAQ's is "Where is the System Run-Time Documentation?".
The better and easy way to start is take working example ( for example blink project),remove contents of main function and add your custom code there.
I understand from your comment that you are debugging the code.
May I know
1. How the behavior is if step over from breakpoint is done?
2. What is the behavior in free run?

Why does setting LOCAL_ARM_NEON double the speed without special code-paths?

I'm using the NDK on Android for some heavy numerical code, using the LLVM toolchain.
I've found that if I set LOCAL_ARM_NEON := true in my Android.mk, I get almost a 50% speedup in my code. I have not written any NEON specific source-files, and have no NEON intrinsics in my code. Does this mean that the compiler is automatically injecting NEON instructions into its code?
If this is the case, then because this is all generated in the compiler, I can't wrap the NEON code-paths with a check for hardware support. Is there a best-practice here? Or is LOCAL_ARM_NEON := true fundamentally unsafe?
Further details: (caveat, I'm not very experienced reading ARM assembly)
Comparison of generated assembly.
I find the slower assembly here to be quite readable. The faster assembly I find very challenging to read. I also can't tell if it has NEON specific instructions in it, since both generated files seem to have vmul instructions which this page claims are NEON specific EDIT: apparently vmul is not NEON specific.
Slower code (not setting the LOCAL_ARM_NEON flag)
00000000 <_dotProduct>:
0: ed9f 0a08 vldr s0, [pc, #32] ; 24 <_dotProduct+0x24>
4: 2a01 cmp r2, #1
6: bfb8 it lt
8: 4770 bxlt lr
a: ed90 1a00 vldr s2, [r0]
e: 3004 adds r0, #4
10: ed91 2a00 vldr s4, [r1]
14: 3104 adds r1, #4
16: 3a01 subs r2, #1
18: ee22 1a01 vmul.f32 s2, s4, s2
1c: ee31 0a00 vadd.f32 s0, s2, s0
20: d1f3 bne.n a <_dotProduct+0xa>
22: 4770 bx lr
24: 00000000 .word 0x00000000
Faster code (with LOCAL_ARM_NEON := true)
00000000 <_dotProduct>:
0: b510 push {r4, lr}
2: 2a01 cmp r2, #1
4: db1b blt.n 3e <_dotProduct+0x3e>
6: 2a00 cmp r2, #0
8: d01c beq.n 44 <_dotProduct+0x44>
a: efc0 0050 vmov.i32 q8, #0 ; 0x00000000
e: f022 0c03 bic.w ip, r2, #3
12: f1bc 0f00 cmp.w ip, #0
16: d01a beq.n 4e <_dotProduct+0x4e>
18: 46e6 mov lr, ip
1a: 460b mov r3, r1
1c: 4604 mov r4, r0
1e: f964 2a8f vld1.32 {d18-d19}, [r4]
22: f1be 0e04 subs.w lr, lr, #4
26: f104 0410 add.w r4, r4, #16
2a: f963 4a8f vld1.32 {d20-d21}, [r3]
2e: f103 0310 add.w r3, r3, #16
32: ff44 2df2 vmul.f32 q9, q10, q9
36: ef42 0de0 vadd.f32 q8, q9, q8
3a: d1f0 bne.n 1e <_dotProduct+0x1e>
3c: e009 b.n 52 <_dotProduct+0x52>
3e: ef80 0010 vmov.i32 d0, #0 ; 0x00000000
42: bd10 pop {r4, pc}
44: ef80 0010 vmov.i32 d0, #0 ; 0x00000000
48: f04f 0c00 mov.w ip, #0
4c: e00b b.n 66 <_dotProduct+0x66>
4e: f04f 0c00 mov.w ip, #0
52: eff0 28e0 vext.8 q9, q8, q8, #8
56: 4594 cmp ip, r2
58: ef40 0de2 vadd.f32 q8, q8, q9
5c: fffc 2c60 vdup.32 q9, d16[1]
60: ef00 0de2 vadd.f32 q0, q8, q9
64: d011 beq.n 8a <_dotProduct+0x8a>
66: eb01 018c add.w r1, r1, ip, lsl #2
6a: eb00 008c add.w r0, r0, ip, lsl #2
6e: eba2 020c sub.w r2, r2, ip
72: ed90 2a00 vldr s4, [r0]
76: 3004 adds r0, #4
78: ed91 3a00 vldr s6, [r1]
7c: 3104 adds r1, #4
7e: 3a01 subs r2, #1
80: ff43 0d12 vmul.f32 d16, d3, d2
84: ef00 0d80 vadd.f32 d0, d16, d0
88: d1f3 bne.n 72 <_dotProduct+0x72>
8a: bd10 pop {r4, pc}
OK I'm going to answer this myself based on the helpful comments from #Michael and #Notlikethat. My speedup, then, is because of the NEON instructions (of course).
It appears that setting LOCAL_ARM_NEON := true allows the compiler to generate NEON instructions, even for non .neon files. This will make the code unportable to ARMv7 that does not support NEON.
I think this gives me two choices, one: compile a version of my lib with and without LOCAL_ARM_NEON := true and decide in Java which one to load based on whether the CPU supports NEON.
The second would be to not set LOCAL_ARM_NEON := true, but instead to duplicate my performance-sensitive code-paths into a .c.neon file (which will allow only that file to be compiled with NEON support. Then, in the main file, use the cpufeatures lib to detect NEON support and switch to that file if available.

ARM disassemble+ Crash at ldmge r1!, {r4, r5, r6, r7, r8, r9, r10, r11}

I found a crash in memcpy() function, which gets called from one of 802.11n specific aggregation function in wifi driver. From the core analysis, the crash point is mentioned below,
0x012014f8 <memcpy+100>: ldmge r1!, {r4, r5, r6, r7, r8, r9, r10, r11}
Why do we see a crash after the ldmge instruction execution?
I would like to know which parameter of memcpy() is corrupted - src_addr,dest_addr or length? Could you please provide your inputs by looking into the disassemble code.
Please find the disassemble code of memcpy() from gdb and backtrace from core file.
disas *
Dump of assembler code for function memcpy:
0x01201494 <memcpy+0>: cmp r2, #0 ; 0x0
0x01201498 <memcpy+4>: moveq pc, lr
0x0120149c <memcpy+8>: push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
0x012014a0 <memcpy+12>: mov r3, r0
0x012014a4 <memcpy+16>: cmp r2, #16 ; 0x10
0x012014a8 <memcpy+20>: blt 0x12016b8 <mc_bytes>
0x012014ac <memcpy+24>: ands r12, r3, #3 ; 0x3
0x012014b0 <memcpy+28>: beq 0x12014d8 <memcpy+68>
0x012014b4 <memcpy+32>: rsb r12, r12, #4 ; 0x4
0x012014b8 <memcpy+36>: cmp r12, #2 ; 0x2
0x012014bc <memcpy+40>: ldrb r4, [r1], #1
0x012014c0 <memcpy+44>: ldrbge r5, [r1], #1
0x012014c4 <memcpy+48>: ldrbgt r6, [r1], #1
0x012014c8 <memcpy+52>: strb r4, [r3], #1
0x012014cc <memcpy+56>: strbge r5, [r3], #1
0x012014d0 <memcpy+60>: strbgt r6, [r3], #1
0x012014d4 <memcpy+64>: sub r2, r2, r12
0x012014d8 <memcpy+68>: ands r12, r1, #3 ; 0x3
0x012014dc <memcpy+72>: bne 0x120156c <mc_unaligned>
0x012014e0 <memcpy+76>: tst r3, #15 ; 0xf
0x012014e4 <memcpy+80>: ldrne r4, [r1], #4
0x012014e8 <memcpy+84>: subne r2, r2, #4 ; 0x4
0x012014ec <memcpy+88>: strne r4, [r3], #4
0x012014f0 <memcpy+92>: bne 0x12014e0 <memcpy+76>
0x012014f4 <memcpy+96>: cmp r2, #32 ; 0x20
**0x012014f8 <memcpy+100>: ldmge r1!, {r4, r5, r6, r7, r8, r9, r10, r11}****
0x012014fc <memcpy+104>: subge r2, r2, #32 ; 0x20
0x01201500 <memcpy+108>: stmiage r3!, {r4, r5, r6, r7, r8, r9, r10, r11}
0x01201504 <memcpy+112>: bge 0x12014f4 <memcpy+96>
0x01201508 <memcpy+116>: cmp r2, #16 ; 0x10
0x0120150c <memcpy+120>: ldmge r1!, {r4, r5, r6, r7}
0x01201510 <memcpy+124>: subge r2, r2, #16 ; 0x10
0x01201514 <memcpy+128>: stmiage r3!, {r4, r5, r6, r7}
0x01201518 <memcpy+132>: tst r2, #8 ; 0x8
0x0120151c <memcpy+136>: beq 0x1201534 <memcpy+160>
0x01201520 <memcpy+140>: ldr r4, [r1], #4
0x01201524 <memcpy+144>: ldr r5, [r1], #4
0x01201528 <memcpy+148>: sub r2, r2, #8 ; 0x8
0x0120152c <memcpy+152>: str r4, [r3], #4
0x01201530 <memcpy+156>: str r5, [r3], #4
0x01201534 <memcpy+160>: tst r2, #4 ; 0x4
0x01201538 <memcpy+164>: ldrne r4, [r1], #4
0x0120153c <memcpy+168>: subne r2, r2, #4 ; 0x4
0x01201540 <memcpy+172>: strne r4, [r3], #4
0x01201544 <memcpy+176>: cmp r2, #0 ; 0x0
0x01201548 <memcpy+180>: beq 0x1201568 <memcpy+212>
0x0120154c <memcpy+184>: cmp r2, #2 ; 0x2
0x01201550 <memcpy+188>: ldrb r4, [r1], #1
0x01201554 <memcpy+192>: ldrbge r5, [r1], #1
0x01201558 <memcpy+196>: ldrbgt r6, [r1], #1
0x0120155c <memcpy+200>: strb r4, [r3], #1
0x01201560 <memcpy+204>: strbge r5, [r3], #1
0x01201564 <memcpy+208>: strbgt r6, [r3], #1
0x01201568 <memcpy+212>: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
0x0120156c <mc_unaligned+0>: bic r1, r1, #3 ; 0x3
0x01201570 <mc_unaligned+4>: teq r12, #1 ; 0x1
0x01201574 <mc_unaligned+8>: beq 0x12015e8 <mc_1>
0x01201578 <mc_unaligned+12>: teq r12, #2 ; 0x2
0x0120157c <mc_unaligned+16>: beq 0x1201650 <mc_2>
0x01201580 <mc_3+0>: ldr r8, [r1], #4
0x01201584 <mc_3+4>: cmp r2, #16 ; 0x10
0x01201588 <mc_3+8>: blt 0x12015d8 <mc_3+88>
0x0120158c <mc_3+12>: lsr r4, r8, #24
0x01201590 <mc_3+16>: ldm r1!, {r5, r6, r7, r8}
0x01201594 <mc_3+20>: orr r4, r4, r5, lsl #8
0x01201598 <mc_3+24>: lsr r5, r5, #24
0x0120159c <mc_3+28>: orr r5, r5, r6, lsl #8
0x012015a0 <mc_3+32>: lsr r6, r6, #24
0x012015a4 <mc_3+36>: orr r6, r6, r7, lsl #8
0x012015a8 <mc_3+40>: lsr r7, r7, #24
0x012015ac <mc_3+44>: orr r7, r7, r8, lsl #8
0x012015b0 <mc_3+48>: stmia r3!, {r4, r5, r6, r7}
0x012015b4 <mc_3+52>: sub r2, r2, #16 ; 0x10
0x012015b8 <mc_3+56>: cmp r2, #32 ; 0x20
0x012015bc <mc_3+60>: bge 0x120158c <mc_3+12>
0x012015c0 <mc_3+64>: b 0x12015d8 <mc_3+88>
0x012015c4 <mc_3+68>: lsr r4, r8, #24
0x012015c8 <mc_3+72>: ldr r8, [r1], #4
0x012015cc <mc_3+76>: orr r4, r4, r8, lsl #8
0x012015d0 <mc_3+80>: str r4, [r3], #4
0x012015d4 <mc_3+84>: sub r2, r2, #4 ; 0x4
0x012015d8 <mc_3+88>: cmp r2, #4 ; 0x4
0x012015dc <mc_3+92>: bge 0x12015c4 <mc_3+68>
0x012015e0 <mc_3+96>: sub r1, r1, #1 ; 0x1
0x012015e4 <mc_3+100>: b 0x1201544 <memcpy+176>
0x012015e8 <mc_1+0>: ldr r8, [r1], #4
0x012015ec <mc_1+4>: cmp r2, #16 ; 0x10
0x012015f0 <mc_1+8>: blt 0x1201640 <mc_1+88>
0x012015f4 <mc_1+12>: lsr r4, r8, #8
0x012015f8 <mc_1+16>: ldm r1!, {r5, r6, r7, r8}
0x012015fc <mc_1+20>: orr r4, r4, r5, lsl #24
0x01201600 <mc_1+24>: lsr r5, r5, #8
0x01201604 <mc_1+28>: orr r5, r5, r6, lsl #24
0x01201608 <mc_1+32>: lsr r6, r6, #8
0x0120160c <mc_1+36>: orr r6, r6, r7, lsl #24
0x01201610 <mc_1+40>: lsr r7, r7, #8
0x01201614 <mc_1+44>: orr r7, r7, r8, lsl #24
0x01201618 <mc_1+48>: stmia r3!, {r4, r5, r6, r7}
0x0120161c <mc_1+52>: sub r2, r2, #16 ; 0x10
0x01201620 <mc_1+56>: cmp r2, #32 ; 0x20
0x01201624 <mc_1+60>: bge 0x12015f4 <mc_1+12>
0x01201628 <mc_1+64>: b 0x1201640 <mc_1+88>
0x0120162c <mc_1+68>: lsr r4, r8, #8
0x01201630 <mc_1+72>: ldr r8, [r1], #4
0x01201634 <mc_1+76>: orr r4, r4, r8, lsl #24
0x01201638 <mc_1+80>: str r4, [r3], #4
0x0120163c <mc_1+84>: sub r2, r2, #4 ; 0x4
0x01201640 <mc_1+88>: cmp r2, #4 ; 0x4
0x01201644 <mc_1+92>: bge 0x120162c <mc_1+68>
0x01201648 <mc_1+96>: sub r1, r1, #3 ; 0x3
0x0120164c <mc_1+100>: b 0x1201544 <memcpy+176>
0x01201650 <mc_2+0>: ldr r8, [r1], #4
0x01201654 <mc_2+4>: cmp r2, #16 ; 0x10
0x01201658 <mc_2+8>: blt 0x12016a8 <mc_2+88>
0x0120165c <mc_2+12>: lsr r4, r8, #16
0x01201660 <mc_2+16>: ldm r1!, {r5, r6, r7, r8}
0x01201664 <mc_2+20>: orr r4, r4, r5, lsl #16
0x01201668 <mc_2+24>: lsr r5, r5, #16
0x0120166c <mc_2+28>: orr r5, r5, r6, lsl #16
0x01201670 <mc_2+32>: lsr r6, r6, #16
0x01201674 <mc_2+36>: orr r6, r6, r7, lsl #16
0x01201678 <mc_2+40>: lsr r7, r7, #16
0x0120167c <mc_2+44>: orr r7, r7, r8, lsl #16
0x01201680 <mc_2+48>: stmia r3!, {r4, r5, r6, r7}
0x01201684 <mc_2+52>: sub r2, r2, #16 ; 0x10
0x01201688 <mc_2+56>: cmp r2, #32 ; 0x20
0x0120168c <mc_2+60>: bge 0x120165c <mc_2+12>
0x01201690 <mc_2+64>: b 0x12016a8 <mc_2+88>
0x01201694 <mc_2+68>: lsr r4, r8, #16
0x01201698 <mc_2+72>: ldr r8, [r1], #4
0x0120169c <mc_2+76>: orr r4, r4, r8, lsl #16
0x012016a0 <mc_2+80>: str r4, [r3], #4
0x012016a4 <mc_2+84>: sub r2, r2, #4 ; 0x4
0x012016a8 <mc_2+88>: cmp r2, #4 ; 0x4
0x012016ac <mc_2+92>: bge 0x1201694 <mc_2+68>
0x012016b0 <mc_2+96>: sub r1, r1, #2 ; 0x2
0x012016b4 <mc_2+100>: b 0x1201544 <memcpy+176>
0x012016b8 <mc_bytes+0>: teq r2, #0 ; 0x0
0x012016bc <mc_bytes+4>: ldrbne r12, [r1], #1
0x012016c0 <mc_bytes+8>: strbne r12, [r3], #1
0x012016c4 <mc_bytes+12>: subsne r2, r2, #1 ; 0x1
0x012016c8 <mc_bytes+16>: bne 0x12016bc <mc_bytes+4>
0x012016cc <mc_bytes+20>: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
End of assembler dump.
Crash Point:
backtrace *
#0 0x012014f8 in memcpy () from libc.so.3
#1 0x78276c04 in wlan_11n_aggregate_pkt (priv=0x0, pra_list=0x3f5898,
headroom=<value optimized out>, ptrindex=0)
at \mlan\mlan_11n_aggr.c:98
#2 0x002ff4e8 in ?? ()
Please let me know if you need further information.

why is clang optimization breaking my inline assembly code?

in an attempt to learn something about ARM assembly, i have written a simple test project to perform image downscaling using inline assembly and NEON instructions. you can see it here:
https://github.com/rmaz/NEON-Image-Downscaling
after some effort i managed to get it working, happy days. except that it only works for optimization levels less than -O2. i have taken a look at the generated ASM, but i cannot see any obvious reason why this should occur. can anyone offer any insight? here is the function responsible for the inline assembly part:
static void inline resizeRow(uint32_t *dst, uint32_t *src, uint32_t pixelsPerRow)
{
const uint32_t * rowB = src + pixelsPerRow;
// force the number of pixels per row to a mutliple of 8
pixelsPerRow = 8 * (pixelsPerRow / 8);
__asm__ volatile("Lresizeloop: \n" // start loop
"vld1.32 {d0-d3}, [%1]! \n" // load 8 pixels from the top row
"vld1.32 {d4-d7}, [%2]! \n" // load 8 pixels from the bottom row
"vhadd.u8 q0, q0, q2 \n" // average the pixels vertically
"vhadd.u8 q1, q1, q3 \n"
"vtrn.32 q0, q2 \n" // transpose to put the horizontally adjacent pixels in different registers
"vtrn.32 q1, q3 \n"
"vhadd.u8 q0, q0, q2 \n" // average the pixels horizontally
"vhadd.u8 q1, q1, q3 \n"
"vtrn.32 d0, d1 \n" // fill the registers with pixels
"vtrn.32 d2, d3 \n"
"vswp d1, d2 \n"
"vst1.64 {d0-d1}, [%0]! \n" // store the result
"subs %3, %3, #8 \n" // subtract 8 from the pixel count
"bne Lresizeloop \n" // repeat until the row is complete
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3"
);
}
the functioning generated output at O1 for the surrounding function and loop is as follows:
.align 2
.code 16 # #"\01-[BDPViewController downscaleImageNeon:]"
.thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
.cfi_startproc
Lfunc_begin4:
.loc 1 86 0 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
# BB#0:
.loc 1 86 1 prologue_end # NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
push {r4, r5, r6, r7, lr}
add r7, sp, #12
push.w {r8, r10, r11}
sub sp, #20
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
.loc 1 88 20 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
movw r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
mov r6, r2
Ltmp43:
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
movt r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
add r0, pc
ldr.w r11, [r0]
mov r0, r6
blx _objc_retain
mov r4, r0
mov r0, r6
mov r1, r11
Ltmp44:
blx _objc_msgSend
blx _CGImageGetWidth
mov r5, r0
Ltmp45:
#DEBUG_VALUE: width <- R5+0
.loc 1 89 21 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
mov r0, r6
mov r1, r11
str r5, [sp, #16] # 4-byte Spill
blx _objc_msgSend
blx _CGImageGetHeight
mov r10, r0
Ltmp46:
#DEBUG_VALUE: height <- R10+0
.loc 1 90 26 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetBytesPerRow
str r0, [sp, #12] # 4-byte Spill
Ltmp47:
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
.loc 1 91 35 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetAlphaInfo
str r0, [sp, #4] # 4-byte Spill
Ltmp48:
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
.loc 1 94 45 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
mov r0, r6
mov r1, r11
blx _objc_msgSend
mov r6, r0
Ltmp49:
mov r0, r4
blx _objc_release
mov r0, r6
.loc 1 98 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
mul r8, r10, r5
Ltmp50:
#DEBUG_VALUE: width <- [sp+#16]+#0
.loc 1 94 45 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
blx _CGImageGetDataProvider
blx _CGDataProviderCopyData
Ltmp51:
#DEBUG_VALUE: data <- R0+0
str r0, [sp, #8] # 4-byte Spill
Ltmp52:
#DEBUG_VALUE: data <- [sp+#8]+#0
.loc 1 95 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
blx _CFDataGetBytePtr
mov r4, r0
Ltmp53:
#DEBUG_VALUE: buffer <- R4+0
.loc 1 98 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
lsr.w r0, r8, #2
movs r1, #4
blx _calloc
mov r5, r0
Ltmp54:
#DEBUG_VALUE: outputBuffer <- R5+0
mov r0, r10
Ltmp55:
#DEBUG_VALUE: height <- R0+0
.loc 1 101 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
cmp r0, #0
Ltmp56:
#DEBUG_VALUE: rowIndex <- 0+0
beq LBB4_3
# BB#1: # %.lr.ph
Ltmp57:
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: buffer <- R4+0
#DEBUG_VALUE: outputBuffer <- R5+0
#DEBUG_VALUE: rowIndex <- 0+0
ldr r1, [sp, #12] # 4-byte Reload
Ltmp58:
#DEBUG_VALUE: bytesPerRow <- R1+0
mov.w r8, #0
lsl.w r11, r1, #1
.loc 1 104 74 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
lsr.w r10, r1, #1
Ltmp60:
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2: # =>This Inner Loop Header: Depth=1
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: outputBuffer <- R5+0
#DEBUG_VALUE: rowIndex <- 0+0
lsr.w r1, r8, #1
Ltmp61:
mov r6, r0
Ltmp62:
#DEBUG_VALUE: height <- R6+0
mla r0, r1, r10, r5
Ltmp63:
#DEBUG_VALUE: destRow <- R1+0
.loc 1 105 9 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
ldr r2, [sp, #16] # 4-byte Reload
mov r1, r4
Ltmp64:
bl _resizeRow
mov r0, r6
Ltmp65:
#DEBUG_VALUE: height <- R0+0
.loc 1 101 50 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
add.w r8, r8, #2
Ltmp66:
#DEBUG_VALUE: rowIndex <- R8+0
.loc 1 101 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
add r4, r11
cmp r8, r0
blo LBB4_2
Ltmp67:
LBB4_3: # %._crit_edge
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: outputBuffer <- R5+0
.loc 1 109 28 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
ldr r1, [sp, #4] # 4-byte Reload
Ltmp68:
lsrs r2, r0, #1
str r1, [sp]
mov r6, r5
Ltmp69:
#DEBUG_VALUE: outputBuffer <- R6+0
ldr r1, [sp, #16] # 4-byte Reload
ldr r0, [sp, #12] # 4-byte Reload
Ltmp70:
lsrs r1, r1, #1
lsrs r3, r0, #1
mov r0, r5
bl _createBitmapContext
mov r4, r0
Ltmp71:
#DEBUG_VALUE: context <- R4+0
.loc 1 110 30 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
blx _CGBitmapContextCreateImage
.loc 1 111 66 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
.loc 1 110 30 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
mov r5, r0
Ltmp72:
#DEBUG_VALUE: scaledImage <- R5+0
.loc 1 111 66 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
add r1, pc
LPC4_2:
add r0, pc
mov r2, r5
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
Ltmp73:
#DEBUG_VALUE: returnImage <- R0+0
# InlineAsm Start
mov r7, r7 # marker for objc_retainAutoreleaseReturnValue
# InlineAsm End
blx _objc_retainAutoreleasedReturnValue
Ltmp74:
mov r8, r0
.loc 1 112 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
mov r0, r5
blx _CGImageRelease
.loc 1 113 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
mov r0, r4
blx _CGContextRelease
.loc 1 114 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
ldr r0, [sp, #8] # 4-byte Reload
blx _CFRelease
.loc 1 115 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
mov r0, r6
blx _free
Ltmp75:
.loc 1 118 1 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
mov r0, r8
add sp, #20
pop.w {r8, r10, r11}
pop.w {r4, r5, r6, r7, lr}
Ltmp76:
b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
.cfi_endproc
.align 2
.code 16 # #resizeRow
.thumb_func _resizeRow
_resizeRow:
.cfi_startproc
Lfunc_begin5:
.loc 1 26 0 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
# BB#0:
#DEBUG_VALUE: resizeRow:dst <- R0+0
#DEBUG_VALUE: resizeRow:src <- R1+0
#DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
.loc 1 27 47 prologue_end # NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
add.w r3, r1, r2, lsl #2
Ltmp78:
#DEBUG_VALUE: rowB <- R3+0
.loc 1 30 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
bic r2, r2, #7
Ltmp79:
.loc 1 32 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
# InlineAsm Start
Lresizeloop:
vld1.32 {d0-d3}, [r1]!
vld1.32 {d4-d7}, [r3]!
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 q0, q2
vtrn.32 q1, q3
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 d0, d1
vtrn.32 d2, d3
vswp d1, d2
vst1.64 {d0-d1}, [r0]!
subs r2, r2, #8
bne Lresizeloop
# InlineAsm End
Ltmp80:
.loc 1 51 1 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
bx lr
Ltmp81:
Lfunc_end5:
.cfi_endproc
and the non functioning output at O2 is as follows:
.align 2
.code 16 # #"\01-[BDPViewController downscaleImageNeon:]"
.thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
.cfi_startproc
Lfunc_begin4:
.loc 1 86 0 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
# BB#0:
.loc 1 86 1 prologue_end # NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
push {r4, r5, r6, r7, lr}
add r7, sp, #12
push.w {r8, r10, r11}
sub sp, #20
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
.loc 1 88 20 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
movw r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
mov r6, r2
Ltmp43:
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
movt r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
add r0, pc
ldr.w r11, [r0]
mov r0, r6
blx _objc_retain
mov r4, r0
mov r0, r6
mov r1, r11
Ltmp44:
blx _objc_msgSend
blx _CGImageGetWidth
mov r5, r0
Ltmp45:
#DEBUG_VALUE: width <- R5+0
.loc 1 89 21 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
mov r0, r6
mov r1, r11
str r5, [sp, #16] # 4-byte Spill
blx _objc_msgSend
blx _CGImageGetHeight
mov r10, r0
Ltmp46:
#DEBUG_VALUE: height <- R10+0
.loc 1 90 26 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetBytesPerRow
str r0, [sp, #12] # 4-byte Spill
Ltmp47:
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
.loc 1 91 35 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetAlphaInfo
str r0, [sp, #4] # 4-byte Spill
Ltmp48:
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
.loc 1 94 45 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
mov r0, r6
mov r1, r11
blx _objc_msgSend
mov r6, r0
Ltmp49:
mov r0, r4
blx _objc_release
mov r0, r6
.loc 1 98 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
mul r8, r10, r5
Ltmp50:
#DEBUG_VALUE: width <- [sp+#16]+#0
.loc 1 94 45 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
blx _CGImageGetDataProvider
blx _CGDataProviderCopyData
Ltmp51:
#DEBUG_VALUE: data <- R0+0
str r0, [sp, #8] # 4-byte Spill
Ltmp52:
#DEBUG_VALUE: data <- [sp+#8]+#0
.loc 1 95 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
blx _CFDataGetBytePtr
mov r4, r0
Ltmp53:
#DEBUG_VALUE: buffer <- R4+0
.loc 1 98 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
lsr.w r0, r8, #2
movs r1, #4
blx _calloc
mov r5, r0
Ltmp54:
#DEBUG_VALUE: outputBuffer <- R5+0
mov r0, r10
Ltmp55:
#DEBUG_VALUE: height <- R0+0
.loc 1 101 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
cmp r0, #0
Ltmp56:
#DEBUG_VALUE: rowIndex <- 0+0
beq LBB4_3
# BB#1: # %.lr.ph
Ltmp57:
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: buffer <- R4+0
#DEBUG_VALUE: outputBuffer <- R5+0
#DEBUG_VALUE: rowIndex <- 0+0
ldr r1, [sp, #12] # 4-byte Reload
Ltmp58:
#DEBUG_VALUE: bytesPerRow <- R1+0
mov.w r8, #0
lsl.w r11, r1, #1
.loc 1 104 74 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
lsr.w r10, r1, #1
Ltmp60:
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2: # =>This Inner Loop Header: Depth=1
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: outputBuffer <- R5+0
#DEBUG_VALUE: rowIndex <- 0+0
lsr.w r1, r8, #1
Ltmp61:
mov r6, r0
Ltmp62:
#DEBUG_VALUE: height <- R6+0
mla r0, r1, r10, r5
Ltmp63:
#DEBUG_VALUE: destRow <- R1+0
.loc 1 105 9 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
ldr r2, [sp, #16] # 4-byte Reload
mov r1, r4
Ltmp64:
bl _resizeRow
mov r0, r6
Ltmp65:
#DEBUG_VALUE: height <- R0+0
.loc 1 101 50 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
add.w r8, r8, #2
Ltmp66:
#DEBUG_VALUE: rowIndex <- R8+0
.loc 1 101 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
add r4, r11
cmp r8, r0
blo LBB4_2
Ltmp67:
LBB4_3: # %._crit_edge
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: outputBuffer <- R5+0
.loc 1 109 28 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
ldr r1, [sp, #4] # 4-byte Reload
Ltmp68:
lsrs r2, r0, #1
str r1, [sp]
mov r6, r5
Ltmp69:
#DEBUG_VALUE: outputBuffer <- R6+0
ldr r1, [sp, #16] # 4-byte Reload
ldr r0, [sp, #12] # 4-byte Reload
Ltmp70:
lsrs r1, r1, #1
lsrs r3, r0, #1
mov r0, r5
bl _createBitmapContext
mov r4, r0
Ltmp71:
#DEBUG_VALUE: context <- R4+0
.loc 1 110 30 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
blx _CGBitmapContextCreateImage
.loc 1 111 66 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
.loc 1 110 30 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
mov r5, r0
Ltmp72:
#DEBUG_VALUE: scaledImage <- R5+0
.loc 1 111 66 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
add r1, pc
LPC4_2:
add r0, pc
mov r2, r5
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
Ltmp73:
#DEBUG_VALUE: returnImage <- R0+0
# InlineAsm Start
mov r7, r7 # marker for objc_retainAutoreleaseReturnValue
# InlineAsm End
blx _objc_retainAutoreleasedReturnValue
Ltmp74:
mov r8, r0
.loc 1 112 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
mov r0, r5
blx _CGImageRelease
.loc 1 113 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
mov r0, r4
blx _CGContextRelease
.loc 1 114 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
ldr r0, [sp, #8] # 4-byte Reload
blx _CFRelease
.loc 1 115 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
mov r0, r6
blx _free
Ltmp75:
.loc 1 118 1 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
mov r0, r8
add sp, #20
pop.w {r8, r10, r11}
pop.w {r4, r5, r6, r7, lr}
Ltmp76:
b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
.cfi_endproc
.align 2
.code 16 # #resizeRow
.thumb_func _resizeRow
_resizeRow:
.cfi_startproc
Lfunc_begin5:
.loc 1 26 0 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
# BB#0:
#DEBUG_VALUE: resizeRow:dst <- R0+0
#DEBUG_VALUE: resizeRow:src <- R1+0
#DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
.loc 1 27 47 prologue_end # NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
add.w r3, r1, r2, lsl #2
Ltmp78:
#DEBUG_VALUE: rowB <- R3+0
.loc 1 30 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
bic r2, r2, #7
Ltmp79:
.loc 1 32 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
# InlineAsm Start
Lresizeloop:
vld1.32 {d0-d3}, [r1]!
vld1.32 {d4-d7}, [r3]!
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 q0, q2
vtrn.32 q1, q3
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 d0, d1
vtrn.32 d2, d3
vswp d1, d2
vst1.64 {d0-d1}, [r0]!
subs r2, r2, #8
bne Lresizeloop
# InlineAsm End
Ltmp80:
.loc 1 51 1 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
bx lr
Ltmp81:
Lfunc_end5:
.cfi_endproc
Here's a snippet of the assembly code I get from your Xcode project with -O2. (Building with -O1 doesn't bother to inline the function, so I'm not surprised it works fine.)
Ltmp55:
#DEBUG_VALUE: rowIndex <- R3+0
.loc 1 101 29 # /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
add r8, r12
cmp r3, r11
.loc 1 32 5 # /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
Ltmp56:
# InlineAsm Start
Lresizeloop:
vld1.32 {d0-d3}, [r4]!
vld1.32 {d4-d7}, [r5]!
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 q0, q2
vtrn.32 q1, q3
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 d0, d1
vtrn.32 d2, d3
vswp d1, d2
vst1.64 {d0-d1}, [r6]!
subs r2, r2, #8
bne Lresizeloop
# InlineAsm End
Ltmp57:
blo LBB2_2
See that blo (branch-if-lower) instruction on the final line? It uses the condition codes set by the cmp r3, r11 at the top of the assembly block. But of course your inline assembly code has totally trashed the condition code register by then. So is this a compiler bug?... Nope! You just forgot to tell the compiler that your inline assembly code trashes the condition codes. Replace
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3"
);
with
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3", "cc"
);
and the assembly output fixes itself. I haven't run the app, but I bet you'll find it's all better now. :)