How to load a vector from memory with up-sample in Neon with C API - neon

i'm new about Neon. I try find some instructions to do following operation:
int a[8] = {1,2,3,4,5,6,7,8};
int b[4] = {1,2,3,4};
int c[8] = {0};
for (int =0; i<8; i++)
c[i] = a[i] - b[i/2];
How can i Do that with arm neon, actually how can I load the array with upsample to Neon like {b[0],b[0],b[1],b[1],b[2],b[2],b[3],b[3]}

you can do this by extending b[] vector:
vld1.32 {q10, q11}, [ptrB]!
vld1.32 {q12, q13}, [ptrA]!
vld1.32 {q14, q15}, [ptrA]!
vshll.s32 q8, d20, #32
vshll.s32 q9, d21, #32
vshll.s32 q10, d22, #32
vshll.s32 q11, d23, #32
vsra.u64 q8, q8, #32
vsra.u64 q9, q9, #32
vsra.u64 q10, q10, #32
vsra.u64 q11, q11, #32
vsub.s32 q12, q12, q8
vsub.s32 q13, q13, q9
vsub.s32 q14, q14, q10
vsub.s32 q15, q15, q11
vst1.32 {q12, q13}, [ptrC]!
vst1.32 {q14, q15}, [ptrC]!
However, it's so much efficient when done with vld2 and vst2 when loading/storing a[] vector:
vld1.32 {q10, q11}, [ptrB]!
vld2.32 {q12, q13}, [ptrA]!
vld2.32 {q14, q15}, [ptrA]!
vsub.s32 q12, q12, q10
vsub.s32 q13, q13, q10
vsub.s32 q14, q14, q11
vsub.s32 q15, q15, q11
vst2.32 {q12, q13}, [ptrC]!
vst2.32 {q14, q15}, [ptrC]!

Related

Compile tensorflow on arm, error: 'asm' operand has impossible constraints

I'm trying to compile tensorflow on my device(directly compile on the board, not cross-cpmpile), but a strange problem happens when compiling gemmlowp.
This is the error log:
In file included from external/gemmlowp/meta/streams.h:293:0,
from external/gemmlowp/meta/quantized_mul_kernels.h:22,
from ./tensorflow/core/kernels/meta_support.h:21,
from tensorflow/core/kernels/meta_support.cc:18:
external/gemmlowp/meta/streams_arm_32.h: In static member function 'static void gemmlowp::meta::GemmExecutorPackLHS::ExecuteDispatch3D(const P&) [with P = gemmlowp::meta::GemmParams<unsigned char, int, gemmlowp::meta::ColumnMajorWithSum, gemmlowp::meta::RowMajorWithSum, gemmlowp::meta::QuantizedStaticPreprocessedAsInt32, gemmlowp::meta::RowMajor>; int m = 1; int n = 8; int k = 8; int m_leftovers = 0; int n_leftovers = 7; int k_leftovers = 4]':
external/gemmlowp/meta/streams_arm_32.h:4211:59: error: can't find a register in class 'LO_REGS' while reloading 'asm'
"d25", "d26", "d27", "d28", "d29", "cc", "memory");
^
external/gemmlowp/meta/streams_arm_32.h:4211:59: error: 'asm' operand has impossible constraints
Target //tensorflow/tools/pip_package:build_pip_package failed to build
INFO: Elapsed time: 183.585s, Critical Path: 179.60s
My compile option is:
bazel build -c opt --copt="-mfpu=neon-vfpv4" --copt="-funsafe-math-optimizations" --copt="-ftree-vectorize" --copt="-fomit-frame-pointer" --local_resources 1536,1.0,1.0 --verbose_failures tensorflow/tools/pip_package:build_pip_package
chip info: Cortex-A17
processor : 3
model name : ARMv7 Processor rev 1 (v7l)
BogoMIPS : 48.00
Features : swp half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 evtstrm
CPU implementer : 0x41
CPU architecture: 7
CPU variant : 0x0
CPU part : 0xc0d
CPU revision : 1
OS:Ubuntu 14.04 32bit
streams_arm_32.h [line 4101 - 4212]
template <>
inline void Stream<uint8_t, 7, 8, 4, RowMajorWithSum>::Pack(
const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
std::cout << __FILE__ << "(" << __LINE__
<< ") RowMajorWithSum<uint8_t, 7, 8, 4, RowMajorWithSum>::Pack()"
<< std::endl
<< std::flush;
#endif
#endif
int params_count_copy = params.count;
asm volatile(
"add r0, %[in], %[stride]\n"
"add r1, r0, %[stride]\n"
"add r2, r1, %[stride]\n"
"add r3, r2, %[stride]\n"
"add r4, r3, %[stride]\n"
"add r5, r4, %[stride]\n"
// Reduce count by leftovers.
"sub %[count], %[count], #4\n"
"vmov.i16 q8, #0\n"
"vmov.i16 q9, #0\n"
"vmov.i16 q10, #0\n"
"vmov.i16 q11, #0\n"
"vmov.i16 q12, #0\n"
"vmov.i16 q13, #0\n"
"vmov.i16 q14, #0\n"
"1:"
"subs %[count], %[count], #8\n"
// Load Aggregate Store: 7x8.
"vld1.32 {d0}, [%[in]]!\n"
"vld1.32 {d1}, [r0]!\n"
"vld1.32 {d2}, [r1]!\n"
"vld1.32 {d3}, [r2]!\n"
"vld1.32 {d4}, [r3]!\n"
"vld1.32 {d5}, [r4]!\n"
"vld1.32 {d6}, [r5]!\n"
"vaddw.u8 q8, q8, d0\n"
"vaddw.u8 q9, q9, d1\n"
"vaddw.u8 q10, q10, d2\n"
"vaddw.u8 q11, q11, d3\n"
"vaddw.u8 q12, q12, d4\n"
"vaddw.u8 q13, q13, d5\n"
"vaddw.u8 q14, q14, d6\n"
"vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n"
"vst1.32 {d4, d5, d6}, [%[out]:64]!\n"
"bne 1b\n"
// Load Aggregate Store: 7x4.
"vmov.i8 d0, #0\n"
"vmov.i8 d1, #0\n"
"vmov.i8 d2, #0\n"
"vmov.i8 d3, #0\n"
"vmov.i8 d4, #0\n"
"vmov.i8 d5, #0\n"
"vmov.i8 d6, #0\n"
"vld1.32 {d0[0]}, [%[in]]!\n"
"vld1.32 {d1[0]}, [r0]!\n"
"vld1.32 {d2[0]}, [r1]!\n"
"vld1.32 {d3[0]}, [r2]!\n"
"vld1.32 {d4[0]}, [r3]!\n"
"vld1.32 {d5[0]}, [r4]!\n"
"vld1.32 {d6[0]}, [r5]!\n"
"vaddw.u8 q8, q8, d0\n"
"vaddw.u8 q9, q9, d1\n"
"vaddw.u8 q10, q10, d2\n"
"vaddw.u8 q11, q11, d3\n"
"vaddw.u8 q12, q12, d4\n"
"vaddw.u8 q13, q13, d5\n"
"vaddw.u8 q14, q14, d6\n"
"vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n"
"vst1.32 {d4, d5, d6}, [%[out]:64]!\n"
// Aggregator Reduction.
"vmov.32 d0[0], %[multiplicative_sum_offset]\n"
"vdup.32 q1, %[additive_sum_offset]\n"
"vpaddl.u16 q8, q8\n"
"vpaddl.u16 q9, q9\n"
"vpaddl.u16 q10, q10\n"
"vpaddl.u16 q11, q11\n"
"vpaddl.u16 q12, q12\n"
"vpaddl.u16 q13, q13\n"
"vpaddl.u16 q14, q14\n"
"vpadd.u32 d16, d16, d17\n"
"vpadd.u32 d18, d18, d19\n"
"vpadd.u32 d20, d20, d21\n"
"vpadd.u32 d22, d22, d23\n"
"vpadd.u32 d24, d24, d25\n"
"vpadd.u32 d26, d26, d27\n"
"vpadd.u32 d28, d28, d29\n"
"vpadd.u32 d16, d16, d18\n"
"vpadd.u32 d17, d20, d22\n"
"vpadd.u32 d18, d24, d26\n"
"vpadd.u32 d19, d28, d28\n"
"vmul.i32 q8, q8, d0[0]\n"
"vmul.i32 q9, q9, d0[0]\n"
"vadd.i32 q8, q8, q1\n"
"vadd.i32 q9, q9, q1\n"
"vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n"
: [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
: [stride] "r"(params.stride),
[multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
[additive_sum_offset] "r"(params.additive_sum_offset)
: "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5",
"d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24",
"d25", "d26", "d27", "d28", "d29", "cc", "memory");
}
I followed the setup from https://github.com/samjabrahams/tensorflow-on-raspberry-pi/blob/master/GUIDE.md
I would greatly appreciated for any suggestions.

ARM disassemble+ Crash at ldmge r1!, {r4, r5, r6, r7, r8, r9, r10, r11}

I found a crash in memcpy() function, which gets called from one of 802.11n specific aggregation function in wifi driver. From the core analysis, the crash point is mentioned below,
0x012014f8 <memcpy+100>: ldmge r1!, {r4, r5, r6, r7, r8, r9, r10, r11}
Why do we see a crash after the ldmge instruction execution?
I would like to know which parameter of memcpy() is corrupted - src_addr,dest_addr or length? Could you please provide your inputs by looking into the disassemble code.
Please find the disassemble code of memcpy() from gdb and backtrace from core file.
disas *
Dump of assembler code for function memcpy:
0x01201494 <memcpy+0>: cmp r2, #0 ; 0x0
0x01201498 <memcpy+4>: moveq pc, lr
0x0120149c <memcpy+8>: push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
0x012014a0 <memcpy+12>: mov r3, r0
0x012014a4 <memcpy+16>: cmp r2, #16 ; 0x10
0x012014a8 <memcpy+20>: blt 0x12016b8 <mc_bytes>
0x012014ac <memcpy+24>: ands r12, r3, #3 ; 0x3
0x012014b0 <memcpy+28>: beq 0x12014d8 <memcpy+68>
0x012014b4 <memcpy+32>: rsb r12, r12, #4 ; 0x4
0x012014b8 <memcpy+36>: cmp r12, #2 ; 0x2
0x012014bc <memcpy+40>: ldrb r4, [r1], #1
0x012014c0 <memcpy+44>: ldrbge r5, [r1], #1
0x012014c4 <memcpy+48>: ldrbgt r6, [r1], #1
0x012014c8 <memcpy+52>: strb r4, [r3], #1
0x012014cc <memcpy+56>: strbge r5, [r3], #1
0x012014d0 <memcpy+60>: strbgt r6, [r3], #1
0x012014d4 <memcpy+64>: sub r2, r2, r12
0x012014d8 <memcpy+68>: ands r12, r1, #3 ; 0x3
0x012014dc <memcpy+72>: bne 0x120156c <mc_unaligned>
0x012014e0 <memcpy+76>: tst r3, #15 ; 0xf
0x012014e4 <memcpy+80>: ldrne r4, [r1], #4
0x012014e8 <memcpy+84>: subne r2, r2, #4 ; 0x4
0x012014ec <memcpy+88>: strne r4, [r3], #4
0x012014f0 <memcpy+92>: bne 0x12014e0 <memcpy+76>
0x012014f4 <memcpy+96>: cmp r2, #32 ; 0x20
**0x012014f8 <memcpy+100>: ldmge r1!, {r4, r5, r6, r7, r8, r9, r10, r11}****
0x012014fc <memcpy+104>: subge r2, r2, #32 ; 0x20
0x01201500 <memcpy+108>: stmiage r3!, {r4, r5, r6, r7, r8, r9, r10, r11}
0x01201504 <memcpy+112>: bge 0x12014f4 <memcpy+96>
0x01201508 <memcpy+116>: cmp r2, #16 ; 0x10
0x0120150c <memcpy+120>: ldmge r1!, {r4, r5, r6, r7}
0x01201510 <memcpy+124>: subge r2, r2, #16 ; 0x10
0x01201514 <memcpy+128>: stmiage r3!, {r4, r5, r6, r7}
0x01201518 <memcpy+132>: tst r2, #8 ; 0x8
0x0120151c <memcpy+136>: beq 0x1201534 <memcpy+160>
0x01201520 <memcpy+140>: ldr r4, [r1], #4
0x01201524 <memcpy+144>: ldr r5, [r1], #4
0x01201528 <memcpy+148>: sub r2, r2, #8 ; 0x8
0x0120152c <memcpy+152>: str r4, [r3], #4
0x01201530 <memcpy+156>: str r5, [r3], #4
0x01201534 <memcpy+160>: tst r2, #4 ; 0x4
0x01201538 <memcpy+164>: ldrne r4, [r1], #4
0x0120153c <memcpy+168>: subne r2, r2, #4 ; 0x4
0x01201540 <memcpy+172>: strne r4, [r3], #4
0x01201544 <memcpy+176>: cmp r2, #0 ; 0x0
0x01201548 <memcpy+180>: beq 0x1201568 <memcpy+212>
0x0120154c <memcpy+184>: cmp r2, #2 ; 0x2
0x01201550 <memcpy+188>: ldrb r4, [r1], #1
0x01201554 <memcpy+192>: ldrbge r5, [r1], #1
0x01201558 <memcpy+196>: ldrbgt r6, [r1], #1
0x0120155c <memcpy+200>: strb r4, [r3], #1
0x01201560 <memcpy+204>: strbge r5, [r3], #1
0x01201564 <memcpy+208>: strbgt r6, [r3], #1
0x01201568 <memcpy+212>: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
0x0120156c <mc_unaligned+0>: bic r1, r1, #3 ; 0x3
0x01201570 <mc_unaligned+4>: teq r12, #1 ; 0x1
0x01201574 <mc_unaligned+8>: beq 0x12015e8 <mc_1>
0x01201578 <mc_unaligned+12>: teq r12, #2 ; 0x2
0x0120157c <mc_unaligned+16>: beq 0x1201650 <mc_2>
0x01201580 <mc_3+0>: ldr r8, [r1], #4
0x01201584 <mc_3+4>: cmp r2, #16 ; 0x10
0x01201588 <mc_3+8>: blt 0x12015d8 <mc_3+88>
0x0120158c <mc_3+12>: lsr r4, r8, #24
0x01201590 <mc_3+16>: ldm r1!, {r5, r6, r7, r8}
0x01201594 <mc_3+20>: orr r4, r4, r5, lsl #8
0x01201598 <mc_3+24>: lsr r5, r5, #24
0x0120159c <mc_3+28>: orr r5, r5, r6, lsl #8
0x012015a0 <mc_3+32>: lsr r6, r6, #24
0x012015a4 <mc_3+36>: orr r6, r6, r7, lsl #8
0x012015a8 <mc_3+40>: lsr r7, r7, #24
0x012015ac <mc_3+44>: orr r7, r7, r8, lsl #8
0x012015b0 <mc_3+48>: stmia r3!, {r4, r5, r6, r7}
0x012015b4 <mc_3+52>: sub r2, r2, #16 ; 0x10
0x012015b8 <mc_3+56>: cmp r2, #32 ; 0x20
0x012015bc <mc_3+60>: bge 0x120158c <mc_3+12>
0x012015c0 <mc_3+64>: b 0x12015d8 <mc_3+88>
0x012015c4 <mc_3+68>: lsr r4, r8, #24
0x012015c8 <mc_3+72>: ldr r8, [r1], #4
0x012015cc <mc_3+76>: orr r4, r4, r8, lsl #8
0x012015d0 <mc_3+80>: str r4, [r3], #4
0x012015d4 <mc_3+84>: sub r2, r2, #4 ; 0x4
0x012015d8 <mc_3+88>: cmp r2, #4 ; 0x4
0x012015dc <mc_3+92>: bge 0x12015c4 <mc_3+68>
0x012015e0 <mc_3+96>: sub r1, r1, #1 ; 0x1
0x012015e4 <mc_3+100>: b 0x1201544 <memcpy+176>
0x012015e8 <mc_1+0>: ldr r8, [r1], #4
0x012015ec <mc_1+4>: cmp r2, #16 ; 0x10
0x012015f0 <mc_1+8>: blt 0x1201640 <mc_1+88>
0x012015f4 <mc_1+12>: lsr r4, r8, #8
0x012015f8 <mc_1+16>: ldm r1!, {r5, r6, r7, r8}
0x012015fc <mc_1+20>: orr r4, r4, r5, lsl #24
0x01201600 <mc_1+24>: lsr r5, r5, #8
0x01201604 <mc_1+28>: orr r5, r5, r6, lsl #24
0x01201608 <mc_1+32>: lsr r6, r6, #8
0x0120160c <mc_1+36>: orr r6, r6, r7, lsl #24
0x01201610 <mc_1+40>: lsr r7, r7, #8
0x01201614 <mc_1+44>: orr r7, r7, r8, lsl #24
0x01201618 <mc_1+48>: stmia r3!, {r4, r5, r6, r7}
0x0120161c <mc_1+52>: sub r2, r2, #16 ; 0x10
0x01201620 <mc_1+56>: cmp r2, #32 ; 0x20
0x01201624 <mc_1+60>: bge 0x12015f4 <mc_1+12>
0x01201628 <mc_1+64>: b 0x1201640 <mc_1+88>
0x0120162c <mc_1+68>: lsr r4, r8, #8
0x01201630 <mc_1+72>: ldr r8, [r1], #4
0x01201634 <mc_1+76>: orr r4, r4, r8, lsl #24
0x01201638 <mc_1+80>: str r4, [r3], #4
0x0120163c <mc_1+84>: sub r2, r2, #4 ; 0x4
0x01201640 <mc_1+88>: cmp r2, #4 ; 0x4
0x01201644 <mc_1+92>: bge 0x120162c <mc_1+68>
0x01201648 <mc_1+96>: sub r1, r1, #3 ; 0x3
0x0120164c <mc_1+100>: b 0x1201544 <memcpy+176>
0x01201650 <mc_2+0>: ldr r8, [r1], #4
0x01201654 <mc_2+4>: cmp r2, #16 ; 0x10
0x01201658 <mc_2+8>: blt 0x12016a8 <mc_2+88>
0x0120165c <mc_2+12>: lsr r4, r8, #16
0x01201660 <mc_2+16>: ldm r1!, {r5, r6, r7, r8}
0x01201664 <mc_2+20>: orr r4, r4, r5, lsl #16
0x01201668 <mc_2+24>: lsr r5, r5, #16
0x0120166c <mc_2+28>: orr r5, r5, r6, lsl #16
0x01201670 <mc_2+32>: lsr r6, r6, #16
0x01201674 <mc_2+36>: orr r6, r6, r7, lsl #16
0x01201678 <mc_2+40>: lsr r7, r7, #16
0x0120167c <mc_2+44>: orr r7, r7, r8, lsl #16
0x01201680 <mc_2+48>: stmia r3!, {r4, r5, r6, r7}
0x01201684 <mc_2+52>: sub r2, r2, #16 ; 0x10
0x01201688 <mc_2+56>: cmp r2, #32 ; 0x20
0x0120168c <mc_2+60>: bge 0x120165c <mc_2+12>
0x01201690 <mc_2+64>: b 0x12016a8 <mc_2+88>
0x01201694 <mc_2+68>: lsr r4, r8, #16
0x01201698 <mc_2+72>: ldr r8, [r1], #4
0x0120169c <mc_2+76>: orr r4, r4, r8, lsl #16
0x012016a0 <mc_2+80>: str r4, [r3], #4
0x012016a4 <mc_2+84>: sub r2, r2, #4 ; 0x4
0x012016a8 <mc_2+88>: cmp r2, #4 ; 0x4
0x012016ac <mc_2+92>: bge 0x1201694 <mc_2+68>
0x012016b0 <mc_2+96>: sub r1, r1, #2 ; 0x2
0x012016b4 <mc_2+100>: b 0x1201544 <memcpy+176>
0x012016b8 <mc_bytes+0>: teq r2, #0 ; 0x0
0x012016bc <mc_bytes+4>: ldrbne r12, [r1], #1
0x012016c0 <mc_bytes+8>: strbne r12, [r3], #1
0x012016c4 <mc_bytes+12>: subsne r2, r2, #1 ; 0x1
0x012016c8 <mc_bytes+16>: bne 0x12016bc <mc_bytes+4>
0x012016cc <mc_bytes+20>: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
End of assembler dump.
Crash Point:
backtrace *
#0 0x012014f8 in memcpy () from libc.so.3
#1 0x78276c04 in wlan_11n_aggregate_pkt (priv=0x0, pra_list=0x3f5898,
headroom=<value optimized out>, ptrindex=0)
at \mlan\mlan_11n_aggr.c:98
#2 0x002ff4e8 in ?? ()
Please let me know if you need further information.

How can I optimize this code with ARM NEON?

I'm trying to optimize some code in order to reduce as much as possible execution times.
This is the code:
int shifter=0;
// now iterate through all the pairings
UINT32_ALIAS* ptr2=(UINT32_ALIAS*)ptr;
const BriskShortPair* max=shortPairs_+noShortPairs_;
for(BriskShortPair* iter=shortPairs_; iter<max;++iter){
t1=*(_values+iter->i);
t2=*(_values+iter->j);
if(t1>t2){
*ptr2|=((1)<<shifter);
} // else already initialized with zero
// take care of the iterators:
++shifter;
if(shifter==32){
shifter=0;
++ptr2;
}
}
I was wondering if it's possible to somehow parallelize this using NEON.
Is it possible?
Thank you
EDIT: The context of this code is the BRISK features detector (http://www.asl.ethz.ch/people/lestefan/personal/BRISK)
I'm trying to optimize this code for an ARM architecture.
The piece of code I'm referring to has the following structure:
-an external for cycle that scans a certain number of points
-for each one of these points there a certain number of other points around it (a fixed number) and each one of these has an intensity value associated.
-in an internal for cycle fixed pairs of points are compared on the basis of their intensity value and the result of this comparison can be 0 or 1 and this value is put in a vector.
The code I posted here is the internal for cycle.
EDIT : I initially misunderstood the original source code.
Here is the correct version, completely rewritten. (55 cycles / iteration)
Although it's not as easy as assumed with the initial version below, NEON can handle this extremely well, resulting in an eye-popping performance boost compared to the original C implementation.
With the right tweaks, you might get additional gain in performance (less than 50 cycles / iteration). The readability will suffer heavily then though.
Have fun!
AREA BRISK_ASM_NEON, CODE, READNOLY
EXPORT yourFunction
CODE32
yourFunction FUNCTION
loop
pld [r0, #192]
vld2.32 {q8, q9}, [r0]!
vld2.32 {q10, q11}, [r0]!
pld [r0, #192]
vld2.32 {q12, q13}, [r0]!
vld2.32 {q14, q15}, [r0]!
vcgt.u32 q8, q8, q9
vcgt.u32 q9, q10, q11
vcgt.u32 q10, q12, q13
vcgt.u32 q11, q14, q15
pld [r0, #192]
vld2.32 {q12, q13}, [r0]!
vld2.32 {q14, q15}, [r0]!
pld [r0, #192]
vld2.32 {q0, q1}, [r0]!
vld2.32 {q2, q3}, [r0]!
vcgt.u32 q12, q12, q13
vcgt.u32 q13, q14, q15
vcgt.u32 q14, q0, q1
vcgt.u32 q15, q2, q3
vsli.32 q8, q10, #8
vsli.32 q9, q11, #8
vsli.32 q8, q12, #16
vsli.32 q9, q13, #16
vsli.32 q8, q14, #24
vsli.32 q9, q15, #24
vsli.8 d16, d17, #2
vsli.8 d18, d19, #2
vsli.8 d16, d18, #4
vbic.i8 d16, #0xaa
vshr.u64 d17, d16, #31
vorr d16, d16, d17
vst1.32 {d16[0]}, [r1]!
subs r2, r2, #32
bgt loop
bx lr
ENDFUNC
END
=============================================================================
!!!!!!! The code below is INVALID !!!!!!!!
=============================================================================
It's a piece of cake with NEON.
Here's your "miracle" :
prototype :
void yourFunc(unsigned int * pPair, unsigned int * ptr2, unsigned int count);
AREA BRISK_ASM_NEON, CODE, READNOLY
EXPORT yourFunction
CODE32
yourFunction FUNCTION
adr r12, shifter_table
vpush {q4-q7}
vldmia r12, {q0-q7}
loop
vld1.32 {q8, q9}, [r1]
vorr q10, q8, q0
vorr q11, q9, q1
vld2.32 {q12, q13}, [r0]!
vld2.32 {q14, q15}, [r0]!
vcgt.u32 q12, q12, q13
vcgt.u32 q13, q14, q15
vbsl q12, q10, q8
vbsl q13, q11, q9
vst1.32 {q12, q13}, [r1]!
vld1.32 {q8, q9}, [r1]
vorr q10, q8, q2
vorr q11, q9, q3
vld2.32 {q12, q13}, [r0]!
vld2.32 {q14, q15}, [r0]!
vcgt.u32 q12, q12, q13
vcgt.u32 q13, q14, q15
vbsl q12, q10, q8
vbsl q13, q11, q9
vst1.32 {q12, q13}, [r1]!
vld1.32 {q8, q9}, [r1]
vorr q10, q8, q4
vorr q11, q9, q5
vld2.32 {q12, q13}, [r0]!
vld2.32 {q14, q15}, [r0]!
vcgt.u32 q12, q12, q13
vcgt.u32 q13, q14, q15
vbsl q12, q10, q8
vbsl q13, q11, q9
vst1.32 {q12, q13}, [r1]!
vld1.32 {q8, q9}, [r1]
vorr q10, q8, q6
vorr q11, q9, q7
vld2.32 {q12, q13}, [r0]!
vld2.32 {q14, q15}, [r0]!
vcgt.u32 q12, q12, q13
vcgt.u32 q13, q14, q15
vbsl q12, q10, q8
vbsl q13, q11, q9
vst1.32 {q12, q13}, [r1]!
subs r2, #32
bgt loop
vpop {q4-q7}
bx lr
ENDFUNC
shifter_table
DCD (1<<00), (1<<01), (1<<02), (1<<03), (1<<04), (1<<05), (1<<06), (1<<07), (1<<08), (1<<09), (1<<10), (1<<11), (1<<12), (1<<13), (1<<14), (1<<15)
DCD (1<<16), (1<<17), (1<<18), (1<<19), (1<<20), (1<<21), (1<<22), (1<<23), (1<<24), (1<<25), (1<<26), (1<<27), (1<<28), (1<<29), (1<<30), (1<<31)
END
The code above is just moderately optimized (interlocks here and there), and works only if count is a multiple of 32.
That's as far as I go managing readability and when working "unprofessionally".
47 cycles / iteration isn't bad. The rest is up to you.
Good luck!

why is clang optimization breaking my inline assembly code?

in an attempt to learn something about ARM assembly, i have written a simple test project to perform image downscaling using inline assembly and NEON instructions. you can see it here:
https://github.com/rmaz/NEON-Image-Downscaling
after some effort i managed to get it working, happy days. except that it only works for optimization levels less than -O2. i have taken a look at the generated ASM, but i cannot see any obvious reason why this should occur. can anyone offer any insight? here is the function responsible for the inline assembly part:
static void inline resizeRow(uint32_t *dst, uint32_t *src, uint32_t pixelsPerRow)
{
const uint32_t * rowB = src + pixelsPerRow;
// force the number of pixels per row to a mutliple of 8
pixelsPerRow = 8 * (pixelsPerRow / 8);
__asm__ volatile("Lresizeloop: \n" // start loop
"vld1.32 {d0-d3}, [%1]! \n" // load 8 pixels from the top row
"vld1.32 {d4-d7}, [%2]! \n" // load 8 pixels from the bottom row
"vhadd.u8 q0, q0, q2 \n" // average the pixels vertically
"vhadd.u8 q1, q1, q3 \n"
"vtrn.32 q0, q2 \n" // transpose to put the horizontally adjacent pixels in different registers
"vtrn.32 q1, q3 \n"
"vhadd.u8 q0, q0, q2 \n" // average the pixels horizontally
"vhadd.u8 q1, q1, q3 \n"
"vtrn.32 d0, d1 \n" // fill the registers with pixels
"vtrn.32 d2, d3 \n"
"vswp d1, d2 \n"
"vst1.64 {d0-d1}, [%0]! \n" // store the result
"subs %3, %3, #8 \n" // subtract 8 from the pixel count
"bne Lresizeloop \n" // repeat until the row is complete
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3"
);
}
the functioning generated output at O1 for the surrounding function and loop is as follows:
.align 2
.code 16 # #"\01-[BDPViewController downscaleImageNeon:]"
.thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
.cfi_startproc
Lfunc_begin4:
.loc 1 86 0 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
# BB#0:
.loc 1 86 1 prologue_end # NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
push {r4, r5, r6, r7, lr}
add r7, sp, #12
push.w {r8, r10, r11}
sub sp, #20
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
.loc 1 88 20 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
movw r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
mov r6, r2
Ltmp43:
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
movt r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
add r0, pc
ldr.w r11, [r0]
mov r0, r6
blx _objc_retain
mov r4, r0
mov r0, r6
mov r1, r11
Ltmp44:
blx _objc_msgSend
blx _CGImageGetWidth
mov r5, r0
Ltmp45:
#DEBUG_VALUE: width <- R5+0
.loc 1 89 21 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
mov r0, r6
mov r1, r11
str r5, [sp, #16] # 4-byte Spill
blx _objc_msgSend
blx _CGImageGetHeight
mov r10, r0
Ltmp46:
#DEBUG_VALUE: height <- R10+0
.loc 1 90 26 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetBytesPerRow
str r0, [sp, #12] # 4-byte Spill
Ltmp47:
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
.loc 1 91 35 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetAlphaInfo
str r0, [sp, #4] # 4-byte Spill
Ltmp48:
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
.loc 1 94 45 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
mov r0, r6
mov r1, r11
blx _objc_msgSend
mov r6, r0
Ltmp49:
mov r0, r4
blx _objc_release
mov r0, r6
.loc 1 98 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
mul r8, r10, r5
Ltmp50:
#DEBUG_VALUE: width <- [sp+#16]+#0
.loc 1 94 45 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
blx _CGImageGetDataProvider
blx _CGDataProviderCopyData
Ltmp51:
#DEBUG_VALUE: data <- R0+0
str r0, [sp, #8] # 4-byte Spill
Ltmp52:
#DEBUG_VALUE: data <- [sp+#8]+#0
.loc 1 95 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
blx _CFDataGetBytePtr
mov r4, r0
Ltmp53:
#DEBUG_VALUE: buffer <- R4+0
.loc 1 98 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
lsr.w r0, r8, #2
movs r1, #4
blx _calloc
mov r5, r0
Ltmp54:
#DEBUG_VALUE: outputBuffer <- R5+0
mov r0, r10
Ltmp55:
#DEBUG_VALUE: height <- R0+0
.loc 1 101 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
cmp r0, #0
Ltmp56:
#DEBUG_VALUE: rowIndex <- 0+0
beq LBB4_3
# BB#1: # %.lr.ph
Ltmp57:
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: buffer <- R4+0
#DEBUG_VALUE: outputBuffer <- R5+0
#DEBUG_VALUE: rowIndex <- 0+0
ldr r1, [sp, #12] # 4-byte Reload
Ltmp58:
#DEBUG_VALUE: bytesPerRow <- R1+0
mov.w r8, #0
lsl.w r11, r1, #1
.loc 1 104 74 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
lsr.w r10, r1, #1
Ltmp60:
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2: # =>This Inner Loop Header: Depth=1
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: outputBuffer <- R5+0
#DEBUG_VALUE: rowIndex <- 0+0
lsr.w r1, r8, #1
Ltmp61:
mov r6, r0
Ltmp62:
#DEBUG_VALUE: height <- R6+0
mla r0, r1, r10, r5
Ltmp63:
#DEBUG_VALUE: destRow <- R1+0
.loc 1 105 9 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
ldr r2, [sp, #16] # 4-byte Reload
mov r1, r4
Ltmp64:
bl _resizeRow
mov r0, r6
Ltmp65:
#DEBUG_VALUE: height <- R0+0
.loc 1 101 50 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
add.w r8, r8, #2
Ltmp66:
#DEBUG_VALUE: rowIndex <- R8+0
.loc 1 101 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
add r4, r11
cmp r8, r0
blo LBB4_2
Ltmp67:
LBB4_3: # %._crit_edge
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: outputBuffer <- R5+0
.loc 1 109 28 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
ldr r1, [sp, #4] # 4-byte Reload
Ltmp68:
lsrs r2, r0, #1
str r1, [sp]
mov r6, r5
Ltmp69:
#DEBUG_VALUE: outputBuffer <- R6+0
ldr r1, [sp, #16] # 4-byte Reload
ldr r0, [sp, #12] # 4-byte Reload
Ltmp70:
lsrs r1, r1, #1
lsrs r3, r0, #1
mov r0, r5
bl _createBitmapContext
mov r4, r0
Ltmp71:
#DEBUG_VALUE: context <- R4+0
.loc 1 110 30 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
blx _CGBitmapContextCreateImage
.loc 1 111 66 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
.loc 1 110 30 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
mov r5, r0
Ltmp72:
#DEBUG_VALUE: scaledImage <- R5+0
.loc 1 111 66 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
add r1, pc
LPC4_2:
add r0, pc
mov r2, r5
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
Ltmp73:
#DEBUG_VALUE: returnImage <- R0+0
# InlineAsm Start
mov r7, r7 # marker for objc_retainAutoreleaseReturnValue
# InlineAsm End
blx _objc_retainAutoreleasedReturnValue
Ltmp74:
mov r8, r0
.loc 1 112 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
mov r0, r5
blx _CGImageRelease
.loc 1 113 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
mov r0, r4
blx _CGContextRelease
.loc 1 114 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
ldr r0, [sp, #8] # 4-byte Reload
blx _CFRelease
.loc 1 115 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
mov r0, r6
blx _free
Ltmp75:
.loc 1 118 1 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
mov r0, r8
add sp, #20
pop.w {r8, r10, r11}
pop.w {r4, r5, r6, r7, lr}
Ltmp76:
b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
.cfi_endproc
.align 2
.code 16 # #resizeRow
.thumb_func _resizeRow
_resizeRow:
.cfi_startproc
Lfunc_begin5:
.loc 1 26 0 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
# BB#0:
#DEBUG_VALUE: resizeRow:dst <- R0+0
#DEBUG_VALUE: resizeRow:src <- R1+0
#DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
.loc 1 27 47 prologue_end # NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
add.w r3, r1, r2, lsl #2
Ltmp78:
#DEBUG_VALUE: rowB <- R3+0
.loc 1 30 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
bic r2, r2, #7
Ltmp79:
.loc 1 32 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
# InlineAsm Start
Lresizeloop:
vld1.32 {d0-d3}, [r1]!
vld1.32 {d4-d7}, [r3]!
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 q0, q2
vtrn.32 q1, q3
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 d0, d1
vtrn.32 d2, d3
vswp d1, d2
vst1.64 {d0-d1}, [r0]!
subs r2, r2, #8
bne Lresizeloop
# InlineAsm End
Ltmp80:
.loc 1 51 1 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
bx lr
Ltmp81:
Lfunc_end5:
.cfi_endproc
and the non functioning output at O2 is as follows:
.align 2
.code 16 # #"\01-[BDPViewController downscaleImageNeon:]"
.thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
.cfi_startproc
Lfunc_begin4:
.loc 1 86 0 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
# BB#0:
.loc 1 86 1 prologue_end # NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
push {r4, r5, r6, r7, lr}
add r7, sp, #12
push.w {r8, r10, r11}
sub sp, #20
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
.loc 1 88 20 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
movw r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
mov r6, r2
Ltmp43:
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
movt r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
add r0, pc
ldr.w r11, [r0]
mov r0, r6
blx _objc_retain
mov r4, r0
mov r0, r6
mov r1, r11
Ltmp44:
blx _objc_msgSend
blx _CGImageGetWidth
mov r5, r0
Ltmp45:
#DEBUG_VALUE: width <- R5+0
.loc 1 89 21 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
mov r0, r6
mov r1, r11
str r5, [sp, #16] # 4-byte Spill
blx _objc_msgSend
blx _CGImageGetHeight
mov r10, r0
Ltmp46:
#DEBUG_VALUE: height <- R10+0
.loc 1 90 26 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetBytesPerRow
str r0, [sp, #12] # 4-byte Spill
Ltmp47:
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
.loc 1 91 35 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
mov r0, r6
mov r1, r11
blx _objc_msgSend
blx _CGImageGetAlphaInfo
str r0, [sp, #4] # 4-byte Spill
Ltmp48:
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
.loc 1 94 45 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
mov r0, r6
mov r1, r11
blx _objc_msgSend
mov r6, r0
Ltmp49:
mov r0, r4
blx _objc_release
mov r0, r6
.loc 1 98 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
mul r8, r10, r5
Ltmp50:
#DEBUG_VALUE: width <- [sp+#16]+#0
.loc 1 94 45 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
blx _CGImageGetDataProvider
blx _CGDataProviderCopyData
Ltmp51:
#DEBUG_VALUE: data <- R0+0
str r0, [sp, #8] # 4-byte Spill
Ltmp52:
#DEBUG_VALUE: data <- [sp+#8]+#0
.loc 1 95 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
blx _CFDataGetBytePtr
mov r4, r0
Ltmp53:
#DEBUG_VALUE: buffer <- R4+0
.loc 1 98 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
lsr.w r0, r8, #2
movs r1, #4
blx _calloc
mov r5, r0
Ltmp54:
#DEBUG_VALUE: outputBuffer <- R5+0
mov r0, r10
Ltmp55:
#DEBUG_VALUE: height <- R0+0
.loc 1 101 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
cmp r0, #0
Ltmp56:
#DEBUG_VALUE: rowIndex <- 0+0
beq LBB4_3
# BB#1: # %.lr.ph
Ltmp57:
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: buffer <- R4+0
#DEBUG_VALUE: outputBuffer <- R5+0
#DEBUG_VALUE: rowIndex <- 0+0
ldr r1, [sp, #12] # 4-byte Reload
Ltmp58:
#DEBUG_VALUE: bytesPerRow <- R1+0
mov.w r8, #0
lsl.w r11, r1, #1
.loc 1 104 74 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
lsr.w r10, r1, #1
Ltmp60:
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2: # =>This Inner Loop Header: Depth=1
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: outputBuffer <- R5+0
#DEBUG_VALUE: rowIndex <- 0+0
lsr.w r1, r8, #1
Ltmp61:
mov r6, r0
Ltmp62:
#DEBUG_VALUE: height <- R6+0
mla r0, r1, r10, r5
Ltmp63:
#DEBUG_VALUE: destRow <- R1+0
.loc 1 105 9 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
ldr r2, [sp, #16] # 4-byte Reload
mov r1, r4
Ltmp64:
bl _resizeRow
mov r0, r6
Ltmp65:
#DEBUG_VALUE: height <- R0+0
.loc 1 101 50 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
add.w r8, r8, #2
Ltmp66:
#DEBUG_VALUE: rowIndex <- R8+0
.loc 1 101 29 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
add r4, r11
cmp r8, r0
blo LBB4_2
Ltmp67:
LBB4_3: # %._crit_edge
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
#DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
#DEBUG_VALUE: width <- [sp+#16]+#0
#DEBUG_VALUE: height <- R0+0
#DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
#DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
#DEBUG_VALUE: data <- [sp+#8]+#0
#DEBUG_VALUE: outputBuffer <- R5+0
.loc 1 109 28 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
ldr r1, [sp, #4] # 4-byte Reload
Ltmp68:
lsrs r2, r0, #1
str r1, [sp]
mov r6, r5
Ltmp69:
#DEBUG_VALUE: outputBuffer <- R6+0
ldr r1, [sp, #16] # 4-byte Reload
ldr r0, [sp, #12] # 4-byte Reload
Ltmp70:
lsrs r1, r1, #1
lsrs r3, r0, #1
mov r0, r5
bl _createBitmapContext
mov r4, r0
Ltmp71:
#DEBUG_VALUE: context <- R4+0
.loc 1 110 30 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
blx _CGBitmapContextCreateImage
.loc 1 111 66 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
.loc 1 110 30 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
mov r5, r0
Ltmp72:
#DEBUG_VALUE: scaledImage <- R5+0
.loc 1 111 66 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
add r1, pc
LPC4_2:
add r0, pc
mov r2, r5
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
Ltmp73:
#DEBUG_VALUE: returnImage <- R0+0
# InlineAsm Start
mov r7, r7 # marker for objc_retainAutoreleaseReturnValue
# InlineAsm End
blx _objc_retainAutoreleasedReturnValue
Ltmp74:
mov r8, r0
.loc 1 112 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
mov r0, r5
blx _CGImageRelease
.loc 1 113 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
mov r0, r4
blx _CGContextRelease
.loc 1 114 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
ldr r0, [sp, #8] # 4-byte Reload
blx _CFRelease
.loc 1 115 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
mov r0, r6
blx _free
Ltmp75:
.loc 1 118 1 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
mov r0, r8
add sp, #20
pop.w {r8, r10, r11}
pop.w {r4, r5, r6, r7, lr}
Ltmp76:
b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
.cfi_endproc
.align 2
.code 16 # #resizeRow
.thumb_func _resizeRow
_resizeRow:
.cfi_startproc
Lfunc_begin5:
.loc 1 26 0 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
# BB#0:
#DEBUG_VALUE: resizeRow:dst <- R0+0
#DEBUG_VALUE: resizeRow:src <- R1+0
#DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
.loc 1 27 47 prologue_end # NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
add.w r3, r1, r2, lsl #2
Ltmp78:
#DEBUG_VALUE: rowB <- R3+0
.loc 1 30 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
bic r2, r2, #7
Ltmp79:
.loc 1 32 5 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
# InlineAsm Start
Lresizeloop:
vld1.32 {d0-d3}, [r1]!
vld1.32 {d4-d7}, [r3]!
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 q0, q2
vtrn.32 q1, q3
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 d0, d1
vtrn.32 d2, d3
vswp d1, d2
vst1.64 {d0-d1}, [r0]!
subs r2, r2, #8
bne Lresizeloop
# InlineAsm End
Ltmp80:
.loc 1 51 1 # NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
bx lr
Ltmp81:
Lfunc_end5:
.cfi_endproc
Here's a snippet of the assembly code I get from your Xcode project with -O2. (Building with -O1 doesn't bother to inline the function, so I'm not surprised it works fine.)
Ltmp55:
#DEBUG_VALUE: rowIndex <- R3+0
.loc 1 101 29 # /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
add r8, r12
cmp r3, r11
.loc 1 32 5 # /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
Ltmp56:
# InlineAsm Start
Lresizeloop:
vld1.32 {d0-d3}, [r4]!
vld1.32 {d4-d7}, [r5]!
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 q0, q2
vtrn.32 q1, q3
vhadd.u8 q0, q0, q2
vhadd.u8 q1, q1, q3
vtrn.32 d0, d1
vtrn.32 d2, d3
vswp d1, d2
vst1.64 {d0-d1}, [r6]!
subs r2, r2, #8
bne Lresizeloop
# InlineAsm End
Ltmp57:
blo LBB2_2
See that blo (branch-if-lower) instruction on the final line? It uses the condition codes set by the cmp r3, r11 at the top of the assembly block. But of course your inline assembly code has totally trashed the condition code register by then. So is this a compiler bug?... Nope! You just forgot to tell the compiler that your inline assembly code trashes the condition codes. Replace
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3"
);
with
: "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
: "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
: "q0", "q1", "q2", "q3", "cc"
);
and the assembly output fixes itself. I haven't run the app, but I bet you'll find it's all better now. :)

Why do these simple methods compile differently?

I'm slightly confused as to why clang is emitting different code for the following two method:
#interface ClassA : NSObject
#end
#implementation ClassA
+ (ClassA*)giveMeAnObject1 {
return [[ClassA alloc] init];
}
+ (id)giveMeAnObject2 {
return [[ClassA alloc] init];
}
#end
If we look at the ARMv7 emitted then we see this, at O3, with ARC enabled:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject1]"
"+[ClassA giveMeAnObject1]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
LPC0_0:
add r1, pc
LPC0_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
LPC0_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autorelease
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject2]"
"+[ClassA giveMeAnObject2]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autoreleaseReturnValue
The only difference is the tail call to objc_autoreleaseReturnValue vs objc_autorelease. I would expect both to call objc_autoreleaseReturnValue to be honest. In-fact the first method not using objc_autoreleaseReturnValue means that it will potentially be slower than the second because there will definitely be an autorelease then a retain by the caller, rather than the faster bypass of this redundant call that ARC can do if it's supported in the runtime.
The LLVM which is emitted gives some kind of reason why it's like that:
define internal %1* #"\01+[ClassA giveMeAnObject1]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autorelease(i8* %6) nounwind
%8 = bitcast i8* %6 to %1*
ret %1* %8
}
define internal i8* #"\01+[ClassA giveMeAnObject2]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autoreleaseReturnValue(i8* %6) nounwind
ret i8* %6
}
But I'm struggling to see why it's decided to compile these two method differently. Can anyone shed some light onto it?
Update:
Even weirder is these other methods:
+ (ClassA*)giveMeAnObject3 {
ClassA *a = [[ClassA alloc] init];
return a;
}
+ (id)giveMeAnObject4 {
ClassA *a = [[ClassA alloc] init];
return a;
}
These compile to:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject3]"
"+[ClassA giveMeAnObject3]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject4]"
"+[ClassA giveMeAnObject4]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
LPC3_0:
add r1, pc
LPC3_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
This time, they are identical however there's a few things which could be optimised even more here:
There's a redundant mov r4, r0 followed by mov r0, r4.
There's a retain followed by a release.
Surely, the bottom bit of both of those methods can turn into:
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
Obviously we could then also omit popping r4 because we don't actually clobber it any more. Then the method would turn into the exact same as giveMeAnObject2 which is exactly what we'd expect.
Why is clang not being clever and doing this?!
This appears to be a bug in the optimizer and is being tracked as rdar://problem/10813093.