AND instruction causes Segmentation fault in x86-64 - g++

I am studying GCC inline assembly (for x86-64) in c++, and analyzing "Optimizing subroutines in assembly language An optimization guide for x86 platforms" by Agner Fog. For practice, I put an Agner Fog's asmlib code (strlen64.asm) to AT&T inline assembly. but, my program receives signal SIGSEGV. the following code ran on "https://www.onlinegdb.com/online_c++_compiler" (g++), and debugged in gdb.
this is my code (In comments, There are the ASM to c++ codes for my study):
# include<iostream>
__attribute__((naked))
size_t strlen_sse(const char* const str) {
// rdi : str
// rax : aligned_str, diff
// rcx : misalign
// edx : mask, bsf_result
// xmm0 : zero
// xmm1 : c16, cmp_result
__asm__ __volatile__(
"movq %%rdi, %%rax \n" // const char* aligned_str = str;
"movl %%edi, %%ecx \n" // intptr_t misalign = reinterpret_cast<intptr_t>(str);
"pxor %%xmm0, %%xmm0 \n" // const __m128i zero = _mm_setzero_si128();
"and -0x10, %%rax \n" // aligned_str = reinterpret_cast<const char*>(
// reinterpret_cast<intptr_t>(aligned_str) & -16
// );
"and 0xf, %%ecx \n" // misalign = misalign & 15;
"movdqa (%%rax), %%xmm1 \n" // __m128i c16 = _mm_load_si128(reinterpret_cast<const __m128i*>(aligned_str) ),
"pcmpeqb %%xmm0, %%xmm1 \n" // cmp_result = _mm_cmpeq_epi8(c16, zero);
"pmovmskb %%xmm1, %%edx \n" // int mask = _mm_movemask_epi8(cmp_result);
"shr %%cl, %%edx \n" // mask >>= misalign;
"shl %%cl, %%edx \n" // mask <<= misalign;
"bsfl %%edx, %%edx \n" // int bsf_result = __builtin_ctz(mask);
"jnz 2f \n" // if(bsf_result!=32) goto STRLEN_SSE_EXIT;
"1: \n" // do {
"addq 0x10, %%rax \n" // aligned_str += 16;
"movdqa (%%rax), %%xmm1 \n" // c16 = _mm_load_si128(reinterpret_cast<const __m128i*>(aligned_str) );
"pcmpeqb %%xmm0, %%xmm1 \n" // cmp_result = _mm_cmpeq_epi8(c16, zero);
"pmovmskb %%xmm1, %%edx \n" // mask = _mm_movemask_epi8(cmp_result);
"bsfl %%edx, %%edx \n" // bsf_result = __builtin_ctz(mask);
"jz 1b \n" // } while(bsf_result==32);
"2: \n" // STRLEN_SSE_EXIT:
"subq %%rdi, %%rax \n" // size_t diff = aligned_str - str;
"addq %%rdx, %%rax \n" // diff += bsf_result;
"ret \n" // return diff;
: /* Output */
: /* Input */
: "%rdi", "%edi", "%rax", "%rcx", "%edx", "%xmm0", "%xmm1"
);
}
int main()
{
std::cout << strlen_sse("online compiler and debugger for c/c++");
}
this is what I see in gdb:
0x0000555555555190 45 );
(gdb) stepi
0x0000555555555192 45 );
(gdb) stepi
0x0000555555555196 45 );
(gdb) disass strlen_sse
Dump of assembler code for function strlen_sse(char const*):
0x0000555555555189 <+0>: endbr64
0x000055555555518d <+4>: mov %rdi,%rax
0x0000555555555190 <+7>: mov %edi,%ecx
0x0000555555555192 <+9>: pxor %xmm0,%xmm0
=> 0x0000555555555196 <+13>: and 0xfffffffffffffff0,%rax
0x000055555555519e <+21>: and 0xf,%ecx
0x00005555555551a5 <+28>: movdqa (%rax),%xmm1
0x00005555555551a9 <+32>: pcmpeqb %xmm0,%xmm1
0x00005555555551ad <+36>: pmovmskb %xmm1,%edx
0x00005555555551b1 <+40>: shr %cl,%edx
0x00005555555551b3 <+42>: shl %cl,%edx
0x00005555555551b5 <+44>: bsf %edx,%edx
0x00005555555551b8 <+47>: jne 0x5555555551d3 <strlen_sse(char const*)+74>
0x00005555555551ba <+49>: add 0x10,%rax
0x00005555555551c2 <+57>: movdqa (%rax),%xmm1
0x00005555555551c6 <+61>: pcmpeqb %xmm0,%xmm1
0x00005555555551ca <+65>: pmovmskb %xmm1,%edx
0x00005555555551ce <+69>: bsf %edx,%edx
0x00005555555551d1 <+72>: je 0x5555555551ba <strlen_sse(char const*)+49>
0x00005555555551d3 <+74>: sub %rdi,%rax
--Type <RET> for more, q to quit, c to continue without paging--
0x00005555555551d6 <+77>: add %rdx,%rax
0x00005555555551d9 <+80>: retq
0x00005555555551da <+81>: nop
0x00005555555551db <+82>: ud2
End of assembler dump.
(gdb) stepi
Program received signal SIGSEGV, Segmentation fault.
0x0000555555555196 in strlen_sse (
str=0x555555556010 "online compiler and debugger for c/c++") at main.cpp:45
45 );
(gdb) Quit
I think that the "pxor %xmm0,%xmm0" worked fine, because "print $xmm0" command showed:
(gdb) print $xmm0
$1 = {
v4_float = {0, 0, 0, 0}, v2_double = {0, 0},
v16_int8 = {0 <repeats 16 times>}, v8_int16 = {0, 0, 0, 0, 0, 0, 0, 0},
v4_int32 = {0, 0, 0, 0}, v2_int64 = {0, 0},
uint128 = 0
}
But, the following "and 0xfffffffffffffff0,%rax" causes Segmentation fault, and the program is terminated. I didn't understand the result, so instead I thought a possible problem that the signal SIGSEGV came from "movdqa (%rax), %xmm1". but, "info registers" command showed:
rax 0x555555556010 93824992239632
rbx 0x555555555270 93824992236144
rcx 0x55556010 1431658512
rdx 0x7fffffffecd8 140737488350424
rsi 0x7fffffffecc8 140737488350408
rdi 0x555555556010 93824992239632
rbp 0x7fffffffebd0 0x7fffffffebd0
rsp 0x7fffffffebc8 0x7fffffffebc8
r8 0x0 0
r9 0x7ffff7fabec0 140737353793216
r10 0x7ffff7e2e1d5 140737352229333
r11 0x7ffff7f225a0 140737353229728
r12 0x5555555550a0 93824992235680
r13 0x7fffffffecc0 140737488350400
r14 0x0 0
r15 0x0 0
rip 0x555555555196 0x555555555196 <strlen_sse(char const*)+13>
eflags 0x10246 [ PF ZF IF RF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
"rax" is aligned by 16. How do I solve this problem? on the other hand, when executed the c++ code, worked well. (I am not used to English, so I am sorry if there is an awkward expression in the question). Thanks.

Related

Parameter passing to subroutine x64 inline assembly(Vc++ 2015 with Intel c++ compiler 2017)

I coded
main()
{
unsigned char *memory;
unsigned int a=15;
float sigma=5.0f;
gaussian_filter();
}
gaussian_filter()
{
unsigned __int64 evacuate_rbp;
unsigned __int64 evacuate_rsp;
__asm
{
mov eax, a
...
mov evacuate_rbp, rbp
mov evacuate_rsp, rsp
...
mov rsp, memory
...
movss xmm0, sigma
....
mov rbp, evacuate_rbp
mov rsp, evacuate_rsp
}
}
I want to non-parameter passing to use rbp and rsp registers as an address index of memory.
In Build with Intel C++ compiler, error is occured. Why?.

Speeding up the loop

I have the following piece of code:
for chunk in imagebuf.chunks_mut(4) {
let temp = chunk[0];
chunk[0] = chunk[2];
chunk[2] = temp;
}
For an array of 40000 u8s, it takes about 2.5 ms on my machine, compiled using cargo build --release.
The following C++ code takes about 100 us for the exact same data (verified by implementing it and using FFI to call it from rust):
for(;imagebuf!=endbuf;imagebuf+=4) {
char c=imagebuf[0];
imagebuf[0]=imagebuf[2];
imagebuf[2]=c;
}
I'm thinking it should be possible to speed up the Rust implementation to perform as fast as the C++ version.
The Rust program was built using cargo --release, the C++ program was built without any optimization flags.
Any hints?
I cannot reproduce the timings you are getting. You probably have an error in how you measure (or I have 😉). On my machine both versions run in exactly the same time.
In this answer, I will first compare the assembly output of both, the C++ and the Rust version. Afterwards I will describe how to reproduce my timings.
Assembly comparison
I generated the assembly code with the amazing Compiler Explorer (Rust code, C++ Code). I compiled the C++ code with optimizations activated (-O3), too, to make it a fair game (C++ compiler optimizations had no impact on the measured timings though). Here is the resulting assembly (Rust left, C++ right):
example::foo_rust: | foo_cpp(char*, char*):
test rsi, rsi | cmp rdi, rsi
je .LBB0_5 | je .L3
mov r8d, 4 |
.LBB0_2: | .L5:
cmp rsi, 4 |
mov rdx, rsi |
cmova rdx, r8 |
test rdi, rdi |
je .LBB0_5 |
cmp rdx, 3 |
jb .LBB0_6 |
movzx ecx, byte ptr [rdi] | movzx edx, BYTE PTR [rdi]
movzx eax, byte ptr [rdi + 2] | movzx eax, BYTE PTR [rdi+2]
| add rdi, 4
mov byte ptr [rdi], al | mov BYTE PTR [rdi-2], al
mov byte ptr [rdi + 2], cl | mov BYTE PTR [rdi-4], dl
lea rdi, [rdi + rdx] |
sub rsi, rdx | cmp rsi, rdi
jne .LBB0_2 | jne .L5
.LBB0_5: | .L3:
| xor eax, eax
ret | ret
.LBB0_6: |
push rbp +-----------------+
mov rbp, rsp |
lea rdi, [rip + panic_bounds_check_loc.3] |
mov esi, 2 |
call core::panicking::panic_bounds_check#PLT |
You can immediately see that C++ does in fact produce a lot less assembly (without optimization C++ produced nearly as many instruction as Rust does). I am not sure about all of the additional instructions Rust produces, but at least half of them are for bound checking. But this bound checking is, as far as I understand, not for the actual accesses via [] but just once every loop iteration. This is just for the case that the slice's length is not divisible by 4. But I guess the Rust assembly could be better still (even with bound checks).
As mentioned in the comments, you can remove bound checking by using get_unchecked() and get_unchecked_mut(). Note however, that this did not influence the performance in my measurements!
Lastly: you should use [&]::swap(i, j) here.
for chunk in imagebuf.chunks_mut(4) {
chunk.swap(0, 2);
}
This, again, did not notably influence performance. But it's shorter and better code.
Measuring
I used this C++ code (in foocpp.cpp):
extern "C" void foo_cpp(char *imagebuf, char *endbuf);
void foo_cpp(char* imagebuf, char* endbuf) {
for(;imagebuf!=endbuf;imagebuf+=4) {
char c=imagebuf[0];
imagebuf[0]=imagebuf[2];
imagebuf[2]=c;
}
}
I compiled it with:
gcc -c -O3 foocpp.cpp && ar rvs libfoocpp.a foocpp.o
Then I used this Rust code to measure everything:
#![feature(test)]
extern crate libc;
extern crate test;
use test::black_box;
use std::time::Instant;
#[link(name = "foocpp")]
extern {
fn foo_cpp(start: *mut libc::c_char, end: *const libc::c_char);
}
pub fn foo_rust(imagebuf: &mut [u8]) {
for chunk in imagebuf.chunks_mut(4) {
let temp = chunk[0];
chunk[0] = chunk[2];
chunk[2] = temp;
}
}
fn main() {
let mut buf = [0u8; 40_000];
let before = Instant::now();
foo_rust(black_box(&mut buf));
black_box(buf);
println!("rust: {:?}", Instant::now() - before);
// ----------------------------------
let mut buf = [0u8 as libc::c_char; 40_000];
let before = Instant::now();
let ptr = buf.as_mut_ptr();
let end = unsafe { ptr.offset(buf.len() as isize) };
unsafe { foo_cpp(black_box(ptr), black_box(end)); }
black_box(buf);
println!("cpp: {:?}", Instant::now() - before);
}
The black_box() all over the place prevents the compiler from optimizing where it isn't supposed to. I executed it with (nightly compiler):
LIBRARY_PATH=.:$LIBRARY_PATH cargo run --release
Giving me (i7-6700HQ) values like these:
rust: Duration { secs: 0, nanos: 30583 }
cpp: Duration { secs: 0, nanos: 30810 }
The times fluctuate a lot (way more than the difference between both versions). I am not exactly sure why the additional assembly generated by Rust does not result in a slower execution, though.

How unwind ARM Cortex M3 stack

The ARM Coretex STM32's HardFault_Handler can only get several registers values, r0, r1,r2, r3, lr, pc, xPSR, when crash happened. But there is no FP and SP in the stack. Thus I could not unwind the stack.
Is there any solution for this? Thanks a lot.
[update]
Following a web instruction to let ARMGCC(Keil uvision IDE) generate FP by adding a compiling option "--use_frame_pointer", but I could not find the FP in the stack. I am a real newbie here. Below is my demo code:
int test2(int i, int j)
{
return i/j;
}
int main()
{
SCB->CCR |= 0x10;
int a = 10;
int b = 0;
int c;
c = test2(a,b);
}
enum { r0 = 0, r1, r2, r3, r11, r12, lr, pc, psr};
void Hard_Fault_Handler(uint32_t *faultStackAddress)
{
uint32_t r0_val = faultStackAddress[r0];
uint32_t r1_val = faultStackAddress[r1];
uint32_t r2_val = faultStackAddress[r2];
uint32_t r3_val = faultStackAddress[r3];
uint32_t r12_val = faultStackAddress[r12];
uint32_t r11_val = faultStackAddress[r11];
uint32_t lr_val = faultStackAddress[lr];
uint32_t pc_val = faultStackAddress[pc];
uint32_t psr_val = faultStackAddress[psr];
}
I have two questions here:
1. I am not sure where the index of FP(r11) in the stack, or whether it is pushed into stack or not. I assume it is before r12, because I compared the assemble source before and after adding the option "--use_frame_pointer". I also compared the values read from Hard_Fault_Handler, seems like r11 is not in the stack. Because r11 address I read points to a place where the code is not my code.
[update] I have confirmed that FP is pushed into the stack. The second question still needs to be answered.
See below snippet code:
Without the option "--use_frame_pointer"
test2 PROC
MOVS r0,#3
BX lr
ENDP
main PROC
PUSH {lr}
MOVS r0,#0
BL test2
MOVS r0,#0
POP {pc}
ENDP
with the option "--use_frame_pointer"
test2 PROC
PUSH {r11,lr}
ADD r11,sp,#4
MOVS r0,#3
MOV sp,r11
SUB sp,sp,#4
POP {r11,pc}
ENDP
main PROC
PUSH {r11,lr}
ADD r11,sp,#4
MOVS r0,#0
BL test2
MOVS r0,#0
MOV sp,r11
SUB sp,sp,#4
POP {r11,pc}
ENDP
2. Seems like FP is not in the input parameter faultStackAddress of Hard_Fault_Handler(), where can I get the caller's FP to unwind the stack?
[update again]
Now I understood the last FP(r11) is not stored in the stack. All I need to do is to read the value of r11 register, then I can unwind the whole stack.
So now my final question is how to read it using inline assembler of C. I tried below code, but failed to read the correct value from r11 following the reference of http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0472f/Cihfhjhg.html
volatile int top_fp;
__asm
{
mov top_fp, r11
}
r11's value is 0x20009DCC
top_fp's value is 0x00000004
[update 3] Below is my whole code.
int test5(int i, int j, int k)
{
char a[128] = {0} ;
a[0] = 'a';
return i/j;
}
int test2(int i, int j)
{
char a[18] = {0} ;
a[0] = 'a';
return test5(i, j, 0);
}
int main()
{
SCB->CCR |= 0x10;
int a = 10;
int b = 0;
int c;
c = test2(a,b); //create a divide by zero crash
}
/* The fault handler implementation calls a function called Hard_Fault_Handler(). */
#if defined(__CC_ARM)
__asm void HardFault_Handler(void)
{
TST lr, #4
ITE EQ
MRSEQ r0, MSP
MRSNE r0, PSP
B __cpp(Hard_Fault_Handler)
}
#else
void HardFault_Handler(void)
{
__asm("TST lr, #4");
__asm("ITE EQ");
__asm("MRSEQ r0, MSP");
__asm("MRSNE r0, PSP");
__asm("B Hard_Fault_Handler");
}
#endif
void Hard_Fault_Handler(uint32_t *faultStackAddress)
{
volatile int top_fp;
__asm
{
mov top_fp, r11
}
//TODO: use top_fp to unwind the whole stack.
}
[update 4] Finally, I made it out. My solution:
Note: To access r11, we have to use embedded assembler, see here, which costs me much time to figure it out.
//we have to use embedded assembler.
__asm int getRegisterR11()
{
mov r0,r11
BX LR
}
//call it from Hard_Fault_Handler function.
/*
Function call stack frame:
FP1(r11) -> | lr |(High Address)
| FP2|(prev FP)
| ...|
Current FP(r11) ->| lr |
| FP1|(prev FP)
| ...|(Low Address)
With FP, we can access lr(link register) which is the address to return when the current functions returns(where you were).
Then (current FP - 1) points to prev FP.
Thus we can unwind the stack.
*/
void unwindBacktrace(uint32_t topFp, uint16_t* backtrace)
{
uint32_t nextFp = topFp;
int j = 0;
//#define BACK_TRACE_DEPTH 5
//loop backtrace using FP(r11), save lr into an uint16_t array.
for(int i = 0; i < BACK_TRACE_DEPTH; i++)
{
uint32_t lr = *((uint32_t*)nextFp);
if ((lr >= 0x08000000) && (lr <= 0x08FFFFFF))
{
backtrace[j*2] = LOW_16_BITS(lr);
backtrace[j*2 + 1] = HIGH_16_BITS(lr);
j += 1;
}
nextFp = *((uint32_t*)nextFp - 1);
if (nextFp == 0)
{
break;
}
}
}
#if defined(__CC_ARM)
__asm void HardFault_Handler(void)
{
TST lr, #4
ITE EQ
MRSEQ r0, MSP
MRSNE r0, PSP
B __cpp(Hard_Fault_Handler)
}
#else
void HardFault_Handler(void)
{
__asm("TST lr, #4");
__asm("ITE EQ");
__asm("MRSEQ r0, MSP");
__asm("MRSNE r0, PSP");
__asm("B Hard_Fault_Handler");
}
#endif
void Hard_Fault_Handler(uint32_t *faultStackAddress)
{
//get back trace
int topFp = getRegisterR11();
unwindBacktrace(topFp, persistentData.faultStack.back_trace);
}
Very primitive method to unwind the stack in such case is to read all stack memory above SP seen at the time of HardFault_Handler and process it using arm-none-eabi-addr2line. All link register entries saved on stack will be transformed into source line (remember that actual code path goes the line before LR points to). Note, if functions in between were called using branch instruction (b) instead of branch and link (bl) you'll not see them using this method.
(I don't have enough reputation points to write comments, so I'm editing my answer):
UPDATE for question 2:
Why do you expect that Hard_Fault_Handler has any arguments? Hard_Fault_Handler is usally a function to which address is stored in vector (exception) table. When the processor exception happens then Hard_Fault_Handler will be executed. There is no arguments passing involved doing this. But still, all registers at the time the fault happens are preserved. Specifically, if you compiled without omit-frame-pointer you can just read value of R11 (or R7 in Thumb-2 mode). However, to be sure that in your code Hard_Fault_Handler is actually a real hard fault handler, look into startup.s code and see if Hard_Fault_Handler is at the third entry in vector table. If there is an other function, it means Hard_Fault_Handler is just called from that function explicitly. See this article for details. You can also read my blog :) There is a chapter about stack which is based on Android example, but a lot of things are the same in general.
Also note, most probably in faultStackAddress should be stored a stack pointer, not a frame pointer.
UPDATE 2
Ok, lets clarify some things. Firstly, please paste the code from which you call Hard_Fault_Handler. Secondly, I guess you call it from within real HardFault exception handler. In that case you cannot expect that R11 will be at faultStackAddress[r11]. You've already mentioned it at the first sentence in your question. There will be only r0-r3, r12, lr, pc and psr.
You've also written:
But there is no FP and SP in the stack. Thus I could not unwind the
stack. Is there any solution for this?
The SP is not "in the stack" because you have it already in one of the stack registers (msp or psp). See again THIS ARTICLE. Also, FP is not crucial to unwind stack because you can do it without it (by "navigating" through saved Link Registers). Other thing is that if you dump memory below your SP you can expect FP to be just next to saved LR if you really need it.
Answering your last question: I don't now how you're verifying this code and how you're calling it (you need to paste full code). You can look into assembly of that function and see what's happening under the hood. Other thing you can do is to follow this post as a template.

MASM ReadFile failed because of a bad handle value in EAX register

I want to convert the following C++ program in MASM (The goal is to open an existing file, write a string into it and at the end read the file) :
void __cdecl _tmain(int argc, TCHAR *argv[])
{
HANDLE hFile;
/////////////////////////////////////////////////////////////////////////////////
if ((hFile = CreateFile(TEXT("C:\\Users\\Bloodsucker94\\Desktop\\TestFile.txt"),
GENERIC_WRITE | GENERIC_READ,
FILE_SHARE_READ,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
NULL)) == INVALID_HANDLE_VALUE)
_tprintf(TEXT("CreateFile() failed code %d\n"), GetLastError());
/////////////////////////////////////////////////////////////////////////////////
char DataBuffer[1024] = "aaa";
DWORD dwBytesToWrite = (DWORD)strlen(DataBuffer);
DWORD dwBytesWritten = 0;
//_tprintf(TEXT("CreateFile() HFILE=%d\n"), hFile);
if (WriteFile(hFile, DataBuffer, dwBytesToWrite, &dwBytesWritten, NULL) == FALSE)
_tprintf(TEXT("WriteFile() failed code %d\n"), GetLastError());
//_tprintf(TEXT("WriteFile() HFILE=%d\n"), hFile);
/////////////////////////////////////////////////////////////////////////////////
SetFilePointer(hFile, 0, NULL, FILE_BEGIN);
char ReadBuffer[4096] = {0};
DWORD dwBytesRead;
if (FALSE == ReadFile(hFile, ReadBuffer, 4096, &dwBytesRead, NULL))
_tprintf(TEXT("ReadFile() failed code %d\n"), GetLastError());
printf("|%s|", ReadBuffer);
getchar();
CloseHandle(hFile);
}
The ASM code :
.386
.model flat, stdcall
option casemap :none
include \masm32\include\windows.inc
include \masm32\include\kernel32.inc
include \masm32\include\masm32.inc
includelib \masm32\lib\kernel32.lib
includelib \masm32\lib\masm32.lib
include \masm32\include\msvcrt.inc
includelib \masm32\lib\msvcrt.lib
.data
FileName BYTE "HelloWorld.txt", 0
BufferToWrite BYTE "Hell yeaahhhhhh!!!!", 0
ErrorCreateMsgFormat BYTE "CreateFile() failed with code %d", 0
ErrorReadMsgFormat BYTE "ReadFile() failed with code %d", 0
ErrorWriteMsgFormat BYTE "WriteFile() failed with code %d", 0
CheckFormat BYTE "hFile=%d", 0
CheckSize BYTE "size=%d", 0
CheckPtr BYTE "EAX_ADDR=Ox%08X", 0
OutputFormat BYTE "output=%s", 0
.data?
hFile HANDLE ?
hFileCopy HANDLE ?
FileSize DWORD ?
hMem LPVOID ?
BytesRead LPDWORD ?
ErrorCode DWORD ?
RetRead BOOL ?
RetWrite BOOL ?
NumberOfBytesToWrite DWORD ?
NumberOfBytesWritten DWORD ?
BufferToWriteSize DWORD ?
.code
start:
invoke lstrlen, ADDR BufferToWrite
mov BufferToWriteSize, eax
;-----------------------------CREATE-------------------------------
invoke CreateFile, ADDR FileName, \
GENERIC_WRITE + GENERIC_READ, \
0, \
NULL, \
OPEN_EXISTING, \
FILE_ATTRIBUTE_NORMAL, \
NULL
mov hFile, eax
.IF hFile == INVALID_HANDLE_VALUE
invoke GetLastError
mov ErrorCode, eax
invoke crt_printf, ADDR ErrorCreateMsgFormat, \
ErrorCode
jmp _quit
.ENDIF
invoke crt_printf, ADDR CheckFormat, \
hFile
;---------------------------WRITE---------------------------------
invoke WriteFile, hFile, \
ADDR BufferToWrite, \
BufferToWriteSize, \
ADDR NumberOfBytesWritten, \
NULL
mov RetWrite, eax
.IF RetWrite == FALSE
invoke GetLastError
mov ErrorCode, eax
invoke crt_printf, ADDR ErrorWriteMsgFormat, \
ErrorCode
jmp _quit
.ENDIF
invoke crt_printf, ADDR CheckFormat, \
hFile
;--------------------------READ----------------------------------
invoke GetFileSize, eax, \ ;problem start here
NULL
mov FileSize, eax
inc eax
invoke crt_printf, ADDR CheckSize, \
FileSize
invoke GlobalAlloc, GMEM_FIXED, \
eax
mov hMem, eax
add eax, FileSize
mov BYTE PTR [eax], 0
invoke ReadFile, hFile, \
hMem, \
FileSize, \
ADDR BytesRead, \
NULL
mov RetRead, eax
.IF RetRead == FALSE
invoke GetLastError
mov ErrorCode, eax
invoke crt_printf, ADDR ErrorReadMsgFormat, \
ErrorCode
jmp _quit
.ENDIF
invoke crt_printf, ADDR CheckFormat, \
hFile
invoke crt_printf, ADDR OutputFormat, \
hMem
invoke CloseHandle, hFile
invoke GlobalFree, hMem
_quit:
invoke ExitProcess, 0
end start
The problem is EAX register not contain the CreateFile return value (hFile).
It's normal because it contains at the point of the execution the value of the WriteFile function. I didn't find any solution to conserve the initial value of eax returned by CreatefILE function and use it again after the WriteFile function call.
I can't do this :
mov FileSize, hFile
I just want to save the first value of eax. I tried to save it into another register but it does not work. Does anyone can help me ?
Eiher:
.data
savedValue DWORD ?
.code
…
// save to a variable
mov savedValue, eax
…
// restore from a variable
mov eax, savedValue
…
or:
.code
…
// save to stack
push eax
…
// restore from stack
pop eax
…
Sorry for any syntax errors. It's been a long a time since the last common use case for direct assembly use.

Arm loop code optimization for calculating mean standard deviation

I want to improve some code which is using 25% of my app CPU, the code is the next:
for (int i=0; i<8; i++) {
unsigned f = *p++;
sum += f;
sqsum += f*f;
}
I made some arm code but it is not working, even not compiling, which is the next:
void loop(uint8_t * p , int * sum ,int * qsum)
{
__asm__ volatile("vld4.8 {d0}, [%0]! \n"
"mov r4, #0 \n"
"vmlal.u8 [%1]!, [%1]!, d0 \n"
"vmull.u8 r4, d0 , d0 \n"
"vmlal.u8 [%2]!, [%2]!, r4\n"
:
: "r"(p), "r"(sum), "r"(qsum)
: "r4"
);
}
Any help?
Here is the my function to improve:
void calculateMeanStDev8x8(cv::Mat* patch, int sx, int sy, int& mean, float& stdev)
{
unsigned sum=0;
unsigned sqsum=0;
for (int j=0; j< 8; j++) {
const unsigned char* p = (const unsigned char*)(patch->data + (j+sy)*patch->step + sx); //Apuntador al inicio de la matrix
//The code to improve
for (int i=0; i<8; i++) {
unsigned f = *p++;
sum += f;
sqsum += f*f;
}
}
mean = sum >> 6;
int r = (sum*sum) >> 6;
stdev = sqrtf(sqsum - r);
if (stdev < .1) {
stdev=0;
}
}
That loop is a perfect candidate for NEON optimization. You can fit your 8 unsigned integers into a single NEON register. There is no "sum all elements of a vector" instruction, but you can use the pairwise add to compute the sum of the 8 elements in 3 steps. Since we can't see the rest of your application, it's hard to know what the big picture is, but NEON is your best bet for improving the speed. All recent Apple products support NEON instructions and in XCode you can use the NEON intrinsics mixed with your C++ code.