Why is this a tail call?

Why is this a tail call? - optimization

Here is a simple hello world:
#include <stdio.h>
int main() {
printf("hello world\n");
return 0;
}
Here it is compiled to LLVM IR:
will#ox:~$ clang -S -O3 -emit-llvm ~/test_apps/hello1.c -o -
; ModuleID = '/home/will/test_apps/hello1.c'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
#str = private unnamed_addr constant [12 x i8] c"hello world\00"
; Function Attrs: nounwind uwtable
define i32 #main() #0 {
%puts = tail call i32 #puts(i8* getelementptr inbounds ([12 x i8]* #str, i64 0, i64 0))
ret i32 0
}
; Function Attrs: nounwind
declare i32 #puts(i8* nocapture readonly) #1
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
!llvm.ident = !{!0}
!0 = !{!"Ubuntu clang version 3.6.0-2ubuntu1 (tags/RELEASE_360/final) (based on LLVM 3.6.0)"}
The description of tail-call optimisation says that the following conditions must be met:
The call is a tail call - in tail position (ret immediately follows
call and ret uses value of call or is void).
Yet in this example the value returned by puts() should not be used as the return value of the function.
Is this a legal tail-call optimisation? What does main() return?

The tail flag in LLVM is a bit strange. It just means that the call to puts is a candidate for tail call optimization, in particular it is not allowed to access any variable on the stack of the caller. The code generator still has to make sure that the call is in a position suitable for tail call optimization before it actually turns the call into a jump, and that's not the case here.
If you look at the assembly emitted by LLVM you'll see that there is no tail call optimization happening:
$ clang -O -S -o - bug.c
[...]
main: # #main
.cfi_startproc
# BB#0: # %entry
pushq %rax
.Ltmp0:
.cfi_def_cfa_offset 16
movl $.Lstr, %edi
callq puts
xorl %eax, %eax
popq %rdx
retq

Related

Why does u64::trailing_zeros() generate branched assembly when branchless works?

This function:
pub fn g(n: u64) -> u32 {
n.trailing_zeros()
}
generates assembly with a branch:
playground::g:
testq %rdi, %rdi
je .LBB0_1
bsfq %rdi, %rax
retq
.LBB0_1:
movl $64, %eax
retq
This alternative function:
pub fn g(n: u64) -> u32 {
if n == 0 { u32::MAX } else { n.trailing_zeros() }
} ^^^^^^^^
generates assembly without a branch:
playground::g:
bsfq %rdi, %rcx
xorl %eax, %eax
cmpq $1, %rdi
sbbl %eax, %eax
orl %ecx, %eax
retq
It turns out that the branch gets created only when the constant returned is 64. Returning 0, or u32::MAX, or any other number generates branchless assembly.
Why is this? Just a quirk of the optimizer or there's a reason?
I'm trying to create performant, branchless code.
Using Rust 1.65 release profile

trailing_zeros corresponds to the cttz LLVM intrinsic.
That intrinsic just so happens to compile to the following instructions on x86-64:
g: # #g
test rdi, rdi
je .LBB0_1
bsf rax, rdi
ret
.LBB0_1:
mov eax, 64
ret
The output of that intrinsic is the bit width of the integer when the input value is 0. LLVM is able to recognize the redundant operation and remove it, which is why u64::BITS or just 64 in your conditional result in the same machine code as just the intrinsic.
It appears that using any other number results in the compiler recognizing the intrinsic branch as dead code, which is therefore removed:
e: # #e
xor ecx, ecx
bsf rax, rdi
cmove eax, ecx
ret
Instead, a single conditional move is generated. I believe this variance in output is just a quirk of the LLVM x86-64 assembler when certain intrinsics are involved.
You can reproduce the same discrepancy with C using clang. godbolt
It might be worth opening an LLVM issue for this, but only if the branchless version is actually better.
this LLVM issue may be related

LLVM optimization passes break recursive code

I have a problem regarding some LLVM optimization passes, which modify the compilation output, so that it is not functional anymore.
This is the input source code of a Fibonacci algorithm:
f<int> fib(int n) {
if n <= 1 { return 1; }
return fib(n - 1) + fib(n - 2);
}
f<int> main() {
printf("Result: %d", fib(46));
return 0;
}
Without optimization, my compiler spits out following IR code, which is working perfectly fine:
; ModuleID = 'Module'
source_filename = "Module"
#0 = private unnamed_addr constant [11 x i8] c"Result: %d\00", align 1
declare i32 #printf(i8*, ...)
define i32 #"fib(int)"(i32 %0) {
entry:
%n = alloca i32, align 4
store i32 %0, i32* %n, align 4
%result = alloca i32, align 4
%1 = load i32, i32* %n, align 4
%le = icmp sle i32 %1, 1
br i1 %le, label %then, label %end
then: ; preds = %entry
ret i32 1
br label %end
end: ; preds = %then, %entry
%2 = load i32, i32* %n, align 4
%sub = sub i32 %2, 1
%3 = call i32 #"fib(int)"(i32 %sub)
%4 = load i32, i32* %n, align 4
%sub1 = sub i32 %4, 2
%5 = call i32 #"fib(int)"(i32 %sub1)
%add = add i32 %3, %5
ret i32 %add
}
define i32 #main() {
main_entry:
%result = alloca i32, align 4
%0 = call i32 #"fib(int)"(i32 46)
%1 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* #0, i32 0, i32 0), i32 %0)
ret i32 0
}
Then I applied a few optimization passes to it. Here is my chain of passes:
fpm->add(llvm::createDeadCodeEliminationPass());
fpm->add(llvm::createLoopDeletionPass());
fpm->add(llvm::createDeadStoreEliminationPass());
fpm->add(llvm::createGVNPass());
fpm->add(llvm::createPromoteMemoryToRegisterPass());
fpm->add(llvm::createInstructionCombiningPass());
fpm->add(llvm::createReassociatePass());
fpm->add(llvm::createCFGSimplificationPass()); // Breaks recursion
fpm->add(llvm::createCorrelatedValuePropagationPass());
fpm->add(llvm::createLoopSimplifyPass());
With this optimization passes enabled, I get following IR code:
; ModuleID = 'Module'
source_filename = "Module"
#0 = private unnamed_addr constant [11 x i8] c"Result: %d\00", align 1
declare i32 #printf(i8*, ...)
define i32 #"fib(int)"(i32 %0) {
entry:
%le = icmp slt i32 %0, 2
%sub = add i32 %0, -1
%1 = call i32 #"fib(int)"(i32 %sub)
%sub1 = add i32 %0, -2
%2 = call i32 #"fib(int)"(i32 %sub1)
%add = add i32 %2, %1
ret i32 %add
}
define i32 #main() {
main_entry:
%0 = call i32 #"fib(int)"(i32 46)
%1 = call i32 (i8*, ...) #printf(i8* noundef nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* #0, i64 0, i64 0), i32 %0)
ret i32 0
}
Obviously, this code does produce a stack overflow, because the recursion anchor is gone. It seems like the CFGSimplificationPass merges blocks in a wrong way / eliminates the if body although it is relevant. When I remove the 'createCFGSimplificationPass' line, the optimizations works and the executable outcome runs fine.
Now my question: What am I doing wrong? Or is this maybe a bug in LLVM?
Thanks for your help!

then: ; preds = %entry
ret i32 1
br label %end
A block can't have two terminators, so this IR is invalid. This causes the optimizations to misbehave as they can't tell which one is the intended terminator.
To more easily catch errors like this in the future, you should use llvm::verifyModule to verify the IR you generate before you run additional passes on it.

strange optimize in LLVM IR

Found the problem, it turned out I forget set the initial value of vari in set_array.
which is
store i32 0, i32* %vari
in the code
However it is strange that opt optimize it out.
// ----- original ---------
Below is the code generated in LLVM IR. Notice I called malloc in main function.
When I use opt to optimize the code, malloc is gone and the optimized generated code doesn't match what I expect. (Even if I use printf to print some values in the allocated array to prevent it gets optimized out).
Same thing happens using llc.
How should I create a "complete" program entirely in LLVM IR (be able to call malloc), and compile it to an executable binary.
; ModuleID = 'my_module'
source_filename = "my_module"
target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
define i32 #main(i32 %0, i8** %1) {
main_basicblock:
%malloc_ret = call i8* #malloc(i64 30)
%temp = bitcast i8* %malloc_ret to i32*
call void #set_array(i32* %temp, i32 30, i32 2)
%temp1 = getelementptr i32, i32* %temp, i32 2
%f2 = load i32, i32* %temp1
%temp2 = getelementptr i32, i32* %temp, i32 10
%f23 = load i32, i32* %temp2
ret i32 0
}
define void #set_array(i32* %0, i32 %1, i32 %2) {
set_array_basicblock:
%vari = alloca i32
**store i32 0, i32* %vari**
br label %loop_basicblock
loop_basicblock: ; preds = %set_array_basicblock, %set_array_basicblock2
%temp = load i32, i32* %vari
%temp1 = icmp slt i32 %temp, %1
br i1 %temp1, label %set_array_basicblock2, label %set_array_basicblock4
set_array_basicblock2: ; preds = %loop_basicblock
%i_value = load i32, i32* %vari
%ptr = getelementptr i32, i32* %0, i32 %i_value
store i32 %1, i32* %ptr
%temp3 = add i32 %i_value, 1
store i32 %temp3, i32* %vari
br label %loop_basicblock
set_array_basicblock4: ; preds = %loop_basicblock
ret void
}
declare i8* #malloc(i64)

The allocated memory is not used for anything observable so the optimizer can remove it completely.
You can avoid that disabling optimizations or doing something meaningful with it. For instance, you can print it, call an external function, perform a volatile store...
store volatile i32 0, i32* %temp
Even if I use printf to print some values in the allocated array to prevent it get optimized out
This may happen if the optimizer can compute everything during compilation time. You can add unknown values from some input source, increase the complexity of the code or use one of the other solutions.

How to get the dynamic assigned heap address and malloc size with llvm pass instrumentation at runtime?

Traverse the basic blocks to get the malloc size args and return address at runtime.
I instrument the printf() function at every call malloc() site in the IR and hope it can print the malloc size at runtime.
In the example, the size is inst.getOperand(0), the malloc size get from the scanf().
for (auto &BB : F) {
for (auto Inst = BB.begin(); Inst != BB.end(); Inst++) {
Instruction &inst = *Inst;
if(CallInst* call_inst = dyn_cast<CallInst>(&inst)) {
Function* fn = call_inst->getCalledFunction();
if(fn == "malloc"){
/* do something to get heap address and malloc size*/
// for example
/* declare printf function */
IRBuilder<> builder(call_inst);
std::vector<llvm::Type *> putsArgs;
putsArgs.push_back(builder.getInt8Ty()->getPointerTo());
llvm::ArrayRef<llvm::Type*> argsRef(putsArgs);
/* declare a varible and assign, then puts args */
llvm::FunctionType *putsType =
llvm::FunctionType::get(builder.getInt64Ty(), argsRef, true);
llvm::Constant *putsFunc = M.getOrInsertFunction("printf", putsType);
Value *allocDeclrInt;
Value *RightValue = IntegerType::get(64, inst.getOperand(0));
StoreInst store=builder.CreateStore(RightValue,allocDeclrInt, false);
LoadInst *a = builder.CreateLoad(allocDeclrInt);
Value *intFormat = builder.CreateGlobalStringPtr("%d");
std::vector<llvm::Value *> values;
values.clear();
values.push_back(intFormat);
values.push_back(a);
//puts size
builder.CreateCall(putsFunc, values);
}
}
}
}
My test.c file contains:
int a=0;
scanf("%d",&a);
p1=(char*)malloc(a*sizeof(char));
The IR language:
%conv = sext i32 %29 to i64, !dbg !81
%a.size = alloca i32, !dbg !82
store i32 10, i32* %a.size, !dbg !82
%30 = load i32, i32* %a.size, !dbg !82
%31 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([3 x i8], [3 x i8]* #0, i32 0, i32 0), i32 %30), !dbg !82
%32 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* #1, i32 0, i32 0)), !dbg !82
%call1 = call i8* #malloc(i64 %conv), !dbg !82
can I get the assigned size and heap address at runtime?

malloc() itself selects its address at runtime (and some implementations guarantee that the return value will vary each time the program is run), so if you want to get the heap address, you have to replace it with your own implementation of malloc.
Getting at the malloc size is easier: If callInst->getArgOperand(0) is a ConstantInt you have the size. If not you might be able to fold it, but that's perhaps beyond your interest?

llvm opt -O3 fail (?)

I need to identify integer variables which behave like boolean variables, that is, they can only have the values 0 or 1.
For that purpose, I modified the llvm bitcode to add an equivalent instruction to:
int tmp = someVar*(someVar-1);
Hoping that agressive O3 optimizations will identify tmp as being the constant value 0. Here is a C version of the code I used:
int should_expand(char *s)
{
int tmp = 0;
int ret = 0;
char *p = s;
if (p && *p == '&')
{
ret = 1;
}
tmp = ret * (ret - 1);
return tmp;
}
When I examine the *.ll file I see that almighty clang 6.0.0 failed
to realize tmp is actually 0:
define i32 #should_expand(i8* readonly %s) local_unnamed_addr #0 {
entry:
%tobool = icmp eq i8* %s, null
br i1 %tobool, label %if.end, label %land.lhs.true
land.lhs.true: ; preds = %entry
%tmp = load i8, i8* %s, align 1, !tbaa !2
%cmp = icmp eq i8 %tmp, 38
%spec.select = zext i1 %cmp to i32
br label %if.end
if.end: ; preds = %land.lhs.true, %entry
%ret.0 = phi i32 [ 0, %entry ], [ %spec.select, %land.lhs.true ]
%sub = add nsw i32 %ret.0, -1
%tmp1 = sub nsw i32 0, %ret.0
%mul = and i32 %sub, %tmp1
ret i32 %mul
}
Does that make sense? are there any external static analyzers I can use, that inter-operate smoothly with clang? or any other trick I can use?
Thanks a lot!

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Why is this a tail call? - optimization

Related

Why does u64::trailing_zeros() generate branched assembly when branchless works?

LLVM optimization passes break recursive code

strange optimize in LLVM IR

How to get the dynamic assigned heap address and malloc size with llvm pass instrumentation at runtime?

llvm opt -O3 fail (?)

Categories

Resources