llvm code optimization options do not work - optimization

I am reading about LLVM Code optimization.I tried to apply opt command options on a number of examples but they do not have any effect. For example.Here is a c++ code called deadCode.cpp:
#include<stdio.h>
int square(int x){
return x*x;
}
int main(){
int a=2;
int b=3;
int c=4;
int result =square(a);
printf("%d\n",b);
}
I generated the LLVM IR with clang like this:
clang -emit-llvm -S deadCode.cpp -o deadCodeBefore
and the result file deadCodeBefore content is :
; ModuleID = 'deadCode.cpp'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
#.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
; Function Attrs: nounwind uwtable
define i32 #_Z6squarei(i32 %x) #0 {
%1 = alloca i32, align 4
store i32 %x, i32* %1, align 4
%2 = load i32, i32* %1, align 4
%3 = load i32, i32* %1, align 4
%4 = mul nsw i32 %2, %3
ret i32 %4
}
; Function Attrs: norecurse uwtable
define i32 #main() #1 {
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
%result = alloca i32, align 4
store i32 2, i32* %a, align 4
store i32 3, i32* %b, align 4
store i32 4, i32* %c, align 4
%1 = load i32, i32* %a, align 4
%2 = call i32 #_Z6squarei(i32 %1)
store i32 %2, i32* %result, align 4
%3 = load i32, i32* %b, align 4
%4 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* #.str, i32 0, i32 0), i32 %3)
ret i32 0
}
declare i32 #printf(i8*, ...) #2
attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
the optimization command I used:
opt -S -adce deadCodeBefore -o deadCodeAfter1
As I read it should remove the call to square function and also the declaration of c variable because they have no effect. But the result is the same. Here is deadCodeAfter1 which is the same as deadCodeBefore:
; ModuleID = 'deadCodeBefore'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
#.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
; Function Attrs: nounwind uwtable
define i32 #_Z6squarei(i32 %x) #0 {
%1 = alloca i32, align 4
store i32 %x, i32* %1, align 4
%2 = load i32, i32* %1, align 4
%3 = load i32, i32* %1, align 4
%4 = mul nsw i32 %2, %3
ret i32 %4
}
; Function Attrs: norecurse uwtable
define i32 #main() #1 {
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
%result = alloca i32, align 4
store i32 2, i32* %a, align 4
store i32 3, i32* %b, align 4
store i32 4, i32* %c, align 4
%1 = load i32, i32* %a, align 4
%2 = call i32 #_Z6squarei(i32 %1)
store i32 %2, i32* %result, align 4
%3 = load i32, i32* %b, align 4
%4 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* #.str, i32 0, i32 0), i32 %3)
ret i32 0
}
declare i32 #printf(i8*, ...) #2
attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}

Because it's doing exactly as it is supposed to do. It checks in the IR if an instruction is being used by some other instruction or not. If not only then it removes it. For example in your code, declaration of variable %a (%a = alloca i32, align 4) is being used in the store instruction store i32 2, i32* %a, align 4
If you had just declare a variable and not assigned any value to it, then adce pass would have eliminated it. You can see that by just declaring a variable like int e; and run the optimization on it.
Usually passes in LLVM are dependent on the output of some other pass in order to be effective. An individual pass on itself might not give you the result that you might have expected it to provide.

Related

How to inform the optimizer that NonZeroU32::get will never return zero?

Here's the code sample where I ran into the problem:
pub fn div(x: u32, y: u32) -> u32 {
x / y
}
pub fn safe_div(x: u32, y: std::num::NonZeroU32) -> u32 {
x / y.get() // an unchecked division expected
}
Godbolt's rustc 1.47.0 -O generates the same assembly for both functions:
example::div:
push rax
test esi, esi
je .LBB0_2
mov eax, edi
xor edx, edx
div esi
pop rcx
ret
.LBB0_2:
lea rdi, [rip + str.0]
lea rdx, [rip + .L__unnamed_1]
mov esi, 25
call qword ptr [rip + core::panicking::panic#GOTPCREL]
ud2
example::safe_div:
push rax
test esi, esi
je .LBB1_2
mov eax, edi
xor edx, edx
div esi
pop rcx
ret
.LBB1_2:
lea rdi, [rip + str.0]
lea rdx, [rip + .L__unnamed_2]
mov esi, 25
call qword ptr [rip + core::panicking::panic#GOTPCREL]
ud2
However, it is statically known that
checking NonZeroU32::get's result against zero is pointless.
Can I somehow make the optimizer believe it
(maybe with creating new structs for this) in an unsafeless way?
Related GitHub issue #49572
After I saw your question, I added the needed impls to std,
so now the nightly release (and the upcoming 1.51 stable release) supports this!
Godbolt for
pub fn div(x: u32, y: u32) -> u32 {
x / y
}
pub fn safe_div(x: u32, y: std::num::NonZeroU32) -> u32 {
x / y
}
pub fn safe_rem(x: u32, y: std::num::NonZeroU32) -> u32 {
x % y
}
Produces the expected assembly:
example::div:
push rax
test esi, esi
je .LBB0_2
mov eax, edi
xor edx, edx
div esi
pop rcx
ret
.LBB0_2:
lea rdi, [rip + str.0]
lea rdx, [rip + .L__unnamed_1]
mov esi, 25
call qword ptr [rip + core::panicking::panic#GOTPCREL]
ud2
example::safe_div:
mov eax, edi
xor edx, edx
div esi
ret
example::safe_rem:
mov eax, edi
xor edx, edx
div esi
mov eax, edx
ret

Preserving the input control structure in MLIR's TF dialect

I'm trying to generate MLIR using Tensorflow (2.2.0) as a front-end and I would like to clarify the following.
Let's consider the example below that implements direct matrix multiplication of two 2x2 matrices.
import tensorflow as tf
import tensorflow.mlir as mlir
with tf.Graph().as_default() as g:
with tf.device('/cpu:0'):
#tf.function
def mymatmul(A, B, C):
for i in range(2):
for j in range(2):
cij = 0.0
for k in range(2):
cij += A[i, k]*B[i, j]
C[i, j].assign(cij)
A = tf.constant([[1., 2.], [3., 4.]])
B = tf.constant([[2., 1.], [4., 3.]])
C = tf.Variable([[0., 0.], [0., 0.]])
mymatmul(A, B, C)
tf_mlir_graph = mlir.experimental.convert_graph_def(g.as_graph_def())
print(tf_mlir_graph)
This code emits the following MLIR.
module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 175 : i32}} {
func #main() {
%0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
%1 = "tf.Const"() {value = dense<[[2.000000e+00, 1.000000e+00], [4.000000e+00, 3.000000e+00]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
%2 = "tf.Const"() {value = dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
%3 = "tf.VarHandleOp"() {_class = ["loc:#Variable"], container = "", device = "/device:CPU:0", dtype = f32, shape = "tfshape$dim { size: 2 } dim { size: 2 }", shared_name = "Variable"} : () -> tensor<!tf.resource<tensor<2x2xf32>>>
"tf.StatefulPartitionedCall"(%2, %1, %3) {Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_RESOURCE"], Tout = [], _read_only_resource_inputs = [], config = "", config_proto = "\0A\07\0A\03CPU\10\01\0A\07\0A\03GPU\10\002\02J\008\01", device = "/device:CPU:0", executor_type = "", f = #__inference_mymatmul_1160} : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<!tf.resource<tensor<2x2xf32>>>) -> ()
%4 = "tf.VarIsInitializedOp"(%3) {device = "/device:CPU:0"} : (tensor<!tf.resource<tensor<2x2xf32>>>) -> tensor<i1>
%5 = "tf.ReadVariableOp"(%3) {device = "/device:CPU:0", dtype = f32} : (tensor<!tf.resource<tensor<2x2xf32>>>) -> tensor<2x2xf32>
"tf.AssignVariableOp"(%3, %0) {device = "/device:CPU:0", dtype = f32} : (tensor<!tf.resource<tensor<2x2xf32>>>, tensor<2x2xf32>) -> ()
return
}
func #__inference_mymatmul_1160(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<!tf.resource>) attributes {tf.signature.is_stateful} {
%0 = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>
%1 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
%2 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
%3 = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
%4 = "tf.Const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
%5 = "tf.Const"() {value = dense<[2, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
%6 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
%7 = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
%8 = "tf.ReadVariableOp"(%arg2) {device = "", dtype = f32} : (tensor<!tf.resource>) -> tensor<*xf32>
%9 = "tf.StridedSlice"(%arg0, %3, %0, %0) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%10 = "tf.StridedSlice"(%arg1, %3, %0, %0) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%11 = "tf.Mul"(%9, %10) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%12 = "tf.AddV2"(%11, %7) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%13 = "tf.StridedSlice"(%arg0, %6, %5, %0) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%14 = "tf.StridedSlice"(%arg1, %6, %5, %0) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%15 = "tf.Mul"(%13, %14) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%16 = "tf.AddV2"(%15, %7) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%17 = "tf.StridedSlice"(%arg0, %0, %4, %0) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%18 = "tf.Mul"(%17, %14) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%19 = "tf.AddV2"(%16, %18) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%20 = "tf.StridedSlice"(%arg1, %0, %4, %0) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%21 = "tf.Mul"(%13, %20) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%22 = "tf.AddV2"(%21, %7) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%23 = "tf.Mul"(%17, %20) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%24 = "tf.AddV2"(%22, %23) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%25 = "tf.StridedSlice"(%arg0, %2, %1, %0) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%26 = "tf.Mul"(%25, %10) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%27 = "tf.AddV2"(%12, %26) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
"tf.ResourceStridedSliceAssign"(%arg2, %3, %0, %0, %27) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<!tf.resource>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<f32>) -> ()
%28 = "tf.ReadVariableOp"(%arg2) {device = "", dtype = f32} : (tensor<!tf.resource>) -> tensor<*xf32>
%29 = "tf.StridedSlice"(%arg1, %2, %1, %0) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%30 = "tf.Mul"(%9, %29) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%31 = "tf.AddV2"(%30, %7) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%32 = "tf.Mul"(%25, %29) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%33 = "tf.AddV2"(%31, %32) {T = f32, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
"tf.ResourceStridedSliceAssign"(%arg2, %2, %1, %0, %33) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<!tf.resource>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<f32>) -> ()
%34 = "tf.ReadVariableOp"(%arg2) {device = "", dtype = f32} : (tensor<!tf.resource>) -> tensor<*xf32>
"tf.ResourceStridedSliceAssign"(%arg2, %6, %5, %0, %19) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<!tf.resource>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<f32>) -> ()
%35 = "tf.ReadVariableOp"(%arg2) {device = "", dtype = f32} : (tensor<!tf.resource>) -> tensor<*xf32>
"tf.ResourceStridedSliceAssign"(%arg2, %0, %4, %0, %24) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<!tf.resource>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<f32>) -> ()
return
}
}
What is interesting, at least for my purposes, is the loss of the loop structure of the computation. In the tf dialect, the loop structure is flattened but I would like the output MLIR to reflect/preserve the original loop structure expressed in the TF operator graph.
I suppose, another way of phrasing this question is to ask whether TensorFlow dialect supports control constructs (in my belief it does via tf.IfOp and tf.WhileOp) and if there are any particular syntax restrictions the input should adhere to in order to retain the loop structure.
What would be the best way to go about this?
P.S. I suspect this might have something to do with eager execution which is the default behavior in tf =>2.0. Maybe someone can verify this?
Thanks,
Modifying the input computation to the following did the trick. I believe the problem was with (at least partly) the use of python variables along with tf variables. The following effectively preserves the symbolic structure of the computation.
with tf.Graph().as_default() as g:
with tf.device('/cpu:0'):
#tf.function
def mymatmul(A, B, C, m, n):
for i in range(m):
for j in range(m):
for k in range(n):
C[i,j].assign(tf.math.add(C[i, j], tf.math.multiply(A[i, k], B[k, j])))
return C
A = tf.constant([[1., 2.], [3., 4.]])
B = tf.constant([[2., 1.], [4., 3.]])
C = tf.Variable((tf.zeros((2, 2), dtype=tf.float32)))
m = tf.constant(2)
n = tf.constant(2)
mymatmul(A, B, C, m, n)
This generates the following MLIR with tf.While.
module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 412 : i32}} {
func #main() {
%0 = "tf.Const"() {value = dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
%1 = "tf.Const"() {value = dense<[[2.000000e+00, 1.000000e+00], [4.000000e+00, 3.000000e+00]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
%2 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
%3 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
%4 = "tf.VarHandleOp"() {_class = ["loc:#Variable"], allowed_devices = [], container = "", device = "/device:CPU:0", shared_name = "Variable"} : () -> tensor<!tf.resource<tensor<2x2xf32>>>
%5 = "tf.StatefulPartitionedCall"(%0, %1, %4, %2, %2) {_collective_manager_ids = [], _read_only_resource_inputs = [], config = "", config_proto = "\0A\07\0A\03CPU\10\01\0A\07\0A\03GPU\10\002\02J\008\01", device = "/device:CPU:0", executor_type = "", f = #__inference_mymatmul_3650} : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<i32>, tensor<i32>) -> tensor<2x2xf32>
%6 = "tf.VarIsInitializedOp"(%4) {device = "/device:CPU:0"} : (tensor<!tf.resource<tensor<2x2xf32>>>) -> tensor<i1>
%7 = "tf.ReadVariableOp"(%4) {device = "/device:CPU:0"} : (tensor<!tf.resource<tensor<2x2xf32>>>) -> tensor<2x2xf32>
"tf.AssignVariableOp"(%4, %3) {device = "/device:CPU:0"} : (tensor<!tf.resource<tensor<2x2xf32>>>, tensor<2x2xf32>) -> ()
return
}
func #__inference_mymatmul_3650(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<!tf.resource<tensor<2x2xf32>>>, %arg3: tensor<i32>, %arg4: tensor<i32>) -> tensor<2x2xf32> attributes {tf.signature.is_stateful} {
%0 = "tf.Const"() {value = dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
%1 = "tf.Const"() {value = dense<[[2.000000e+00, 1.000000e+00], [4.000000e+00, 3.000000e+00]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
%2 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
%3 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%4 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
%5:10 = "tf.While"(%3, %4, %3, %2, %4, %4, %4, %arg2, %0, %1) {_lower_using_switch_merge = true, _num_original_outputs = 10 : i64, _read_only_resource_inputs = [], body = #while_body_1410, cond = #while_cond_1400, device = "", is_stateless = false, output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<2x2>, #tf.shape<2x2>], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<2x2xf32>, tensor<2x2xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<2x2xf32>, tensor<2x2xf32>)
%6 = "tf.ReadVariableOp"(%arg2) {device = ""} : (tensor<!tf.resource<tensor<2x2xf32>>>) -> tensor<2x2xf32>
%7 = "tf.Identity"(%6) {device = ""} : (tensor<2x2xf32>) -> tensor<2x2xf32>
return %7 : tensor<2x2xf32>
}
func #while_body_1410(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<i32>, %arg6: tensor<i32>, %arg7: tensor<!tf.resource<tensor<2x2xf32>>>, %arg8: tensor<2x2xf32>, %arg9: tensor<2x2xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<2x2xf32>, tensor<2x2xf32>) attributes {tf.signature.is_stateful} {
%0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
%1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%2 = "tf.Maximum"(%arg5, %1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%3 = "tf.FloorDiv"(%2, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%4 = "tf.FloorMod"(%2, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%5 = "tf.AddV2"(%arg2, %arg3) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%6 = "tf.AddV2"(%arg0, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%7 = "tf.NotEqual"(%4, %1) {device = "", incompatible_shape_error = true} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<i1>) -> tensor<i32>
%9 = "tf.AddV2"(%3, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%10 = "tf.Maximum"(%9, %1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%11:10 = "tf.While"(%1, %10, %1, %0, %2, %arg6, %arg7, %arg2, %arg8, %arg9) {_lower_using_switch_merge = true, _num_original_outputs = 10 : i64, _read_only_resource_inputs = [], body = #while_body_1830, cond = #while_cond_1820, device = "", is_stateless = false, output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<2x2>, #tf.shape<2x2>], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<i32>, tensor<2x2xf32>, tensor<2x2xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<i32>, tensor<2x2xf32>, tensor<2x2xf32>)
%12 = "tf.Identity"(%6) {device = ""} : (tensor<i32>) -> tensor<i32>
%13 = "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<i32>
%14 = "tf.Identity"(%5) {device = ""} : (tensor<i32>) -> tensor<i32>
return %12, %13, %14, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<2x2xf32>, tensor<2x2xf32>
}
func #while_body_1830(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<i32>, %arg6: tensor<!tf.resource<tensor<2x2xf32>>>, %arg7: tensor<i32>, %arg8: tensor<2x2xf32>, %arg9: tensor<2x2xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<i32>, tensor<2x2xf32>, tensor<2x2xf32>) attributes {tf.signature.is_stateful} {
%0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
%1 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%2 = "tf.Maximum"(%arg5, %1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%3 = "tf.FloorDiv"(%2, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%4 = "tf.FloorMod"(%2, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%5 = "tf.AddV2"(%arg2, %arg3) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%6 = "tf.AddV2"(%arg0, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%7 = "tf.NotEqual"(%4, %1) {device = "", incompatible_shape_error = true} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<i1>) -> tensor<i32>
%9 = "tf.AddV2"(%3, %8) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%10 = "tf.Maximum"(%9, %1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%11:10 = "tf.While"(%1, %10, %1, %0, %2, %arg6, %arg7, %arg2, %arg8, %arg9) {_lower_using_switch_merge = true, _num_original_outputs = 10 : i64, _read_only_resource_inputs = [], body = #while_body_2250, cond = #while_cond_2240, device = "", is_stateless = false, output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<2x2>, #tf.shape<2x2>], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<i32>, tensor<i32>, tensor<2x2xf32>, tensor<2x2xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<i32>, tensor<i32>, tensor<2x2xf32>, tensor<2x2xf32>)
%12 = "tf.Identity"(%6) {device = ""} : (tensor<i32>) -> tensor<i32>
%13 = "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<i32>
%14 = "tf.Identity"(%5) {device = ""} : (tensor<i32>) -> tensor<i32>
return %12, %13, %14, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<i32>, tensor<2x2xf32>, tensor<2x2xf32>
}
func #while_body_2250(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<!tf.resource<tensor<2x2xf32>>>, %arg6: tensor<i32>, %arg7: tensor<i32>, %arg8: tensor<2x2xf32>, %arg9: tensor<2x2xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<i32>, tensor<i32>, tensor<2x2xf32>, tensor<2x2xf32>) attributes {tf.signature.is_stateful} {
%0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
%1 = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>
%2 = "tf.AddV2"(%arg7, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%3 = "tf.AddV2"(%arg6, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%4 = "tf.Pack"(%3, %2) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
%5 = "tf.Pack"(%arg6, %arg7) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
%6 = "tf.AddV2"(%arg2, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%7 = "tf.Pack"(%3, %6) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
%8 = "tf.Pack"(%6, %2) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
%9 = "tf.Pack"(%arg6, %arg2) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
%10 = "tf.Pack"(%arg2, %arg7) {axis = 0 : i64, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
%11 = "tf.AddV2"(%arg2, %arg3) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%12 = "tf.ReadVariableOp"(%arg5) {device = ""} : (tensor<!tf.resource<tensor<2x2xf32>>>) -> tensor<2x2xf32>
%13 = "tf.ReadVariableOp"(%arg5) {device = ""} : (tensor<!tf.resource<tensor<2x2xf32>>>) -> tensor<2x2xf32>
%14 = "tf.StridedSlice"(%13, %5, %4, %1) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%15 = "tf.StridedSlice"(%arg8, %9, %7, %1) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%16 = "tf.StridedSlice"(%arg9, %10, %8, %1) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<2x2xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<f32>
%17 = "tf.Mul"(%15, %16) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
%18 = "tf.AddV2"(%14, %17) : (tensor<f32>, tensor<f32>) -> tensor<f32>
"tf.ResourceStridedSliceAssign"(%arg5, %5, %4, %1, %18) {Index = i32, T = f32, begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 3 : i64} : (tensor<!tf.resource<tensor<2x2xf32>>>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<f32>) -> ()
%19 = "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<i32>
%20 = "tf.Identity"(%11) {device = ""} : (tensor<i32>) -> tensor<i32>
%21 = "tf.AddV2"(%arg0, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%22 = "tf.Identity"(%21) {device = ""} : (tensor<i32>) -> tensor<i32>
return %22, %19, %20, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource<tensor<2x2xf32>>>, tensor<i32>, tensor<i32>, tensor<2x2xf32>, tensor<2x2xf32>
}
func #while_cond_2240(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<!tf.resource<tensor<2x2xf32>>>, %arg6: tensor<i32>, %arg7: tensor<i32>, %arg8: tensor<2x2xf32>, %arg9: tensor<2x2xf32>) -> tensor<i1> {
%0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%1 = "tf.GreaterEqual"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%2 = "tf.Less"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%3 = "tf.Greater"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%4 = "tf.LogicalAnd"(%2, %3) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%5 = "tf.Less"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%6 = "tf.LogicalAnd"(%1, %5) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%7 = "tf.LogicalOr"(%6, %4) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%8 = "tf.Less"(%arg0, %arg1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%9 = "tf.LogicalAnd"(%8, %7) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%10 = "tf.Identity"(%9) {device = ""} : (tensor<i1>) -> tensor<i1>
return %10 : tensor<i1>
}
func #while_cond_1820(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<i32>, %arg6: tensor<!tf.resource<tensor<2x2xf32>>>, %arg7: tensor<i32>, %arg8: tensor<2x2xf32>, %arg9: tensor<2x2xf32>) -> tensor<i1> {
%0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%1 = "tf.GreaterEqual"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%2 = "tf.Less"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%3 = "tf.Greater"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%4 = "tf.LogicalAnd"(%2, %3) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%5 = "tf.Less"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%6 = "tf.LogicalAnd"(%1, %5) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%7 = "tf.LogicalOr"(%6, %4) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%8 = "tf.Less"(%arg0, %arg1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%9 = "tf.LogicalAnd"(%8, %7) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%10 = "tf.Identity"(%9) {device = ""} : (tensor<i1>) -> tensor<i1>
return %10 : tensor<i1>
}
func #while_cond_1400(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<i32>, %arg6: tensor<i32>, %arg7: tensor<!tf.resource<tensor<2x2xf32>>>, %arg8: tensor<2x2xf32>, %arg9: tensor<2x2xf32>) -> tensor<i1> {
%0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%1 = "tf.GreaterEqual"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%2 = "tf.Less"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%3 = "tf.Greater"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%4 = "tf.LogicalAnd"(%2, %3) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%5 = "tf.Less"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%6 = "tf.LogicalAnd"(%1, %5) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%7 = "tf.LogicalOr"(%6, %4) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%8 = "tf.Less"(%arg0, %arg1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
%9 = "tf.LogicalAnd"(%8, %7) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
%10 = "tf.Identity"(%9) {device = ""} : (tensor<i1>) -> tensor<i1>
return %10 : tensor<i1>
}
}
This is still too verbose and dense to my liking (and for is my preferred loop construct whenever possible), but that will have to be addressed during dialect conversion and is a different problem altogether.

clang/LLVM project level optimization

So periodically I try LLVM as I have this theory it should outperform GNU. And then it sadly doesn't.
Part of the theory has to do with its ability to link modules/objects together and THEN optimize, where normally optimization happens on a per file/object basis.
Instead of using a generic one, I see how to build for a specific default target
rm -rf llvm-project
git clone https://github.com/llvm/llvm-project.git
cd llvm-project
git checkout llvmorg-10.0.0
mkdir build
cd build
cmake -DLLVM_ENABLE_PROJECTS='clang;lld' -DCMAKE_CROSSCOMPILING=True -DCMAKE_INSTALL_PREFIX=/opt/llvm/llvm10armv6m -DLLVM_DEFAULT_TARGET_TRIPLE=armv6m-none-eabi -DLLVM_TARGET_ARCH=ARM -DLLVM_TARGETS_TO_BUILD=ARM -G "Unix Makefiles" ../llvm
make -j 8
make -j 4
make
sudo make install
And the test files
test.c
unsigned int one ( void )
{
return(1);
}
unsigned int two ( void );
unsigned int testone ( void )
{
return(one());
}
unsigned int testtwo ( void )
{
return(two());
}
two.c
unsigned int two ( void )
{
return(2);
}
basic run
clang -O2 -fomit-frame-pointer -c test.c -o test.o
llvm-objdump -D test.o
00000000 one:
0: 01 20 movs r0, #1
2: 70 47 bx lr
00000004 testone:
4: 01 20 movs r0, #1
6: 70 47 bx lr
00000008 testtwo:
8: 80 b5 push {r7, lr}
a: ff f7 fe ff bl #-4
e: 80 bd pop {r7, pc}
as one would expect, one() has been inlined into testone().
The desire is to get testwo() inlined as well.
clang -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
llc both.bc -o both.s
cat both.s
opt -O2 both.bc -o both.opt.bc
llc both.opt.bc -o both.opt.s
cat both.opt.s
gives
testone:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl one
pop {r7, pc}
testtwo:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl two
pop {r7, pc}
and
testone:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl one
pop {r7, pc}
testtwo:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl two
pop {r7, pc}
that is worse.
opt -std-link-opts both.bc -o both.opt.bc
same, no better
Now this works
clang -O2 -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -O2 -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
opt -O2 both.bc -o both.opt.bc
llc both.opt.bc -o both.opt.s
cat both.opt.s
testone:
.fnstart
# %bb.0: # %entry
movs r0, #1
bx lr
testtwo:
.fnstart
# %bb.0: # %entry
movs r0, #2
bx lr
One would think that not optimizing the parts would give more meat for the optimization of the whole to chew on. Yes? Although this indicates otherwise.
clang -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
opt -O3 both.bc -o both.opt.bc
llc both.opt.bc -o both.opt.s
cat both.opt.s
testone:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl one
movs r0, #1
pop {r7, pc}
testtwo:
.fnstart
# %bb.0: # %entry
.save {r7, lr}
push {r7, lr}
bl two
movs r0, #2
pop {r7, pc}
-O3 doesn't help either, and this output is as pretty bad it calls the function AND inlines it. What is going on there?!
llvm-dis both.opt.bc
cat both.opt.ll
; ModuleID = 'both.opt.bc'
source_filename = "llvm-link"
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv6m-none-unknown-eabi"
; Function Attrs: noinline nounwind optnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: noinline nounwind optnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
%call = call i32 #one()
ret i32 1
}
; Function Attrs: noinline nounwind optnone
define dso_local i32 #testtwo() local_unnamed_addr #0 {
entry:
%call = call i32 #two()
ret i32 2
}
; Function Attrs: noinline nounwind optnone
define dso_local i32 #two() local_unnamed_addr #0 {
entry:
ret i32 2
}
How does one undo that?
clang -O2 -fomit-frame-pointer -c -emit-llvm test.c -o test.bc
clang -O2 -fomit-frame-pointer -c -emit-llvm two.c -o two.bc
llvm-link test.bc two.bc -o both.bc
llvm-dis both.bc
cat both.ll
opt -O3 both.bc -o both.opt.bc
llvm-dis both.opt.bc
cat both.opt.ll
gives
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: nounwind
define dso_local i32 #testtwo() local_unnamed_addr #1 {
entry:
%call = tail call i32 #two() #2
ret i32 %call
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #two() local_unnamed_addr #0 {
entry:
ret i32 2
}
and
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testtwo() local_unnamed_addr #0 {
entry:
ret i32 2
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #two() local_unnamed_addr #0 {
entry:
ret i32 2
}
So is it correct that you have to apply the optimizations everywhere, at the file/object level in order to get the project level to optimize?
And then there is the question of tail call or leaf, etc optimization, if nothing else testtwo: even in the first case
clang -O2 -fomit-frame-pointer -c test.c -o test.o
could simply branch to two() and not setup a stack frame not do any of that. Or is this a thumb thing? b cant reach?
one:
0: b8 01 00 00 00 movl $1, %eax
5: c3 retq
testone:
10: b8 01 00 00 00 movl $1, %eax
15: c3 retq
testtwo:
20: e9 00 00 00 00 jmp 0 <testtwo+5>
In gnu the linker patches up any branch reaching or mode issues with trampolines
arm-none-eabi-gcc -c -O2 -mcpu=cortex-m0 test.c -o test.o
arm-none-eabi-objdump -D test.o
00000000 <one>:
0: 2001 movs r0, #1
2: 4770 bx lr
00000004 <testone>:
4: 2001 movs r0, #1
6: 4770 bx lr
00000008 <testtwo>:
8: b510 push {r4, lr}
a: f7ff fffe bl 0 <two>
e: bd10 pop {r4, pc}
Okay I stand corrected...
clang --version
clang version 10.0.0 (https://github.com/llvm/llvm-project.git d32170dbd5b0d54436537b6b75beaf44324e0c28)
Target: armv6m-none-unknown-eabi
Thread model: posix
InstalledDir: /opt/llvm/llvm10armv6m/bin
arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
I guess the question is if one wants to do a project level optimization using llvm-link and opt, is optimization of each of the individual items required or is there a command line option I am missing. Not interested in compiler specific attributes that go into the source code itself, want the code infected with neither gcc nor llvm specifics.
After gcc 5.x.x the code got more bloated was hoping that llvm would have a chance but whenever I try this (on projects not just 10 lines of code) gcc ends up with fewer executed instructions, and/or fewer memory accesses, etc, etc. For simple demonstration functions like the ones above, with some exceptions they produce the same/equivalent output.
Is there something, another one of the tools, or command line options, that I am missing in order to get more out of clang/llvm?
Is it that this is too trivial of an example for the tool to shine?
EDIT based on answer
clang -c start.s -o start.o
clang -O2 -flto=thin -fomit-frame-pointer -c test.c
clang -O2 -flto=thin -fomit-frame-pointer -c two.c
ld.lld start.o test.o two.o -o test.elf
llvm-objdump -D test.elf
000110fc testtwo:
110fc: 02 20 movs r0, #2
110fe: 70 47 bx lr
00011100 two:
11100: 02 20 movs r0, #2
11102: 70 47 bx lr
so getting rid of the -emit-llvm and using lto basically gives the desired result.
Looking at the bc disassembly
clang -O2 -flto=thin -fomit-frame-pointer -c test.c
llvm-dis test.o
cat test.o.ll
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #one() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: norecurse nounwind readnone
define dso_local i32 #testone() local_unnamed_addr #0 {
entry:
ret i32 1
}
; Function Attrs: nounwind
define dso_local i32 #testtwo() local_unnamed_addr #1 {
entry:
%call = tail call i32 #two() #3
ret i32 %call
}
enables/adds the tail call. I really dislike using the compiler/shell as a linker (for embedded projects that have their own bootstrap and linker script), llvm-ldd usage wasn't easy to figure out or basically couldn't figure out, but ld.lld also supports the tlo stuff, so that worked out.
The answer is pretty easy actually: one should never want to use llc / opt / llvm-link for performing "end-user" project level kind of optimizations. These are developer-side tools with different defaults, thresholds, etc. Basically, they are just simple command-line frontends to various pieces of LLVM toolbox.
In order to perform the proper link-time-optimization you'd need to use the pipelines that were intended for such task. Basically, compiling everything using "clang -flto" and then linking everything again via "clang -flto" would work. Using LTO-aware linker like lld is a prerequisite as well.
Some further information about ThinLTO could also be found here: https://clang.llvm.org/docs/ThinLTO.html and http://blog.llvm.org/2016/06/thinlto-scalable-and-incremental-lto.html

iVar Shown as Private Global In LLVMIR

I have declared a iVar in a class:
#implementation LLVMIRTest{
NSString* ivarTest;
}
When I check for LLVM IR it shows me:
#OBJC_METH_VAR_NAME_ = private global [9 x i8] c"ivarTest\00", section "__TEXT,__objc_methname,cstring_literals", align 1
#OBJC_METH_VAR_TYPE_ = private global [12 x i8] c"#\22NSString\22\00", section "__TEXT,__objc_methtype,cstring_literals", align 1
I have to ask why it is private global mention in LLVM IR. Why not only private?
This is full Module LLVM IR:
; ModuleID = 'LLVMIRTest.m'
source_filename = "LLVMIRTest.m"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.12.0"
%struct._objc_cache = type opaque
%struct._class_t = type { %struct._class_t*, %struct._class_t*, %struct._objc_cache*, i8* (i8*, i8*)**, %struct._class_ro_t* }
%struct._class_ro_t = type { i32, i32, i32, i8*, i8*, %struct.__method_list_t*, %struct._objc_protocol_list*, %struct._ivar_list_t*, i8*, %struct._prop_list_t* }
%struct.__method_list_t = type { i32, i32, [0 x %struct._objc_method] }
%struct._objc_method = type { i8*, i8*, i8* }
%struct._objc_protocol_list = type { i64, [0 x %struct._protocol_t*] }
%struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32, i8**, i8*, %struct._prop_list_t* }
%struct._ivar_list_t = type { i32, i32, [0 x %struct._ivar_t] }
%struct._ivar_t = type { i64*, i8*, i8*, i32, i32 }
%struct._prop_list_t = type { i32, i32, [0 x %struct._prop_t] }
%struct._prop_t = type { i8*, i8* }
#_objc_empty_cache = external global %struct._objc_cache
#"OBJC_METACLASS_$_NSObject" = external global %struct._class_t
#OBJC_CLASS_NAME_ = private global [11 x i8] c"LLVMIRTest\00", section "__TEXT,__objc_classname,cstring_literals", align 1
#"\01l_OBJC_METACLASS_RO_$_LLVMIRTest" = private global %struct._class_ro_t { i32 1, i32 40, i32 40, i8* null, i8* getelementptr inbounds ([11 x i8], [11 x i8]* #OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* null, %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
#"OBJC_METACLASS_$_LLVMIRTest" = global %struct._class_t { %struct._class_t* #"OBJC_METACLASS_$_NSObject", %struct._class_t* #"OBJC_METACLASS_$_NSObject", %struct._objc_cache* #_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* #"\01l_OBJC_METACLASS_RO_$_LLVMIRTest" }, section "__DATA, __objc_data", align 8
#"OBJC_CLASS_$_NSObject" = external global %struct._class_t
#"OBJC_IVAR_$_LLVMIRTest.ivarTest" = hidden global i64 8, section "__DATA, __objc_ivar", align 8
#OBJC_METH_VAR_NAME_ = private global [9 x i8] c"ivarTest\00", section "__TEXT,__objc_methname,cstring_literals", align 1
#OBJC_METH_VAR_TYPE_ = private global [12 x i8] c"#\22NSString\22\00", section "__TEXT,__objc_methtype,cstring_literals", align 1
#"\01l_OBJC_$_INSTANCE_VARIABLES_LLVMIRTest" = private global { i32, i32, [1 x %struct._ivar_t] } { i32 32, i32 1, [1 x %struct._ivar_t] [%struct._ivar_t { i64* #"OBJC_IVAR_$_LLVMIRTest.ivarTest", i8* getelementptr inbounds ([9 x i8], [9 x i8]* #OBJC_METH_VAR_NAME_, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* #OBJC_METH_VAR_TYPE_, i32 0, i32 0), i32 3, i32 8 }] }, section "__DATA, __objc_const", align 8
#"\01l_OBJC_CLASS_RO_$_LLVMIRTest" = private global %struct._class_ro_t { i32 0, i32 8, i32 16, i8* null, i8* getelementptr inbounds ([11 x i8], [11 x i8]* #OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* null, %struct._objc_protocol_list* null, %struct._ivar_list_t* bitcast ({ i32, i32, [1 x %struct._ivar_t] }* #"\01l_OBJC_$_INSTANCE_VARIABLES_LLVMIRTest" to %struct._ivar_list_t*), i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
#"OBJC_CLASS_$_LLVMIRTest" = global %struct._class_t { %struct._class_t* #"OBJC_METACLASS_$_LLVMIRTest", %struct._class_t* #"OBJC_CLASS_$_NSObject", %struct._objc_cache* #_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* #"\01l_OBJC_CLASS_RO_$_LLVMIRTest" }, section "__DATA, __objc_data", align 8
#"OBJC_LABEL_CLASS_$" = private global [1 x i8*] [i8* bitcast (%struct._class_t* #"OBJC_CLASS_$_LLVMIRTest" to i8*)], section "__DATA, __objc_classlist, regular, no_dead_strip", align 8
#llvm.compiler.used = appending global [5 x i8*] [i8* getelementptr inbounds ([11 x i8], [11 x i8]* #OBJC_CLASS_NAME_, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* #OBJC_METH_VAR_NAME_, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* #OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast ({ i32, i32, [1 x %struct._ivar_t] }* #"\01l_OBJC_$_INSTANCE_VARIABLES_LLVMIRTest" to i8*), i8* bitcast ([1 x i8*]* #"OBJC_LABEL_CLASS_$" to i8*)], section "llvm.metadata"
!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
!llvm.ident = !{!6}
!0 = !{i32 1, !"Objective-C Version", i32 2}
!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
!3 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
!4 = !{i32 1, !"Objective-C Class Properties", i32 64}
!5 = !{i32 1, !"PIC Level", i32 2}
!6 = !{!"Apple LLVM version 8.0.0 (clang-800.0.38)"}
Global variables are considered module scoped that are initialized at compile time versus runtime.
Most front-ends to LLVM put strings at the module scope and those require a global or a constant tag. There is a way to store strings on the stack instead but that is a rare occurrence in my experience. I typically create strings as
#somename = internal constant....
The private is instructing not to expose the symbols outside of the module.
I assume this is how Objective-C adds type and instrumentation support for classes. If you look at the rest of the output it is likely that the pointers to those strings are getting passed into the RT library (e.g. %x = load ...) prior to some call.
Update after OP added listing
What you've listed is the 'static' module level constructs representing your class. If you look closely, the class declaration consists of a number of embedded structures and strings. Because these declarations are at the module level they are declared global which puts them, as you would expect, in the data segment as it is data after-all and not method implementation. Think of your class as a structure containing not only space for your variable 'iVar` but additional information necessary for the Objective-C RT.
So, classes and their variables are considered module level declarations which are represented as data structures at the module level which is marked global by LLVM standards. See LLVM Global Variable documentation.

Why do these simple methods compile differently?

I'm slightly confused as to why clang is emitting different code for the following two method:
#interface ClassA : NSObject
#end
#implementation ClassA
+ (ClassA*)giveMeAnObject1 {
return [[ClassA alloc] init];
}
+ (id)giveMeAnObject2 {
return [[ClassA alloc] init];
}
#end
If we look at the ARMv7 emitted then we see this, at O3, with ARC enabled:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject1]"
"+[ClassA giveMeAnObject1]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC0_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC0_1+4))
LPC0_0:
add r1, pc
LPC0_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC0_2+4))
LPC0_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autorelease
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject2]"
"+[ClassA giveMeAnObject2]":
push {r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
mov r7, sp
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r7, lr}
b.w _objc_autoreleaseReturnValue
The only difference is the tail call to objc_autoreleaseReturnValue vs objc_autorelease. I would expect both to call objc_autoreleaseReturnValue to be honest. In-fact the first method not using objc_autoreleaseReturnValue means that it will potentially be slower than the second because there will definitely be an autorelease then a retain by the caller, rather than the faster bypass of this redundant call that ARC can do if it's supported in the runtime.
The LLVM which is emitted gives some kind of reason why it's like that:
define internal %1* #"\01+[ClassA giveMeAnObject1]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autorelease(i8* %6) nounwind
%8 = bitcast i8* %6 to %1*
ret %1* %8
}
define internal i8* #"\01+[ClassA giveMeAnObject2]"(i8* nocapture %self, i8* nocapture %_cmd) {
%1 = load %struct._class_t** #"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 4
%2 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_", align 4
%3 = bitcast %struct._class_t* %1 to i8*
%4 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %3, i8* %2)
%5 = load i8** #"\01L_OBJC_SELECTOR_REFERENCES_2", align 4
%6 = tail call i8* bitcast (i8* (i8*, i8*, ...)* #objc_msgSend to i8* (i8*, i8*)*)(i8* %4, i8* %5)
%7 = tail call i8* #objc_autoreleaseReturnValue(i8* %6) nounwind
ret i8* %6
}
But I'm struggling to see why it's decided to compile these two method differently. Can anyone shed some light onto it?
Update:
Even weirder is these other methods:
+ (ClassA*)giveMeAnObject3 {
ClassA *a = [[ClassA alloc] init];
return a;
}
+ (id)giveMeAnObject4 {
ClassA *a = [[ClassA alloc] init];
return a;
}
These compile to:
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject3]"
"+[ClassA giveMeAnObject3]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC2_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC2_1+4))
LPC2_0:
add r1, pc
LPC2_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC2_2+4))
LPC2_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
.align 2
.code 16
.thumb_func "+[ClassA giveMeAnObject4]"
"+[ClassA giveMeAnObject4]":
push {r4, r7, lr}
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
add r7, sp, #4
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_-(LPC3_0+4))
movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC3_1+4))
LPC3_0:
add r1, pc
LPC3_1:
add r0, pc
ldr r1, [r1]
ldr r0, [r0]
blx _objc_msgSend
movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC3_2+4))
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
blx _objc_retainAutoreleasedReturnValue
mov r4, r0
mov r0, r4
blx _objc_release
mov r0, r4
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
This time, they are identical however there's a few things which could be optimised even more here:
There's a redundant mov r4, r0 followed by mov r0, r4.
There's a retain followed by a release.
Surely, the bottom bit of both of those methods can turn into:
LPC3_2:
add r1, pc
ldr r1, [r1]
blx _objc_msgSend
pop.w {r4, r7, lr}
b.w _objc_autoreleaseReturnValue
Obviously we could then also omit popping r4 because we don't actually clobber it any more. Then the method would turn into the exact same as giveMeAnObject2 which is exactly what we'd expect.
Why is clang not being clever and doing this?!
This appears to be a bug in the optimizer and is being tracked as rdar://problem/10813093.