Verilog: cannot be driven by primitive or continous assignment - module

Verilog: cannot be driven by primitive or continous assignment
ok, its giving me this error on on every line that I do an instance of FullAdder 32 bit.
module Multiplier_S(output reg [63:0] f_result, input [31:0] a, input [31:0] b);
wire [31:0] sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,
sum15,sum16,sum17,sum18,sum19,sum20,sum21,sum22,sum23,sum24,sum25,sum26,sum27,sum28,
sum29,sum30,sum31;
wire [31:0] and1,and2,and3,and4,and5,and6,and7,and8,and9,and10,and11,and12,and13,and14,
and15,and16,and17,and18,and19,and20,and21,and22,and23,and24,and25,and26,and27,and28,
and29,and30,and31,and32;
reg [63:0] result;
//bit0
AND_Bank a_1(and1, a[0], b);
initial begin
result[0] = and1[0];
end
Shift_Right_32bit shift1(and1, 1, and1);
initial begin
and1[31]=1'b0;
end
//bit 1
AND_Bank a_2(and2, a[1], b);
FullAdder_32bit adder1(sum1, result[1], and2, and1);
//bit 2
AND_Bank a_3(and3, a[2], b);
FullAdder_32bit adder2(sum2, result[2], and3, sum1);
//bit 3
AND_Bank a_4(and4, a[3], b);
FullAdder_32bit adder3(sum3, result[3], and4, sum2);
//bit 4
AND_Bank a_5(and5, a[4], b);
FullAdder_32bit adder4(sum4, result[4], and5, sum3);
//bit 5
AND_Bank a_6(and6, a[5], b);
FullAdder_32bit adder5(sum5, result[5], and6, sum4);
//bit 6
AND_Bank a_7(and7, a[6], b);
FullAdder_32bit adder6(sum6, result[6], and7, sum5);
//bit 7
AND_Bank a_8(and8, a[7], b);
FullAdder_32bit adder7(sum7, result[7], and8, sum6);
//bit 8
AND_Bank a_9(and10, a[8], b);
FullAdder_32bit adder8(sum8, result[8], and9, sum7);
//bit 9
AND_Bank a_10(and11, a[9], b);
FullAdder_32bit adder9(sum9, result[9], and10, sum8);
//bit 10
AND_Bank a_11(and12, a[10], b);
FullAdder_32bit adder10(sum10, result[10], and11, sum9);
//bit 11
AND_Bank a_12(and13, a[11], b);
FullAdder_32bit adder11(sum11, result[11], and12, sum10);
//bit 12
AND_Bank a_13(and14, a[12], b);
FullAdder_32bit adder12(sum12, result[12], and13, sum11);
//bit 13
AND_Bank a_14(and15, a[13], b);
FullAdder_32bit adder13(sum13, result[13], and14, sum12);
//bit 14
AND_Bank a_15(and16, a[14], b);
FullAdder_32bit adder14(sum14, result[14], and15, sum13);
//bit 15
AND_Bank a_16(and17, a[15], b);
FullAdder_32bit adder15(sum15, result[15], and16, sum14);
//bit 16
AND_Bank a_17(and18, a[16], b);
FullAdder_32bit adder16(sum16, result[16], and17, sum15);
//bit 17
AND_Bank a_18(and19, a[17], b);
FullAdder_32bit adder17(sum17, result[17], and18, sum16);
//bit 18
AND_Bank a_19(and20, a[18], b);
FullAdder_32bit adder18(sum18, result[18], and19, sum17);
//bit 19
AND_Bank a_20(and21, a[19], b);
FullAdder_32bit adder19(sum19, result[19], and20, sum18);
//bit 20
AND_Bank a_21(and22, a[20], b);
FullAdder_32bit adder20(sum20, result[20], and21, sum19);
//bit 21
AND_Bank a_22(and23, a[21], b);
FullAdder_32bit adder21(sum21, result[21], and22, sum20);
//bit 22
AND_Bank a_23(and24, a[22], b);
FullAdder_32bit adder22(sum22, result[22], and23, sum21);
//bit 23
AND_Bank a_24(and25, a[23], b);
FullAdder_32bit adder23(sum23, result[23], and24, sum22);
//bit 24
AND_Bank a_25(and25, a[24], b);
FullAdder_32bit adder24(sum24, result[24], and25, sum23);
//bit 25
AND_Bank a_26(and26, a[25], b);
FullAdder_32bit adder25(sum25, result[25], and26, sum24);
//bit 26
AND_Bank a_27(and27, a[26], b);
FullAdder_32bit adder26(sum26, result[26], and27, sum25);
//bit 27
AND_Bank a_28(and28, a[27], b);
FullAdder_32bit adder27(sum27, result[27], and28, sum26);
//bit 28
AND_Bank a_29(and29, a[28], b);
FullAdder_32bit adder28(sum28, result[28], and29, sum27);
//bit 29
AND_Bank a_30(and30, a[29], b);
FullAdder_32bit adder29(sum29, result[29], and30, sum28);
//bit 30
AND_Bank a_31(and31, a[30], b);
FullAdder_32bit adder30(sum30, result[30], and31, sum29);
//bit 31
AND_Bank a_32(and32, a[31], b);
FullAdder_32bit adder31(sum31, result[31], and32, sum30);
//bit 63 al 32
initial begin
result[62:32] = sum31[30:0];
end
if(a[31] || b[31])
begin
initial begin
result[63] = 1'b0;
end
end
else
begin
initial begin
result[63] = 1'b1;
end
end
initial begin
f_result[63:0] = result[31:0];
end
endmodule
and here is the full adder module:
module FullAdder_32bit(output [31:0] result, output reg carry, input [31:0] a, input [31:0] b);
reg [32:0] temp_sum;
initial begin
temp_sum = a + b;
checkCarryFlag;
assign result = temp_sum[31:0];
end
task checkCarryFlag;
begin
if( temp_sum[32] == 1 )
begin
carry = 1;
end
else carry = 0;
end
endtask
endmodule
and here is the command console with one error, but its really for everytime I call on the full adder. I am a begginer on verilog, and wish to learn why I am getting all these errors. thanks in advance.

When connecting modules the output of an instance must drive a wire.
For example
module top
wire block_wire_output;
block u_block(
.block_reg_output( block_wire_output)
);
endmodule
module block(
output reg block_reg_output
);
initial begin
block_reg_output =1'b1;
end
endmodule
In your code you have reg [63:0] result; being driven by the output port of an instance. This breaks the above rule, and result should be declared as a wire (wire [63:0] result;). This does mean that you can not define part of result in an initial or always block. Your use of initial does not look to be correct as they are only evaluated once. It looks like you really wanted to use:
assign result[0] = and1[0];

Related

GMS2 trigger for event not working properly

(First of all i'm new to all of this so sorry in advance if this was a waste of time)
So I'm creating a game and decided to follow Friendly Cosmonaut's guide on triggers and cutscenes.
(link) https://www.youtube.com/watch?v=LDLxCXexcxk
I followed everything there and created a few custom scripts and everything works like a charm on the first room, but when I try adding a trigger on the second room it just doesn't work at all.
For example, I create a trigger so when it collides with the player object, it makes the screen shake (as a test) and it creates an object in the room. But when the code is the following:
t_scene_info = [
[screen_shake, 6, 1000],
]
(I'll provide code for all functions further in the post)
Nothing happens on collision, but when I change it to:
t_scene_info = [
[screen_shake(6, 1000)],
]
The code runs before even the player has collided, as soon as the trigger object gets created.
(Again, I tried this in the first room with the first 2 triggers, and it works fine.)
Here's the code for:
oCutscene (Create event):
scene_info = -1;
scene = 0;
timer = 0;
scene_info = [
[create_box_at_mouse],
[cutscene_wait, 2],
[create_box_at_mouse],
[cutscene_wait, 4],
[create_box_at_mouse]
];
event_perform(ev_other, ev_user0);
x_dest = -1;
y_dest = -1;
oCutscene (Step event):
script_execute_alt(current_scene[0], current_scene_array);
oCutscene (User event 0):
current_scene = scene_info[scene]
var len = array_length_1d(current_scene) -1;
current_scene_array = -1;
current_scene_array = array_create(len, 0);
array_copy(current_scene_array, 0, current_scene, 1, len);
oTriggerP (Step event) (The trigger that's supposed to execute the code when colliding with the player or playerd (just a dummy)):
if(!instance_exists(oCutscene)){
if(place_meeting(x, y, oPlayerD) || place_meeting(x, y, oPlayer)){
create_cutscene(t_scene_info);
instance_destroy();
}
}
oTriggerP (Create event):
t_scene_info = -1;
create_cutscene (Script):
if(!instance_exists(oCutscene)){
if(place_meeting(x, y, oPlayerD) || place_meeting(x, y, oPlayer)){
create_cutscene(t_scene_info);
instance_destroy();
}
}
cutscene_end_action (Script):
with(oCutscene){
scene++;
if(scene > array_length_1d(scene_info)-1){
instance_destroy();
exit;
}
event_perform(ev_other, ev_user0);
}
cutscene_wait (Script):
with(oCutscene){
scene++;
if(scene > array_length_1d(scene_info)-1){
instance_destroy();
exit;
}
event_perform(ev_other, ev_user0);
}
script_execute_alt (Script):
///#description script_execute_alt
///#arg ind
///#arg [arg1,arg2,...]
var num1 = argument0;
var a = argument1;
var len = array_length_1d(argument1);
switch(len){
case 0 : script_execute(0) break;
case 1 : script_execute(num1, a[0]); break;
case 2: script_execute(num1, a[0], a[1]); break;
case 3: script_execute(num1, a[0], a[1], a[2]); break;
case 4: script_execute(num1, a[0], a[1], a[2], a[3]); break;
case 5: script_execute(num1, a[0], a[1], a[2], a[3], a[4]); break;
case 6: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5]); break;
case 7: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6]); break;
case 8: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]); break;
case 9: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8]); break;
case 10: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9]);
break;
case 11: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10]); break;
case 12: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10], a[11]); break;
case 13: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10], a[11], a[12]); break;
case 14: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10], a[11], a[12], a[13]); break;
case 15: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10], a[11], a[12], a[13], a[14]); break;
case 16: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10], a[11], a[12], a[13], a[14], a[15]); break;
case 17: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10], a[11], a[12], a[13], a[14], a[15], a[16]); break;
case 18: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17]); break;
case 19: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17], a[18]); break;
case 20: script_execute(num1, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9],
a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17], a[18], a[19]); break;
}
screen_shake (Script):
///#arg Magnitude
///#arg Time
with(oCamera){
if(argument0 > shake_remain){
shake_magnitude = argument0;
shake_remain = argument0;
shake_length = argument1;
}
cutscene_end_action();
}
Here's the creation code for the 2 triggers in the first room:
First trigger (oTriggerP):
t_scene_info = [
[cutscene_move_character, oPlayerD, 800, 279, false, 1],
[cutscene_wait, 1],
[instance_create_layer, 0, 0, 0, oIntro],
[cutscene_move_character, oPlayerD, 1170, 279, false, 1],
];
Second trigger (Another oTriggerP):
t_scene_info = [
[cutscene_move_character, oPlayerD, 800, 279, false, 1],
[cutscene_wait, 1],
[instance_create_layer, 0, 0, 0, oIntro],
[cutscene_move_character, oPlayerD, 1170, 279, false, 1],
[room_goto(Prehistoric)]
];
I really don't think the cutscene_wait and cutscene_move_character scripts have anything to do with this but I'll throw 'em in just in case
cutscene_wait (Script):
/// #description
/// #arg second
with(oCutscene){
timer++;
if(timer >= argument[0] * room_speed){
timer = 0;
cutscene_end_action();
}
}
cutscene_move_character (Script)
with(oCutscene){
var obj = argument0, relative = argument3, spd = argument4;
if(x_dest == -1){
if(!relative){
x_dest = argument1;
y_dest = argument2;
} else{
x_dest = obj.x + argument1;
y_dest = obj.y + argument2;
}
}
var xx = x_dest;
var yy = y_dest;
with(obj){
sprite_index = sPlayerR;
if(point_distance(x, y, xx, yy) >= spd){
var dir = point_direction(x, y, xx, yy);
var ldirx = lengthdir_x(spd, dir);
var ldiry = lengthdir_y(spd, dir);
if(dir != 0) { image_xscale = sign(ldirx); }
x += ldirx;
y += ldiry;
}else {
x = xx;
y = yy;
with(other){
x_dest = -1;
y_dest = -1;
cutscene_end_action();
}
}
}
}
Thank you in advance for your time and patience, and I apologize if I didn't include something or messed up on anything.

STM32F4-Disc1: user defined software delay in keil MDK version 5 not working

I am getting into learning embedded systems and I tried to implement blinky but the software delay gets skipped for some reason. I was expecting it to blink when I am pushing the button but instead the LEDs kept on.
Code I have used is shown below,
#include Board_LED.h
#include Board_Buttons.h
#include <stdint.h>
void delay(void);
void delay(void) {
int i;
for (i = 0; i < 5000000; i++)
;
}
int main(void) {
LED_Initialize();
Buttons_Initialize();
while (1) {
if (Buttons_GetState() == 1) {
LED_On(0);
LED_On(1);
LED_On(2);
LED_On(3);
delay();
LED_Off(0);
LED_Off(1);
LED_Off(2);
LED_Off(3);
delay();
}
}
return 0;
}
I'm using board support LED and button APIs.
How do I fix this?
My debugger starts as follows:
The problem here is this is dead code, it does nothing interacts with nothing so can/should be optimized out. And an optimizer will often do this.
void delay(void)
{
int i;
for(i=0; i<5000000 ;i++);
}
optimized output:
00000000 <delay>:
0: 4770 bx lr
One way is to not optimize
00000000 <delay>:
0: b580 push {r7, lr}
2: b082 sub sp, #8
4: af00 add r7, sp, #0
6: 2300 movs r3, #0
8: 607b str r3, [r7, #4]
a: e002 b.n 12 <delay+0x12>
c: 687b ldr r3, [r7, #4]
e: 3301 adds r3, #1
10: 607b str r3, [r7, #4]
12: 687b ldr r3, [r7, #4]
14: 4a04 ldr r2, [pc, #16] ; (28 <delay+0x28>)
16: 4293 cmp r3, r2
18: ddf8 ble.n c <delay+0xc>
1a: 46c0 nop ; (mov r8, r8)
1c: 46c0 nop ; (mov r8, r8)
1e: 46bd mov sp, r7
20: b002 add sp, #8
22: bc80 pop {r7}
24: bc01 pop {r0}
26: 4700 bx r0
But that's a bit brutal for an embedded platform so another is to beg the compiler to do something with the variable, keep it in memory and up to date:
void delay(void)
{
volatile int i;
for(i=0; i<5000000 ;i++);
}
It's still a bit ugly but that will burn some time:
00000000 <delay>:
0: 2300 movs r3, #0
2: b082 sub sp, #8
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 4a05 ldr r2, [pc, #20] ; (20 <delay+0x20>)
a: 4293 cmp r3, r2
c: dc05 bgt.n 1a <delay+0x1a>
e: 9b01 ldr r3, [sp, #4]
10: 3301 adds r3, #1
12: 9301 str r3, [sp, #4]
14: 9b01 ldr r3, [sp, #4]
16: 4293 cmp r3, r2
18: ddf9 ble.n e <delay+0xe>
1a: b002 add sp, #8
1c: 4770 bx lr
1e: 46c0 nop ; (mov r8, r8)
20: 004c4b3f .word 0x004c4b3f
The win-win way is to have another function outside the compile domain and let the optimizer work.
void dummy ( int );
void delay(void)
{
int i;
for(i=0; i<5000000 ;i++) dummy(i);
}
00000000 <delay>:
0: b570 push {r4, r5, r6, lr}
2: 2400 movs r4, #0
4: 4d04 ldr r5, [pc, #16] ; (18 <delay+0x18>)
6: 0020 movs r0, r4
8: 3401 adds r4, #1
a: f7ff fffe bl 0 <dummy>
e: 42ac cmp r4, r5
10: d1f9 bne.n 6 <delay+0x6>
12: bc70 pop {r4, r5, r6}
14: bc01 pop {r0}
16: 4700 bx r0
18: 004c4b40 .word 0x004c4b40
A little cleaner, burns some time but isn't excessive, yes note this is all-thumb-variants code. The called function can simply be a bx lr since you don't care what it does with the call.
00000000 <delay>:
0: b538 push {r3, r4, r5, lr}
2: 2400 movs r4, #0
4: 4d03 ldr r5, [pc, #12] ; (14 <delay+0x14>)
6: 4620 mov r0, r4
8: 3401 adds r4, #1
a: f7ff fffe bl 0 <dummy>
e: 42ac cmp r4, r5
10: d1f9 bne.n 6 <delay+0x6>
12: bd38 pop {r3, r4, r5, pc}
14: 004c4b40 .word 0x004c4b40
Building for the mcu cleans up the pop as after armv4t or 5t you could pop the pc to return to either mode, even though this is thumb mode only you still deal with that with these tools.
Now as shown by others, since you don't care about order just want to count you can, depending on architecture (often this is supported) count down. We are asking the compiler to not make this dead code so it has to do it in the order we asked, to be a functional representation of the C code.
void dummy ( int );
void delay(void)
{
int i=5000000;
while(--i) dummy(i);
}
00000000 <delay>:
0: b510 push {r4, lr}
2: 4c03 ldr r4, [pc, #12] ; (10 <delay+0x10>)
4: 4620 mov r0, r4
6: f7ff fffe bl 0 <dummy>
a: 3c01 subs r4, #1
c: d1fa bne.n 4 <delay+0x4>
e: bd10 pop {r4, pc}
10: 004c4b3f .word 0x004c4b3f
And now the compare went away (i-- vs --i made a difference i-- makes for more code)
With volatile:
void delay(void)
{
volatile int i=5000000;
while(--i) continue;
}
00000000 <delay>:
0: b082 sub sp, #8
2: 4b04 ldr r3, [pc, #16] ; (14 <delay+0x14>)
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 3b01 subs r3, #1
a: 9301 str r3, [sp, #4]
c: 2b00 cmp r3, #0
e: d1fa bne.n 6 <delay+0x6>
10: b002 add sp, #8
12: 4770 bx lr
14: 004c4b40 .word 0x004c4b40
void delay(void)
{
volatile int i=5000000;
while(i--) continue;
}
00000000 <delay>:
0: b082 sub sp, #8
2: 4b04 ldr r3, [pc, #16] ; (14 <delay+0x14>)
4: 9301 str r3, [sp, #4]
6: 9b01 ldr r3, [sp, #4]
8: 1e5a subs r2, r3, #1
a: 9201 str r2, [sp, #4]
c: 2b00 cmp r3, #0
e: d1fa bne.n 6 <delay+0x6>
10: b002 add sp, #8
12: 4770 bx lr
14: 004c4b40 .word 0x004c4b40
And that doesn't take advantage of the instruction set, oh well. (Being higher or lower one count doesn't matter as this really can't/won't be a tuned loop, to tune it on a platform like this you really need to use asm and even there it is difficult to tune).
Even cleaner just do it in assembly
.globl delay
delay:
ldr r0,=5000000
dinner:
sub r0,#1
bne dinner
bx lr
00000000 <delay>:
0: 4801 ldr r0, [pc, #4] ; (8 <dinner+0x6>)
00000002 <dinner>:
2: 3801 subs r0, #1
4: d1fd bne.n 2 <dinner>
6: 4770 bx lr
8: 004c4b40 .word 0x004c4b40
or make it generic
.globl delay
delay:
sub r0,#1
bne delay
bx lr
00000000 <delay>:
0: 3801 subs r0, #1
2: d1fe bne.n 0 <delay>
4: 4770 bx lr
and then call it from C with
delay(5000000);
Lots of options, but what others didn't show is the code being optimized away and what the choices do to the code. It is quite easy to see in the compiler output using the tools what is going on and why this happened.
And there are various ways to make it or request it to not be dead code. Most people just toss in a volatile and move on. Nothing wrong with that, usually.
Either specify -O0 as optimization flag in the compiler settings to avoid that the useless loop (from the compiler point of view) is optimized away.
Alternatively check the MDK or BSP for a provided delay() function known to work.
How did you discover that the loop was skipped (maybe your button function is not working)
Test it with:
void delay(volatile uint32_t del)
{
while(del--);
}
int main(void)
{
LED_Initialize();
Buttons_Initialize();
while(1){
if( 1 || Buttons_GetState() == 1){ //it skips the if checks
LED_On(0);
LED_On(1);
LED_On(2);
LED_On(3);
delay(500000);
LED_Off(0);
LED_Off(1);
LED_Off(2);
LED_Off(3);
delay(500000);
}
}
}
void delay(void)
{
volatile int i;
for(i=0; i<5000000 ;i++);
}
This should work provided that Buttons_GetState() is working fine. Declared variable 'i' as volatile so that no optimization happens by compiler.

What is the fastest way for adding the vector elements horizontally in odd order?

According to this question I implemented the horizontal addition this time 5 by 5 and 7 by 7. It does the job correctly but it is not fast enough.
Can it be faster than what it is? I tried to use hadd and other instruction but the improvement is restricted. For examlple, when I use _mm256_bsrli_epi128 it is slightly better but it needs some extra permutation that ruins the benefit because of the lanes. So the question is how it should be implemented to gain more performance. The same story is for 9 elements, etc.
This adds 5 elements horizontally and puts the results in places 0, 5, and 10:
//it put the results in places 0, 5, and 10
inline __m256i _mm256_hadd5x5_epi16(__m256i a )
{
__m256i a1, a2, a3, a4;
a1 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 1 * 2);
a2 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 2 * 2);
a3 = _mm256_bsrli_epi128(a2, 2);
a4 = _mm256_bsrli_epi128(a3, 2);
return _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a1, a2), _mm256_add_epi16(a3, a4)) , a );
}
And this adds 7 elements horizontally and puts the results in places 0 and 7:
inline __m256i _mm256_hadd7x7_epi16(__m256i a )
{
__m256i a1, a2, a3, a4, a5, a6;
a1 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 1 * 2);
a2 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 2 * 2);
a3 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 3 * 2);
a4 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 4 * 2);
a5 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 5 * 2);
a6 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 6 * 2);
return _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a1, a2), _mm256_add_epi16(a3, a4)) , _mm256_add_epi16(_mm256_add_epi16(a5, a6), a ));
}
Indeed it is possible calculate these sums with less instructions. The idea is to accumulate
the partial sums not only in columns 10, 5 and 0, but also in other columns. This reduces the number of
vpaddw instructions and the number of 'shuffles' compared to your solution.
#include <stdio.h>
#include <x86intrin.h>
/* gcc -O3 -Wall -m64 -march=haswell hor_sum5x5.c */
int print_vec_short(__m256i x);
int print_10_5_0_short(__m256i x);
__m256i _mm256_hadd5x5_epi16(__m256i a );
__m256i _mm256_hadd7x7_epi16(__m256i a );
int main() {
short x[16];
for(int i=0; i<16; i++) x[i] = i+1; /* arbitrary initial values */
__m256i t0 = _mm256_loadu_si256((__m256i*)x);
__m256i t2 = _mm256_permutevar8x32_epi32(t0,_mm256_set_epi32(0,7,6,5,4,3,2,1));
__m256i t02 = _mm256_add_epi16(t0,t2);
__m256i t3 = _mm256_bsrli_epi128(t2,4); /* byte shift right */
__m256i t023 = _mm256_add_epi16(t02,t3);
__m256i t13 = _mm256_srli_epi64(t02,16); /* bit shift right */
__m256i sum = _mm256_add_epi16(t023,t13);
printf("t0 = ");print_vec_short(t0 );
printf("t2 = ");print_vec_short(t2 );
printf("t02 = ");print_vec_short(t02 );
printf("t3 = ");print_vec_short(t3 );
printf("t023= ");print_vec_short(t023);
printf("t13 = ");print_vec_short(t13 );
printf("sum = ");print_vec_short(sum );
printf("\nVector elements of interest: columns 10, 5, 0:\n");
printf("t0 [10, 5, 0] = ");print_10_5_0_short(t0 );
printf("t2 [10, 5, 0] = ");print_10_5_0_short(t2 );
printf("t02 [10, 5, 0] = ");print_10_5_0_short(t02 );
printf("t3 [10, 5, 0] = ");print_10_5_0_short(t3 );
printf("t023[10, 5, 0] = ");print_10_5_0_short(t023);
printf("t13 [10, 5, 0] = ");print_10_5_0_short(t13 );
printf("sum [10, 5, 0] = ");print_10_5_0_short(sum );
printf("\nSum with _mm256_hadd5x5_epi16(t0)\n");
sum = _mm256_hadd5x5_epi16(t0);
printf("sum [10, 5, 0] = ");print_10_5_0_short(sum );
/* now the sum of 7 elements: */
printf("\n\nSum of short ints 13...7 and short ints 6...0:\n");
__m256i t = _mm256_loadu_si256((__m256i*)x);
t0 = _mm256_permutevar8x32_epi32(t0,_mm256_set_epi32(3,6,5,4,3,2,1,0));
t0 = _mm256_and_si256(t0,_mm256_set_epi16(0xFFFF,0,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF));
__m256i t1 = _mm256_alignr_epi8(t0,t0,2);
__m256i t01 = _mm256_add_epi16(t0,t1);
__m256i t23 = _mm256_alignr_epi8(t01,t01,4);
__m256i t0123 = _mm256_add_epi16(t01,t23);
__m256i t4567 = _mm256_alignr_epi8(t0123,t0123,8);
__m256i sum08 = _mm256_add_epi16(t0123,t4567); /* all elements are summed, but another permutation is needed to get the answer at position 7 */
sum = _mm256_permutevar8x32_epi32(sum08,_mm256_set_epi32(4,4,4,4,4,0,0,0));
printf("t = ");print_vec_short(t );
printf("t0 = ");print_vec_short(t0 );
printf("t1 = ");print_vec_short(t1 );
printf("t01 = ");print_vec_short(t01 );
printf("t23 = ");print_vec_short(t23 );
printf("t0123 = ");print_vec_short(t0123 );
printf("t4567 = ");print_vec_short(t4567 );
printf("sum08 = ");print_vec_short(sum08 );
printf("sum = ");print_vec_short(sum );
printf("\nSum with _mm256_hadd7x7_epi16(t) (the answer is in column 0 and in column 7)\n");
sum = _mm256_hadd7x7_epi16(t);
printf("sum = ");print_vec_short(sum );
return 0;
}
inline __m256i _mm256_hadd5x5_epi16(__m256i a )
{
__m256i a1, a2, a3, a4;
a1 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 1 * 2);
a2 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 2 * 2);
a3 = _mm256_bsrli_epi128(a2, 2);
a4 = _mm256_bsrli_epi128(a3, 2);
return _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a1, a2), _mm256_add_epi16(a3, a4)) , a );
}
inline __m256i _mm256_hadd7x7_epi16(__m256i a )
{
__m256i a1, a2, a3, a4, a5, a6;
a1 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 1 * 2);
a2 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 2 * 2);
a3 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 3 * 2);
a4 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 4 * 2);
a5 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 5 * 2);
a6 = _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, 6 * 2);
return _mm256_add_epi16(_mm256_add_epi16(_mm256_add_epi16(a1, a2), _mm256_add_epi16(a3, a4)) , _mm256_add_epi16(_mm256_add_epi16(a5, a6), a ));
}
int print_vec_short(__m256i x){
short int v[16];
_mm256_storeu_si256((__m256i *)v,x);
printf("%4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi \n",
v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
return 0;
}
int print_10_5_0_short(__m256i x){
short int v[16];
_mm256_storeu_si256((__m256i *)v,x);
printf("%4hi %4hi %4hi \n",v[10],v[5],v[0]);
return 0;
}
The output is:
$ ./a.out
t0 = 16 15 14 13 | 12 11 10 9 | 8 7 6 5 | 4 3 2 1
t2 = 2 1 16 15 | 14 13 12 11 | 10 9 8 7 | 6 5 4 3
t02 = 18 16 30 28 | 26 24 22 20 | 18 16 14 12 | 10 8 6 4
t3 = 0 0 2 1 | 16 15 14 13 | 0 0 10 9 | 8 7 6 5
t023= 18 16 32 29 | 42 39 36 33 | 18 16 24 21 | 18 15 12 9
t13 = 0 18 16 30 | 0 26 24 22 | 0 18 16 14 | 0 10 8 6
sum = 18 34 48 59 | 42 65 60 55 | 18 34 40 35 | 18 25 20 15
Vector elements of interest: columns 10, 5, 0:
t0 [10, 5, 0] = 11 6 1
t2 [10, 5, 0] = 13 8 3
t02 [10, 5, 0] = 24 14 4
t3 [10, 5, 0] = 15 10 5
t023[10, 5, 0] = 39 24 9
t13 [10, 5, 0] = 26 16 6
sum [10, 5, 0] = 65 40 15
Sum with _mm256_hadd5x5_epi16(t0)
sum [10, 5, 0] = 65 40 15
Sum of short ints 13...7 and short ints 6...0:
t = 16 15 14 13 | 12 11 10 9 | 8 7 6 5 | 4 3 2 1
t0 = 8 0 14 13 | 12 11 10 9 | 0 7 6 5 | 4 3 2 1
t1 = 9 8 0 14 | 13 12 11 10 | 1 0 7 6 | 5 4 3 2
t01 = 17 8 14 27 | 25 23 21 19 | 1 7 13 11 | 9 7 5 3
t23 = 21 19 17 8 | 14 27 25 23 | 5 3 1 7 | 13 11 9 7
t0123 = 38 27 31 35 | 39 50 46 42 | 6 10 14 18 | 22 18 14 10
t4567 = 39 50 46 42 | 38 27 31 35 | 22 18 14 10 | 6 10 14 18
sum08 = 77 77 77 77 | 77 77 77 77 | 28 28 28 28 | 28 28 28 28
sum = 77 77 77 77 | 77 77 77 77 | 77 77 28 28 | 28 28 28 28
Sum with _mm256_hadd7x7_epi16(t) (the answer is in column 0 and in column 7)
sum = 16 31 45 58 | 70 81 91 84 | 77 70 63 56 | 49 42 35 28

Optimization using prefetch

I want to understand how to use PREFETCH* instructions.
For this I wrote some code:
.model flat
.code
?fast_mem_copy_sse##YAXPAH0H#Z PROC
MOV edi, [esp + 4] ; destination
MOV esi, [esp + 8] ; source
MOV ecx, [esp + 12] ; n bytes for copy
copy_loop_1:
MOVAPS xmm0, [edi + 0 * 4 * 4]
MOVAPS xmm1, [edi + 1 * 4 * 4]
MOVAPS xmm2, [edi + 2 * 4 * 4]
MOVAPS xmm3, [edi + 3 * 4 * 4]
MOVAPS xmm4, [edi + 4 * 4 * 4]
MOVAPS xmm5, [edi + 5 * 4 * 4]
MOVAPS xmm6, [edi + 6 * 4 * 4]
MOVAPS xmm7, [edi + 7 * 4 * 4]
MOVAPS [esi + 0 * 4 * 4], xmm0
MOVAPS [esi + 1 * 4 * 4], xmm1
MOVAPS [esi + 2 * 4 * 4], xmm2
MOVAPS [esi + 3 * 4 * 4], xmm3
MOVAPS [esi + 4 * 4 * 4], xmm4
MOVAPS [esi + 5 * 4 * 4], xmm5
MOVAPS [esi + 6 * 4 * 4], xmm6
MOVAPS [esi + 7 * 4 * 4], xmm7
ADD esi, 4*4*8
ADD edi, 4*4*8
SUB ecx, 4*8
JNZ copy_loop_1
RET
?fast_mem_copy_sse##YAXPAH0H#Z ENDP
?fast_mem_copy_sse_movntdq##YAXPAH0H#Z PROC
MOV edi, [esp + 4] ; destination
MOV esi, [esp + 8] ; source
MOV ecx, [esp + 12] ; n bytes for copy
copy_loop_2:
MOVAPS xmm0, [edi + 0 * 4 * 4]
MOVAPS xmm1, [edi + 1 * 4 * 4]
MOVAPS xmm2, [edi + 2 * 4 * 4]
MOVAPS xmm3, [edi + 3 * 4 * 4]
MOVAPS xmm4, [edi + 4 * 4 * 4]
MOVAPS xmm5, [edi + 5 * 4 * 4]
MOVAPS xmm6, [edi + 6 * 4 * 4]
MOVAPS xmm7, [edi + 7 * 4 * 4]
MOVNTDQ [esi + 0 * 4 * 4], xmm0
MOVNTDQ [esi + 1 * 4 * 4], xmm1
MOVNTDQ [esi + 2 * 4 * 4], xmm2
MOVNTDQ [esi + 3 * 4 * 4], xmm3
MOVNTDQ [esi + 4 * 4 * 4], xmm4
MOVNTDQ [esi + 5 * 4 * 4], xmm5
MOVNTDQ [esi + 6 * 4 * 4], xmm6
MOVNTDQ [esi + 7 * 4 * 4], xmm7
ADD esi, 4*4*8
ADD edi, 4*4*8
SUB ecx, 4*8
JNZ copy_loop_2
RET
?fast_mem_copy_sse_movntdq##YAXPAH0H#Z ENDP
?fast_mem_copy_sse_prefetch##YAXPAH0H#Z PROC
MOV edi, [esp + 4] ; destination
MOV esi, [esp + 8] ; source
MOV ecx, [esp + 12] ; n bytes for copy
copy_loop_3:
;PREFETCHT0 [edi + 0 * 4 * 4]
;PREFETCHT0 [edi + 1 * 4 * 4]
;PREFETCHT0 [edi + 2 * 4 * 4]
;PREFETCHT0 [edi + 3 * 4 * 4]
;PREFETCHT0 [edi + 4 * 4 * 4]
;PREFETCHT0 [edi + 5 * 4 * 4]
;PREFETCHT0 [edi + 6 * 4 * 4]
;PREFETCHT0 [edi + 7 * 4 * 4]
PREFETCHT0 [edi]
MOVAPS xmm0, [edi + 0 * 4 * 4]
MOVAPS xmm1, [edi + 1 * 4 * 4]
MOVAPS xmm2, [edi + 2 * 4 * 4]
MOVAPS xmm3, [edi + 3 * 4 * 4]
MOVAPS xmm4, [edi + 4 * 4 * 4]
MOVAPS xmm5, [edi + 5 * 4 * 4]
MOVAPS xmm6, [edi + 6 * 4 * 4]
MOVAPS xmm7, [edi + 7 * 4 * 4]
MOVAPS [esi + 0 * 4 * 4], xmm0
MOVAPS [esi + 1 * 4 * 4], xmm1
MOVAPS [esi + 2 * 4 * 4], xmm2
MOVAPS [esi + 3 * 4 * 4], xmm3
MOVAPS [esi + 4 * 4 * 4], xmm4
MOVAPS [esi + 5 * 4 * 4], xmm5
MOVAPS [esi + 6 * 4 * 4], xmm6
MOVAPS [esi + 7 * 4 * 4], xmm7
ADD esi, 4*4*8
ADD edi, 4*4*8
SUB ecx, 4*8
JNZ copy_loop_3
RET
?fast_mem_copy_sse_prefetch##YAXPAH0H#Z ENDP
END
#include <string.h>
#include <iostream>
#include <time.h>
//#define CHECK
#define BLOCK_SIZE 8*8
#define AMOUNT_OF_BLOCKS 200*4
#define AMOUNT_OF_RUNS 100000
void fast_mem_copy_sse(int *dst, int *src, int n);
void fast_mem_copy_sse_movntdq(int *dst, int *src, int n);
void fast_mem_copy_sse_prefetch(int *dst, int *src, int n);
void fast_mem_copy(int *dst, int *src, int n)
{
for (int i = 0; i < n; i++) {
*(dst + i) = *(src + i);
}
}
int main()
{
clock_t t;
_declspec(align(16)) int a[AMOUNT_OF_BLOCKS*BLOCK_SIZE];
_declspec(align(16)) int b[AMOUNT_OF_BLOCKS*BLOCK_SIZE];
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy_sse(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy_sse work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy_sse took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy_sse_movntdq(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy_sse_movntdq work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy_sse_movntdq took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy_sse_prefetch(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy_sse_prefetch work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy_sse_prefetch took me " << t << " clicks (" << ((float)t / CLOCKS_PER_SEC) << " seconds).\n";
system("PAUSE");
return 0;
}
I got the following result:
fast_mem_copy took me 11262 clicks (11.262 seconds).
fast_mem_copy_sse took me 1940 clicks (1.94 seconds).
fast_mem_copy_sse_movntdq took me 3570 clicks (3.57 seconds).
fast_mem_copy_sse_prefetch took me 1970 clicks (1.97 seconds).
So what is wrong?
Or in fast_mem_copy_sse are using hardware prefetch and there is no any sense to use instruction for prefetch?
Also I used VTune and it told me that there is no cache misses.
Prefetching will only help if you do it far enough ahead to matter. I believe CPU speeds are up to the point that it now takes about 200 CPU cycles to fetch from RAM. With a loop like yours you'd need to be prefetching probably 10 iterations ahead.
Also, if you are doing simple copy loops that proceed in sequential access, the CPU hardware is already doing prefetch for you.

#autorelease Pool and Loops (for, while, do) Syntax

clang allows the following loop syntax:
for (...) #autorelease { ... }
while (...) #autorelease { ... }
do #autorelease { ... } while (...);
I haven't found any documentation on that syntax so far (Apple doesn't use this syntax in their guides, at least no in the guides introducing the #autorelease construct), but is it reasonable to assume that the three statement above are equivalent to the three statements below:
for (...) { #autorelease { ... } }
while (...) { #autorelease { ... } }
do { #autorelease { ... } } while (...);
Since that is what I would expect them to be (going by standard C syntax rules), yet I'm not entirely sure if that's really the case. It could also be some "special syntax", where the autorelease pool is not renewed for every loop iteration.
The reason that the first syntax example works is clear when you consider that any conditional statement can omit the { ... } block, resulting in only the following statement being executed.
For example:
if (something == YES)
NSLog(#"Something is yes");
is equivalent to
if (something == YES)
{
NSLog(#"Something is yes");
}
The #autoreleasepool { ... } block is simply the next statement following the conditional.
Personally I use the second syntax as it's less error-prone when making changes, and I find it easier to read. Imagine that when you add a statement between the conditional and the #autoreleasepool { ... } block, the result is considerably different from the original. See this naive example...
int i = 1;
while (i <= 10)
#autoreleasepool
{
NSLog(#"Iteration %d", i);
++i;
}
Will output "Iteration 1" through "Iteration 10". However:
int i = 1;
int total = 0;
while (i <= 10)
total += i;
#autoreleasepool
{
NSLog(#"Iteration %d", i);
++i;
}
Will actually cause an infinite loop because the ++i statement is never reached as it is syntactically equivalent to:
int i = 1;
int total = 0;
while (i <= 10)
{
total += i;
}
#autoreleasepool
{
NSLog(#"Iteration %d", i);
++i;
}
Both syntax are same
-(void)aFunc
{
int i=0;
for(;i<5;)
#autoreleasepool {
++i;
}
}
-(void)bFunc
{
int i=0;
for(;i<5;)
{
#autoreleasepool {
++i;
}
}
}
Assembly code
"-[AppDelegate aFunc]": ## #"\01-[AppDelegate aFunc]"
.cfi_startproc
Lfunc_begin0:
.loc 1 12 0 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:12:0
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
subq $32, %rsp
movq %rdi, -8(%rbp)
movq %rsi, -16(%rbp)
.loc 1 14 12 prologue_end ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:14:12
Ltmp5:
movl $0, -20(%rbp)
LBB0_1: ## =>This Inner Loop Header: Depth=1
.loc 1 15 5 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:15:5
Ltmp6:
cmpl $5, -20(%rbp)
jge LBB0_3
## BB#2: ## in Loop: Header=BB0_1 Depth=1
.loc 1 16 26 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:16:26
Ltmp7:
callq _objc_autoreleasePoolPush
.loc 1 17 13 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:17:13
movl -20(%rbp), %ecx
addl $1, %ecx
movl %ecx, -20(%rbp)
.loc 1 18 9 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:18:9
movq %rax, %rdi
callq _objc_autoreleasePoolPop
jmp LBB0_1
Ltmp8:
LBB0_3:
.loc 1 19 1 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:19:1
addq $32, %rsp
popq %rbp
ret
Ltmp9:
Lfunc_end0:
.file 2 "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/System/Library/Frameworks/Foundation.framework/Headers/NSObject.h"
.file 3 "/Users/Parag/Desktop/Test/Test/AppDelegate.h"
.cfi_endproc
.align 4, 0x90
"-[AppDelegate bFunc]": ## #"\01-[AppDelegate bFunc]"
.cfi_startproc
Lfunc_begin1:
.loc 1 20 0 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:20:0
## BB#0:
pushq %rbp
Ltmp12:
.cfi_def_cfa_offset 16
Ltmp13:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp14:
.cfi_def_cfa_register %rbp
subq $32, %rsp
movq %rdi, -8(%rbp)
movq %rsi, -16(%rbp)
.loc 1 22 12 prologue_end ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:22:12
Ltmp15:
movl $0, -20(%rbp)
LBB1_1: ## =>This Inner Loop Header: Depth=1
.loc 1 23 5 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:23:5
Ltmp16:
cmpl $5, -20(%rbp)
jge LBB1_3
## BB#2: ## in Loop: Header=BB1_1 Depth=1
.loc 1 25 26 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:25:26
Ltmp17:
callq _objc_autoreleasePoolPush
.loc 1 26 14 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:26:14
movl -20(%rbp), %ecx
addl $1, %ecx
movl %ecx, -20(%rbp)
.loc 1 27 9 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:27:9
movq %rax, %rdi
callq _objc_autoreleasePoolPop
Ltmp18:
.loc 1 28 5 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:28:5
jmp LBB1_1
Ltmp19:
LBB1_3:
.loc 1 29 1 ## /Users/Parag/Desktop/Test/Test/AppDelegate.m:29:1
addq $32, %rsp
popq %rbp
ret
Ltmp20:
Lfunc_end1:
I have tried the following code:
#interface Foo : NSObject
#end
#implementation Foo
- (void) dealloc
{
NSLog(#"Deallocating %#.", self);
[super dealloc];
}
#end
for (;;) #autoreleasepool {
[[[Foo alloc] init] autorelease];
sleep(1);
}
The console starts to fill with the deallocated Foo instances, so the syntax appears to work as expected.
This is just the normal C syntax for blocks and statements. When if, else, for, while, etc. do not have braces, they take the following statement, which could be a compound statement.
For example, you can do:
for (...) if (...) { ... }
if (...) while (...) { ... }
and so on... #autoreleasepool blocks are no different.