st7789 TFT Driver parallel read - embedded

I want to read the ID of my St7789 driver through the parallel interface.
I'm using a 16 bit parallel interface.(DB1-DB16)
I am running the Read ID3 command.
I'm writing the DCh hexadecimal code to the controller.
Then I go to read.
The value to read is 0x52.
When reading, I send D / CX WRX and RDX signals.
But I never read the correct values ​​from the parallel pins.
The code looks like this.
/////////////////////////////////////////////////////////////////////////////
GPIO_InitTypeDef GPIO_InitStructure;
  set_rs; / * RS high * /
  set_nw; / * Wr high * /
  clr_nrd; / * Rd low * /
  // DB1-DB8 <-> PC1-PC8 and DB10-DB12 <-> PC10-PC12
  GPIO_InitStructure.GPIO_Pin = GPIO_Pin_1 | GPIO_Pin_2 | GPIO_Pin_3 |
GPIO_Pin_4 | GPIO_Pin_5 | GPIO_Pin_6 | GPIO_Pin_7 | GPIO_Pin_8 |
GPIO_Pin_10 | GPIO_Pin_11 | gpıo_pin_12;
  GPIO_InitStructure.GPIO_Speed ​​ = GPIO_Speed_50 MHz;
  GPIO_InitStructure.GPIO_Mode = GPIO_Mode_IN;
  GPIO_Init (GPIOC, & GPIO_InitStructure);
  // DB13 PD2
  GPIO_InitStructure.GPIO_Pin = GPIO_Pin_2;
  GPIO_InitStructure.GPIO_Speed ​​ = GPIO_Speed_50 MHz;
  GPIO_InitStructure.GPIO_Mode = GPIO_Mode_IN;
  GPIO_Init (GPIOD, & GPIO_InitStructure);
  // DB14 <-> PB9, DB15 <-> PB10, DB16 <-> PB1, DB17 <-> PB2
  GPIO_InitStructure.GPIO_Pin = GPIO_Pin_1 | GPIO_Pin_2 | GPIO_Pin_9 |
GPIO_pin_10;
  GPIO_InitStructure.GPIO_Speed ​​ = GPIO_Speed_50 MHz;
  GPIO_InitStructure.GPIO_Mode = GPIO_Mode_IN;
  GPIO_Init (GPIOB, & GPIO_InitStructure);
  // We read twice to be sure.
  PORTREAD();//16 bit Parallel read.
  PORTREAD();
// DB1-DB8 <-> PC1-PC8 and DB10-DB12 <-> PC10-PC12
GPIO_InitStructure.GPIO_Pin = GPIO_Pin_1 | GPIO_Pin_2 | GPIO_Pin_3 |
GPIO_Pin_4 | GPIO_Pin_5 | GPIO_Pin_6 | GPIO_Pin_7 | GPIO_Pin_8 |
GPIO_Pin_10
| GPIO_Pin_11 | gpıo_pin_12;
GPIO_InitStructure.GPIO_Speed ​​ = GPIO_Speed_50 MHz;
GPIO_InitStructure.GPIO_Mode = GPIO_Mode_OUT;
GPIO_Init (GPIOC, & GPIO_InitStructure);
  // DB13 PD2
  GPIO_InitStructure.GPIO_Pin = GPIO_Pin_2;
  GPIO_InitStructure.GPIO_Speed ​​ = GPIO_Speed_50 MHz;
  GPIO_InitStructure.GPIO_Mode = GPIO_Mode_OUT;
  GPIO_Init (GPIOD, & GPIO_InitStructure);
  // DB14 <-> PB9, DB15 <-> PB10, DB16 <-> PB1, DB17 <-> PB2
  GPIO_InitStructure.GPIO_Pin = GPIO_Pin_1 | GPIO_Pin_2 | GPIO_Pin_9 |
GPIO_pin_10;
  GPIO_InitStructure.GPIO_Speed ​​ = GPIO_Speed_50 MHz;
  GPIO_InitStructure.GPIO_Mode = GPIO_Mode_OUT;
  GPIO_Init (GPIOB, & GPIO_InitStructure);
  set_nrd; / * Rd high * /
/////////////////////////////////////////////////////////////////////////////
First, the D / CX WRX RDX signals are sent. Next parallel pins are routed from the output to the input for reading.
We read it later.
Am I sending the RD signals wrong ?(High and low in wrong sequence)
How should the D / CX WRX RDX signal sequence be ?

Related

How to visualize connected status reports per 5 minutes

Iot Hub sends connected and disconnected events to a log analytics workspace.
How can we create a diagram using a log analytics kusto query that shows the availability of a device during the day with a 5 minute sampling?
Source data looks like this:
OperationName
DeviceId
TimeGenerated
deviceConnect
device1
2022-09-22T09:43:20
deviceDisconnect
device1
2022-09-22T09:53:20
deviceDisconnect
device2
2022-09-22T09:55:20
deviceConnect
device3
2022-09-22T10:00:20
deviceConnect
device4
2022-09-22T10:43:20
...
Resulting data set should be like:
Interval
DeviceId
Status
... assuming all devices disconnected
2022-09-22T09:40:00
device1
Disconnected
2022-09-22T09:40:00
device2
Disconnected
2022-09-22T09:40:00
device3
Disconnected
2022-09-22T09:40:00
device4
Disconnected
2022-09-22T09:45:00
device1
Connected
2022-09-22T09:45:00
device2
Disconnected
2022-09-22T09:45:00
device3
Disconnected
2022-09-22T09:45:00
device4
Disconnected
2022-09-22T09:50:00
device1
Connected
2022-09-22T09:50:00
device2
Connected
2022-09-22T09:50:00
device3
Disconnected
2022-09-22T09:50:00
device4
Disconnected
2022-09-22T09:55:00
device1
Disconnected
2022-09-22T09:55:00
device2
Connected
2022-09-22T09:55:00
device3
Disconnected
2022-09-22T09:55:00
device4
Disconnected
2022-09-22T09:00:00
device1
Disconnected
2022-09-22T09:00:00
device2
Disconnected
2022-09-22T09:00:00
device3
Connected
2022-09-22T09:00:00
device4
Connected
...
Status column can then be projected to Availability 100% or 0%.
Sparse display.
Each device is displayed on another y axis.
10,20,30 etc. stands for disconnected.
15,25,35 etc. stands for connected.
let t = datatable(OperationName:string, DeviceId:string, TimeGenerated:datetime)
[
"deviceConnect" ,"device1" ,datetime("2022-09-22T09:43:20")
,"deviceDisconnect" ,"device1" ,datetime("2022-09-22T09:53:20")
,"deviceDisconnect" ,"device2" ,datetime("2022-09-22T09:55:20")
,"deviceConnect" ,"device3" ,datetime("2022-09-22T10:00:20")
,"deviceConnect" ,"device4" ,datetime("2022-09-22T10:43:20")
];
// Solution starts here
let resolution = 5m;
let y_axis_devices_distance = 10;
let y_axis_device_states_distance = 5;
let TimeGenerated_start = toscalar(t | summarize bin(min(TimeGenerated), resolution));
let TimeGenerated_end = toscalar(t | summarize max(TimeGenerated)) + resolution;
let data_points = toint((bin(TimeGenerated_end, resolution) - bin(TimeGenerated_start, resolution)) / resolution) + 1;
let fictive_connects =
t
| summarize arg_min(TimeGenerated, OperationName) by DeviceId
| where OperationName == "deviceDisconnect"
| extend TimeGenerated = TimeGenerated_start, OperationName = "deviceConnect";
let devices =
t
| distinct DeviceId
| serialize
| extend rn = row_number();
union t, fictive_connects
| make-series Availability = y_axis_device_states_distance * sum(case(OperationName == "deviceConnect", 1, -1)) on TimeGenerated from TimeGenerated_start to TimeGenerated_end step resolution by DeviceId
| lookup devices on DeviceId
| extend Availability = array_concat(pack_array(Availability[0] + rn*y_axis_devices_distance), array_slice(Availability, 1, data_points - 1))
| render timechart with (accumulate=true)
Fiddle
Unified display of all devices (number of connected devices)
let t = datatable(OperationName:string, DeviceId:string, TimeGenerated:datetime)
[
"deviceConnect" ,"device1" ,datetime("2022-09-22T09:43:20")
,"deviceDisconnect" ,"device1" ,datetime("2022-09-22T09:53:20")
,"deviceDisconnect" ,"device2" ,datetime("2022-09-22T09:55:20")
,"deviceConnect" ,"device3" ,datetime("2022-09-22T10:00:20")
,"deviceConnect" ,"device4" ,datetime("2022-09-22T10:43:20")
];
// Solution starts here
let resolution = 5m;
let TimeGenerated_start = toscalar(t | summarize bin(min(TimeGenerated), resolution));
let TimeGenerated_end = toscalar(t | summarize max(TimeGenerated)) + resolution;
let fictive_connects = t
| summarize arg_min(TimeGenerated, OperationName) by DeviceId
| where OperationName == "deviceDisconnect"
| extend TimeGenerated = TimeGenerated_start, OperationName = "deviceConnect";
union t, fictive_connects
| make-series Availability = sum(case(OperationName == "deviceConnect", 1, -1)) on TimeGenerated from TimeGenerated_start to TimeGenerated_end step resolution
| render timechart with (accumulate=true)
Fiddle
Basic solution.
Each device state is 0 or 1.
The graphs of the devices overlap.
let t = datatable(OperationName:string, DeviceId:string, TimeGenerated:datetime)
[
"deviceConnect" ,"device1" ,datetime("2022-09-22T09:43:20")
,"deviceDisconnect" ,"device1" ,datetime("2022-09-22T09:53:20")
,"deviceDisconnect" ,"device2" ,datetime("2022-09-22T09:55:20")
,"deviceConnect" ,"device3" ,datetime("2022-09-22T10:00:20")
,"deviceConnect" ,"device4" ,datetime("2022-09-22T10:43:20")
];
// Solution starts here
let resolution = 5m;
let TimeGenerated_start = toscalar(t | summarize bin(min(TimeGenerated), resolution));
let TimeGenerated_end = toscalar(t | summarize max(TimeGenerated)) + resolution;
let fictive_connects = t
| summarize arg_min(TimeGenerated, OperationName) by DeviceId
| where OperationName == "deviceDisconnect"
| extend TimeGenerated = TimeGenerated_start, OperationName = "deviceConnect";
union t, fictive_connects
| make-series Availability = sum(case(OperationName == "deviceConnect", 1, -1)) on TimeGenerated from TimeGenerated_start to TimeGenerated_end step resolution by DeviceId
| render timechart with (accumulate=true)
Fiddle

Telegram User Adder

Hi recently I made telegram scrapper that scrap users from telegram groups.
Now I am trying make user adder to it.
#!/bin/env python3
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetDialogsRequest
from telethon.tl.types import InputPeerEmpty, InputPeerChannel, InputPeerUser
from telethon.errors.rpcerrorlist import PeerFloodError, UserPrivacyRestrictedError
from telethon.tl.functions.channels import InviteToChannelRequest
import configparser
import os, sys
import csv
import traceback
import time
import random
re="\033[1;31m"
gr="\033[1;32m"
cy="\033[1;36m"
def banner():
print(f"""
_____ __ ____ ____ ____ ___ ____ _____ __ ____ ____ ____ ___ ____
.----------------. .----------------. .----------------. .----------------. .----------------.
| .--------------. || .--------------. || .--------------. || .--------------. || .--------------. |
| | __ | || | ________ | || | ________ | || | _________ | || | _______ | |
| | / \ | || | |_ ___ `. | || | |_ ___ `. | || | |_ ___ | | || | |_ __ \ | |
| | / /\ \ | || | | | `. \ | || | | | `. \ | || | | |_ \_| | || | | |__) | | |
| | / ____ \ | || | | | | | | || | | | | | | || | | _| _ | || | | __ / | |
| | _/ / \ \_ | || | _| |___.' / | || | _| |___.' / | || | _| |___/ | | || | _| | \ \_ | |
| ||____| |____|| || | |________.' | || | |________.' | || | |_________| | || | |____| |___| | |
| | | || | | || | | || | | || | | |
| '--------------' || '--------------' || '--------------' || '--------------' || '--------------' |
'----------------' '----------------' '----------------' '----------------' '----------------'
_____ __ ____ ____ ____ ___ ____ _____ __ ____ ____ ____ ___ ____
version : 2.0
""")
cpass = configparser.RawConfigParser()
cpass.read('config.data')
try:
api_id = cpass['cred']['id']
api_hash = cpass['cred']['hash']
phone = cpass['cred']['phone']
client = TelegramClient(phone, api_id, api_hash)
except KeyError:
os.system('clear')
banner()
print(re+"[!] run python3 setup.py first !!\n")
sys.exit(1)
client.connect()
if not client.is_user_authorized():
client.send_code_request(phone)
os.system('clear')
banner()
client.sign_in(phone, input(gr+'[+] Enter the code: '+re))
os.system('clear')
banner()
input_file = sys.argv[1]
users = []
with open(input_file, encoding='UTF-8') as f:
rows = csv.reader(f,delimiter=",",lineterminator="\n")
next(rows, None)
for row in rows:
user = {}
user['username'] = row[0]
user['id'] = int(row[1])
user['access_hash'] = int(row[2])
user['name'] = row[3]
users.append(user)
chats = []
last_date = None
chunk_size = 200
groups=[]
result = client(GetDialogsRequest(
offset_date=last_date,
offset_id=0,
offset_peer=InputPeerEmpty(),
limit=chunk_size,
hash = 0
))
chats.extend(result.chats)
for chat in chats:
try:
if chat.megagroup== False:
groups.append(chat)
except:
continue
i=0
for group in groups:
print(gr+'['+cy+str(i)+gr+']'+cy+' - '+group.title)
i+=1
print(gr+'[+] Choose a group to add members')
g_index = input(gr+"[+] Enter a Number : "+re)
target_group=groups[int(g_index)]
target_group_entity = InputPeerChannel(target_group.id,target_group.access_hash)
print(gr+"[1] add member by user ID\n[2] add member by username ")
mode = int(input(gr+"Input : "+re))
n = 0
for user in users:
n += 1
if n % 50 == 0:
time.sleep(1)
try:
print ("Adding {}".format(user['id']))
if mode == 1:
if user['username'] == "":
continue
user_to_add = client.get_input_entity(user['username'])
elif mode == 2:
user_to_add = InputPeerUser(user['id'], user['access_hash'])
else:
sys.exit(re+"[!] Invalid Mode Selected. Please Try Again.")
client(InviteToChannelRequest(target_group_entity,[user_to_add]))
print(gr+"[+] Waiting for 2-10 Seconds...")
time.sleep(random.randrange(2, 10))
except FloodWaitError:
print(re+"[!] Getting Flood Error from telegram. \n[!] Script is stopping now. \n[!] Please try again after some time.")
except UserPrivacyRestrictedError:
print(re+"[!] The user's privacy settings do not allow you to do this. Skipping.")
except:
traceback.print_exc()
print(re+"[!] Unexpected Error")
continue
It works but partly I can hardly add 1-10 user at a time and I shows errors some of adding proccess
Kindly I tried most thing command says it needs much time but timer doesnt seem effect on it even I add some.Any suggestions any helps ?
Adding 1456428294
[!] Getting FloodWaitError from telegram.
[!] Script is stopping now.
[!] Please try again after some time.
FloodWaitError (420)
the same request was repeated many times. Must wait .seconds (you can access this attribute). For example:
from telethon import errors
try:
messages = await client.get_messages(chat)
print(messages[0].text)
except errors.FloodWaitError as e:
print('Have to sleep', e.seconds, 'seconds')
time.sleep(e.seconds)
Read the documentation:
https://docs.telethon.dev/en/latest/concepts/errors.html

data frame parsing column scala

I have some problem with parsing Dataframe
val result = df_app_clickstream.withColumn(
"attributes",
explode(expr(raw"transform(attributes, x -> str_to_map(regexp_replace(x, '{\\}',''), ' '))"))
).select(
col("userId"),
col("attributes").getField("campaign_id").alias("app_campaign_id"),
col("attributes").getField("channel_id").alias("app_channel_id")
)
result.show()
I have input like this :
-------------------------------------------------------------------------------
| userId | attributes |
-------------------------------------------------------------------------------
| f6e8252f-b5cc-48a4-b348-29d89ee4fa9e |{'campaign_id':082,'channel_id':'Chnl'}|
-------------------------------------------------------------------------------
and need to get output like this :
--------------------------------------------------------------------
| userId | campaign_id | channel_id|
--------------------------------------------------------------------
| f6e8252f-b5cc-48a4-b348-29d89ee4fa9e | 082 | Facebook |
--------------------------------------------------------------------
but have error
you can try below solution
import org.apache.spark.sql.functions._
val data = Seq(("f6e8252f-b5cc-48a4-b348-29d89ee4fa9e", """{'campaign_id':082, 'channel_id':'Chnl'}""")).toDF("user_id", "attributes")
val out_df = data.withColumn("splitted_col", split(regexp_replace(col("attributes"),"'|\\}|\\{", ""), ","))
.withColumn("campaign_id", split(element_at(col("splitted_col"), 1), ":")(1))
.withColumn("channel_id", split(element_at(col("splitted_col"), 2), ":")(1))
out_df.show(truncate = false)
+------------------------------------+----------------------------------------+-----------------------------------+-----------+----------+
|user_id |attributes |splitted_col |campaign_id|channel_id|
+------------------------------------+----------------------------------------+-----------------------------------+-----------+----------+
|f6e8252f-b5cc-48a4-b348-29d89ee4fa9e|{'campaign_id':082, 'channel_id':'Chnl'}|[campaign_id:082, channel_id:Chnl]|082 |Chnl |
+------------------------------------+----------------------------------------+-----------------------------------+-----------+----------+

Optimizing opencl kernel

I am trying to optimize this kernel. The CPU version of this kernel is 4 times faster than the GPU version. I would expect that the GPU version would be faster.
It might be that we have a lot of memory accesses and that is why we have a low performance. I am using an Intel HD 2500 and OpenCL 1.2.
The GPU kernel is:
__kernel void mykernel(__global unsigned char *inp1,
__global unsigned char *inp2,
__global unsigned char *inp3,
__global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__global unsigned char *lut,
uint size
)
{
unsigned char x1, x2, x3, x4;
unsigned char y1, y2, y3, y4;
const int x = get_global_id(0);
const int y = get_global_id(1);
const int width = get_global_size(0);
const uint id = y * width + x;
x1 = inp1[id];
x2 = inp2[id];
x3 = inp3[id];
x4 = inp4[id];
y1 = (x1 & 0xff) | (x2>>2 & 0xaa) | (x3>>4 & 0x0d) | (x4>>6 & 0x02);
y2 = (x1<<2 & 0xff) | (x2 & 0xaa) | (x3>>2 & 0x0d) | (x4>>4 & 0x02);
y3 = (x1<<4 & 0xff) | (x2<<2 & 0xaa) | (x3 & 0x0d) | (x4>>2 & 0x02);
y4 = (x1<<6 & 0xff) | (x2<<4 & 0xaa) | (x3<<2 & 0x0d) | (x4 & 0x02);
// lookup table
y1 = lut[y1];
y2 = lut[y2];
y3 = lut[y3];
y4 = lut[y4];
outp1[id] = (y1 & 0xc0)
| ((y2 & 0xc0) >> 2)
| ((y3 & 0xc0) >> 4)
| ((y4 & 0xc0) >> 6);
outp2[id] = ((y1 & 0x30) << 2)
| (y2 & 0x30)
| ((y3 & 0x30) >> 2)
| ((y4 & 0x30) >> 4);
outp3[id] = ((y1 & 0x0c) << 4)
| ((y2 & 0x0c) << 2)
| (y3 & 0x0c)
| ((y4 & 0x0c) >> 2);
outp4[id] = ((y1 & 0x03) << 6)
| ((y2 & 0x03) << 4)
| ((y3 & 0x03) << 2)
| (y4 & 0x03);
}
I use :
size_t localWorkSize[1], globalWorkSize[1];
localWorkSize[0] = 1;
globalWorkSize[0] = X*Y; // X,Y define a data space of 15 - 20 MB
LocalWorkSize can vary between 1 - 256.
for LocalWorkSize = 1 I have
CPU = 0.067Sec
GPU = 0.20Sec
for LocalWorkSize = 256 I have
CPU = 0.067Sec
GPU = 0.34Sec
Which is really weird. Can you give me some ideas why I get these strange numbers? and do you have any tips on how I can optimize this kernel?
My main looks like this:
int main(int argc, char** argv)
{
int err,err1,j,i; // error code returned from api calls and other
clock_t start, end; // measuring performance variables
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program_ms_naive; // compute program
cl_kernel kernel_ms_naive; // compute kernel
// ... dynamically allocate arrays
// ... initialize arrays
cl_uint dev_cnt = 0;
clGetPlatformIDs(0, 0, &dev_cnt);
cl_platform_id platform_ids[100];
clGetPlatformIDs(dev_cnt, platform_ids, NULL);
// Connect to a compute device
err = clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
// Create a compute context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
commands = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute programs from the source file
program_ms_naive = clCreateProgramWithSource(context, 1, (const char **) &kernelSource_ms, NULL, &err);
// Build the programs executable
err = clBuildProgram(program_ms_naive, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel_ms_naive = clCreateKernel(program_ms_naive, "ms_naive", &err);
d_A1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A1, &err);
d_A2 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A2, &err);
d_A3 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A3, &err);
d_A4 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A4, &err);
d_lut = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 256, h_ltable, &err);
d_B1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B4 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
int size = YCOLUMNS*XROWS/4;
int size_b = size * 4;
err = clSetKernelArg(kernel_ms_naive, 0, sizeof(cl_mem), (void *)&(d_A1));
err |= clSetKernelArg(kernel_ms_naive, 1, sizeof(cl_mem), (void *)&(d_A2));
err |= clSetKernelArg(kernel_ms_naive, 2, sizeof(cl_mem), (void *)&(d_A3));
err |= clSetKernelArg(kernel_ms_naive, 3, sizeof(cl_mem), (void *)&(d_A4));
err |= clSetKernelArg(kernel_ms_naive, 4, sizeof(cl_mem), (void *)&d_B1);
err |= clSetKernelArg(kernel_ms_naive, 5, sizeof(cl_mem), (void *)&(d_B2));
err |= clSetKernelArg(kernel_ms_naive, 6, sizeof(cl_mem), (void *)&(d_B3));
err |= clSetKernelArg(kernel_ms_naive, 7, sizeof(cl_mem), (void *)&(d_B4));
err |= clSetKernelArg(kernel_ms_naive, 8, sizeof(cl_mem), (void *)&d_lut); //__global
err |= clSetKernelArg(kernel_ms_naive, 9, sizeof(cl_uint), (void *)&size_b);
size_t localWorkSize[1], globalWorkSize[1];
localWorkSize[0] = 256;
globalWorkSize[0] = XROWS*YCOLUMNS;
start = clock();
for (i=0;i< EXECUTION_TIMES;i++)
{
err1 = clEnqueueNDRangeKernel(commands, kernel_ms_naive, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
err = clFinish(commands);
}
end = clock();
return 0;
}
Constant memory is used to broadcast small amount of values to all the work items and acts similar to a constant private register, thus very fast access speed. Normal GPU devices can support up to 16kb of constant memory. Should be enough to hold the LUT.
You can try with constant memory, as a simple solution for the global access bottleneck:
__kernel void mykernel(const __global unsigned char *inp1,
const __global unsigned char *inp2,
const __global unsigned char *inp3,
const __global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__constant unsigned char *lut,
uint size
)
{
...
}
But a proper solution would be to reshape your code:
Use vectors of char4 instead of 4 different buffers (because that
breaks coalescence) [It can give you a big boost up to x4]
Operate on vectors [Slight boost]
Use local/constant memory for LUT [It can reduce 1 non coalesced read of the LUT, maybe 2x-3x]
Still it will be difficult to beat the CPU approach, due to big IO constrains.

Best practice to use bit operations to set some flags

I would like to turn on/off 3 stars that represent a level of difficulty. I don't want to make usage of several if condition, would it be possible to do so by just using bitwise operation?
Let's say i have declared an enum like this:
enum
{
EASY = 0,
MODERATE,
CHALLENGING
} Difficulty;
I would like to find a bit operation that let me find which star to turn on or off:
e.g:
level 2 (challenging)
star 0 -> 1
star 1 -> 1
star 2 -> 1
level 1 (moderate)
star 0 -> 1
star 1 -> 1
star 2 -> 0
level 0 (easy)
star 0 -> 1
star 1 -> 0
star 2 -> 0
In the case if you want to have 3 bits to save your stars states, like instead of having three boolean flags, than you should do:
typedef enum
{
DifficultyEasy = 1 << 0,
DifficultyModerate = 1 << 1,
DifficultyChallenging = 1 << 2
} Difficulty;
Difficulty state = 0; // default
To set Easy:
state |= DifficultyEasy;
To add Challenging:
state |= DifficultyChallenging;
To reset Easy:
state &= ~DifficultyEasy;
To know is Challenging set:
BOOL isChallenging = DifficultyChallenging & state;
In the case somebody needs an explanation how it works:
1 << x means set x bit to 1 (from right);
// actually it means move 0b00000001 left by x, but I said 'set' to simplify it
1 << 5 = 0b00100000; 1 << 2 = 0b00000100; 1 << 0 = 0b00000001;
0b00001111 | 0b11000011 = 0b11001111 (0 | 0 = 0, 1 | 0 = 1, 1 | 1 = 1)
0b00001111 & 0b11000011 = 0b00000011 (0 & 0 = 0, 1 & 0 = 0, 1 & 1 = 1)
~0b00001111 = 0b11110000 (~0 = 1, ~1 = 0)
You would want to do something like this:
typedef enum Difficulty : NSUInteger
{
EASY = 1 << 0,
MODERATE = 1 << 1,
CHALLENGING = 1 << 2
} Difficulty;
And then to check it:
- (void) setStarsWithDifficulty:(Difficulty)diff
{
star0 = (diff & (EASY | MODERATE | CHALLENGING));
star1 = (diff & (MODERATE | CHALLENGING));
star2 = (diff & CHALLENGING);
}
Are you talking about something like:
star0 = 1
star1 = value & CHALLENGING || value & MODERATE
star2 = value & CHALLENGING
#define STAR0 1
#define STAR1 2
#define STAR2 4
#define EASY STAR0
#define MODERATE STAR1|STAR0
#define CHALLENGING STAR0|STAR1|STAR2
Detection a value d with and and compare against 0 will produce the required mapping, some of the above samples will give you the mapped value instead, take a look:
int d = EASY;
NSLog(#"Star 0 %d",(d&STAR0)!=0);
NSLog(#"Star 1 %d",(d&STAR1)!=0);
NSLog(#"Star 2 %d",(d&STAR2)!=0);
d=MODERATE;
NSLog(#"Star 0 %d",(d&STAR0)!=0);
NSLog(#"Star 1 %d",(d&STAR1)!=0);
NSLog(#"Star 2 %d",(d&STAR2)!=0);
d=CHALLENGING;
NSLog(#"Star 0 %d",(d&STAR0)!=0);
NSLog(#"Star 1 %d",(d&STAR1)!=0);
NSLog(#"Star 2 %d",(d&STAR2)!=0);