The obj files I am trying to load have multiple -o flags, so there are multiple meshes. I am trying to load them into only 1 VAO, and I will draw them by first recording each mesh's offset and size. I have noted that the offset and size are in terms of number of vertices instead of faces, so they are multiplied by 3. For example, the first mesh starts at offset 0, and its size is mesh1's mNumberFaces * 3, and the second mesh starts at offset mesh1's mNumberFaces * 3, and its size is mesh2's mNumberFaces * 3. However, it seems only the first mesh is drawn correctly, and the rest of the meshes are all distorted somehow.
This is my loading logic:
Object* obj = new Object(objName);
// Initialize the meshes in the obj file one by one
std::vector<glm::vec3> vert, norm;
std::vector<glm::vec2> text;
std::vector<glm::ivec3> indices;
int vertexOffset = 0;
std::cout << objName << " numMeshes: " << pScene->mNumMeshes << std::endl;
for (unsigned int i = 0; i < pScene->mNumMeshes; i++) {
std::cout << objName << ": vOffset " << vertexOffset << " numV " << pScene->mMeshes[i]->mNumFaces * 3 << std::endl;
aiMesh* pMesh = pScene->mMeshes[i];
aiVector3D Zero3D(0.0f, 0.0f, 0.0f);
for (unsigned int j = 0; j < pMesh->mNumVertices; j++) {
vert.push_back(glm::vec3(pMesh->mVertices[j].x, pMesh->mVertices[j].y, pMesh->mVertices[j].z));
norm.push_back(glm::vec3(pMesh->mNormals[j].x, pMesh->mNormals[j].y, pMesh->mNormals[j].z));
aiVector3D textCoord = pMesh->HasTextureCoords(0) ? pMesh->mTextureCoords[0][j] : Zero3D;
text.push_back(glm::vec2(textCoord.x, textCoord.y));
}
for (unsigned int j = 0; j < pMesh->mNumFaces; j++) {
aiFace face = pMesh->mFaces[j];
indices.push_back(glm::ivec3(face.mIndices[0], face.mIndices[1], face.mIndices[2]));
}
aiMaterial* mtl = pScene->mMaterials[pMesh->mMaterialIndex];
std::string meshName = std::string(pMesh->mName.C_Str());
Mesh* mesh = new Mesh(meshName, loadMaterial(mtl), vertexOffset, pMesh->mNumFaces * 3);
obj->meshList.push_back(mesh);
vertexOffset = vertexOffset + 3 * pMesh->mNumFaces;
}
//create the obj's node structure
//obj->root = processNode(pScene->mRootNode, obj->meshList);
//send the data to the gpu
GLuint vao;
GLuint vbo[3];
GLuint ebo;
glcheck(glGenVertexArrays(1, &vao));
glcheck(glBindVertexArray(vao));
glcheck(glGenBuffers(3, vbo));
glcheck(glBindBuffer(GL_ARRAY_BUFFER, vbo[0]));
glcheck(glBufferData(GL_ARRAY_BUFFER, sizeof(glm::vec3) * vert.size(), vert.data(), GL_STATIC_DRAW));
glcheck(glEnableVertexAttribArray(0));
glcheck(glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 0, 0));
glcheck(glBindBuffer(GL_ARRAY_BUFFER, vbo[1]));
glcheck(glBufferData(GL_ARRAY_BUFFER, sizeof(glm::vec3) * norm.size(), norm.data(), GL_STATIC_DRAW));
glcheck(glEnableVertexAttribArray(1));
glcheck(glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 0, 0));
glcheck(glBindBuffer(GL_ARRAY_BUFFER, vbo[2]));
glcheck(glBufferData(GL_ARRAY_BUFFER, sizeof(glm::vec2) * text.size(), text.data(), GL_STATIC_DRAW));
glcheck(glEnableVertexAttribArray(2));
glcheck(glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, 0, 0));
glcheck(glGenBuffers(1, &ebo));
glcheck(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ebo));
glcheck(glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(glm::ivec3) * indices.size(), indices.data(), GL_STATIC_DRAW));
// Unbind the VBO/VAO
glcheck(glBindVertexArray(0));
//glcheck(glBindBuffer(GL_ARRAY_BUFFER, 0));
//glcheck(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0));
obj->vao = vao; //shared vao variable
objMap[objName] = obj;
objList.push_back(obj);
return obj;
This is my drawing logic:
for (int i = 0; i < instObj->meshList.size(); i++) {
Mesh* mesh = instObj->meshList[i];
glcheck(glDrawElements(GL_TRIANGLES, mesh->size, GL_UNSIGNED_INT, (GLvoid*)(sizeof(GLuint) * mesh->vertexOffset)));
}
This is the first mesh, which is drawn correctly first mesh
The second mesh and onward are all messed up however, second mesh
The complete mesh enter image description here
I've been working with Cypress BLE PSoC 4200, and I've set up my GATT database to send int32 data packets to my iPhone. However, you can only write to the GATT database with uint8 pieces of data. So I wrote the following to take this int32 voltage reading and put it into a uint8 byte array:
// function passes in int32 variable 'result'
uint8 array[4];
array[0] = result & 0xFF;
array[1] = (result >> 8) & 0xFF;
array[2] = (result >> 16) & 0xFF;
array[3] = (result >> 24) & 0xFF;
So, given that in mind, when that int32 packet gets sent, I want to be able take each byte, and recombine them somehow into the original int32 value, and print it to the screen (e.g. 456000 will be 0.456 V).
Right now, I obtain the 4 bytes and handle them like such:
NSData* data = [characteristic value];
const uint8_t *reportData = [data bytes];
// variable to hold the eventual 32-bit data
uint32_t voltage = 0;
Is there a way to go through each index of *reportData and concatenate the bytes? Any help will do, thanks.
Would something like this not work?
uint32_t v0 = (uint32_t)reportData[0];
uint32_t v1 = (uint32_t)reportData[1] << 8;
uint32_t v2 = (uint32_t)reportData[2] << 16;
uint32_t v3 = (uint32_t)reportData[3] << 24;
uint32_t voltage = v0 | v1 | v2 | v3;
I am trying to optimize this kernel. The CPU version of this kernel is 4 times faster than the GPU version. I would expect that the GPU version would be faster.
It might be that we have a lot of memory accesses and that is why we have a low performance. I am using an Intel HD 2500 and OpenCL 1.2.
The GPU kernel is:
__kernel void mykernel(__global unsigned char *inp1,
__global unsigned char *inp2,
__global unsigned char *inp3,
__global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__global unsigned char *lut,
uint size
)
{
unsigned char x1, x2, x3, x4;
unsigned char y1, y2, y3, y4;
const int x = get_global_id(0);
const int y = get_global_id(1);
const int width = get_global_size(0);
const uint id = y * width + x;
x1 = inp1[id];
x2 = inp2[id];
x3 = inp3[id];
x4 = inp4[id];
y1 = (x1 & 0xff) | (x2>>2 & 0xaa) | (x3>>4 & 0x0d) | (x4>>6 & 0x02);
y2 = (x1<<2 & 0xff) | (x2 & 0xaa) | (x3>>2 & 0x0d) | (x4>>4 & 0x02);
y3 = (x1<<4 & 0xff) | (x2<<2 & 0xaa) | (x3 & 0x0d) | (x4>>2 & 0x02);
y4 = (x1<<6 & 0xff) | (x2<<4 & 0xaa) | (x3<<2 & 0x0d) | (x4 & 0x02);
// lookup table
y1 = lut[y1];
y2 = lut[y2];
y3 = lut[y3];
y4 = lut[y4];
outp1[id] = (y1 & 0xc0)
| ((y2 & 0xc0) >> 2)
| ((y3 & 0xc0) >> 4)
| ((y4 & 0xc0) >> 6);
outp2[id] = ((y1 & 0x30) << 2)
| (y2 & 0x30)
| ((y3 & 0x30) >> 2)
| ((y4 & 0x30) >> 4);
outp3[id] = ((y1 & 0x0c) << 4)
| ((y2 & 0x0c) << 2)
| (y3 & 0x0c)
| ((y4 & 0x0c) >> 2);
outp4[id] = ((y1 & 0x03) << 6)
| ((y2 & 0x03) << 4)
| ((y3 & 0x03) << 2)
| (y4 & 0x03);
}
I use :
size_t localWorkSize[1], globalWorkSize[1];
localWorkSize[0] = 1;
globalWorkSize[0] = X*Y; // X,Y define a data space of 15 - 20 MB
LocalWorkSize can vary between 1 - 256.
for LocalWorkSize = 1 I have
CPU = 0.067Sec
GPU = 0.20Sec
for LocalWorkSize = 256 I have
CPU = 0.067Sec
GPU = 0.34Sec
Which is really weird. Can you give me some ideas why I get these strange numbers? and do you have any tips on how I can optimize this kernel?
My main looks like this:
int main(int argc, char** argv)
{
int err,err1,j,i; // error code returned from api calls and other
clock_t start, end; // measuring performance variables
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program_ms_naive; // compute program
cl_kernel kernel_ms_naive; // compute kernel
// ... dynamically allocate arrays
// ... initialize arrays
cl_uint dev_cnt = 0;
clGetPlatformIDs(0, 0, &dev_cnt);
cl_platform_id platform_ids[100];
clGetPlatformIDs(dev_cnt, platform_ids, NULL);
// Connect to a compute device
err = clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
// Create a compute context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
commands = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute programs from the source file
program_ms_naive = clCreateProgramWithSource(context, 1, (const char **) &kernelSource_ms, NULL, &err);
// Build the programs executable
err = clBuildProgram(program_ms_naive, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel_ms_naive = clCreateKernel(program_ms_naive, "ms_naive", &err);
d_A1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A1, &err);
d_A2 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A2, &err);
d_A3 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A3, &err);
d_A4 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A4, &err);
d_lut = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 256, h_ltable, &err);
d_B1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B4 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
int size = YCOLUMNS*XROWS/4;
int size_b = size * 4;
err = clSetKernelArg(kernel_ms_naive, 0, sizeof(cl_mem), (void *)&(d_A1));
err |= clSetKernelArg(kernel_ms_naive, 1, sizeof(cl_mem), (void *)&(d_A2));
err |= clSetKernelArg(kernel_ms_naive, 2, sizeof(cl_mem), (void *)&(d_A3));
err |= clSetKernelArg(kernel_ms_naive, 3, sizeof(cl_mem), (void *)&(d_A4));
err |= clSetKernelArg(kernel_ms_naive, 4, sizeof(cl_mem), (void *)&d_B1);
err |= clSetKernelArg(kernel_ms_naive, 5, sizeof(cl_mem), (void *)&(d_B2));
err |= clSetKernelArg(kernel_ms_naive, 6, sizeof(cl_mem), (void *)&(d_B3));
err |= clSetKernelArg(kernel_ms_naive, 7, sizeof(cl_mem), (void *)&(d_B4));
err |= clSetKernelArg(kernel_ms_naive, 8, sizeof(cl_mem), (void *)&d_lut); //__global
err |= clSetKernelArg(kernel_ms_naive, 9, sizeof(cl_uint), (void *)&size_b);
size_t localWorkSize[1], globalWorkSize[1];
localWorkSize[0] = 256;
globalWorkSize[0] = XROWS*YCOLUMNS;
start = clock();
for (i=0;i< EXECUTION_TIMES;i++)
{
err1 = clEnqueueNDRangeKernel(commands, kernel_ms_naive, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
err = clFinish(commands);
}
end = clock();
return 0;
}
Constant memory is used to broadcast small amount of values to all the work items and acts similar to a constant private register, thus very fast access speed. Normal GPU devices can support up to 16kb of constant memory. Should be enough to hold the LUT.
You can try with constant memory, as a simple solution for the global access bottleneck:
__kernel void mykernel(const __global unsigned char *inp1,
const __global unsigned char *inp2,
const __global unsigned char *inp3,
const __global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__constant unsigned char *lut,
uint size
)
{
...
}
But a proper solution would be to reshape your code:
Use vectors of char4 instead of 4 different buffers (because that
breaks coalescence) [It can give you a big boost up to x4]
Operate on vectors [Slight boost]
Use local/constant memory for LUT [It can reduce 1 non coalesced read of the LUT, maybe 2x-3x]
Still it will be difficult to beat the CPU approach, due to big IO constrains.
I'm struggling to learn how to pack four seperate values into a single byte. I'm trying to get a hex output of 0x91 and the binary representation is supposed to be 10010001 but instead I'm getting outputs of: 0x1010001 and 16842753 respectively. Or is there a better way to do this?
uint8_t globalColorTableFlag = 1;
uint8_t colorResolution = 001;
uint8_t sortFlag = 0;
uint8_t sizeOfGlobalColorTable = 001;
uint32_t packed = ((globalColorTableFlag << 24) | (colorResolution << 16) | (sortFlag << 8) | (sizeOfGlobalColorTable << 0));
NSLog(#"%d",packed); // Logs 16842753, should be: 10010001
NSLog(#"0x%02X",packed); // Logs 0x1010001, should be: 0x91
Try the following:
/* packed starts at 0 */
uint8_t packed = 0;
/* one bit of the flag is kept and shifted to the last position */
packed |= ((globalColorTableFlag & 0x1) << 7);
/* three bits of the resolution are kept and shifted to the fifth position */
packed |= ((colorResolution & 0x7) << 4);
/* one bit of the flag is kept and shifted to the fourth position */
packed |= ((sortFlag & 0x1) << 3);
/* three bits are kept and left in the first position */
packed |= ((sizeOfGlobalColorTable & 0x7) << 0);
For an explanation about the relation between hexadecimal and binary digits see this answer: https://stackoverflow.com/a/17914633/4178025
For bitwise operations see: https://stackoverflow.com/a/3427633/4178025
packed = ((globalColorTableFlag & 1) << 7) +
((colorResolution & 0x7) << 4) +
((sortFlag & 1) << 3) +
((sizeOfGlobalColorTable & 0x7);
Can someone explain how to read these two bit masks?
uint32_t = 0x1 << 0;
uint32_t = 0x1 << 1;
Basically, how would you translate this to a person that can't read code? Which one is smaller than the other?
Well, 0x1 is just the hex value of 1, which in binary is represented as ~001. When you apply a 0 bit shift to 0x1, the value is unchanged because you haven't actually shifted anything. When you shift 1, you're looking at a representation of ~010 which in good ol' numerics is a 2 because you have a 1 in the twos column and zeros everywhere else.
Therefore, uint32_t i = 0x1 << 0; has a lesser value than uint32_t j = 0x1 << 1;.
uint32_t i = 0x1 << 0;
uint32_t j = 0x1 << 1;
NSLog(#"%u",i); // outputs 1
NSLog(#"%u",j); // outputs 2