Threads indexing out of bounds in CUDA kernel - indexing

I am running a CUDA kernel which seems to be indexing out of bounds and I can not figure out why. I get error 8 write-of-size in cuda-memcheck.
I have tried to change the number of blocks and the number of threads in each block as well as only running a fraction of all iterations needed. Here is some usefull information as well as a replicable example which gives the error:
blockSize: 128
numBlocks: 512
Nvidia GTX 970
#include <iostream>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector>
#include <iterator>
#include <cuda_profiler_api.h>
#include <algorithm>
#include <cmath>
#include <numeric>
#include <stdio.h>
#include <fstream>
__host__
int NchooseK(const int &N, const int &K)
{
int result = 1;
for (int i = 1; i <= K; i++)
{
result *= N - (K - i);
result /= i;
}
return result;
}
__host__
inline int get_flatten_size(const unsigned int N){
int sum = 0;
for(int i=1; i<=N ; i++){
sum +=i*NchooseK(N,i);
}
return sum;
}
__host__
std::vector<int> comb(const int &N, const int &K, const int &length)
//void comb(int N, int K, int length)
{
int k;
std::vector<int> vec(K);
std::vector<int> flatten_vec(0);
std::string bitmask(K, 1); // K leading 1's
bitmask.resize(N, 0); // N-K trailing 0's
for (int j = 0; j < length; j++) {
k = 0;
for (int i = 0; i < N; ++i) // [0..N-1] integers
{
if (bitmask[i]) {
//std::cout << i << " ";
vec[k] = i;
k++;
}
//std::cout << std::endl;
}
std::prev_permutation(bitmask.begin(), bitmask.end());
flatten_vec.insert(flatten_vec.end(), vec.begin(),vec.end());
}
return flatten_vec;
}
__host__
void get_matrix_indices(const unsigned int N, int *sub_col, int *sub_size, int *cumulative_size)
{
int size, itterator = 0;
cumulative_size[0] = 0;
std::vector<int> size_i_columns;
std::vector<int> all_columns(0);
for(int i=1; i<=N; i++){
size = NchooseK(N,i);
size_i_columns = comb(N,i,size);
for(int j=0; j<size; j++){
sub_size[itterator]=i;
cumulative_size[itterator+1]=cumulative_size[itterator]+i;
itterator++;
}
all_columns.insert(all_columns.end(),size_i_columns.begin(),size_i_columns.end());
}
//sub_col = &all_columns[0];
for(int i = 0; i < all_columns.size(); i++) sub_col[i] = all_columns[i];
}
__global__
void comb_ols(const unsigned int M, const unsigned int N, int* sub_col, int *sub_size, int* cumulative_size, const unsigned int numberOfCalculations, const unsigned int max_size){
int size;
int start_index;
int index = blockIdx.x*blockDim.x+threadIdx.x;
int stride = blockDim.x*gridDim.x;
double *sub_matrix = new double[M*(1+max_size)];
for(int i = index; i < numberOfCalculations; i+=stride){
size = sub_size[i];
start_index = cumulative_size[i];
for(int j = 0; j < size; j++){
for(int k = 0; k<M; k++){
sub_matrix[k] = 1;
}
}
}
delete [] sub_matrix;
}
And then we the main function:
int main()
{
int N = 17;
int M = 263;
const unsigned int regressors = N-1;
const unsigned int numberOfCalculations = (int) (exp2((double) regressors) - 1);
const unsigned int size_sub_col = get_flatten_size(regressors);
int blockSize =128;
int numBlocks = (numberOfCalculations + blockSize-1)/blockSize;
std::cout << "\nblockSize :" << blockSize;
std::cout << "\nnumBlocks :" << numBlocks;
std::cout << "\nblockSize*numBlocks :" << blockSize*numBlocks;
std::cout << "\nregressors :" << regressors;
std::cout << "\nNumberOfCalculations :" << numberOfCalculations;
std::cout << "\nsize_sub_col :" << size_sub_col << '\n' ;
int *sub_size, *cumulative_size, *sub_columns;
cudaMallocManaged(&sub_size, numberOfCalculations*sizeof(int));
cudaMallocManaged(&cumulative_size, (numberOfCalculations+1)*sizeof(int));
cudaMallocManaged(&sub_columns, size_sub_col*sizeof(int));
get_matrix_indices(regressors,sub_columns, sub_size, cumulative_size);
const unsigned int max_size = N*M;
cudaProfilerStart();
comb_ols<<<numBlocks, blockSize>>>(M,N,sub_columns, sub_size, cumulative_size, numberOfCalculations, max_size);
cudaProfilerStop();
cudaDeviceSynchronize();
cudaFree(sub_size);
cudaFree(cumulative_size);
cudaFree(sub_columns);
return 0;
}
I fail to see why the threads would try to access illegal memory space. The way I understood is that the matrix sub_matrix will be initilized on each thread once and then the parallel for loop happens. Thus should each thread have the necessary memory space. Am I allocating too much memory on the GPU? How is "new sub_matrix" handled here?

If I read your code correctly, each thread is attempting to allocate M * (1 + M*N) doubles, which is 263 * ( 1 + 263*17) = ‭1,176,136‬ doubles, or 8.97Mb of heap memory per thread. You launch 128 * 512 threads. That would mean you require 588Gb of heap space for the kernel to run successfully.
Clearly your GPU lacks that amount of memory and the out of bounds memory access comes from failures in the new call (which you can check for, BTW).
Might I suggest that something in the size calculations for the heap memory you require is wrong. Otherwise you have an extremely unrealistic problem for the GPU and will require some other approach.
Note that even if you manage to redesign things to limit the code to a feasible malloc heap memory size, you will still need, in all likelihood, to resize the malloc heap to a suitable size before running the kernel. The cudaDeviceSetLimit API can be used for this.

Related

Degraded performance of CGAL::intersection between a 32 bit and 64 bit application while multi-threading

We have recently changed our application from 32 bit to 64 bit and have noticed a degradation in performance when using CGAL::intersection in multiple worker threads. Each thread is using a distinct set of shapes, and so do not interfere with each other.
Here is a simple example I put together to illustrate the issue:
#include <iostream>
#include <vector>
#include <thread>
#include <CGAL/Boolean_set_operations_2.h>
#include <CGAL/Exact_predicates_exact_constructions_kernel.h>
#include <CGAL/Gps_circle_segment_traits_2.h>
typedef CGAL::Exact_predicates_exact_constructions_kernel Kernel;
typedef CGAL::Gps_circle_segment_traits_2<Kernel> Traits_2;
typedef Kernel::Point_2 Point_2;
typedef Traits_2::General_polygon_2 Polygon_2;
typedef Traits_2::General_polygon_with_holes_2 Polygon_with_holes_2;
typedef Traits_2::X_monotone_curve_2 X_monotone_curve_2;
const bool multiThreading = true;
const int threadCount = 7;
const int attempts = 2520;
const int overallAttempts = 10;
Polygon_2 construct_polygon(const Point_2& p1, const Point_2& p2, const Point_2& p3, const Point_2& p4, const Point_2& p5)
{
Polygon_2 pgn;
X_monotone_curve_2 s1(p1, p2); pgn.push_back(s1);
X_monotone_curve_2 s2(p2, p3); pgn.push_back(s2);
X_monotone_curve_2 s3(p3, p4); pgn.push_back(s3);
X_monotone_curve_2 s4(p4, p5); pgn.push_back(s4);
X_monotone_curve_2 s5(p5, p1); pgn.push_back(s5);
return pgn;
}
void do_intersection()
{
Polygon_2 outerPolygon = construct_polygon(Point_2(0, 0), Point_2(300, 0), Point_2(310, 150), Point_2(300, 300), Point_2(0, 300));
int numHoles = 100;
std::vector<Polygon_2> holes(numHoles);
for (int i = 0; i < numHoles; i++)
{
holes[i] = construct_polygon(Point_2(2 * i + 1, 2 * i + 1), Point_2(2 * i + 1, 2 * i + 1.5), Point_2(2 * i + 2, 2 * i + 2), Point_2(2 * i + 5, 2 * i + 2), Point_2(2 * i + 3, 2 * i + 1));
}
Polygon_with_holes_2 p(outerPolygon, holes.begin(), holes.end());
Polygon_2 poly1 = construct_polygon(Point_2(1, 0), Point_2(50, 1), Point_2(12, 13), Point_2(25, 50), Point_2(12, 100));
int intersectAttempts = multiThreading ? attempts / threadCount : attempts;
for (int i = 0; i < intersectAttempts; i++)
{
std::list<Polygon_with_holes_2> intersect;
CGAL::intersection(p, poly1, std::back_inserter(intersect));
}
}
int main(int argc, char* argv[])
{
long long averageTime = 0;
for (int x = 0; x < overallAttempts; x++)
{
auto startTime = std::chrono::high_resolution_clock::now();
if (multiThreading)
{
std::thread threads[threadCount];
for (int i = 0; i < threadCount; i++)
{
threads[i] = std::thread(do_intersection);
}
for (int i = 0; i < threadCount; i++)
{
threads[i].join();
}
}
else
{
do_intersection();
}
auto endTime = std::chrono::high_resolution_clock::now();
auto diffTime = endTime - startTime;
averageTime = averageTime + diffTime.count();
std::cout << "Total cost: " << diffTime.count() * 1e-9 << std::endl;
}
std::cout << std::endl;
std::cout << "Average cost: " << (averageTime * 1e-9)/overallAttempts << std::endl;
return 0;
}
When using a single thread, this example is roughly 15% faster in x64 than in x86. However when using 7 worker threads (my machine has 8 logical processors), this example is roughly 50% slower in x64 than x86.
We are unsure why this is considerably slower, but i think it may be related to using CGAL::intersection on a polygon with a lot of holes. Any helpful suggestions would be greatly appreciated.

Comparing Execution time with Time Complexity in Merge & Quick Sort

I have implemented Merge & Quick Sort in the textbook what I've learned, and it says Time Complexities of each sorts are like this:
Merge Sort: O(n.log(n)) / Quick Sort: average O(n.log(n)) and O(n2) in the worst case (if key array is sorted).
So I executed the programs with Two types of Arrays: sorted and random, with different sizes.
Since I wanted to get the Average time, I have tried 10 times per each case.
Here is the code of Merge & Quick Sort:
#include <iostream>
#include <ctime>
#include <vector>
#include <algorithm>
using namespace std;
void Merge(vector<int>& s, int low, int mid, int high) {
int i = low;
int j = mid + 1;
int k = low;
vector<int> u(s);
while (i <= mid && j <= high) {
if (s.at(i) < s.at(j)) {
u.at(k) = s.at(i);
i++;
} else {
u.at(k) = s.at(j);
j++;
}
k++;
}
if (i > mid) {
for (int a = j; a < high + 1; a++) {
u.at(k) = s.at(a);
k++;
}
} else {
for (int a = i; a < mid + 1; a++) {
u.at(k) = s.at(a);
k++;
}
}
for (int a = low; a < high + 1; a++)
s.at(a) = u.at(a);
}
void MergeSort(vector<int>& s, int low, int high) {
int mid;
if (low < high) {
mid = (low + high) / 2;
MergeSort(s, low, mid);
MergeSort(s, mid + 1, high);
Merge(s, low, mid, high);
}
}
void swap(int& a, int& b) {
int tmp = a;
a = b;
b = tmp;
}
void Partition(vector<int>& s, int low, int high, int& pvpoint) {
int j;
int pvitem;
pvitem = s.at(low);
j = low;
for (int i = low + 1; i <= high; i++) {
if (s.at(i) < pvitem) {
j++;
swap(s.at(i), s.at(j));
}
pvpoint = j;
swap(s.at(low), s.at(pvpoint));
}
}
void QuickSort(vector<int>& s, int low, int high) {
int pvpoint;
if (high > low) {
Partition(s, low, high, pvpoint);
QuickSort(s, low, pvpoint - 1);
QuickSort(s, pvpoint + 1, high);
}
}
And each of these main() functions are printing the execution times in SORTED, and RANDOM key arrays.
you can see the result with adding one of these main functions in Visual Studio(C++):
//Sorted key array
int main() {
int s;
for (int i = 1; i < 21; i++) { //Size is from 300 to 6000
s = i * 300;
vector<int> Arr(s);
cout << "N : " << s << "\n";
//Assign Random numbers to each elements
Arr.front() = rand() % Arr.size();
for (int j = 1; j < Arr.size(); j++) { Arr.at(j) = ((737 * Arr.at(j - 1) + 149) % (Arr.size() * 5)); }
sort(Arr.begin(), Arr.end());
//QuickSort(Arr, 0, Arr.size() - 1); <- you can switch using this instead of MergeSort(...) below
for (int i = 0; i < 10; i++) { //print 10 times of execution time
clock_t start, end;
start = clock();
MergeSort(Arr, 0, Arr.size() - 1);
end = clock() - start;
printf("%12.3f ", (double)end * 1000.0 / CLOCKS_PER_SEC);
}
cout << endl;
}
return 0;
}
//Random key array
int main() {
int s;
for (int i = 1; i < 21; i++) {
s = i * 3000;
vector<int> Arr(s);
cout << "N : " << s << "\n";
for (int i = 0; i < 10; i++) {
//Assign Random numbers to each elements
Arr.front() = rand() % Arr.size();
for (int j = 1; j < Arr.size(); j++) { Arr.at(j) = ((737 * Arr.at(j - 1) + 149) % (Arr.size() * 5)); }
//QuickSort(Arr, 0, Arr.size() - 1); <- you can switch using this instead of MergeSort(...) below
clock_t start, end;
start = clock();
MergeSort(Arr, 0, Arr.size() - 1);
end = clock() - start;
printf("%12.3f ", (double)end * 1000.0 / CLOCKS_PER_SEC);
}
cout << endl;
}
return 0;
}
And the THING is, the result is not matching with their time complexity. for example, Merge sort in(RANDOM Array)
size N=3000 prints 20 ms, but size N=60000 prints 1400~1600 ms !! it supposed to print almost 400 ms because Time complexity (Not in worse case) in Quick Sort is O(n.log(n)), isn't it? I want to know what affects to this time and how could I see the printed time that I expected.
You posted the same code in this question: Calculate Execution Times in Sort algorithm and you did not take my answer into account.
Your MergeSort function has a flaw: you duplicate the whole array in merge causing a lot of overhead and quadratic time complexity. This innocent looking definition: vector<int> u(s); defines u as a vector initialized as a copy of s, the full array.
C++ is a very powerful language, often too powerful, littered with traps and pitfalls such as this. It is a very good thing you tried to verify that your program meets the expected performance from the known time complexity of the algorithm. Such a concern is alas too rare.
Here are some guidelines:
For getting execution time:
#include <time.h>
int main()
{
struct timeval stop, start;
int arr[10000];
gettimeofday(&start, NULL);
mergeSort(arr, 0, 9999);
gettimeofday(&stop, NULL);
printf("Time taken for Quick sort is: %ld microseconds\n",
(stop.tv_sec-start.tv_sec)*1000000+stop.tv_usec-start.tv_usec);
}

Teaching myself OOP in C++

So I've been working on this program for the last month. The original code is from this tutorial https://www.youtube.com/watch?v=KjHKwCZyAhQ&list=PLHm_I0tE5kKPPWXkTTtOn8fkcwEGZNETh&index=3
However, I thought I would turn it into an object oriented program before I went on. Doing rather than copying is the best way to learn. The code generated a bmp file before i divided it up, but not anymore. The program executes but it doesn't create a file. Additionally I added Hello World in my .cpp files to see if they were even being executed and it looks like they aren't. I realize in copying this that I have a lot of code, I think the problem is in the main file so hopefully if anyone is nice enough to help me they can pick it out much more quickly!
*edit
Also in the original code he had the strut as a global variable but wasn't sure which file to implement it in or even how to make something global in an OOP! Would I just put it in main above int main() ?
Output.h
#pragma once
#include "ProProcess.h" //this is just a bunch of preprocessor directives
//this program creats a single color bmp file using red, blue, and green (rgb)
class OutPut
{
public:
OutPut(const int height, std::string file_name, int dpi, int index);
~OutPut();
//savebmp_str(std::string* file_name, const int width, const int height, int dpi, int pixels, struct RGBtype);
//commented this out because I wasn't sure how I should pass all these values. Ultimately I used OutPut Object_Output in bmp.cpp so that these variables could be passed in there
const int Getwidth() { return width; }
const int Setwidth(const int x) { const int width = x; }
private:
struct RGBtype //Could be a global variable but I dont know which file to put it in
{
int r;
int g;
int b;
};
const int width = 1960; //window size
const int height = 1080;
int dpi = 72;
int number_of_pixels = width*height;
int index;
const char* file_name = "Scene.bmp";
RGBtype *pixels = new RGBtype[number_of_pixels];//creates an array so that each pixel is comprised of a mix of rgb
};
Output.cpp
#include "OutPut.h"
#include "ProProcess.h"
OutPut::OutPut(const int height, std::string file_name, int dpi, int index)
{
OutPut::RGBtype color;
for (int x = 0; x < height; x++) //nested for loop that draws out each pixel totalling 1920x1080 in all
{
for (int y = 0; y < width; y++)
{
index = y*height + x;
pixels[index].r = 311;//changing the number here changes the color
pixels[index].g = 311;
pixels[index].b = 311;
}
}
std::cout << "Hello World";
}
OutPut::~OutPut()
{
}
BMP.h
#pragma once
#include "ProProcess.h"
#include "OutPut.h"
struct RGBtype
{
int r;
int g;
int b;
};
class BMP
{
public:
BMP(const char *filename, int passed_width, int passed_height, int dpi, RGBtype* data);
~BMP();
private:
OutPut Object_Output(std::string* file_name, const int width, const int height, int dpi, int pixels, struct RGBtype);//this is to pass the variables declared in output.h so bmp.h and bmp.cpp can use them too. Not sure how I would even verify i am doing this properly!
//const char* savebmp_str();
int passed_width;
int passed_height;
int dpi;
RGBtype *data;
};
BMP.cpp
#include "BMP.h"
#include "ProProcess.h"
#include "OutPut.h"
BMP::BMP(const char *filename, int passed_width, int passed_height, int dpi, RGBtype *data)
{
std::cout << passed_height;
FILE *pFile;
int k = passed_width*passed_height;
std::cout << "The value k is" << k;
int s = 4 * k;
int filesize = 54 + s; //s is a function of width and height
double factor = 39.375;
int m = static_cast<int>(factor);
int ppm = dpi*m;
unsigned char bmpfileheader[14] = { 'B','M',0,0,0,0 ,0,0,0,0, 54,0,0,0 }; //B and M are case sensitive. They make a bmp file
unsigned char bmpinfoheader[40] = { 40,0,0,0, 0,0,0,0 ,0,0,0,0, 1,0,24,0 };// the header size 14 and 40 are part of the BMP format
bmpfileheader[2] = (unsigned char)(filesize);
bmpfileheader[3] = (unsigned char)(filesize >> 8);
bmpfileheader[4] = (unsigned char)(filesize >> 16);
bmpfileheader[5] = (unsigned char)(filesize >> 24);
bmpinfoheader[4] = (unsigned char)(passed_width);
bmpinfoheader[5] = (unsigned char)(passed_width >> 8);
bmpinfoheader[6] = (unsigned char)(passed_width >> 16);
bmpinfoheader[7] = (unsigned char)(passed_width >> 24);
bmpinfoheader[8] = (unsigned char)(passed_height);
bmpinfoheader[9] = (unsigned char)(passed_height >> 8);
bmpinfoheader[10] = (unsigned char)(passed_height >> 16);
bmpinfoheader[11] = (unsigned char)(passed_height >> 24);
bmpinfoheader[21] = (unsigned char)(s);
bmpinfoheader[22] = (unsigned char)(s >> 8);
bmpinfoheader[23] = (unsigned char)(s >> 16);
bmpinfoheader[24] = (unsigned char)(s >> 24);
bmpinfoheader[25] = (unsigned char)(ppm);
bmpinfoheader[26] = (unsigned char)(ppm >> 8);
bmpinfoheader[27] = (unsigned char)(ppm >> 16);
bmpinfoheader[28] = (unsigned char)(ppm >> 24);
bmpinfoheader[29] = (unsigned char)(ppm);
bmpinfoheader[30] = (unsigned char)(ppm >> 8);
bmpinfoheader[31] = (unsigned char)(ppm >> 16);
bmpinfoheader[32] = (unsigned char)(ppm >> 24);
pFile = fopen(filename, "wb");
fwrite(bmpfileheader, sizeof(char), 14, pFile);
fwrite(bmpinfoheader, sizeof(char), 40, pFile);
for (int i = 0; i < k; i++)
{
RGBtype rgb = data[i];
double red = (data[i].r);
double green = (data[i].g);
double blue = (data[i].b);
int color[3] = { (int)floor(blue), (int)floor(green), (int)floor(red) };
fwrite(color, 1, 3, pFile);
}
fclose(pFile);
std::cout << "Hello World";
}
BMP::~BMP()
{
}
main.cpp
#include <iostream>
#include "OutPut.h"
#include "ProProcess.h"
#include "BMP.h"
int main()
{
OutPut Pixel_gen();
BMP BMP_Format_Maker();
OutPut Object_Output();
system("Pause");
return 0;
}

How to calculate CRC-16 from HEX values?

In my code i need to calculate CRC-16 16 bit values for the HEX values stored as NSdata, below is the code snippet to calculate CRC-16 in c.
void UpdateCRC(unsigned short int *CRC, unsigned char x)
{
// This function uses the initial CRC value passed in the first
// argument, then modifies it using the single character passed
// as the second argument, according to a CRC-16 polynomial
// Arguments:
// CRC -- pointer to starting CRC value
// x -- new character to be processed
// Returns:
// The function does not return any values, but updates the variable
// pointed to by CRC
static int const Poly = 0xA001;
int i;
bool flag;
*CRC ^= x;
for (i=0; i<8; i++)
// CRC-16 polynomial
{
flag = ((*CRC & 1) == 1);
*CRC = (unsigned short int)(*CRC >> 1);
if (flag)
*CRC ^= Poly;
}
return;
}
NSdata which holds the hex values like below
const char connectByteArray[] = {
0x21,0x01,0x90,0x80,0x5F
};
NSData* data = [NSData dataWithBytes: connectByteArray length:sizeof(connectByteArray)];
I solved using the following C program, I hope it may help someone ..cheers!!!
#include <string.h>
#include <stdio.h>
const int order = 16;
const unsigned long polynom = 0x8005;
const int direct = 1;
const unsigned long crcinit = 0;
const unsigned long crcxor = 0;
const int refin = 1;
const int refout = 1;
// 'order' [1..32] is the CRC polynom order, counted without the leading '1' bit
// 'polynom' is the CRC polynom without leading '1' bit
// 'direct' [0,1] specifies the kind of algorithm: 1=direct, no augmented zero bits
// 'crcinit' is the initial CRC value belonging to that algorithm
// 'crcxor' is the final XOR value
// 'refin' [0,1] specifies if a data byte is reflected before processing (UART) or not
// 'refout' [0,1] specifies if the CRC will be reflected before XOR
// Data character string
const unsigned char string[] = {0x05,0x0f,0x01,0x00,0x00,0x99};
// internal global values:
unsigned long crcmask;
unsigned long crchighbit;
unsigned long crcinit_direct;
unsigned long crcinit_nondirect;
unsigned long crctab[256];
// subroutines
unsigned long reflect (unsigned long crc, int bitnum) {
// reflects the lower 'bitnum' bits of 'crc'
unsigned long i, j=1, crcout=0;
for (i=(unsigned long)1<<(bitnum-1); i; i>>=1) {
if (crc & i) crcout|=j;
j<<= 1;
}
return (crcout);
}
void generate_crc_table() {
// make CRC lookup table used by table algorithms
int i, j;
unsigned long bit, crc;
for (i=0; i<256; i++) {
crc=(unsigned long)i;
if (refin) crc=reflect(crc, 8);
crc<<= order-8;
for (j=0; j<8; j++) {
bit = crc & crchighbit;
crc<<= 1;
if (bit) crc^= polynom;
}
if (refin) crc = reflect(crc, order);
crc&= crcmask;
crctab[i]= crc;
}
}
unsigned long crctablefast (unsigned char* p, unsigned long len) {
// fast lookup table algorithm without augmented zero bytes, e.g. used in pkzip.
// only usable with polynom orders of 8, 16, 24 or 32.
unsigned long crc = crcinit_direct;
if (refin) crc = reflect(crc, order);
if (!refin) while (len--) crc = (crc << 8) ^ crctab[ ((crc >> (order-8)) & 0xff) ^ *p++];
else while (len--) crc = (crc >> 8) ^ crctab[ (crc & 0xff) ^ *p++];
if (refout^refin) crc = reflect(crc, order);
crc^= crcxor;
crc&= crcmask;
return(crc);
}
unsigned long crctable (unsigned char* p, unsigned long len) {
// normal lookup table algorithm with augmented zero bytes.
// only usable with polynom orders of 8, 16, 24 or 32.
unsigned long crc = crcinit_nondirect;
if (refin) crc = reflect(crc, order);
if (!refin) while (len--) crc = ((crc << 8) | *p++) ^ crctab[ (crc >> (order-8)) & 0xff];
else while (len--) crc = ((crc >> 8) | (*p++ << (order-8))) ^ crctab[ crc & 0xff];
if (!refin) while (++len < order/8) crc = (crc << 8) ^ crctab[ (crc >> (order-8)) & 0xff];
else while (++len < order/8) crc = (crc >> 8) ^ crctab[crc & 0xff];
if (refout^refin) crc = reflect(crc, order);
crc^= crcxor;
crc&= crcmask;
return(crc);
}
unsigned long crcbitbybit(unsigned char* p, unsigned long len) {
// bit by bit algorithm with augmented zero bytes.
// does not use lookup table, suited for polynom orders between 1...32.
unsigned long i, j, c, bit;
unsigned long crc = crcinit_nondirect;
for (i=0; i<len; i++) {
c = (unsigned long)*p++;
if (refin) c = reflect(c, 8);
for (j=0x80; j; j>>=1) {
bit = crc & crchighbit;
crc<<= 1;
if (c & j) crc|= 1;
if (bit) crc^= polynom;
}
}
for (i=0; i<order; i++) {
bit = crc & crchighbit;
crc<<= 1;
if (bit) crc^= polynom;
}
if (refout) crc=reflect(crc, order);
crc^= crcxor;
crc&= crcmask;
return(crc);
}
unsigned long crcbitbybitfast(unsigned char* p, unsigned long len) {
// fast bit by bit algorithm without augmented zero bytes.
// does not use lookup table, suited for polynom orders between 1...32.
unsigned long i, j, c, bit;
unsigned long crc = crcinit_direct;
for (i=0; i<len; i++) {
c = (unsigned long)*p++;
if (refin) c = reflect(c, 8);
for (j=0x80; j; j>>=1) {
bit = crc & crchighbit;
crc<<= 1;
if (c & j) bit^= crchighbit;
if (bit) crc^= polynom;
}
}
if (refout) crc=reflect(crc, order);
crc^= crcxor;
crc&= crcmask;
return(crc);
}
int main() {
// test program for checking four different CRC computing types that are:
// crcbit(), crcbitfast(), crctable() and crctablefast(), see above.
// parameters are at the top of this program.
// Result will be printed on the console.
int i;
unsigned long bit, crc;
// at first, compute constant bit masks for whole CRC and CRC high bit
crcmask = ((((unsigned long)1<<(order-1))-1)<<1)|1;
crchighbit = (unsigned long)1<<(order-1);
// check parameters
if (order < 1 || order > 32) {
printf("ERROR, invalid order, it must be between 1..32.\n");
return(0);
}
if (polynom != (polynom & crcmask)) {
printf("ERROR, invalid polynom.\n");
return(0);
}
if (crcinit != (crcinit & crcmask)) {
printf("ERROR, invalid crcinit.\n");
return(0);
}
if (crcxor != (crcxor & crcmask)) {
printf("ERROR, invalid crcxor.\n");
return(0);
}
// generate lookup table
generate_crc_table();
// compute missing initial CRC value
if (!direct) {
crcinit_nondirect = crcinit;
crc = crcinit;
for (i=0; i<order; i++) {
bit = crc & crchighbit;
crc<<= 1;
if (bit) crc^= polynom;
}
crc&= crcmask;
crcinit_direct = crc;
}
else {
crcinit_direct = crcinit;
crc = crcinit;
for (i=0; i<order; i++) {
bit = crc & 1;
if (bit) crc^= polynom;
crc >>= 1;
if (bit) crc|= crchighbit;
}
crcinit_nondirect = crc;
}
// call CRC algorithms using the CRC parameters above and print result to the console
printf("\n");
printf("CRC tester v1.1 written on 13/01/2003 by Sven Reifegerste (zorc/reflex)\n");
printf("-----------------------------------------------------------------------\n");
printf("\n");
printf("Parameters:\n");
printf("\n");
printf(" polynom : 0x%x\n", polynom);
printf(" order : %d\n", order);
printf(" crcinit : 0x%x direct, 0x%x nondirect\n", crcinit_direct, crcinit_nondirect);
printf(" crcxor : 0x%x\n", crcxor);
printf(" refin : %d\n", refin);
printf(" refout : %d\n", refout);
printf("\n");
printf(" data string : '%s' (%d bytes)\n", string, strlen(string));
printf("\n");
printf("Results:\n");
printf("\n");
printf(" crc bit by bit : 0x%x\n", crcbitbybit((unsigned char *)string, 6));
printf(" crc bit by bit fast : 0x%x\n", crcbitbybitfast((unsigned char *)string, strlen(string)));
if (!(order&7)) printf(" crc table : 0x%x\n", crctable((unsigned char *)string, strlen(string)));
if (!(order&7)) printf(" crc table fast : 0x%x\n", crctablefast((unsigned char *)string, strlen(string)));
return(0);
}

CUDA optimization question

Here's a simple program:
void multiply(const int* v_in, const int* w_in, int n_v, int n_w, int* w_out)
{
for(int i=0; i<n_w; i++)
{
int sum=0;
for(int j=0; j<n_v; j++)
sum += (w_in[i]*v_in[j])>>1;
w_out[i]=sum;
}
}
Presume n_v, n_w ~10^6. Clearly, there's at least a dozen equivalent ways to do this in CUDA, with different ways to subdivide (n_v*n_w) operations into threads, with and without shared memory... Which way should, theoretically speaking, be the fastest?
simplest:
void multiply(const int* v_in, const int* w_in, int n_v, int n_w, int* w_out)
{
int *v = shared; // dynamic
for(int i = block.rank; i < n_w; i += block.size)
{
int w = w_in[i]; // coalesced
int sum=0;
for(int j=0; j<n_v; j += block.size) { // assumption
v[block.rank] = v_in[j+block.rank];
__synch();
for(int k = 0; k < block.size; ++k)
sum += (w*v[k])>>1; //
__synch(); // ouch
}
w_out[i] = sum; // ditto
}
}