CUDA optimization question

CUDA optimization question - optimization

Here's a simple program:
void multiply(const int* v_in, const int* w_in, int n_v, int n_w, int* w_out)
{
for(int i=0; i<n_w; i++)
{
int sum=0;
for(int j=0; j<n_v; j++)
sum += (w_in[i]*v_in[j])>>1;
w_out[i]=sum;
}
}
Presume n_v, n_w ~10^6. Clearly, there's at least a dozen equivalent ways to do this in CUDA, with different ways to subdivide (n_v*n_w) operations into threads, with and without shared memory... Which way should, theoretically speaking, be the fastest?

simplest:
void multiply(const int* v_in, const int* w_in, int n_v, int n_w, int* w_out)
{
int *v = shared; // dynamic
for(int i = block.rank; i < n_w; i += block.size)
{
int w = w_in[i]; // coalesced
int sum=0;
for(int j=0; j<n_v; j += block.size) { // assumption
v[block.rank] = v_in[j+block.rank];
__synch();
for(int k = 0; k < block.size; ++k)
sum += (w*v[k])>>1; //
__synch(); // ouch
}
w_out[i] = sum; // ditto
}
}

Related

How to draw new graph with new point? ROOT Cern

I wanted to ask for a information.
I am studying ROOT.
I have 100 files and each file has 9 columns.
I need to take the x data of the first column and for every other y the relative values.
using the push_back method i create the vectors X and Y[8].
Now to normalize it, I need to take the smallest value among the elements of X and translate all the points and make a graph with all the points translated.
I wanted to ask where I am going wrong.
M= 8 columns
I = 100 file with 9 columns-- 1 for X and 8 for Y — for (int i=0; i<N; i++){ etc…
int npoints = (int)X.size();
for (int k=0; k<M; k++) {
g[i][k]= new TGraph();
g[i][k]->SetNameTitle( Form("graphic_name_%d_%d",i,k), Form("graphic_name_ %d_%d",i,k) );
}
for (int p=0; p<M; p++) {
for( int b=0; b<npoints; b++){
float a=X[0];
double t;
t=b-a;
g[i][p]->SetPoint(b,X[t],Y[p][t]);
}
}
please,can you help me?
thanks

This code snippet might give you some ideas on how to approach your problem:
#define NFILES 100
#define NROWS 20
#define NCOLUMNS 9
void graphs()
{
// Create some fake data
float data[NFILES][NROWS][NCOLUMNS];
for (int k=0; k<NFILES; k++) {
for (int i=0; i<NROWS; i++) {
for (int j=0; j<NCOLUMNS; j++) {
data[k][i][j] = k*10+i*2.5+j*0.01;
}
}
}
// Allocate graphs
TGraph* graphs[NFILES][NCOLUMNS-1];
for (int k=0; k<NFILES; k++) {
for (int j=0; j<NCOLUMNS-1; j++) {
graphs[k][j]= new TGraph();
graphs[k][j]->SetNameTitle( Form("graphic_name_%d_%d",k,j), Form("graphic_name_ %d_%d",k,j) );
}
}
// Fill graphs
for (int k=0; k<NFILES; k++) {
const float a = data[k][0][0];
for (int j=0; j<NCOLUMNS-1; j++) {
for( int i=0; i<NROWS; i++) {
graphs[k][j]->SetPoint(i,data[k][i][0]-a,data[k][i][j+1]);
}
}
}
}

Threads indexing out of bounds in CUDA kernel

I am running a CUDA kernel which seems to be indexing out of bounds and I can not figure out why. I get error 8 write-of-size in cuda-memcheck.
I have tried to change the number of blocks and the number of threads in each block as well as only running a fraction of all iterations needed. Here is some usefull information as well as a replicable example which gives the error:
blockSize: 128
numBlocks: 512
Nvidia GTX 970
#include <iostream>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector>
#include <iterator>
#include <cuda_profiler_api.h>
#include <algorithm>
#include <cmath>
#include <numeric>
#include <stdio.h>
#include <fstream>
__host__
int NchooseK(const int &N, const int &K)
{
int result = 1;
for (int i = 1; i <= K; i++)
{
result *= N - (K - i);
result /= i;
}
return result;
}
__host__
inline int get_flatten_size(const unsigned int N){
int sum = 0;
for(int i=1; i<=N ; i++){
sum +=i*NchooseK(N,i);
}
return sum;
}
__host__
std::vector<int> comb(const int &N, const int &K, const int &length)
//void comb(int N, int K, int length)
{
int k;
std::vector<int> vec(K);
std::vector<int> flatten_vec(0);
std::string bitmask(K, 1); // K leading 1's
bitmask.resize(N, 0); // N-K trailing 0's
for (int j = 0; j < length; j++) {
k = 0;
for (int i = 0; i < N; ++i) // [0..N-1] integers
{
if (bitmask[i]) {
//std::cout << i << " ";
vec[k] = i;
k++;
}
//std::cout << std::endl;
}
std::prev_permutation(bitmask.begin(), bitmask.end());
flatten_vec.insert(flatten_vec.end(), vec.begin(),vec.end());
}
return flatten_vec;
}
__host__
void get_matrix_indices(const unsigned int N, int *sub_col, int *sub_size, int *cumulative_size)
{
int size, itterator = 0;
cumulative_size[0] = 0;
std::vector<int> size_i_columns;
std::vector<int> all_columns(0);
for(int i=1; i<=N; i++){
size = NchooseK(N,i);
size_i_columns = comb(N,i,size);
for(int j=0; j<size; j++){
sub_size[itterator]=i;
cumulative_size[itterator+1]=cumulative_size[itterator]+i;
itterator++;
}
all_columns.insert(all_columns.end(),size_i_columns.begin(),size_i_columns.end());
}
//sub_col = &all_columns[0];
for(int i = 0; i < all_columns.size(); i++) sub_col[i] = all_columns[i];
}
__global__
void comb_ols(const unsigned int M, const unsigned int N, int* sub_col, int *sub_size, int* cumulative_size, const unsigned int numberOfCalculations, const unsigned int max_size){
int size;
int start_index;
int index = blockIdx.x*blockDim.x+threadIdx.x;
int stride = blockDim.x*gridDim.x;
double *sub_matrix = new double[M*(1+max_size)];
for(int i = index; i < numberOfCalculations; i+=stride){
size = sub_size[i];
start_index = cumulative_size[i];
for(int j = 0; j < size; j++){
for(int k = 0; k<M; k++){
sub_matrix[k] = 1;
}
}
}
delete [] sub_matrix;
}
And then we the main function:
int main()
{
int N = 17;
int M = 263;
const unsigned int regressors = N-1;
const unsigned int numberOfCalculations = (int) (exp2((double) regressors) - 1);
const unsigned int size_sub_col = get_flatten_size(regressors);
int blockSize =128;
int numBlocks = (numberOfCalculations + blockSize-1)/blockSize;
std::cout << "\nblockSize :" << blockSize;
std::cout << "\nnumBlocks :" << numBlocks;
std::cout << "\nblockSize*numBlocks :" << blockSize*numBlocks;
std::cout << "\nregressors :" << regressors;
std::cout << "\nNumberOfCalculations :" << numberOfCalculations;
std::cout << "\nsize_sub_col :" << size_sub_col << '\n' ;
int *sub_size, *cumulative_size, *sub_columns;
cudaMallocManaged(&sub_size, numberOfCalculations*sizeof(int));
cudaMallocManaged(&cumulative_size, (numberOfCalculations+1)*sizeof(int));
cudaMallocManaged(&sub_columns, size_sub_col*sizeof(int));
get_matrix_indices(regressors,sub_columns, sub_size, cumulative_size);
const unsigned int max_size = N*M;
cudaProfilerStart();
comb_ols<<<numBlocks, blockSize>>>(M,N,sub_columns, sub_size, cumulative_size, numberOfCalculations, max_size);
cudaProfilerStop();
cudaDeviceSynchronize();
cudaFree(sub_size);
cudaFree(cumulative_size);
cudaFree(sub_columns);
return 0;
}
I fail to see why the threads would try to access illegal memory space. The way I understood is that the matrix sub_matrix will be initilized on each thread once and then the parallel for loop happens. Thus should each thread have the necessary memory space. Am I allocating too much memory on the GPU? How is "new sub_matrix" handled here?

If I read your code correctly, each thread is attempting to allocate M * (1 + M*N) doubles, which is 263 * ( 1 + 263*17) = ‭1,176,136‬ doubles, or 8.97Mb of heap memory per thread. You launch 128 * 512 threads. That would mean you require 588Gb of heap space for the kernel to run successfully.
Clearly your GPU lacks that amount of memory and the out of bounds memory access comes from failures in the new call (which you can check for, BTW).
Might I suggest that something in the size calculations for the heap memory you require is wrong. Otherwise you have an extremely unrealistic problem for the GPU and will require some other approach.
Note that even if you manage to redesign things to limit the code to a feasible malloc heap memory size, you will still need, in all likelihood, to resize the malloc heap to a suitable size before running the kernel. The cudaDeviceSetLimit API can be used for this.

compare images using systemC

I wrote in this forum asking for help to solve this problem that took ame a lot of my time,i write my first program using systemC, I will expain my aim as much as I can , I stored 2 matrix of pixel value of image in two different text files, I write a systemC code that load two matrix and apply somme of absolute difference, if number of different superior of a Threshold the code displays message (motion).
My code composed of two modules, the first module check if there a number stored in a text file, if yes this Module will automates the other module to load the two matrix and compare them, I really need this code for my project graduation any help or suggestion.
#include "systemC.h"
#include "string.h"
#include "stdio.h"
#include"stdlib.h"
#include <time.h>
#include <math.h> /* fabs */
#include <fstream>
#include <iostream>
#include <fstream>
using namespace std;
#define _CRT_SECURE_NO_WARNINGS
_CRT_SECURE_NO_WARNINGS
double elapsed;
int H = 0;
int D = 0;
int a, b;
int in = false;
int L = 0;
char *mode1 = "r";
char *mode2 = "w";
int i, j, k;
int rows1, cols1, rows2, cols2;
bool fileFound = false;
FILE *SwitchContext;
FILE *image1;
FILE *image2;
FILE *image3;
int sum = 0;
clock_t start = clock();
SC_MODULE(synchronization)
{
sc_in<bool>sig ;
SC_CTOR(synchronization)
{
SC_METHOD(synchroprocess)
}
void synchroprocess()
{
cout << "\n Running Automation";
SwitchContext = fopen("F:/SWITCH CONTEXT.txt", mode2);
fscanf(SwitchContext, "%d", &L);
while (L != 0)
{
cout << "waiting...";
}
sig == true;
}
};
SC_MODULE(imageProcess)
{
sc_in<bool>sig;
SC_CTOR(imageProcess)
{
SC_METHOD(MotionDetector)
sensitive(sig);
}
void MotionDetector()
{
image3 = fopen("F:/image3.txt", mode2);
do
{
char *mode1 = "r";
char *mode2 = "w";
image1 = fopen("F:/image1.txt", mode1);
if (!image1)
{
printf("File Not Found!!\n");
fileFound = true;
}
else
fileFound = false;
}
while (fileFound);
do
{
image2 = fopen("F:/image2.txt", mode1);
if (!image2)
{
printf("File Not Found!!\n");
fileFound = true;
}
else
fileFound = false;
}
while (fileFound);
rows1 = rows2 = 384;
cols1 = cols2 = 512;
int **mat1 = (int **)malloc(rows1 * sizeof(int*));
for (i = 0; i < rows1; i++)
mat1[i] = (int *)malloc(cols1 * sizeof(int));
i = 0;
int **mat2 = (int **)malloc(rows2 * sizeof(int*));
for (i = 0; i < rows2; i++)
mat2[i] = (int *)malloc(cols2 * sizeof(int));
i = 0;
while (!feof(image1))
{
for (i = 0; i < rows1; i++)
{
for (j = 0; j < cols1; j++)
fscanf(image1, "%d%", &mat1[i][j]);
}
}
i = 0;
j = 0;
while (!feof(image2))
{
for (i = 0; i < rows2; i++)
{
for (j = 0; j < cols2; j++)
fscanf(image2, "%d%", &mat2[i][j]);
}
}
i = 0;
j = 0;
printf("\n\n");
for (i = 0; i < rows1; i++)
{
for (j = 0; j < cols1; j++) {
a = abs(mat1[i][j] = mat2[i][j]);
b = b + a;
}
}
i = j = 0;
D = b / 196608;
if (D > 0.9)
{
printf("%d,&K");
printf("MOTION...DETECTED");
getchar();
sc_pause;
for (i = 0; i < rows1; i++) {
for (j = 0; j < cols1; j++)
{
fprintf(image3, "%d ", mat2[i][j]);
}
fprintf(image3, "\n");
}
printf("\n Image Saved....");
std::ofstream mon_fichier("F:\toto.txt");
mon_fichier << elapsed << '\n';
}
fclose(image1);
fclose(image2);
fclose(image3);
clock_t end = clock();
elapsed = ((double)end - start) / CLOCKS_PER_SEC;
printf("time is %f", elapsed);
}
};
int sc_main(int argc, char* argv[])
{
imageProcess master("EE2");
master.MotionDetector();
sc_start();
return(0);
}

What you did is basically wrong.
You copy pasted code to SC_MODULE, this code is simple C code
(Do not mix C and C++ files)
This is not how you use clock
What you should do:
You need to check if your algorithm works, for this you do not need SystemC at all
Then you can replace data types with HW one and check if it still works
Then you have to find which data interface is used in HW and how to use this interface
Then you have to tweak your alg. to work with this interface (There you can use SC_MODULE, sc ports etc...)
Also take look at SC_CTHREAD, you will need it.
Without any informations about target platform I can not provide any other help.

TLE in Foe Pairs (Educational Codeforces Round 10)

On implementing O(N+M) complexity code for Foe Pairs problem
http://codeforces.com/contest/652/problem/C, I am getting TLE in Test Case 12.
Constraint : (1 ≤ N, M ≤ 3·105)
I am not getting, why for this constraint O(N+M) is getting TLE.
Here, is the code
#include<iostream>
#include<vector>
using namespace std;
int main()
{
int n,m;
cin>>n>>m;
std::vector<int> v(n+1);
for (int i = 0; i < n; ++i)
{
int x;
cin>>x;
v[x] = i;
}
std::vector<int> dp(n,0);
for (int i = 0; i < m; ++i)
{
int a,b;
cin>>a>>b;
if(v[a]>v[b])
swap(a,b);
dp[v[b]] = max(dp[v[b]], v[a]+1);
}
for (int i = 1; i < n; ++i)
{
dp[i] = max(dp[i], dp[i-1]);
}
long long s = 0;
for (int i = 0; i < n; ++i)
{
s+=(i+1-dp[i]);
}
cout<<s;
}
Is there anything, I am missing?

I changed all cin to scanf, it passed all test cases : http://codeforces.com/contest/652/submission/17014495
#include<cstdio>
#include<iostream>
#include<vector>
using namespace std;
int main()
{
int n,m;
scanf("%d%d", &n, &m);
//cin>>n>>m;
std::vector<int> v(n+1);
for (int i = 0; i < n; ++i)
{
int x;
//cin>>x;
scanf("%d", &x);
v[x] = i;
}
std::vector<int> dp(n,0);
for (int i = 0; i < m; ++i)
{
int a,b;
//cin>>a>>b;
scanf("%d%d", &a, &b);
if(v[a]>v[b])
swap(a,b);
dp[v[b]] = max(dp[v[b]], v[a]+1);
}
for (int i = 1; i < n; ++i)
{
dp[i] = max(dp[i], dp[i-1]);
}
long long s = 0;
for (int i = 0; i < n; ++i)
{
s+=(i+1-dp[i]);
}
cout<<s;
return 0;
}
You should always try to use scanf when the amount of input is large as it is faster.
You can read more about scanf being faster here : Using scanf() in C++ programs is faster than using cin?

Factorial in bignum library

Ive tried to create my own implementation of a bignum library
I cant seem to get the factorial to work. If I ask it to solve 4!, it gives out 96. It multiplies 4 twice. similarly, 5! is 600, not 120. I haven't implemented division, so I cant/dont want to divide the answer by the number
//bignum project
#include <iostream>
using namespace std;
class bignum
{
public:
int number[100];
int dpos;
int operator/ (bignum);
bignum operator- (bignum);
bignum operator* (bignum);
bignum operator+ (bignum);
bignum operator= (string);
void output()
{
int begin=0;
for(int i=0; i<=99; i++)
{
if(number[i]!=0 || begin==1)
{
cout<<number[i];
begin=1;
}
}
}
};
bool num_is_zero(bignum k)
{
for(int a=0; a<=99; a++)
{
if(k.number[a]!=0)
{
return false;
}
}
return true;
}
bignum factorial(bignum a)
{
bignum j;
bignum fact;
fact="1";
while(!num_is_zero(a))
{
j="1";
fact=fact*a;
a=a-j;
}
return fact;
}
bignum bignum::operator= (string k)
{
int l;
l=k.length()-1;
for(int h=0; h<=99; h++)
{
number[h]=0;
}
for(int a=99; a>=0 && l>=0; a--)
{
number[a]=k[l]-'0';
l--;
}
}
bignum bignum::operator+ (bignum b)
{
bignum a;
int carry=0;
for(int k=0; k<=99; k++)
{
a.number[k]=0;
}
for(int i=99; i>=0; i--)
{
a.number[i]= number[i]+b.number[i]+a.number[i];
if(a.number[i]>9)
{
carry=(a.number[i]/10);
a.number[i-1]+=carry;
a.number[i]=(a.number[i]%10);
}
}
return (a);
}
bignum bignum::operator- (bignum c)
{
bignum a;
int sign=0;
for(int k=0; k<=99; k++)
{
a.number[k]=0;
}
for(int i=99; i>=0; i--)
{
if(number[i]<c.number[i])
{
number[i]+=10;
if(i!=0)
number[i-1]--;
}
a.number[i]=number[i]-c.number[i];
}
return (a);
}
bignum bignum::operator* (bignum b)
{
bignum ans;
int ans_grid[100][100],x,lines=0,carry,sum[100];
for(int a=0; a<=99; a++)
{
for(int b=0; b<=99; b++)
{
ans_grid[a][b]=0;
}
}
for(int i=99; i>=0; i--)
{
for(int j=i,x=99; j>=0; j--,x--)
{
ans_grid[lines][j]=(number[i]*b.number[x]);
}
lines++;
}
//------------------------------------------------Carry Forward and assign to ans------------------------------------------------//
for(int j=99; j>=0; j--)
{
for(int i=99; i>=0; i--)
{
if(ans_grid[j][i]>9 && i!=0)
{
carry=(ans_grid[j][i]/10);
ans_grid[j][i-1]+=carry;
ans_grid[j][i]%=10;
}
}
}
for(int col=99; col>=0; col--)
{
for(int row=99; row>=0; row--)
{
sum[col]+=ans_grid[row][col];
}
}
for(int i=99; i>=0; i--)
{
if(sum[i]>9 && i!=0)
{
carry=(sum[i]/10);
sum[i-1]+=carry;
sum[i]%=10;
}
}
for(int l=0; l<=99; l++)
ans.number[l]=sum[l];
//-------------------------------------------------------------------------------------------------------------------------------//
return (ans);
}

I think your problem is that sum is uninitialized in your operator*. I'm reluctant to give a partial answer, since I took the code and fiddled with it a bit, so here's my version which appears to work (and also prints "0" correctly):
Update: I couldn't resist making several structural improvements. I didn't touch your multiplication routine, so you should still be able to extract the bugfix even if you don't care for the rest.
#include <iostream>
#include <string>
#include <algorithm>
using namespace std;
class bignum
{
public:
int number[100];
bignum() { std::fill(number, number + 100, 0); }
bignum(const bignum & other) { std::copy(other.number, other.number + 100, number); }
bignum(const std::string &);
bignum & operator-=(const bignum &);
inline bignum operator-(const bignum & c) const { return bignum(*this) -= c; }
bignum operator*(const bignum &) const;
inline bignum & operator= (const std::string & k) { return *this = bignum(k); }
inline operator bool() const
{
for (size_t a = 0; a < 100; ++a)
if (number[a] != 0) return true;
return false;
}
};
std::ostream & operator<<(std::ostream & o, const bignum & b)
{
bool begun = false;
for (size_t i = 0; i < 100; ++i)
{
if (begun || b.number[i] != 0)
{
cout << b.number[i];
begun = true;
}
}
if (!begun) o << "0";
return o;
}
bignum::bignum(const std::string & k)
{
std::fill(number, number + 100, 0);
for(size_t h = 0; h < std::min(k.length(), 100U); ++h)
number[99 - h] = k[k.length() - 1 - h] - '0';
}
bignum & bignum::operator-=(const bignum & c)
{
for (int i = 99; i >= 0; --i)
{
if (number[i] < c.number[i])
{
number[i] += 10;
if (i != 0) --number[i-1];
}
number[i] -= c.number[i];
}
return *this;
}
bignum bignum::operator*(const bignum & b) const
{
bignum ans;
int ans_grid[100][100], lines = 0, carry, sum[100];
std::fill(sum, sum + 100, 0);
for (size_t i = 0; i < 100; ++i) std::fill(ans_grid[i], ans_grid[i] + 100, 0);
for(int i=99; i>=0; i--)
{
for(int j=i,x=99; j>=0; j--,x--)
{
ans_grid[lines][j]=(number[i]*b.number[x]);
}
lines++;
}
//------------------------------------------------Carry Forward and assign to ans------------------------------------------------//
for(int j=99; j>=0; j--)
{
for(int i=99; i>=0; i--)
{
if(ans_grid[j][i]>9 && i!=0)
{
carry=(ans_grid[j][i]/10);
ans_grid[j][i-1]+=carry;
ans_grid[j][i]%=10;
}
}
}
for(int col=99; col>=0; col--)
{
for(int row=99; row>=0; row--)
{
sum[col]+=ans_grid[row][col];
}
}
for(int i=99; i>=0; i--)
{
if(sum[i]>9 && i!=0)
{
carry=(sum[i]/10);
sum[i-1]+=carry;
sum[i]%=10;
}
}
for(int l=0; l<=99; l++)
ans.number[l]=sum[l];
//-------------------------------------------------------------------------------------------------------------------------------//
return (ans);
}
bignum factorial(bignum a)
{
bignum j; j = "1";
bignum fact; fact="1";
while(a)
{
fact = fact * a;
a = a-j;
}
return fact;
}
int main(int argc, char * argv[])
{
if (argc < 2) return 0;
bignum a;
a = std::string(argv[1]);
bignum b = factorial(a);
cout << a << std::endl << b << std::endl;
}
The string assignment is still broken for strings with more than one digit, but that wasn't your question I suppose...

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

CUDA optimization question - optimization

Related

How to draw new graph with new point? ROOT Cern

Threads indexing out of bounds in CUDA kernel

compare images using systemC

TLE in Foe Pairs (Educational Codeforces Round 10)

Factorial in bignum library

Categories

Resources