CUDA-transfer 2D array from host to device - gpu

I have a 2D matrix in the main. I want to transfer if from host to device. Can you tell me how I can allocate memory for it and transfer it to the device memory?
#define N 5
__global__ void kernel(int a[N][N]){
}
int main(void){
int a[N][N];
cudaMalloc(?);
cudaMemcpy(?);
kernel<<<N,N>>>(?);
}

Perhaps something like this is what you really had in mind:
#define N 5
__global__ void kernel(int *a)
{
// Thread indexing within Grid - note these are
// in column major order.
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
int tidy = threadIdx.y + blockIdx.y * blockDim.y;
// a_ij = a[i][j], where a is in row major order
int a_ij = a[tidy + tidx*N];
}
int main(void)
{
int a[N][N], *a_device;
const size_t a_size = sizeof(int) * size_t(N*N);
cudaMalloc((void **)&a_device, a_size);
cudaMemcpy(a_device, a, a_size, cudaMemcpyHostToDevice);
kernel<<<N,N>>>(a_device);
}
The point you might have missed is that when you statically declare an array like this A[N][N], it is really just a row major ordered piece of linear memory. The compiler is automatically converting between a[i][j] and a[j + i*N] when it emits code. On the GPU, you must use the second form of access to read the memory you copy from the host.

Related

Addressing pins of Register in microcontrollers

I'm working on Keil software and using LM3S316 microcontroller. Usually we address registers in microcontrollers in form of:
#define GPIO_PORTC_DATA_R (*((volatile uint32_t *)0x400063FC))
My question is how can I access to single pin of register for example, if I have this method:
char process_key(int a)
{ PC_0 = a ;}
How can I get PC_0 and how to define it?
Thank you
Given say:
#define PIN0 (1u<<0)
#define PIN1 (1u<<1)
#define PIN2 (1u<<2)
// etc...
Then:
char process_key(int a)
{
if( a != 0 )
{
// Set bit
GPIO_PORTC_DATA_R |= PIN0 ;
}
else
{
// Clear bit
GPIO_PORTC_DATA_R &= ~PIN0 ;
}
}
A generalisation of this idiomatic technique is presented at How do you set, clear, and toggle a single bit?
However the read-modify-write implied by |= / &= can be problematic if the register might be accessed in different thread/interrupt contexts, as well as adding a possibly undesirable overhead. Cortex-M3/4 parts have a feature known as bit-banding that allows individual bits to be addressed directly and atomically. Given:
volatile uint32_t* getBitBandAddress( volatile const void* address, int bit )
{
__IO uint32_t* bit_address = 0;
uint32_t addr = reinterpret_cast<uint32_t>(address);
// This bit maniplation makes the function valid for RAM
// and Peripheral bitband regions
uint32_t word_band_base = addr & 0xf0000000u;
uint32_t bit_band_base = word_band_base | 0x02000000u;
uint32_t offset = addr - word_band_base;
// Calculate bit band address
bit_address = reinterpret_cast<__IO uint32_t*>(bit_band_base + (offset * 32u) + (static_cast<uint32_t>(bit) * 4u));
return bit_address ;
}
Then you can have:
char process_key(int a)
{
static volatile uint32_t* PC0_BB_ADDR = getBitBandAddress( &GPIO_PORTC_DATA_R, 0 ) ;
*PC0_BB_ADDR = a ;
}
You could of course determine and hard-code the bit-band address; for example:
#define PC0 (*((volatile uint32_t *)0x420C7F88u))
Then:
char process_key(int a)
{
PC0 = a ;
}
Details of the bit-band address calculation can be found ARM Cortex-M Technical Reference Manual, and there is an on-line calculator here.

`const int* const int` initialisation with function

I want to define a constant array of constants at every MPI node using C++03. M_chunk_sizes defines the size of matrix that will be passed to other nodes and won't be changed during the runtime.
int* define_chunk_sizes( int S, int world) {
int out[world];
double quotient = static_cast<double> (S) / world;
int maj = ceil(quotient);
for (int i =0; i < world - 1; i++)
out[i] = maj;
out[world-1] = maj + (S - maj*world);
return out;
}
int main() {
const int M = 999; // rows
int world_size = 4;
const int* const M_chunk_sizes = define_chunk_sizes(M, world_size);
}
But i get a warning: address of stack memory associated with local variable 'out' returned [-Wreturn-stack-address]
return out;.
What is the right way of doing this?
funciton local variables(stack varibales) will go out of scope and life once function returns.
you have use dynamic memory management operators, so allocate memory to out using
new
and relase memory using
delete
once you done with it.

cudaMallocHost vs malloc for better performance shows no difference

I have gone through this site. From here I got that pinned memory using cudamallocHost gives better performance than cudamalloc. Then I use two different simple program and tested the execution time as
using cudaMallocHost
#include <stdio.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
clock_t start;
start=clock();/* Line 8 */
clock_t finish;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 100000; // Number of elements in arrays
size_t size = N * sizeof(float);
cudaMallocHost((void **) &a_h, size);
//a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
cudaFreeHost(a_h);
cudaFree(a_d);
finish = clock() - start;
double interval = finish / (double)CLOCKS_PER_SEC;
printf("%f seconds elapsed", interval);
}
using malloc
#include <stdio.h>
#include <cuda.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
clock_t start;
start=clock();/* Line 8 */
clock_t finish;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 100000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
finish = clock() - start;
double interval = finish / (double)CLOCKS_PER_SEC;
printf("%f seconds elapsed", interval);
}
here during execution of both program, the execution time was almost similar.
Is there anything wrong in the implementation?? what is the exact difference in execution in cudamalloc and cudamallochost??
and also with each run the execution time decreases
If you want to see the difference in execution time for the copy operation, just time the copy operation. In many cases you will see approximately a 2x difference in execution time for just the copy operation when the underlying mememory is pinned. And make your copy operation large enough/long enough so that you are well above the granularity of whatever timing mechanism you are using. The various profilers such as the visual profiler and nvprof can help here.
The cudaMallocHost operation under the hood is doing something like a malloc plus additional OS functions to "pin" each page associated with the allocation. These additional OS operations take extra time, as compared to just doing a malloc. And note that as the size of the allocation increases, the registration ("pinning") cost will generally increase as well.
Therefore, for many examples, just timing the overall execution doesn't show much difference, because while the cudaMemcpy operation may be quicker from pinned memory, the cudaMallocHost takes longer than the corresponding malloc.
So what's the point?
You may be interested in using pinned memory (i.e. cudaMallocHost) when you will be doing repeated transfers from a single buffer. You only pay the extra cost to pin it once, but you benefit on each transfer/usage.
Pinned memory is required to overlap a data transfer operations (cudaMemcpyAsync) with compute activities (kernel calls). Refer to the programming guide.
I too found that just declaring cudaHostAlloc / cudaMallocHost on a piece of memory doesn't do much.
To be sure, do a nvprof with --print-gpu-trace and see whether the throughput for memcpyHtoD or memcpyDtoH is good. For PCI2.0, you should get around 6-8gbps.
However, pinned memory is a perquisite for cudaMemcpyAsync.
After I called cudaMemcpyAsync, I shifted whatever computations I had on the host right after it. In this way you can "layer" the asynchronous memcpys with the host computations.
I was surprised that I was able to save quite a lot of time this way, it's worth a try.

CUDA Crashes for big data set

My computer crashes (I have to manually reset it) when I run my kernel function in a loop for 600+ times (it would not crash if it were like 50 times or so), and I'm not sure what's causing the crash.
My main is as follows:
int main()
{
int *seam = new int [image->height];
int width = image->width;
int height = image->height;
int *fMC = (int*)malloc(width*height*sizeof(int*));
int *fNew = (int*)malloc(width*height*sizeof(int*));
for(int i=0;i<numOfSeams;i++)
{
seam = cpufindSeamV2(fMC,width,height,1);
fMC = kernel_shiftSeam(fMC,fNew,seam,width,height,nWidth,1);
for(int k=0;k<height;k++)
{
fMC[(nWidth-1)+width*k] = INT_MAX;
}
}
and my kernel is :
int* kernel_shiftSeam(int *MCEnergyMat, int *newE, int *seam, int width, int height, int x, int direction)
{
//time measurement
float elapsed_time_ms = 0;
cudaEvent_t start, stop; //threads per block
dim3 threads(16,16);
//blocks
dim3 blocks((width+threads.x-1)/threads.x, (height+threads.y-1)/threads.y);
//MCEnergy and Seam arrays on device
int *device_MC, *device_new, *device_Seam;
//MCEnergy and Seam arrays on host
int *host_MC, *host_new, *host_Seam;
//total number of bytes in array
int size = width*height*sizeof(int);
int seamSize;
if(direction == 1)
{
seamSize = height*sizeof(int);
host_Seam = (int*)malloc(seamSize);
for(int i=0;i<height;i++)
host_Seam[i] = seam[i];
}
else
{
seamSize = width*sizeof(int);
host_Seam = (int*)malloc(seamSize);
for(int i=0;i<width;i++)
host_Seam[i] = seam[i];
}
cudaMallocHost((void**)&host_MC, size );
cudaMallocHost((void**)&host_new, size );
host_MC = MCEnergyMat;
host_new = newE;
//allocate 1D flat array on device
cudaMalloc((void**)&device_MC, size);
cudaMalloc((void**)&device_new, size);
cudaMalloc((void**)&device_Seam, seamSize);
//copy host array to device
cudaMemcpy(device_MC, host_MC, size, cudaMemcpyHostToDevice);
cudaMemcpy(device_new, host_new, size, cudaMemcpyHostToDevice);
cudaMemcpy(device_Seam, host_Seam, seamSize, cudaMemcpyHostToDevice);
//measure start time for cpu calculations
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//perform gpu calculations
if(direction == 1)
{
gpu_shiftSeam<<< blocks,threads >>>(device_MC, device_new, device_Seam, width, height, x);
}
//measure end time for cpu calcuations
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed_time_ms, start, stop );
execTime += elapsed_time_ms;
//copy out the results back to host
cudaMemcpy(newE, device_new, size, cudaMemcpyDeviceToHost);
//free memory
free(host_Seam);
cudaFree(host_MC); cudaFree(host_new);
cudaFree(device_MC); cudaFree(device_new); cudaFree(device_Seam);
//destroy event objects
cudaEventDestroy(start); cudaEventDestroy(stop);
return newE;
}
So, my program crashes when I call "kernel_shiftSeam" for many times, I also freed the memory using cudaFree so I don't know whether or not its a memory leak problem. It would be great if someone can point me in the right direction.
Could be heap problems. Try reordering the cudaFree statements in your kernel to be LIFO. Check release notes for any newer CUDA drivers that contain heap/leak fixes. On windows try installing process explorer 15.12 or newer as it shows GPU memory usage - and a leaky heap is easy to spot.

CUDA program causes nvidia driver to crash

My monte carlo pi calculation CUDA program is causing my nvidia driver to crash when I exceed around 500 trials and 256 full blocks. It seems to be happening in the monteCarlo kernel function.Any help is appreciated.
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#define NUM_THREAD 256
#define NUM_BLOCK 256
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
// Function to sum an array
__global__ void reduce0(float *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_odata[i];
__syncthreads();
// do reduction in shared mem
for (unsigned int s=1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (2*s) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
__global__ void monteCarlo(float *g_odata, int trials, curandState *states){
// unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int incircle, k;
float x, y, z;
incircle = 0;
curand_init(1234, i, 0, &states[i]);
for(k = 0; k < trials; k++){
x = curand_uniform(&states[i]);
y = curand_uniform(&states[i]);
z =(x*x + y*y);
if (z <= 1.0f) incircle++;
}
__syncthreads();
g_odata[i] = incircle;
}
///////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////
int main() {
float* solution = (float*)calloc(100, sizeof(float));
float *sumDev, *sumHost, total;
const char *error;
int trials;
curandState *devStates;
trials = 500;
total = trials*NUM_THREAD*NUM_BLOCK;
dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions
dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions
size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
sumHost = (float*)calloc(NUM_BLOCK*NUM_THREAD, sizeof(float));
cudaMalloc((void **) &sumDev, size); // Allocate array on device
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
cudaMalloc((void **) &devStates, (NUM_THREAD*NUM_BLOCK)*sizeof(curandState));
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
// Do calculation on device by calling CUDA kernel
monteCarlo <<<dimGrid, dimBlock>>> (sumDev, trials, devStates);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
// call reduction function to sum
reduce0 <<<dimGrid, dimBlock, (NUM_THREAD*sizeof(float))>>> (sumDev);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
dim3 dimGrid1(1,1,1);
dim3 dimBlock1(256,1,1);
reduce0 <<<dimGrid1, dimBlock1, (NUM_THREAD*sizeof(float))>>> (sumDev);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
// Retrieve result from device and store it in host array
cudaMemcpy(sumHost, sumDev, sizeof(float), cudaMemcpyDeviceToHost);
error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
*solution = 4*(sumHost[0]/total);
printf("%.*f\n", 1000, *solution);
free (solution);
free(sumHost);
cudaFree(sumDev);
cudaFree(devStates);
//*solution = NULL;
return 0;
}
If smaller numbers of trials work correctly, and if you are running on MS Windows without the NVIDIA Tesla Compute Cluster (TCC) driver and/or the GPU you are using is attached to a display, then you are probably exceeding the operating system's "watchdog" timeout. If the kernel occupies the display device (or any GPU on Windows without TCC) for too long, the OS will kill the kernel so that the system does not become non-interactive.
The solution is to run on a non-display-attached GPU and if you are on Windows, use the TCC driver. Otherwise, you will need to reduce the number of trials in your kernel and run the kernel multiple times to compute the number of trials you need.
EDIT: According to the CUDA 4.0 curand docs(page 15, "Performance Notes"), you can improve performance by copying the state for a generator to local storage inside your kernel, then storing the state back (if you need it again) when you are finished:
curandState state = states[i];
for(k = 0; k < trials; k++){
x = curand_uniform(&state);
y = curand_uniform(&state);
z =(x*x + y*y);
if (z <= 1.0f) incircle++;
}
Next, it mentions that setup is expensive, and suggests that you move curand_init into a separate kernel. This may help keep the cost of your MC kernel down so you don't run up against the watchdog.
I recommend reading that section of the docs, there are several useful guidelines.
For those of you having a geforce GPU which does not support TCC driver there is another solution based on:
http://msdn.microsoft.com/en-us/library/windows/hardware/ff569918(v=vs.85).aspx
start regedit,
navigate to HKEY_LOCAL_MACHINE\System\CurrentControlSet\Control\GraphicsDrivers
create new DWORD key called TdrLevel, set value to 0,
restart PC.
Now your long-running kernels should not be terminated. This answer is based on:
Modifying registry to increase GPU timeout, windows 7
I just thought it might be useful to provide the solution here as well.