cooperative_groups::this_grid() causes any CUDA API call to return 'unknown error' - cmake

Following the same steps in CUDA samples to launch a kernel and sync across the grid using cooperative_groups::this_grid().sync() causes any CUDA API call to fails. While using
cooperative_groups::this_thread_block().sync() works fine and gives correct results.
I used the following code and CMakeLists.txt (cmake version 3.11.1) to test it using CUDA 10 on TITAN V GPU (Driver Version 410.73) with Ubuntu 16.04.5 LTS. The code is also available on github in order to make it easy to reproduce the error.
The code reads an array and then reverses it (from [0 1 2 ... 9] to [9 8 7 ... 0]). In order to do this, each thread reads a single element from the array, sync, and then writes its element to the right destination. The code can be easily modified to ensure that this_thread_block().sync() works fine. Simply change arr_size to be less 1024 and use cg::thread_block barrier = cg::this_thread_block(); instead.
test_cg.cu
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <stdint.h>
#include <cstdint>
#include <numeric>
#include <cuda.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
//********************** CUDA_ERROR
inline void HandleError(cudaError_t err, const char *file, int line) {
//Error handling micro, wrap it around function whenever possible
if (err != cudaSuccess) {
printf("\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);
#ifdef _WIN32
system("pause");
#else
exit(EXIT_FAILURE);
#endif
}
}
#define CUDA_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
//******************************************************************************
//********************** cg kernel
__global__ void testing_cg_grid_sync(const uint32_t num_elements,
uint32_t *d_arr){
uint32_t tid = threadIdx.x + blockDim.x*blockIdx.x;
if (tid < num_elements){
uint32_t my_element = d_arr[tid];
//to sync across the whole grid
cg::grid_group barrier = cg::this_grid();
//to sync within a single block
//cg::thread_block barrier = cg::this_thread_block();
//wait for all reads
barrier.sync();
uint32_t tar_id = num_elements - tid - 1;
d_arr[tar_id] = my_element;
}
}
//******************************************************************************
//********************** execute
void execute_test(const int sm_count){
//host array
const uint32_t arr_size = 1 << 20; //1M
uint32_t* h_arr = (uint32_t*)malloc(arr_size * sizeof(uint32_t));
//fill with sequential numbers
std::iota(h_arr, h_arr + arr_size, 0);
//device array
uint32_t* d_arr;
CUDA_ERROR(cudaMalloc((void**)&d_arr, arr_size*sizeof(uint32_t)));
CUDA_ERROR(cudaMemcpy(d_arr, h_arr, arr_size*sizeof(uint32_t),
cudaMemcpyHostToDevice));
//launch config
const int threads = 512;
//following the same steps done in conjugateGradientMultiBlockCG.cu
//cuda sample to launch kernel that sync across grid
//https://github.com/NVIDIA/cuda-samples/blob/master/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu#L436
int num_blocks_per_sm = 0;
CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
(void*)testing_cg_grid_sync, threads, 0));
dim3 grid_dim(sm_count * num_blocks_per_sm, 1, 1), block_dim(threads, 1, 1);
if(arr_size > grid_dim.x*block_dim.x){
printf("\n The grid size (numBlocks*numThreads) is less than array size.\n");
exit(EXIT_FAILURE);
}
printf("\n Launching %d blocks, each containing %d threads", grid_dim.x,
block_dim.x);
//argument passed to the kernel
void *kernel_args[] = {
(void *)&arr_size,
(void *)&d_arr, };
//finally launch the kernel
cudaLaunchCooperativeKernel((void*)testing_cg_grid_sync,
grid_dim, block_dim, kernel_args);
//make sure everything went okay
CUDA_ERROR(cudaGetLastError());
CUDA_ERROR(cudaDeviceSynchronize());
//get results on the host
CUDA_ERROR(cudaMemcpy(h_arr, d_arr, arr_size*sizeof(uint32_t),
cudaMemcpyDeviceToHost));
//validate
for (uint32_t i = 0; i < arr_size; i++){
if (h_arr[i] != arr_size - i - 1){
printf("\n Result mismatch in h_arr[%u] = %u\n", i, h_arr[i]);
exit(EXIT_FAILURE);
}
}
}
//******************************************************************************
int main(int argc, char**argv) {
//set to Titan V
uint32_t device_id = 0;
cudaSetDevice(device_id);
//get sm count
cudaDeviceProp devProp;
CUDA_ERROR(cudaGetDeviceProperties(&devProp, device_id));
int sm_count = devProp.multiProcessorCount;
//execute
execute_test(sm_count);
printf("\n Mission accomplished \n");
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
set(PROJECT_NAME "test_cg")
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
#default build type is Release
if (CMAKE_BUILD_TYPE STREQUAL "")
set(CMAKE_BUILD_TYPE Release)
endif ()
SET(CUDA_SEPARABLE_COMPILATION ON)
########## Libraries/flags Starts Here ######################
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -lineinfo; -std=c++11; -expt-extended-lambda; -O3; -use_fast_math; -rdc=true;)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode=arch=compute_70,code=sm_70) #for TITAN V
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -Wall -std=c++11")
########## Libraries/flags Ends Here ######################
########## inc/libs/exe/features Starts Here ######################
set(CMAKE_INCLUDE_CURRENT_DIR ON)
CUDA_ADD_EXECUTABLE(${PROJECT_NAME} test_cg.cu)
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY})
########## inc/libs/exe/features Ends Here ######################
Running this code gives:
unknown error in /home/ahdhn/test_cg/test_cg.cu at line 67
This is the first line that uses cudaMalloc. I made sure that the code is compiled for the correct architecture by querying __CUDA_ARCH__ from the device and the results is 700. Kindly let me know if you spot me doing something wrong in the code or the CMakeLists.txt file.

With external help, the solution that got the code working is to add string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70 --cudart shared") after the second set(CUDA_NVCC_FLAGS...... The reason is that I only have libcudadevrt.a under my /usr/local/cuda-10.0/lib64/ and so I have to signal CUDA to link shared/dynamic run-time library since the default is to link to static. string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70") after the second set(CUDA_NVCC_FLAGS...... The reason is that the sm_70 flag was not passed to the linker properly.
Additionally, using only CUDA_NVCC_FLAGS will only pass the sm_70 info to the compiler not the linker. While only using CMAKE_NVCC_FLAGS will report error: namespace "cooperative_groups" has no member "grid_group" error.

Related

Building a CUDA program with CMake: undefined reference to __cudaRegisterLinkedBinary...cpp1_ii_main

My cuda version is 10.1, and GPU is T4. My code is like this:
#include <iostream>
#include <algorithm>
#include <random>
#include <vector>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <cooperative_groups.h>
using std::cout;
using std::endl;
void sort_2d_by_row();
thrust::device_vector<float> thrust_2d_by_row_even_odd(
thrust::device_vector<float>&, int, int);
__global__ void even_odd_kernel(float *ptr, int M, int N);
int main() {
cudaError_t err = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1UL << 32);
if (err) cout << "errors occur\n";
sort_2d_by_row();
return 0;
}
void sort_2d_by_row() {
std::random_device rd;
std::mt19937 engine;
engine.seed(rd());
std::uniform_real_distribution<float> u(0, 90.);
int M = 19;
int N = 8 * 768 * 768;
/* int N = 10; */
std::vector<float> v(M * N);
std::generate(v.begin(), v.end(), [&](){return u(engine);});
thrust::host_vector<float> hv(v.begin(), v.end());
thrust::device_vector<float> dv = hv;
thrust::device_vector<float> res_even_odd = thrust_2d_by_row_even_odd(dv, M, N);
}
thrust::device_vector<float> thrust_2d_by_row_even_odd(
thrust::device_vector<float>& v, int M, int N) {
thrust::device_vector<float> res(v.begin(), v.end());
thrust::device_vector<int> index(M);
thrust::sequence(thrust::device, index.begin(), index.end(), 0, 1);
int blocky = 1;
while (blocky < M) blocky *= 2;
blocky /= 2;
int blockx = 1;
while (blockx < (N / 2) && blockx < 1024) blockx *= 2;
blockx /= 2;
int gridx = std::min(4096, N / blockx / 2);
dim3 block(blockx, blocky);
dim3 grid(gridx);
even_odd_kernel<<<grid, block, 0>>>(
thrust::raw_pointer_cast(&res[0]), M, N);
cudaDeviceSynchronize();
return res;
}
// descending
__global__ void even_odd_kernel(float *ptr, int M, int N) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int m = threadIdx.y;
int tstride = blockDim.x * gridDim.x * 2;
cooperative_groups::grid_group g = cooperative_groups::this_grid();
g.sync();
}
And CMakeLists.txt is like this:
CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
PROJECT(cuda)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif ()
set(CMAKE_CXX_FLAGS "-std=c++14 -Wall -Wextra")
set(CMAKE_CXX_FLAGS_DEBUG "-g3 -O0")
set(CMAKE_CXX_FLAGS_RELEASE "-O2")
set(CUDA_NVCC_FLAGS "-std=c++14 -arch=sm_60 -Xptxas=-v -rdc=true")
set(CUDA_NVCC_FLAGS_DEBUG "-G -O0")
set(CUDA_NVCC_FLAGS_RELEASE "-O2")
set(CUDA_CUDA_FLAGS "-gencode arch=compute_70,code=sm_70 -rdc=true")
message (${CMAKE_BUILD_TYPE})
find_package(CUDA REQUIRED)
cuda_add_executable(sort sort.cu)
target_include_directories(
sort PUBLIC ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS})
target_link_libraries(
sort ${CUDA_LIBRARIES})
The error message is:
CMakeFiles/sort.dir/sort_generated_sort.cu.o: In function
`__sti____cudaRegisterAll()':
tmpxft_0004cd04_00000000-5_sort.cudafe1.cpp:(.text.startup+0x15):
undefined reference to
`__cudaRegisterLinkedBinary_39_tmpxft_0004cd04_00000000_6_sort_cpp1_ii_main'
collect2: error: ld returned 1 exit status
CMakeFiles/sort.dir/build.make:963: recipe for target 'sort' failed
How could I make it work please? Besides, Does g.sync() have big harms to the program performance, or is the impact travial?
The cooperative groups are not an issue, IMHO. That's just something requiring a recent version of CUDA. As for your linking trouble - I think it must be some sort of flag mess. I'll suggest an alternative CMakeLists.txt, which itself is not perfect, but is more appropriate for CMake versions of recent years. It also has a bunch of suggestions for you in comments:
cmake_minimum_required(VERSION 3.8.2)
# If you want to properly search for Thrust, you'll need a FindThrust.cmake
# script, which constitutes a "CMake module". You place it under cmake/Modules
# in your source directory and make it available by uncommenting the following
# line:
#list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
project(sort-with-cuda
DESCRIPTION "My project description here"
LANGUAGES CXX CUDA)
# Don't do this. Set your build type explicitly, once; and then it's
# cached and you don't have to worry about it when you run make.
#
#if (NOT CMAKE_BUILD_TYPE)
# set(CMAKE_BUILD_TYPE Release)
#endif ()
# In the future, this should not be necessary, but we need it for
# cuda_select_nvcc_arch_flags
include(FindCUDA)
# This will set the appropriate gencode parameters for the hardware
# on your system (although you could always force it manually)
cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS_TMP Auto)
set(CUDA_ARCH_FLAGS ${CUDA_ARCH_FLAGS_TMP} CACHE STRING "CUDA -gencode parameters")
string(REPLACE ";" " " CUDA_ARCH_FLAGS_STR "${CUDA_ARCH_FLAGS}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS_STR}")
# The above may produce something like:
#
# -gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_70,code=compute_70;-gencode;arch=compute_75,code=compute_75
#
# But it may include older micro-architectures which have been
# deprecated/removed, in which case you'll need to edit that
# with ccmake and only keep what you need.
add_executable(sort-with-cuda sort.cu)
set_target_properties(
sort-with-cuda
PROPERTIES
CXX_STANDARD 14
CXX_STANDARD_REQUIRED YES
CXX_EXTENSIONS NO
)
# Note: I haven't added flags for compiling with warnings
# Thrust is very finickey: It provies a configuration script, but
# only for CMake >= 3.15 . And - it doesn't provide a FindThrust.cmake
# script itself with targets appropriate for CMake >= 3.
#
# See https://github.com/NVIDIA/thrust/blob/main/thrust/cmake/README.md
#
# With CMake 3.15 or later you can enable the following two lines:
#
#find_package(Thrust REQUIRED CONFIG)
#thrust_create_target(Thrust)
#target_link_libraries(sort-with-cuda Thrust)
#
# With earlier CMake versions, get yourself a proper FindThrust.cmake
# script (which creates a Thrust::Thrust target I suppose) and
# then uncomment the following two lines:
#
#find_package(Thrust REQUIRED)
#target_link_libraries(sort-with-cuda Thrust::Thrust)
# The following sets -rdc=true , but you don't actually need that for your example
set_target_properties(sort-with-cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

problem with sprint/printf with freeRTOS on stm32f7

Since two days I am trying to make printf\sprintf working in my project...
MCU: STM32F722RETx
I tried to use newLib, heap3, heap4, etc, etc. nothing works. HardFault_Handler is run evry time.
Now I am trying to use simple implementation from this link and still the same problem. I suppose my device has some problem with double numbers, becouse program run HardFault_Handler from this line if (value != value) in _ftoa function.( what is strange because this stm32 support FPU)
Do you guys have any idea? (Now I am using heap_4.c)
My compiller options:
target_compile_options(${PROJ_NAME} PUBLIC
$<$<COMPILE_LANGUAGE:CXX>:
-std=c++14
>
-mcpu=cortex-m7
-mthumb
-mfpu=fpv5-d16
-mfloat-abi=hard
-Wall
-ffunction-sections
-fdata-sections
-O1 -g
-DLV_CONF_INCLUDE_SIMPLE
)
Linker options:
target_link_options(${PROJ_NAME} PUBLIC
${LINKER_OPTION} ${LINKER_SCRIPT}
-mcpu=cortex-m7
-mthumb
-mfloat-abi=hard
-mfpu=fpv5-sp-d16
-specs=nosys.specs
-specs=nano.specs
# -Wl,--wrap,malloc
# -Wl,--wrap,_malloc_r
-u_printf_float
-u_sprintf_float
)
Linker script:
/* Highest address of the user mode stack */
_estack = 0x20040000; /* end of RAM */
/* Generate a link error if heap and stack don't fit into RAM */
_Min_Heap_Size = 0x200; /* required amount of heap */
_Min_Stack_Size = 0x400; /* required amount of stack */
/* Specify the memory areas */
MEMORY
{
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 256K
FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 512K
}
UPDATE:
I don't think so it is stack problem, I have set configCHECK_FOR_STACK_OVERFLOW to 2, but hook function is never called. I found strange think: This soulution works:
float d = 23.5f;
char buffer[20];
sprintf(buffer, "temp %f", 23.5f);
but this solution not:
float d = 23.5f;
char buffer[20];
sprintf(buffer, "temp %f",d);
No idea why passing variable by copy, generate a HardFault_Handler...
You can implement a hard fault handler that at least will provide you with the SP location to where the issue is occurring. This should provide more insight.
https://www.freertos.org/Debugging-Hard-Faults-On-Cortex-M-Microcontrollers.html
It should let you know if your issue is due to a floating point error within the MCU or if it is due to a branching error possibly caused by some linking problem
I also had error with printf when using FreeRTOS for my SiFive HiFive Rev B.
To solve it, I rewrite _fstat and _write functions to change output function of printf
/*
* Retarget functions for printf()
*/
#include <errno.h>
#include <sys/stat.h>
int _fstat (int file, struct stat * st) {
errno = -ENOSYS;
return -1;
}
int _write (int file, char * ptr, int len) {
extern int uart_putc(int c);
int i;
/* Turn character to capital letter and output to UART port */
for (i = 0; i < len; i++) uart_putc((int)*ptr++);
return 0;
}
And create another uart_putc function for UART0 of SiFive HiFive Rev B hardware:
void uart_putc(int c)
{
#define uart0_txdata (*(volatile uint32_t*)(0x10013000)) // uart0 txdata register
#define UART_TXFULL (1 << 31) // uart0 txdata flag
while ((uart0_txdata & UART_TXFULL) != 0) { }
uart0_txdata = c;
}
The newlib C-runtime library (used in many embedded tool chains) internally uses it's own malloc-family routines. newlib maintains some internal buffers and requires some support for thread-safety:
http://www.nadler.com/embedded/newlibAndFreeRTOS.html
hard fault can caused by unaligned Memory Access:
https://www.keil.com/support/docs/3777.htm

Problem using Lloyd optimization and Mesh_domain::create_labeled_image_mesh_domain

I'm using CGAL 4.13 (Linux Fedora 29) to generate 3D meshes from segmented anathomical images. I would like to use Lloyd optimization, but I got in a reproductible way a runtime error.
In order to illustrate my problem, I modified the example mesh_3D_image.cpp by adding a Lloyd optimization step, as shown hereafter. The program compiles with no error/warning message.
#include <CGAL/Exact_predicates_inexact_constructions_kernel.h>
#include <CGAL/Mesh_triangulation_3.h>
#include <CGAL/Mesh_complex_3_in_triangulation_3.h>
#include <CGAL/Mesh_criteria_3.h>
#include <CGAL/Labeled_mesh_domain_3.h>
#include <CGAL/make_mesh_3.h>
#include <CGAL/Image_3.h>
typedef CGAL::Exact_predicates_inexact_constructions_kernel K;
typedef CGAL::Labeled_mesh_domain_3<K> Mesh_domain;
typedef CGAL::Sequential_tag Concurrency_tag;
typedef CGAL::Mesh_triangulation_3<Mesh_domain,CGAL::Default,Concurrency_tag>::type Tr;
typedef CGAL::Mesh_complex_3_in_triangulation_3<Tr> C3t3;
typedef CGAL::Mesh_criteria_3<Tr> Mesh_criteria;
using namespace CGAL::parameters;
int main(int argc, char* argv[])
{
const char* fname = (argc>1)?argv[1]:"data/liver.inr.gz";
CGAL::Image_3 image;
if(!image.read(fname)){
std::cerr << "Error: Cannot read file " << fname << std::endl;
return EXIT_FAILURE;
}
Mesh_domain domain = Mesh_domain::create_labeled_image_mesh_domain(image);
Mesh_criteria criteria(facet_angle=30, facet_size=6, facet_distance=4,
cell_radius_edge_ratio=3, cell_size=8);
C3t3 c3t3 = CGAL::make_mesh_3<C3t3>(domain, criteria);
// !!! THE FOLLOWING LINE MAKES THE PROGRAM CRASH !!!
CGAL::lloyd_optimize_mesh_3(c3t3, domain, time_limit=30);
std::ofstream medit_file("out.mesh");
c3t3.output_to_medit(medit_file);
return 0;
}
I compile it by using the following CMakeLists.txt file:
# Created by the script cgal_create_CMakeLists
project( executables )
cmake_minimum_required(VERSION 2.8.11)
find_package( CGAL QUIET COMPONENTS )
# !!! I had to add manually the following line !!!
find_package(CGAL COMPONENTS ImageIO)
include( ${CGAL_USE_FILE} )
find_package( Boost REQUIRED )
add_executable( executables lloyd.cpp )
add_to_cached_list( CGAL_EXECUTABLE_TARGETS executables )
target_link_libraries(executables ${CGAL_LIBRARIES} ${CGAL_3RD_PARTY_LIBRARIES} )
No mesh is generated. I obtain the following message:
$ ./build/mesh_3D_image
terminate called after throwing an instance of 'CGAL::Precondition_exception'
what(): CGAL ERROR: precondition violation!
Expr: std::distance(first,last) >= 3
File: /usr/include/CGAL/Mesh_3/Lloyd_move.h
Line: 419
Aborted (core dumped)
Where my code is wrong, and how can I trigger optimizations for meshes generated by 3D images?
actually, when CGAL::make_mesh_3() is called like this :
C3t3 c3t3 = CGAL::make_mesh_3<C3t3>(domain, criteria);
it internally launches CGAL::perturb_mesh_3() and CGAL::exude_mesh_3(). The latest changes the weights of vertices in the Regular triangulation, and should always be called last (see the Warning in the documentation of CGAL::exude_mesh_3().
The only limitation on the order is that exuder should be called last. So you can either call
C3t3 c3t3 = CGAL::make_mesh_3<C3t3>(domain, criteria, lloyd(time_limit=30));
or
C3t3 c3t3 = CGAL::make_mesh_3<C3t3>(domain, criteria, no_exude());
CGAL::lloyd_optimize_mesh_3(c3t3, domain, time_limit = 30);
CGAL::exude_mesh_3(c3t3);
You removed the part:
if(!image.read(fname)){
std::cerr << "Error: Cannot read file " << fname << std::endl;
return EXIT_FAILURE;
}
from the example, which is what actually reads the image from the file.

Why do I need separable compilation?

I have the code shown below. As far as I understood, separable compilation must be turned on when
CUDA device code is separated into .h and .cu files
Use ObjectA's device code into Object's B device code
however, in my main function I am not having any of the cases above. Could you tell me why do I have to set separable compilation for this sample project?
BitHelper.h
#pragma once
#include <cuda_runtime.h>
#define COMPILE_TARGET __host__ __device__
class BitHelper
{
public:
COMPILE_TARGET BitHelper();
COMPILE_TARGET ~BitHelper();
COMPILE_TARGET static void clear(unsigned int& val0);
};
BitHelper.cu
#include "bithelper.h"
BitHelper::BitHelper()
{}
BitHelper::~BitHelper()
{}
void BitHelper::clear(unsigned int& val0)
{
val0 = 0x0000;
}
Consume_BitHelper.h
#pragma once
class Consume_BitHelper
{
public:
void apply();
private:
bool test_cpu();
bool test_gpu();
};
Consume_BitHelper.cu
#include "consume_bithelper.h"
#include <cuda_runtime.h>
#include <iostream>
#include "bithelper.h"
__global__
void myKernel()
{
unsigned int FLAG_VALUE = 0x2222;
printf("GPU before: %d\n", FLAG_VALUE);
BitHelper::clear(FLAG_VALUE);
printf("GPU after: %d\n", FLAG_VALUE);
}
void Consume_BitHelper::apply()
{
test_cpu();
test_gpu();
cudaDeviceSynchronize();
}
bool Consume_BitHelper::test_cpu()
{
std::cout << "TEST CPU" << std::endl;
unsigned int FLAG_VALUE = 0x1111;
std::cout << "CPU before: " << FLAG_VALUE << std::endl;
BitHelper::clear(FLAG_VALUE);
std::cout << "CPU after : " << FLAG_VALUE << std::endl;
return true;
}
bool Consume_BitHelper::test_gpu()
{
std::cout << "TEST GPU" << std::endl;
myKernel << <1, 1 >> > ();
return true;
}
main.cu
#include "consume_bithelper.h"
#include "bithelper.h"
#include <iostream>
int main(int argc, char** argv)
{
Consume_BitHelper cbh;
cbh.apply();
std::cout << "\nPress any key to continue...";
std::cin.get();
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(cuda_class LANGUAGES CXX CUDA)
#BitHelper needs separable compilation because we have separated declaration from definition
add_library(bithelper_lib STATIC bithelper.cu)
set_property(TARGET bithelper_lib PROPERTY CUDA_SEPARABLE_COMPILATION ON)
#Consume_BitHelper needs separable compilation because we call BitHelper's device code
#from Consume_BitHelper's kernel
add_library(consume_bithelper_lib STATIC consume_bithelper.cu)
set_property(TARGET consume_bithelper_lib PROPERTY CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(consume_bithelper_lib bithelper_lib)
#We only call CPU code so no need of separable compilation?
add_executable(${PROJECT_NAME} main.cu)
target_link_libraries(${PROJECT_NAME} bithelper_lib consume_bithelper_lib)
The errors I'm getting are these
EDIT
According to Robert Crovella's post Consume_BitHelper.cu uses BitHelper::clear defined in a separate compilation unit.
Does it mean I have to activate only separate compilation for BitHelper?
Since separate compilation has to do only with device code called from device code.
Why am I getting the mentioned errors when separate compilation is NOT on for cuda_class? (which is the executable created from CMake and is not calling any device code)
Separable compilation has to do with how the compiler handles function calls. In exchange for a little bit of overhead, you get the ability to make true function calls and thus access code from other "compilation units" (i.e. .cu source files).
As GPU programmers are obsessed with performance (particularly the extra registers that get used when separable compilation is enabled) Nvidia made it an option instead of default.
You should only need separable compilation for .cu files that access functions/globals defined in other .cu files.

Why doesn't CMake find Boost?

i try to use Boost unit test framework to test my static library but CMake cannot find Boost in spite of BOOST_ROOT, CMAKE_INCLUDE_PATH, CMAKE_LIBRARY_PATH are set before call to find_package. Actually i've tried a lot of options based on the answers to similar question but still without any success.
cmake version:3.6.1
boost version:1.60.0
Here is link to my project on GitHub
And here is log output from TravisCI
CMakeLists.txt:
cmake_minimum_required(VERSION ${cmake_version})
project(${LibName}_test)
set(Boost_DEBUG ON)
set(Boost_DETAILED_FAILURE_MSG ON)
set(BOOST_ROOT $ENV{BOOST_ROOT})
message(STATUS "BOOST_ROOT >> ${BOOST_ROOT}")
set(BOOST_LIBRARYDIR $ENV{BOOST_ROOT}/lib)
set(BOOST_INCLUDEDIR $ENV{BOOST_ROOT}/include)
enable_testing()
set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_MULTITHREADED ON)
SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "$ENV{BOOST_ROOT}/include")
SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} "$ENV{BOOST_ROOT}/lib")
find_package(Boost 1.60.0 EXACT REQUIRED COMPONENTS unit_test_framework)
#if(Boost_FOUND)
include_directories("${LIB_HEADERS}")
add_executable(test_executable
main.cpp
SomeClass_test.cpp
)
target_include_directories(test_executable PRIVATE ${Boost_INCLUDE_DIRS})
target_link_libraries(test_executable
#${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}
Boost::unit_test_framework
${LibName}
)
add_test(NAME test1 COMMAND test_executable)
#else()
# message(STATUS "No Boost library were found!")
#endif()
main.cpp:
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include <iostream>
// initialization function:
bool init_unit_test()
{
return true;
}
// entry point:
int main(int argc, char * argv[]) {
// insert code here...
std::cout << "Boost version: "
<< BOOST_VERSION / 100000 << "." // major version
<< BOOST_VERSION / 100 % 1000 << "." // minor version
<< BOOST_VERSION % 100 // patch level
<< std::endl;
return boost::unit_test::unit_test_main( &init_unit_test, argc, argv );
}
SomeClass_test.cpp:
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include "config.h"
#include "Rectangle.h"
#include "SomeClass_test.hpp"
BOOST_AUTO_TEST_CASE(int_test)
{
int i = 2;
BOOST_TEST(i);
BOOST_TEST(i == 2);
}
BOOST_AUTO_TEST_CASE(area_test)
{
Rectangle rect;
rect.set_values(2, 3);
BOOST_TEST(6 == rect.area());
}
BOOST_AUTO_TEST_CASE(config_test)
{
auto major = SimpleNet::Version::Major;
BOOST_TEST(0 == major);
}