i try to use Boost unit test framework to test my static library but CMake cannot find Boost in spite of BOOST_ROOT, CMAKE_INCLUDE_PATH, CMAKE_LIBRARY_PATH are set before call to find_package. Actually i've tried a lot of options based on the answers to similar question but still without any success.
cmake version:3.6.1
boost version:1.60.0
Here is link to my project on GitHub
And here is log output from TravisCI
CMakeLists.txt:
cmake_minimum_required(VERSION ${cmake_version})
project(${LibName}_test)
set(Boost_DEBUG ON)
set(Boost_DETAILED_FAILURE_MSG ON)
set(BOOST_ROOT $ENV{BOOST_ROOT})
message(STATUS "BOOST_ROOT >> ${BOOST_ROOT}")
set(BOOST_LIBRARYDIR $ENV{BOOST_ROOT}/lib)
set(BOOST_INCLUDEDIR $ENV{BOOST_ROOT}/include)
enable_testing()
set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_MULTITHREADED ON)
SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "$ENV{BOOST_ROOT}/include")
SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} "$ENV{BOOST_ROOT}/lib")
find_package(Boost 1.60.0 EXACT REQUIRED COMPONENTS unit_test_framework)
#if(Boost_FOUND)
include_directories("${LIB_HEADERS}")
add_executable(test_executable
main.cpp
SomeClass_test.cpp
)
target_include_directories(test_executable PRIVATE ${Boost_INCLUDE_DIRS})
target_link_libraries(test_executable
#${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}
Boost::unit_test_framework
${LibName}
)
add_test(NAME test1 COMMAND test_executable)
#else()
# message(STATUS "No Boost library were found!")
#endif()
main.cpp:
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include <iostream>
// initialization function:
bool init_unit_test()
{
return true;
}
// entry point:
int main(int argc, char * argv[]) {
// insert code here...
std::cout << "Boost version: "
<< BOOST_VERSION / 100000 << "." // major version
<< BOOST_VERSION / 100 % 1000 << "." // minor version
<< BOOST_VERSION % 100 // patch level
<< std::endl;
return boost::unit_test::unit_test_main( &init_unit_test, argc, argv );
}
SomeClass_test.cpp:
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include "config.h"
#include "Rectangle.h"
#include "SomeClass_test.hpp"
BOOST_AUTO_TEST_CASE(int_test)
{
int i = 2;
BOOST_TEST(i);
BOOST_TEST(i == 2);
}
BOOST_AUTO_TEST_CASE(area_test)
{
Rectangle rect;
rect.set_values(2, 3);
BOOST_TEST(6 == rect.area());
}
BOOST_AUTO_TEST_CASE(config_test)
{
auto major = SimpleNet::Version::Major;
BOOST_TEST(0 == major);
}
Related
My cuda version is 10.1, and GPU is T4. My code is like this:
#include <iostream>
#include <algorithm>
#include <random>
#include <vector>
#include <numeric>
#include <algorithm>
#include <chrono>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <cooperative_groups.h>
using std::cout;
using std::endl;
void sort_2d_by_row();
thrust::device_vector<float> thrust_2d_by_row_even_odd(
thrust::device_vector<float>&, int, int);
__global__ void even_odd_kernel(float *ptr, int M, int N);
int main() {
cudaError_t err = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1UL << 32);
if (err) cout << "errors occur\n";
sort_2d_by_row();
return 0;
}
void sort_2d_by_row() {
std::random_device rd;
std::mt19937 engine;
engine.seed(rd());
std::uniform_real_distribution<float> u(0, 90.);
int M = 19;
int N = 8 * 768 * 768;
/* int N = 10; */
std::vector<float> v(M * N);
std::generate(v.begin(), v.end(), [&](){return u(engine);});
thrust::host_vector<float> hv(v.begin(), v.end());
thrust::device_vector<float> dv = hv;
thrust::device_vector<float> res_even_odd = thrust_2d_by_row_even_odd(dv, M, N);
}
thrust::device_vector<float> thrust_2d_by_row_even_odd(
thrust::device_vector<float>& v, int M, int N) {
thrust::device_vector<float> res(v.begin(), v.end());
thrust::device_vector<int> index(M);
thrust::sequence(thrust::device, index.begin(), index.end(), 0, 1);
int blocky = 1;
while (blocky < M) blocky *= 2;
blocky /= 2;
int blockx = 1;
while (blockx < (N / 2) && blockx < 1024) blockx *= 2;
blockx /= 2;
int gridx = std::min(4096, N / blockx / 2);
dim3 block(blockx, blocky);
dim3 grid(gridx);
even_odd_kernel<<<grid, block, 0>>>(
thrust::raw_pointer_cast(&res[0]), M, N);
cudaDeviceSynchronize();
return res;
}
// descending
__global__ void even_odd_kernel(float *ptr, int M, int N) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int m = threadIdx.y;
int tstride = blockDim.x * gridDim.x * 2;
cooperative_groups::grid_group g = cooperative_groups::this_grid();
g.sync();
}
And CMakeLists.txt is like this:
CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
PROJECT(cuda)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif ()
set(CMAKE_CXX_FLAGS "-std=c++14 -Wall -Wextra")
set(CMAKE_CXX_FLAGS_DEBUG "-g3 -O0")
set(CMAKE_CXX_FLAGS_RELEASE "-O2")
set(CUDA_NVCC_FLAGS "-std=c++14 -arch=sm_60 -Xptxas=-v -rdc=true")
set(CUDA_NVCC_FLAGS_DEBUG "-G -O0")
set(CUDA_NVCC_FLAGS_RELEASE "-O2")
set(CUDA_CUDA_FLAGS "-gencode arch=compute_70,code=sm_70 -rdc=true")
message (${CMAKE_BUILD_TYPE})
find_package(CUDA REQUIRED)
cuda_add_executable(sort sort.cu)
target_include_directories(
sort PUBLIC ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS})
target_link_libraries(
sort ${CUDA_LIBRARIES})
The error message is:
CMakeFiles/sort.dir/sort_generated_sort.cu.o: In function
`__sti____cudaRegisterAll()':
tmpxft_0004cd04_00000000-5_sort.cudafe1.cpp:(.text.startup+0x15):
undefined reference to
`__cudaRegisterLinkedBinary_39_tmpxft_0004cd04_00000000_6_sort_cpp1_ii_main'
collect2: error: ld returned 1 exit status
CMakeFiles/sort.dir/build.make:963: recipe for target 'sort' failed
How could I make it work please? Besides, Does g.sync() have big harms to the program performance, or is the impact travial?
The cooperative groups are not an issue, IMHO. That's just something requiring a recent version of CUDA. As for your linking trouble - I think it must be some sort of flag mess. I'll suggest an alternative CMakeLists.txt, which itself is not perfect, but is more appropriate for CMake versions of recent years. It also has a bunch of suggestions for you in comments:
cmake_minimum_required(VERSION 3.8.2)
# If you want to properly search for Thrust, you'll need a FindThrust.cmake
# script, which constitutes a "CMake module". You place it under cmake/Modules
# in your source directory and make it available by uncommenting the following
# line:
#list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
project(sort-with-cuda
DESCRIPTION "My project description here"
LANGUAGES CXX CUDA)
# Don't do this. Set your build type explicitly, once; and then it's
# cached and you don't have to worry about it when you run make.
#
#if (NOT CMAKE_BUILD_TYPE)
# set(CMAKE_BUILD_TYPE Release)
#endif ()
# In the future, this should not be necessary, but we need it for
# cuda_select_nvcc_arch_flags
include(FindCUDA)
# This will set the appropriate gencode parameters for the hardware
# on your system (although you could always force it manually)
cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS_TMP Auto)
set(CUDA_ARCH_FLAGS ${CUDA_ARCH_FLAGS_TMP} CACHE STRING "CUDA -gencode parameters")
string(REPLACE ";" " " CUDA_ARCH_FLAGS_STR "${CUDA_ARCH_FLAGS}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS_STR}")
# The above may produce something like:
#
# -gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_70,code=compute_70;-gencode;arch=compute_75,code=compute_75
#
# But it may include older micro-architectures which have been
# deprecated/removed, in which case you'll need to edit that
# with ccmake and only keep what you need.
add_executable(sort-with-cuda sort.cu)
set_target_properties(
sort-with-cuda
PROPERTIES
CXX_STANDARD 14
CXX_STANDARD_REQUIRED YES
CXX_EXTENSIONS NO
)
# Note: I haven't added flags for compiling with warnings
# Thrust is very finickey: It provies a configuration script, but
# only for CMake >= 3.15 . And - it doesn't provide a FindThrust.cmake
# script itself with targets appropriate for CMake >= 3.
#
# See https://github.com/NVIDIA/thrust/blob/main/thrust/cmake/README.md
#
# With CMake 3.15 or later you can enable the following two lines:
#
#find_package(Thrust REQUIRED CONFIG)
#thrust_create_target(Thrust)
#target_link_libraries(sort-with-cuda Thrust)
#
# With earlier CMake versions, get yourself a proper FindThrust.cmake
# script (which creates a Thrust::Thrust target I suppose) and
# then uncomment the following two lines:
#
#find_package(Thrust REQUIRED)
#target_link_libraries(sort-with-cuda Thrust::Thrust)
# The following sets -rdc=true , but you don't actually need that for your example
set_target_properties(sort-with-cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
I've been trying to run OpenCV using CLion IDE under Windows. When I try to run this sample code for loading and displaying an image
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <iostream>
using namespace cv;
using namespace std;
int main( int argc, char** argv )
{
if( argc != 2)
{
cout <<" Usage: display_image ImageToLoadAndDisplay" << endl;
return -1;
}
Mat image;
image = imread("earth.jpg", CV_LOAD_IMAGE_COLOR); // Read the file
if(! image.data ) // Check for invalid input
{
cout << "Could not open or find the image" << std::endl ;
return -1;
}
namedWindow( "Display window", WINDOW_AUTOSIZE );// Create a window for display.
imshow( "Display window", image ); // Show our image inside it.
waitKey(0); // Wait for a keystroke in the window
return 0;
}
I get the error statement:
Process finished with exit code -1073741515 (0xC0000135)
As for the content in my CMakeLists.txt, it looks like this:
cmake_minimum_required(VERSION 3.6)
project(test)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
# Where to find CMake modules and OpenCV
set(OpenCV_DIR "C:\\opencv\\mingw-build\\install")
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
add_executable(openCV main.cpp)
# add libs you need
set(OpenCV_LIBS opencv_core opencv_imgproc opencv_highgui opencv_imgcodecs)
# linking
target_link_libraries(openCV ${OpenCV_LIBS})
Thanks for helping me with this.
You need to add OpenCV binary path with DLLs to your PATH BEFORE CLion start.
I do it from script:
=== CLionWithMingwAndOpenCV.bat ==========================
#echo off
set PATH=C:\mingw-w64\x86_64-5.2.0-win32-seh-rt_v4-rev0\mingw64\bin;D:\opencv\release\bin;%PATH%
"C:\Program Files (x86)\JetBrains\CLion XXXX\bin\clion64.exe"
=== ==========================
I am totally new to cmake and its syntax .But fortunately I am able to run the cmake tutorial step 1 as per the introductions mention on below links :
https://cmake.org/cmake/help/latest/guide/tutorial/index.html
But I am totally stucked at step 2 project to run using cmake.
I have created the step 2 project and understand the syntax to link the library for doing square root of a number, But I did not understand how to run this as I am getting below error :
user#server:~/TER_CMAKE/Tutorial/step2_build$ cmake ../step2
CMake Error at CMakeLists.txt:19 (add_subdirectory):
The binary directory
/home/user/TER_CMAKE/Tutorial/step2/MathFunctions
is already used to build a source directory. It cannot be used to build
source directory
/home/user/TER_CMAKE/Tutorial/step2/MathFunctions
Specify a unique binary directory name.
-- Configuring incomplete, errors occurred!
The example is available at below location for step 2 under heading Adding a Library (Step 2)..
https://moodle.rrze.uni-erlangen.de/pluginfile.php/14829/mod_resource/content/5/CMakeTutorial.pdf
My intention is to run my example this way
step2_build$ cmake ../step2
step2_build$ cmake --build .
step2_build$ ./Tutorial 121
As I am not sure that is it good to ask this way on this platform ,But as I do not have any other guidance .I am doing this by my own .
Note: I do not wants to use any tool to run my step 2 example.I wants to run everything using command prompt and cmake command only .where I can understand the cmake .
Edit:
Adding my CMakeLists.txt =
cmake_minimum_required(VERSION 3.5)
#set the project name
project(Tutorial VERSION 1.0)
#specify the c++ std
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED True)
option(USE_MYMATH "Use tutorial provided math implementation" ON)
#Configure a header file to pass the version number to the source code
configure_file(TutorialConfig.h.in TutorialConfig.h)
#add the MathFunctions Library
add_subdirectory(MathFunctions)
if(USE_MYMATH)
add_subdirectory(MathFunctions)
list(APPEND EXTRA_LIBS MathFunctions)
list(APPEND EXTRA_INCLUDES "${PROJECT_SOURCE_DIR}/MathFunctions")
endif()
#add the executable
add_executable(Tutorial tutorial.cpp)
target_link_libraries(Tutorial PUBLIC ${EXTRA_LIBS})
# add the binary tree to the search path for include files
# so that we will find TutorialConfig.h
target_include_directories(Tutorial PUBLIC
"${PROJECT_BINARY_DIR}"
${EXTRA_LIBS}
)
My Source tutorial.cpp file:
#include <iostream>
#include <cmath>
#include <cstdlib>
#include <string>
#ifdef USE_MYMATH
#include "MathFunctions.h"
#endif
#include "TutorialConfig.h"
using namespace std;
int main(int argc, char* argv[])
{
if (argc < 2) {
cout << "Usage: " << argv[0] << " number" << endl;
return 1;
}
// convert input to double
const double inputValue = atof(argv[1]);
// calculate square root
#ifdef USE_MYMATH
const double outputValue = mysqrt(inputValue);
#else
const double outputValue = sqrt(inputValue);
#endif
cout << "The square root of " << inputValue << " is " << outputValue << endl;
return 0;
}
ToturialConfig.h.in file :
#define Tutorial_VERSION_MAJOR #Tutorial_VERSION_MAJOR#
#define Tutorial_VERSION_MINOR #Tutorial_VERSION_MINOR#
#cmakedefine USE_MYMATH
EDIT:
Step2 has a folder MathFuctions,Which has Cmake file mysqrt.cpp file
/TER_CMAKE/Tutorial/step2/MathFunctions/CMakeLists.txt
add_library(MathFunctions mysqrt.cpp)
/TER_CMAKE/Tutorial/step2/MathFunctions/mysqrt.cpp
#include <iostream>
// a hack square root calculation using simple operations
double mysqrt(double x)
{
if (x <= 0) {
return 0;
}
double result = x;
// do ten iterations
for (int i = 0; i < 10; ++i) {
if (result <= 0) {
result = 0.1;
}
double delta = x - (result * result);
result = result + 0.5 * delta / result;
std::cout << "Computing sqrt of " << x << " to be " << result << std::endl;
}
return result;
}
In case USE_MYMATH variable is set add_subdirectory(MathFunctions) is invoked twice. You need to decide and remove one of the occurrences on lines 16 and 19 in you CMakeLists.txt.
Two issues I can see:
You're adding the subdirectory "MathFunctions" twice when you configure the build with -DUSE_MYMATH=ON. This is why you are getting "CMake Error at CMakeLists.txt:19 (add_subdirectory):"
To fix, remove
#add the MathFunctions Library
add_subdirectory(MathFunctions)
and rely on
if(USE_MYMATH)
add_subdirectory(MathFunctions)
list(APPEND EXTRA_LIBS MathFunctions)
list(APPEND EXTRA_INCLUDES "${PROJECT_SOURCE_DIR}/MathFunctions")
endif()
In your CMakeLists.txt file, you are doing
target_include_directories(Tutorial PUBLIC
"${PROJECT_BINARY_DIR}"
${EXTRA_LIBS}
)
Instead of
${EXTRA_LIBS}
It should be
${EXTRA_INCLUDES}
in Discourse Cmake Org -- help with tutorial step 2
Josef Angstenberger
jtxa said
The files in Step3 are the expected result if you do everything from Step2.
Can you please compare your files against the ones from Step3 to see if there are any relevant differences?
Blockquote
Marshallb's solution will solve nahesh relkar's problem
Loading Step2/CMakeLists.txt and Step3/CMakeLists.txt into vimdiff helped me to fix mine
Following the same steps in CUDA samples to launch a kernel and sync across the grid using cooperative_groups::this_grid().sync() causes any CUDA API call to fails. While using
cooperative_groups::this_thread_block().sync() works fine and gives correct results.
I used the following code and CMakeLists.txt (cmake version 3.11.1) to test it using CUDA 10 on TITAN V GPU (Driver Version 410.73) with Ubuntu 16.04.5 LTS. The code is also available on github in order to make it easy to reproduce the error.
The code reads an array and then reverses it (from [0 1 2 ... 9] to [9 8 7 ... 0]). In order to do this, each thread reads a single element from the array, sync, and then writes its element to the right destination. The code can be easily modified to ensure that this_thread_block().sync() works fine. Simply change arr_size to be less 1024 and use cg::thread_block barrier = cg::this_thread_block(); instead.
test_cg.cu
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <stdint.h>
#include <cstdint>
#include <numeric>
#include <cuda.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
//********************** CUDA_ERROR
inline void HandleError(cudaError_t err, const char *file, int line) {
//Error handling micro, wrap it around function whenever possible
if (err != cudaSuccess) {
printf("\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);
#ifdef _WIN32
system("pause");
#else
exit(EXIT_FAILURE);
#endif
}
}
#define CUDA_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
//******************************************************************************
//********************** cg kernel
__global__ void testing_cg_grid_sync(const uint32_t num_elements,
uint32_t *d_arr){
uint32_t tid = threadIdx.x + blockDim.x*blockIdx.x;
if (tid < num_elements){
uint32_t my_element = d_arr[tid];
//to sync across the whole grid
cg::grid_group barrier = cg::this_grid();
//to sync within a single block
//cg::thread_block barrier = cg::this_thread_block();
//wait for all reads
barrier.sync();
uint32_t tar_id = num_elements - tid - 1;
d_arr[tar_id] = my_element;
}
}
//******************************************************************************
//********************** execute
void execute_test(const int sm_count){
//host array
const uint32_t arr_size = 1 << 20; //1M
uint32_t* h_arr = (uint32_t*)malloc(arr_size * sizeof(uint32_t));
//fill with sequential numbers
std::iota(h_arr, h_arr + arr_size, 0);
//device array
uint32_t* d_arr;
CUDA_ERROR(cudaMalloc((void**)&d_arr, arr_size*sizeof(uint32_t)));
CUDA_ERROR(cudaMemcpy(d_arr, h_arr, arr_size*sizeof(uint32_t),
cudaMemcpyHostToDevice));
//launch config
const int threads = 512;
//following the same steps done in conjugateGradientMultiBlockCG.cu
//cuda sample to launch kernel that sync across grid
//https://github.com/NVIDIA/cuda-samples/blob/master/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu#L436
int num_blocks_per_sm = 0;
CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
(void*)testing_cg_grid_sync, threads, 0));
dim3 grid_dim(sm_count * num_blocks_per_sm, 1, 1), block_dim(threads, 1, 1);
if(arr_size > grid_dim.x*block_dim.x){
printf("\n The grid size (numBlocks*numThreads) is less than array size.\n");
exit(EXIT_FAILURE);
}
printf("\n Launching %d blocks, each containing %d threads", grid_dim.x,
block_dim.x);
//argument passed to the kernel
void *kernel_args[] = {
(void *)&arr_size,
(void *)&d_arr, };
//finally launch the kernel
cudaLaunchCooperativeKernel((void*)testing_cg_grid_sync,
grid_dim, block_dim, kernel_args);
//make sure everything went okay
CUDA_ERROR(cudaGetLastError());
CUDA_ERROR(cudaDeviceSynchronize());
//get results on the host
CUDA_ERROR(cudaMemcpy(h_arr, d_arr, arr_size*sizeof(uint32_t),
cudaMemcpyDeviceToHost));
//validate
for (uint32_t i = 0; i < arr_size; i++){
if (h_arr[i] != arr_size - i - 1){
printf("\n Result mismatch in h_arr[%u] = %u\n", i, h_arr[i]);
exit(EXIT_FAILURE);
}
}
}
//******************************************************************************
int main(int argc, char**argv) {
//set to Titan V
uint32_t device_id = 0;
cudaSetDevice(device_id);
//get sm count
cudaDeviceProp devProp;
CUDA_ERROR(cudaGetDeviceProperties(&devProp, device_id));
int sm_count = devProp.multiProcessorCount;
//execute
execute_test(sm_count);
printf("\n Mission accomplished \n");
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
set(PROJECT_NAME "test_cg")
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
#default build type is Release
if (CMAKE_BUILD_TYPE STREQUAL "")
set(CMAKE_BUILD_TYPE Release)
endif ()
SET(CUDA_SEPARABLE_COMPILATION ON)
########## Libraries/flags Starts Here ######################
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -lineinfo; -std=c++11; -expt-extended-lambda; -O3; -use_fast_math; -rdc=true;)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode=arch=compute_70,code=sm_70) #for TITAN V
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -Wall -std=c++11")
########## Libraries/flags Ends Here ######################
########## inc/libs/exe/features Starts Here ######################
set(CMAKE_INCLUDE_CURRENT_DIR ON)
CUDA_ADD_EXECUTABLE(${PROJECT_NAME} test_cg.cu)
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY})
########## inc/libs/exe/features Ends Here ######################
Running this code gives:
unknown error in /home/ahdhn/test_cg/test_cg.cu at line 67
This is the first line that uses cudaMalloc. I made sure that the code is compiled for the correct architecture by querying __CUDA_ARCH__ from the device and the results is 700. Kindly let me know if you spot me doing something wrong in the code or the CMakeLists.txt file.
With external help, the solution that got the code working is to add string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70 --cudart shared") after the second set(CUDA_NVCC_FLAGS...... The reason is that I only have libcudadevrt.a under my /usr/local/cuda-10.0/lib64/ and so I have to signal CUDA to link shared/dynamic run-time library since the default is to link to static. string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70") after the second set(CUDA_NVCC_FLAGS...... The reason is that the sm_70 flag was not passed to the linker properly.
Additionally, using only CUDA_NVCC_FLAGS will only pass the sm_70 info to the compiler not the linker. While only using CMAKE_NVCC_FLAGS will report error: namespace "cooperative_groups" has no member "grid_group" error.
I have the code shown below. As far as I understood, separable compilation must be turned on when
CUDA device code is separated into .h and .cu files
Use ObjectA's device code into Object's B device code
however, in my main function I am not having any of the cases above. Could you tell me why do I have to set separable compilation for this sample project?
BitHelper.h
#pragma once
#include <cuda_runtime.h>
#define COMPILE_TARGET __host__ __device__
class BitHelper
{
public:
COMPILE_TARGET BitHelper();
COMPILE_TARGET ~BitHelper();
COMPILE_TARGET static void clear(unsigned int& val0);
};
BitHelper.cu
#include "bithelper.h"
BitHelper::BitHelper()
{}
BitHelper::~BitHelper()
{}
void BitHelper::clear(unsigned int& val0)
{
val0 = 0x0000;
}
Consume_BitHelper.h
#pragma once
class Consume_BitHelper
{
public:
void apply();
private:
bool test_cpu();
bool test_gpu();
};
Consume_BitHelper.cu
#include "consume_bithelper.h"
#include <cuda_runtime.h>
#include <iostream>
#include "bithelper.h"
__global__
void myKernel()
{
unsigned int FLAG_VALUE = 0x2222;
printf("GPU before: %d\n", FLAG_VALUE);
BitHelper::clear(FLAG_VALUE);
printf("GPU after: %d\n", FLAG_VALUE);
}
void Consume_BitHelper::apply()
{
test_cpu();
test_gpu();
cudaDeviceSynchronize();
}
bool Consume_BitHelper::test_cpu()
{
std::cout << "TEST CPU" << std::endl;
unsigned int FLAG_VALUE = 0x1111;
std::cout << "CPU before: " << FLAG_VALUE << std::endl;
BitHelper::clear(FLAG_VALUE);
std::cout << "CPU after : " << FLAG_VALUE << std::endl;
return true;
}
bool Consume_BitHelper::test_gpu()
{
std::cout << "TEST GPU" << std::endl;
myKernel << <1, 1 >> > ();
return true;
}
main.cu
#include "consume_bithelper.h"
#include "bithelper.h"
#include <iostream>
int main(int argc, char** argv)
{
Consume_BitHelper cbh;
cbh.apply();
std::cout << "\nPress any key to continue...";
std::cin.get();
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(cuda_class LANGUAGES CXX CUDA)
#BitHelper needs separable compilation because we have separated declaration from definition
add_library(bithelper_lib STATIC bithelper.cu)
set_property(TARGET bithelper_lib PROPERTY CUDA_SEPARABLE_COMPILATION ON)
#Consume_BitHelper needs separable compilation because we call BitHelper's device code
#from Consume_BitHelper's kernel
add_library(consume_bithelper_lib STATIC consume_bithelper.cu)
set_property(TARGET consume_bithelper_lib PROPERTY CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(consume_bithelper_lib bithelper_lib)
#We only call CPU code so no need of separable compilation?
add_executable(${PROJECT_NAME} main.cu)
target_link_libraries(${PROJECT_NAME} bithelper_lib consume_bithelper_lib)
The errors I'm getting are these
EDIT
According to Robert Crovella's post Consume_BitHelper.cu uses BitHelper::clear defined in a separate compilation unit.
Does it mean I have to activate only separate compilation for BitHelper?
Since separate compilation has to do only with device code called from device code.
Why am I getting the mentioned errors when separate compilation is NOT on for cuda_class? (which is the executable created from CMake and is not calling any device code)
Separable compilation has to do with how the compiler handles function calls. In exchange for a little bit of overhead, you get the ability to make true function calls and thus access code from other "compilation units" (i.e. .cu source files).
As GPU programmers are obsessed with performance (particularly the extra registers that get used when separable compilation is enabled) Nvidia made it an option instead of default.
You should only need separable compilation for .cu files that access functions/globals defined in other .cu files.