This tutorial demonstrates how to make a C++/CUDA-based Python extension for PyTorch. But for ... reasons ... my use-case is more complicated than this and doesn't fit neatly within the Python setuptools framework described by the tutorial.
Is there a way to use cmake to compile a Python library that extends PyTorch?
Yes.
The trick is to use cmake to combine together all the C++ and CUDA files we'll need and to use PyBind11 to build the interface we want; fortunately, PyBind11 is included with PyTorch.
The code below is collected and kept up-to-date in this Github repo.
Our project consists of several files:
CMakeLists.txt
cmake_minimum_required (VERSION 3.9)
project(pytorch_cmake_example LANGUAGES CXX CUDA)
find_package(Python REQUIRED COMPONENTS Development)
find_package(Torch REQUIRED)
# Modify if you need a different default value
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 61)
endif()
# List all your code files here
add_library(pytorch_cmake_example SHARED
main.cu
)
target_compile_features(pytorch_cmake_example PRIVATE cxx_std_11)
target_link_libraries(pytorch_cmake_example PRIVATE ${TORCH_LIBRARIES} Python::Python)
# Use if the default GCC version gives issues.
# Similar syntax is used if we need better compilation flags.
target_compile_options(pytorch_cmake_example PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-ccbin g++-9>)
# Use a variant of this if you're on an earlier cmake than 3.18
# target_compile_options(pytorch_cmake_example PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-gencode arch=compute_61,code=sm_61>)
main.cu
#include <c10/cuda/CUDAException.h>
#include <torch/extension.h>
#include <torch/library.h>
using namespace at;
int64_t integer_round(int64_t num, int64_t denom){
return (num + denom - 1) / denom;
}
template<class T>
__global__ void add_one_kernel(const T *const input, T *const output, const int64_t N){
// Grid-strided loop
for(int i=blockDim.x*blockIdx.x+threadIdx.x;i<N;i+=blockDim.x*gridDim.x){
output[i] = input[i] + 1;
}
}
///Adds one to each element of a tensor
Tensor add_one(const Tensor &input){
auto output = torch::zeros_like(input);
// Common values:
// AT_DISPATCH_INDEX_TYPES
// AT_DISPATCH_FLOATING_TYPES
// AT_DISPATCH_INTEGRAL_TYPES
AT_DISPATCH_ALL_TYPES(
input.scalar_type(), "add_one_cuda", [&](){
const auto block_size = 128;
const auto num_blocks = std::min(65535L, integer_round(input.numel(), block_size));
add_one_kernel<<<num_blocks, block_size>>>(
input.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
input.numel()
);
// Always test your kernel launches
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
);
return output;
}
///Note that we can have multiple implementations spread across multiple files, though there should only be one `def`
TORCH_LIBRARY(pytorch_cmake_example, m) {
m.def("add_one(Tensor input) -> Tensor");
m.impl("add_one", c10::DispatchKey::CUDA, TORCH_FN(add_one));
//c10::DispatchKey::CPU is also an option
}
Compilation
Compile it all using this command:
cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` -GNinja ..
test.py
You can then run the following test script.
import torch
torch.ops.load_library("build/libpytorch_cmake_example.so")
shape = (3,3,3)
a = torch.randint(0, 10, shape, dtype=torch.float).cuda()
a_plus_one = torch.ops.pytorch_cmake_example.add_one(a)
Related
I am totally new to cmake and its syntax .But fortunately I am able to run the cmake tutorial step 1 as per the introductions mention on below links :
https://cmake.org/cmake/help/latest/guide/tutorial/index.html
But I am totally stucked at step 2 project to run using cmake.
I have created the step 2 project and understand the syntax to link the library for doing square root of a number, But I did not understand how to run this as I am getting below error :
user#server:~/TER_CMAKE/Tutorial/step2_build$ cmake ../step2
CMake Error at CMakeLists.txt:19 (add_subdirectory):
The binary directory
/home/user/TER_CMAKE/Tutorial/step2/MathFunctions
is already used to build a source directory. It cannot be used to build
source directory
/home/user/TER_CMAKE/Tutorial/step2/MathFunctions
Specify a unique binary directory name.
-- Configuring incomplete, errors occurred!
The example is available at below location for step 2 under heading Adding a Library (Step 2)..
https://moodle.rrze.uni-erlangen.de/pluginfile.php/14829/mod_resource/content/5/CMakeTutorial.pdf
My intention is to run my example this way
step2_build$ cmake ../step2
step2_build$ cmake --build .
step2_build$ ./Tutorial 121
As I am not sure that is it good to ask this way on this platform ,But as I do not have any other guidance .I am doing this by my own .
Note: I do not wants to use any tool to run my step 2 example.I wants to run everything using command prompt and cmake command only .where I can understand the cmake .
Edit:
Adding my CMakeLists.txt =
cmake_minimum_required(VERSION 3.5)
#set the project name
project(Tutorial VERSION 1.0)
#specify the c++ std
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED True)
option(USE_MYMATH "Use tutorial provided math implementation" ON)
#Configure a header file to pass the version number to the source code
configure_file(TutorialConfig.h.in TutorialConfig.h)
#add the MathFunctions Library
add_subdirectory(MathFunctions)
if(USE_MYMATH)
add_subdirectory(MathFunctions)
list(APPEND EXTRA_LIBS MathFunctions)
list(APPEND EXTRA_INCLUDES "${PROJECT_SOURCE_DIR}/MathFunctions")
endif()
#add the executable
add_executable(Tutorial tutorial.cpp)
target_link_libraries(Tutorial PUBLIC ${EXTRA_LIBS})
# add the binary tree to the search path for include files
# so that we will find TutorialConfig.h
target_include_directories(Tutorial PUBLIC
"${PROJECT_BINARY_DIR}"
${EXTRA_LIBS}
)
My Source tutorial.cpp file:
#include <iostream>
#include <cmath>
#include <cstdlib>
#include <string>
#ifdef USE_MYMATH
#include "MathFunctions.h"
#endif
#include "TutorialConfig.h"
using namespace std;
int main(int argc, char* argv[])
{
if (argc < 2) {
cout << "Usage: " << argv[0] << " number" << endl;
return 1;
}
// convert input to double
const double inputValue = atof(argv[1]);
// calculate square root
#ifdef USE_MYMATH
const double outputValue = mysqrt(inputValue);
#else
const double outputValue = sqrt(inputValue);
#endif
cout << "The square root of " << inputValue << " is " << outputValue << endl;
return 0;
}
ToturialConfig.h.in file :
#define Tutorial_VERSION_MAJOR #Tutorial_VERSION_MAJOR#
#define Tutorial_VERSION_MINOR #Tutorial_VERSION_MINOR#
#cmakedefine USE_MYMATH
EDIT:
Step2 has a folder MathFuctions,Which has Cmake file mysqrt.cpp file
/TER_CMAKE/Tutorial/step2/MathFunctions/CMakeLists.txt
add_library(MathFunctions mysqrt.cpp)
/TER_CMAKE/Tutorial/step2/MathFunctions/mysqrt.cpp
#include <iostream>
// a hack square root calculation using simple operations
double mysqrt(double x)
{
if (x <= 0) {
return 0;
}
double result = x;
// do ten iterations
for (int i = 0; i < 10; ++i) {
if (result <= 0) {
result = 0.1;
}
double delta = x - (result * result);
result = result + 0.5 * delta / result;
std::cout << "Computing sqrt of " << x << " to be " << result << std::endl;
}
return result;
}
In case USE_MYMATH variable is set add_subdirectory(MathFunctions) is invoked twice. You need to decide and remove one of the occurrences on lines 16 and 19 in you CMakeLists.txt.
Two issues I can see:
You're adding the subdirectory "MathFunctions" twice when you configure the build with -DUSE_MYMATH=ON. This is why you are getting "CMake Error at CMakeLists.txt:19 (add_subdirectory):"
To fix, remove
#add the MathFunctions Library
add_subdirectory(MathFunctions)
and rely on
if(USE_MYMATH)
add_subdirectory(MathFunctions)
list(APPEND EXTRA_LIBS MathFunctions)
list(APPEND EXTRA_INCLUDES "${PROJECT_SOURCE_DIR}/MathFunctions")
endif()
In your CMakeLists.txt file, you are doing
target_include_directories(Tutorial PUBLIC
"${PROJECT_BINARY_DIR}"
${EXTRA_LIBS}
)
Instead of
${EXTRA_LIBS}
It should be
${EXTRA_INCLUDES}
in Discourse Cmake Org -- help with tutorial step 2
Josef Angstenberger
jtxa said
The files in Step3 are the expected result if you do everything from Step2.
Can you please compare your files against the ones from Step3 to see if there are any relevant differences?
Blockquote
Marshallb's solution will solve nahesh relkar's problem
Loading Step2/CMakeLists.txt and Step3/CMakeLists.txt into vimdiff helped me to fix mine
I'm using CGAL 4.13 (Linux Fedora 29) to generate 3D meshes from segmented anathomical images. I would like to use Lloyd optimization, but I got in a reproductible way a runtime error.
In order to illustrate my problem, I modified the example mesh_3D_image.cpp by adding a Lloyd optimization step, as shown hereafter. The program compiles with no error/warning message.
#include <CGAL/Exact_predicates_inexact_constructions_kernel.h>
#include <CGAL/Mesh_triangulation_3.h>
#include <CGAL/Mesh_complex_3_in_triangulation_3.h>
#include <CGAL/Mesh_criteria_3.h>
#include <CGAL/Labeled_mesh_domain_3.h>
#include <CGAL/make_mesh_3.h>
#include <CGAL/Image_3.h>
typedef CGAL::Exact_predicates_inexact_constructions_kernel K;
typedef CGAL::Labeled_mesh_domain_3<K> Mesh_domain;
typedef CGAL::Sequential_tag Concurrency_tag;
typedef CGAL::Mesh_triangulation_3<Mesh_domain,CGAL::Default,Concurrency_tag>::type Tr;
typedef CGAL::Mesh_complex_3_in_triangulation_3<Tr> C3t3;
typedef CGAL::Mesh_criteria_3<Tr> Mesh_criteria;
using namespace CGAL::parameters;
int main(int argc, char* argv[])
{
const char* fname = (argc>1)?argv[1]:"data/liver.inr.gz";
CGAL::Image_3 image;
if(!image.read(fname)){
std::cerr << "Error: Cannot read file " << fname << std::endl;
return EXIT_FAILURE;
}
Mesh_domain domain = Mesh_domain::create_labeled_image_mesh_domain(image);
Mesh_criteria criteria(facet_angle=30, facet_size=6, facet_distance=4,
cell_radius_edge_ratio=3, cell_size=8);
C3t3 c3t3 = CGAL::make_mesh_3<C3t3>(domain, criteria);
// !!! THE FOLLOWING LINE MAKES THE PROGRAM CRASH !!!
CGAL::lloyd_optimize_mesh_3(c3t3, domain, time_limit=30);
std::ofstream medit_file("out.mesh");
c3t3.output_to_medit(medit_file);
return 0;
}
I compile it by using the following CMakeLists.txt file:
# Created by the script cgal_create_CMakeLists
project( executables )
cmake_minimum_required(VERSION 2.8.11)
find_package( CGAL QUIET COMPONENTS )
# !!! I had to add manually the following line !!!
find_package(CGAL COMPONENTS ImageIO)
include( ${CGAL_USE_FILE} )
find_package( Boost REQUIRED )
add_executable( executables lloyd.cpp )
add_to_cached_list( CGAL_EXECUTABLE_TARGETS executables )
target_link_libraries(executables ${CGAL_LIBRARIES} ${CGAL_3RD_PARTY_LIBRARIES} )
No mesh is generated. I obtain the following message:
$ ./build/mesh_3D_image
terminate called after throwing an instance of 'CGAL::Precondition_exception'
what(): CGAL ERROR: precondition violation!
Expr: std::distance(first,last) >= 3
File: /usr/include/CGAL/Mesh_3/Lloyd_move.h
Line: 419
Aborted (core dumped)
Where my code is wrong, and how can I trigger optimizations for meshes generated by 3D images?
actually, when CGAL::make_mesh_3() is called like this :
C3t3 c3t3 = CGAL::make_mesh_3<C3t3>(domain, criteria);
it internally launches CGAL::perturb_mesh_3() and CGAL::exude_mesh_3(). The latest changes the weights of vertices in the Regular triangulation, and should always be called last (see the Warning in the documentation of CGAL::exude_mesh_3().
The only limitation on the order is that exuder should be called last. So you can either call
C3t3 c3t3 = CGAL::make_mesh_3<C3t3>(domain, criteria, lloyd(time_limit=30));
or
C3t3 c3t3 = CGAL::make_mesh_3<C3t3>(domain, criteria, no_exude());
CGAL::lloyd_optimize_mesh_3(c3t3, domain, time_limit = 30);
CGAL::exude_mesh_3(c3t3);
You removed the part:
if(!image.read(fname)){
std::cerr << "Error: Cannot read file " << fname << std::endl;
return EXIT_FAILURE;
}
from the example, which is what actually reads the image from the file.
Following the same steps in CUDA samples to launch a kernel and sync across the grid using cooperative_groups::this_grid().sync() causes any CUDA API call to fails. While using
cooperative_groups::this_thread_block().sync() works fine and gives correct results.
I used the following code and CMakeLists.txt (cmake version 3.11.1) to test it using CUDA 10 on TITAN V GPU (Driver Version 410.73) with Ubuntu 16.04.5 LTS. The code is also available on github in order to make it easy to reproduce the error.
The code reads an array and then reverses it (from [0 1 2 ... 9] to [9 8 7 ... 0]). In order to do this, each thread reads a single element from the array, sync, and then writes its element to the right destination. The code can be easily modified to ensure that this_thread_block().sync() works fine. Simply change arr_size to be less 1024 and use cg::thread_block barrier = cg::this_thread_block(); instead.
test_cg.cu
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <stdint.h>
#include <cstdint>
#include <numeric>
#include <cuda.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
//********************** CUDA_ERROR
inline void HandleError(cudaError_t err, const char *file, int line) {
//Error handling micro, wrap it around function whenever possible
if (err != cudaSuccess) {
printf("\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);
#ifdef _WIN32
system("pause");
#else
exit(EXIT_FAILURE);
#endif
}
}
#define CUDA_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
//******************************************************************************
//********************** cg kernel
__global__ void testing_cg_grid_sync(const uint32_t num_elements,
uint32_t *d_arr){
uint32_t tid = threadIdx.x + blockDim.x*blockIdx.x;
if (tid < num_elements){
uint32_t my_element = d_arr[tid];
//to sync across the whole grid
cg::grid_group barrier = cg::this_grid();
//to sync within a single block
//cg::thread_block barrier = cg::this_thread_block();
//wait for all reads
barrier.sync();
uint32_t tar_id = num_elements - tid - 1;
d_arr[tar_id] = my_element;
}
}
//******************************************************************************
//********************** execute
void execute_test(const int sm_count){
//host array
const uint32_t arr_size = 1 << 20; //1M
uint32_t* h_arr = (uint32_t*)malloc(arr_size * sizeof(uint32_t));
//fill with sequential numbers
std::iota(h_arr, h_arr + arr_size, 0);
//device array
uint32_t* d_arr;
CUDA_ERROR(cudaMalloc((void**)&d_arr, arr_size*sizeof(uint32_t)));
CUDA_ERROR(cudaMemcpy(d_arr, h_arr, arr_size*sizeof(uint32_t),
cudaMemcpyHostToDevice));
//launch config
const int threads = 512;
//following the same steps done in conjugateGradientMultiBlockCG.cu
//cuda sample to launch kernel that sync across grid
//https://github.com/NVIDIA/cuda-samples/blob/master/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu#L436
int num_blocks_per_sm = 0;
CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
(void*)testing_cg_grid_sync, threads, 0));
dim3 grid_dim(sm_count * num_blocks_per_sm, 1, 1), block_dim(threads, 1, 1);
if(arr_size > grid_dim.x*block_dim.x){
printf("\n The grid size (numBlocks*numThreads) is less than array size.\n");
exit(EXIT_FAILURE);
}
printf("\n Launching %d blocks, each containing %d threads", grid_dim.x,
block_dim.x);
//argument passed to the kernel
void *kernel_args[] = {
(void *)&arr_size,
(void *)&d_arr, };
//finally launch the kernel
cudaLaunchCooperativeKernel((void*)testing_cg_grid_sync,
grid_dim, block_dim, kernel_args);
//make sure everything went okay
CUDA_ERROR(cudaGetLastError());
CUDA_ERROR(cudaDeviceSynchronize());
//get results on the host
CUDA_ERROR(cudaMemcpy(h_arr, d_arr, arr_size*sizeof(uint32_t),
cudaMemcpyDeviceToHost));
//validate
for (uint32_t i = 0; i < arr_size; i++){
if (h_arr[i] != arr_size - i - 1){
printf("\n Result mismatch in h_arr[%u] = %u\n", i, h_arr[i]);
exit(EXIT_FAILURE);
}
}
}
//******************************************************************************
int main(int argc, char**argv) {
//set to Titan V
uint32_t device_id = 0;
cudaSetDevice(device_id);
//get sm count
cudaDeviceProp devProp;
CUDA_ERROR(cudaGetDeviceProperties(&devProp, device_id));
int sm_count = devProp.multiProcessorCount;
//execute
execute_test(sm_count);
printf("\n Mission accomplished \n");
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
set(PROJECT_NAME "test_cg")
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
#default build type is Release
if (CMAKE_BUILD_TYPE STREQUAL "")
set(CMAKE_BUILD_TYPE Release)
endif ()
SET(CUDA_SEPARABLE_COMPILATION ON)
########## Libraries/flags Starts Here ######################
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -lineinfo; -std=c++11; -expt-extended-lambda; -O3; -use_fast_math; -rdc=true;)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode=arch=compute_70,code=sm_70) #for TITAN V
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -Wall -std=c++11")
########## Libraries/flags Ends Here ######################
########## inc/libs/exe/features Starts Here ######################
set(CMAKE_INCLUDE_CURRENT_DIR ON)
CUDA_ADD_EXECUTABLE(${PROJECT_NAME} test_cg.cu)
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY})
########## inc/libs/exe/features Ends Here ######################
Running this code gives:
unknown error in /home/ahdhn/test_cg/test_cg.cu at line 67
This is the first line that uses cudaMalloc. I made sure that the code is compiled for the correct architecture by querying __CUDA_ARCH__ from the device and the results is 700. Kindly let me know if you spot me doing something wrong in the code or the CMakeLists.txt file.
With external help, the solution that got the code working is to add string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70 --cudart shared") after the second set(CUDA_NVCC_FLAGS...... The reason is that I only have libcudadevrt.a under my /usr/local/cuda-10.0/lib64/ and so I have to signal CUDA to link shared/dynamic run-time library since the default is to link to static. string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70") after the second set(CUDA_NVCC_FLAGS...... The reason is that the sm_70 flag was not passed to the linker properly.
Additionally, using only CUDA_NVCC_FLAGS will only pass the sm_70 info to the compiler not the linker. While only using CMAKE_NVCC_FLAGS will report error: namespace "cooperative_groups" has no member "grid_group" error.
I have a program to recognize files based on their signatures. Works great. But I'm new to C and am now trying to get an IDE called CLion to work. However, I can't figure out how to add command arguments to cmake - such that when I run main and want to pass a gifFile or a pdfFile to recognize I can do so. Here is my code so far for cmake.
cmake_minimum_required(VERSION 3.8)
project(Assignment6)
set(CMAKE_C_STANDARD 99)
set(SOURCE_FILES file_recognizer.c)
add_executable(Assignment6 ${SOURCE_FILES})
in the command line this would be something like
gcc file_recognizer.c -o Assignment6
and then you say
./Assignment6 gifFile.gif
How do I get cmake to accept the argument at the end, "gifFile.gif"?
Below is my main function for reference if needed
int main(int argc, char const *argv[]) {
FILE *yourFile;
unsigned char *fileBytes, *fileType;
long fileLength;
fileLength = 10;
if(argc != 2 || (yourFile = fopen(argv[1], "rb")) == NULL) {
printf("Invalid Input\n");
exit(-1);
}
fileBytes = readFile(yourFile, fileLength, fileBytes);
determineFileType(fileBytes, fileType);
return 0;
}
This isn't something CMake is designed to do. I mean, it's do-able with CMake:
add_custom_command(OUTPUT output COMMAND Assignment6 ARGS gifFile.gif)
But what it sounds like what you want to do isn't part of the build process and CMake is all about the building, testing, and packaging software, not running it.
Since you're using CLion, you might be better off setting the program arguements in CLion for debugging purposes.
Click here:
and enter your args here:
when I compile my code in Eclipse it works fine. But the process fails if I try to compile it with CMake. It says:
error: ‘default_random_engine’ does not name a type
default_random_engine generator;
error: ‘uniform_int_distribution’ does not name a type
uniform_int_distribution distribution;
and some more errors, which I believe are the consequences of these two.
class randomInt
{
private:
int m_max;
default_random_engine generator;
uniform_int_distribution<int> distribution;
public:
randomInt(int max = 0) :
m_max(max),
generator(time(0)),
distribution(0, m_max)
{}
int operator ()()
{
return distribution(generator);
}
};
int main(int argc, char **argv)
{
vector<int> vec(100);
generate(vec.begin(), vec.end(), randomInt(100));
ostream_iterator<int> streamIt(cout, ",\n");
copy(vec.begin(), vec.end(), streamIt);
return 0;
}
There is my CMakeLists.txt
project(TEST)
# States that CMake required version must be greater than 2.6
cmake_minimum_required(VERSION 2.8)
# Setup sources
set(TEST_SOURCES
aufgabe2_1.cpp
aufgabe2_2.cpp
aufgabe2_3.cpp
aufgabe2_4.cpp)
set(CMAKE_CXX_FLAGS_DEBUG "-g -Wall -std=c++11")
# Build executable
add_executable(main ${TEST_SOURCES})
These are symbols from the C++ standard library's pseudo-random number generation, but you haven't qualified the symbols with the namespace std, so use std::default_random_engine and std::uniform_int_distribution.