Simplest way to auto-restart a JVM on StackOverflowError - error-handling

It does not seem that there is a -XX option to restart a JVM on StackOverflowError. What is the simplest way to auto-restart a JVM when it gets a StackOverflowError?

HotSpot JVM has built-in -XX:AbortVMOnException=java.lang.StackOverflowError option, but unfortunately this flag is available only in debug builds of JVM.
The working solution is to use JVM TI agent that will intercept all exceptions and abort the process whenever the exception belongs to the specified class. Here is an example of such agent.
#include <jvmti.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
static const char* fatal_error_class;
void JNICALL ExceptionCallback(jvmtiEnv* jvmti, JNIEnv* env, jthread thread,
jmethodID method, jlocation location, jobject exception,
jmethodID catch_method, jlocation catch_location) {
char* class_name;
jclass exception_class = env->GetObjectClass(exception);
jvmti->GetClassSignature(exception_class, &class_name, NULL);
class_name[strlen(class_name) - 1] = 0;
if (strcmp(class_name + 1, fatal_error_class) == 0) {
printf("Abort on fatal error\n");
exit(1);
}
jvmti->Deallocate((unsigned char*)class_name);
}
extern "C" JNIEXPORT jint JNICALL Agent_OnLoad(JavaVM* vm, char* options, void* unused) {
if (options == NULL || options[0] == 0) {
printf("Usage: -agentpath:/path/to/libabort.so=java/lang/StackOverflowError\n");
return 1;
}
fatal_error_class = strdup(options);
jvmtiEnv* jvmti;
vm->GetEnv((void**)&jvmti, JVMTI_VERSION_1_0);
jvmtiCapabilities capabilities = {0};
capabilities.can_generate_exception_events = 1;
jvmti->AddCapabilities(&capabilities);
jvmtiEventCallbacks callbacks = {0};
callbacks.Exception = ExceptionCallback;
jvmti->SetEventCallbacks(&callbacks, sizeof(callbacks));
jvmti->SetEventNotificationMode(JVMTI_ENABLE, JVMTI_EVENT_EXCEPTION, NULL);
return 0;
}
How to compile it:
g++ -I $JAVA_HOME/include -I $JAVA_HOME/include/linux -fPIC -shared -olibabort.so abort.cpp
How to run:
java -agentpath:/path/to/libabort.so=java/lang/StackOverflowError ...

Related

Linux kernel module reference counter is always zero

I am implementing a kernel module that exposes some data to userspase using mmap interface.
I create a file in /proc file system passing struct file_operations with pointers to needed functions:
static struct file_operations module_file_ops = {
.owner = THIS_MODULE,
.open = module_open,
.mmap = module_mmap
};
proc_create(THIS_MODULE->name, 0444, NULL, &module_file_ops);
Userspace application is able to open and read from the file (mmap contents) as expected.
When I do lsof I see the file is opened by the userspace app.
However, lsmod always gives zero as usage counter despite I set .owner to THIS_MODULE, so that I can easily remove the module with rmmod and lead the system to crash.
Please advise.
module.c
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
static struct proc_dir_entry *proc_file;
static const struct file_operations test_file_ops = {
.owner = THIS_MODULE
};
static int __init initialize(void) {
int error = 0;
proc_file = proc_create(THIS_MODULE->name, 0444, NULL, &test_file_ops);
if (!proc_file) {
error = -EIO;
}
return error;
}
static void __exit teardown(void) {
proc_remove(proc_file);
}
module_init(initialize);
module_exit(teardown);
Makefile
obj-m += test.o
test-objs := module.o
CC=gcc
KDIR := /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)
EXTRA_CFLAGS=-I/usr/include -I/usr/include/x86_64-linux-gnu
all:
$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules

cooperative_groups::this_grid() causes any CUDA API call to return 'unknown error'

Following the same steps in CUDA samples to launch a kernel and sync across the grid using cooperative_groups::this_grid().sync() causes any CUDA API call to fails. While using
cooperative_groups::this_thread_block().sync() works fine and gives correct results.
I used the following code and CMakeLists.txt (cmake version 3.11.1) to test it using CUDA 10 on TITAN V GPU (Driver Version 410.73) with Ubuntu 16.04.5 LTS. The code is also available on github in order to make it easy to reproduce the error.
The code reads an array and then reverses it (from [0 1 2 ... 9] to [9 8 7 ... 0]). In order to do this, each thread reads a single element from the array, sync, and then writes its element to the right destination. The code can be easily modified to ensure that this_thread_block().sync() works fine. Simply change arr_size to be less 1024 and use cg::thread_block barrier = cg::this_thread_block(); instead.
test_cg.cu
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <stdint.h>
#include <cstdint>
#include <numeric>
#include <cuda.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
//********************** CUDA_ERROR
inline void HandleError(cudaError_t err, const char *file, int line) {
//Error handling micro, wrap it around function whenever possible
if (err != cudaSuccess) {
printf("\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);
#ifdef _WIN32
system("pause");
#else
exit(EXIT_FAILURE);
#endif
}
}
#define CUDA_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
//******************************************************************************
//********************** cg kernel
__global__ void testing_cg_grid_sync(const uint32_t num_elements,
uint32_t *d_arr){
uint32_t tid = threadIdx.x + blockDim.x*blockIdx.x;
if (tid < num_elements){
uint32_t my_element = d_arr[tid];
//to sync across the whole grid
cg::grid_group barrier = cg::this_grid();
//to sync within a single block
//cg::thread_block barrier = cg::this_thread_block();
//wait for all reads
barrier.sync();
uint32_t tar_id = num_elements - tid - 1;
d_arr[tar_id] = my_element;
}
}
//******************************************************************************
//********************** execute
void execute_test(const int sm_count){
//host array
const uint32_t arr_size = 1 << 20; //1M
uint32_t* h_arr = (uint32_t*)malloc(arr_size * sizeof(uint32_t));
//fill with sequential numbers
std::iota(h_arr, h_arr + arr_size, 0);
//device array
uint32_t* d_arr;
CUDA_ERROR(cudaMalloc((void**)&d_arr, arr_size*sizeof(uint32_t)));
CUDA_ERROR(cudaMemcpy(d_arr, h_arr, arr_size*sizeof(uint32_t),
cudaMemcpyHostToDevice));
//launch config
const int threads = 512;
//following the same steps done in conjugateGradientMultiBlockCG.cu
//cuda sample to launch kernel that sync across grid
//https://github.com/NVIDIA/cuda-samples/blob/master/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu#L436
int num_blocks_per_sm = 0;
CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
(void*)testing_cg_grid_sync, threads, 0));
dim3 grid_dim(sm_count * num_blocks_per_sm, 1, 1), block_dim(threads, 1, 1);
if(arr_size > grid_dim.x*block_dim.x){
printf("\n The grid size (numBlocks*numThreads) is less than array size.\n");
exit(EXIT_FAILURE);
}
printf("\n Launching %d blocks, each containing %d threads", grid_dim.x,
block_dim.x);
//argument passed to the kernel
void *kernel_args[] = {
(void *)&arr_size,
(void *)&d_arr, };
//finally launch the kernel
cudaLaunchCooperativeKernel((void*)testing_cg_grid_sync,
grid_dim, block_dim, kernel_args);
//make sure everything went okay
CUDA_ERROR(cudaGetLastError());
CUDA_ERROR(cudaDeviceSynchronize());
//get results on the host
CUDA_ERROR(cudaMemcpy(h_arr, d_arr, arr_size*sizeof(uint32_t),
cudaMemcpyDeviceToHost));
//validate
for (uint32_t i = 0; i < arr_size; i++){
if (h_arr[i] != arr_size - i - 1){
printf("\n Result mismatch in h_arr[%u] = %u\n", i, h_arr[i]);
exit(EXIT_FAILURE);
}
}
}
//******************************************************************************
int main(int argc, char**argv) {
//set to Titan V
uint32_t device_id = 0;
cudaSetDevice(device_id);
//get sm count
cudaDeviceProp devProp;
CUDA_ERROR(cudaGetDeviceProperties(&devProp, device_id));
int sm_count = devProp.multiProcessorCount;
//execute
execute_test(sm_count);
printf("\n Mission accomplished \n");
return 0;
}
CMakeLists.txt
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
set(PROJECT_NAME "test_cg")
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
#default build type is Release
if (CMAKE_BUILD_TYPE STREQUAL "")
set(CMAKE_BUILD_TYPE Release)
endif ()
SET(CUDA_SEPARABLE_COMPILATION ON)
########## Libraries/flags Starts Here ######################
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -lineinfo; -std=c++11; -expt-extended-lambda; -O3; -use_fast_math; -rdc=true;)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode=arch=compute_70,code=sm_70) #for TITAN V
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -Wall -std=c++11")
########## Libraries/flags Ends Here ######################
########## inc/libs/exe/features Starts Here ######################
set(CMAKE_INCLUDE_CURRENT_DIR ON)
CUDA_ADD_EXECUTABLE(${PROJECT_NAME} test_cg.cu)
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY})
########## inc/libs/exe/features Ends Here ######################
Running this code gives:
unknown error in /home/ahdhn/test_cg/test_cg.cu at line 67
This is the first line that uses cudaMalloc. I made sure that the code is compiled for the correct architecture by querying __CUDA_ARCH__ from the device and the results is 700. Kindly let me know if you spot me doing something wrong in the code or the CMakeLists.txt file.
With external help, the solution that got the code working is to add string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70 --cudart shared") after the second set(CUDA_NVCC_FLAGS...... The reason is that I only have libcudadevrt.a under my /usr/local/cuda-10.0/lib64/ and so I have to signal CUDA to link shared/dynamic run-time library since the default is to link to static. string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70") after the second set(CUDA_NVCC_FLAGS...... The reason is that the sm_70 flag was not passed to the linker properly.
Additionally, using only CUDA_NVCC_FLAGS will only pass the sm_70 info to the compiler not the linker. While only using CMAKE_NVCC_FLAGS will report error: namespace "cooperative_groups" has no member "grid_group" error.

GetProcAddress returning the correct address with Visual C++ and an incorrect address with g++

This is going to sound really strange. I am using Visual Studio 2017 C++ (native mode) and also, g++ 4.7.1-2 of the MingW toolchain. Target is Windows 64bit.
Using VS C++, I compile the following trivial program:
`
#include "stdafx.h"
#include <Windows.h>
#include <winternl.h>
typedef NTSTATUS (NTAPI* RTLINT64)(ULONGLONG, ULONG, PUNICODE_STRING);
RTLINT64 RtlInt64 = (RTLINT64) nullptr;
int main()
{
UNICODE_STRING unicodestring = { 0 };
WCHAR localbuffer[256] = { 0 }; // way more than enough
__int64 value = 0;
unicodestring.Length = 0;
unicodestring.MaximumLength = sizeof(localbuffer);
unicodestring.Buffer = (PWCH) &localbuffer;
// get ntdll's module handle
HMODULE NtDllModule = LoadLibrary(L"ntdll.dll");
if (NtDllModule)
{
RtlInt64 = (RTLINT64) GetProcAddress(NtDllModule,
"RtlInt64ToUnicodeString");
value = 0xFFFFFFFFF;
RtlInt64 (value, 10, &unicodestring);
wprintf(L"%s\n", unicodestring.Buffer);
}
return 0;
}
`
As expected, GetProcAddress returns the address of RtlInt64ToUnicodeString (no surprise there!)
The code below is, with exception of the includes, pretty much a carbon copy of the above. Yet, somehow, in that version compiled with G++, GetProcAddress returns the address of RtlInterlockedSetBitRun instead of the address of RtlInt64ToUnicodeString (that IS a surprise!). Here is the code:
// GCC and MingW version
#include <Windows.h>
#include <winbase.h>
#include <strsafe.h>
#include <winuser.h>
#include <winternl.h>
// --------------------------------------------------------------------------
typedef NTSTATUS(NTAPI* RTLINT64)(ULONGLONG, ULONG, PUNICODE_STRING);
RTLINT64 RtlInt64 = (RTLINT64) nullptr;
// --------------------------------------------------------------------------
int main(int argc, char *argv[])
{
WCHAR localbuffer[256] = {0}; // way more than enough
UNICODE_STRING unicodestring = {0};
__int64 value = 0;
unicodestring.Length = 0;
unicodestring.MaximumLength = sizeof(localbuffer);
unicodestring.Buffer = (PWCH) &localbuffer;
// get ntdll's module handle
HMODULE NtDllModule = LoadLibraryW(L"ntdll.dll");
if (NtDllModule)
{
RtlInt64 = (RTLINT64) GetProcAddress(NtDllModule,
"RtlInt64ToUnicodeString");
// the above call to GetProcAddress returned the address of
// RtlInterlockedSetBitRun instead of the address of the requested function
// as a result, the statements below don't work.
value = 0xFFFFFFFFF;
RtlInt64(value, 10, &unicodestring);
wprintf(L"%s\n", unicodestring.Buffer);
}
return 0;
}
my question is: is there something in the above code that justifies the discrepancy ?
Also note that, I am using G++ with a tool called VisualGDB which integrates the compiler and the debugger into Visual Studio. Normally things of that kind can cause strange "side effects" but, in this case, it seems rather unlikely for something that has nothing to do with ntdll to be the culprit.
Thank you for your help.

Find CPU times and system times of process in linux

I have a main program that creates two children and each children calls execlv. At the end of the program how do I calculate the CPU times and system times of the parent and two process?
#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>
int main()
{
pid_t pid1,pid2,wid; // variable for parent and two children
char *my_args[3]; // strign array for containing the arguments for executing sigShooter1
// int aInt = 368; //
char str[15]; // strign to contain the pids of children when passing as command line arguments
pid1 = fork();
if (pid1 < 0)
{
fprintf(stderr, ": fork failed: %s\n", strerror(errno));
exit(1);
}
if(pid1 == 0)
{
my_args[0] = "sigperf1";
my_args[1] = "0";
my_args[2] = NULL;
execv("sigshooter1",my_args);
fprintf(stderr,"sigshooter1 cannot be executed by first child...");
exit(-1);
}
pid2 = fork();
if(pid2 < 0)
{
fprintf(stderr, ": fork failed: %s\n", strerror(errno));
exit(1);
}
if(pid2 == 0)
{
sprintf(str, "%d", pid1);
my_args[0] = "sigperf1";
my_args[1] = str;
my_args[2] = NULL;
// printf("this is converted = %s\n",my_args[1]);
//sleep(1);
execv("sigshooter1",my_args);
fprintf(stderr,"sigshooter1 cannot be executed by second child...");
exit(-1);
}
wid = wait(NULL);
}
You'll need a profiler for that. For starters, you can run perf stat ./a.out to get the total CPU time of all three processes, and perf stat -i ./a.out to get the CPU time of parent process only.
If you need something more detailed, take a look at more serious tools like valgrind or gprof.

running gzip on sigle core in muticore environment under unix

I have a requirement to use only single core to test gzip performance in multi-core cpu environment(not sure what is the default settings for gzip in this case). Need help to find out the command to execute gzip compression in single core.
Thanks
gzip is single threaded by default so in effect it will look like it's running on one core ie it might run on several physical cores but it won't be in parallel.
If you absolutely must run on one core and you're on linux you would set affinity to a particular core.
http://man7.org/linux/man-pages/man2/sched_setaffinity.2.html
This is code that I got from the man page.
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \
} while (0)
int
main(int argc, char *argv[])
{
cpu_set_t set;
int parentCPU, childCPU;
int nloops, j;
if (argc != 4) {
fprintf(stderr, "Usage: %s parent-cpu child-cpu num-loops\n",
argv[0]);
exit(EXIT_FAILURE);
}
parentCPU = atoi(argv[1]);
childCPU = atoi(argv[2]);
nloops = atoi(argv[3]);
CPU_ZERO(&set);
switch (fork()) {
case -1: /* Error */
errExit("fork");
case 0: /* Child */
CPU_SET(childCPU, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == -1)
errExit("sched_setaffinity");
for (j = 0; j < nloops; j++)
getppid();
exit(EXIT_SUCCESS);
default: /* Parent */
CPU_SET(parentCPU, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == -1)
errExit("sched_setaffinity");
for (j = 0; j < nloops; j++)
getppid();
wait(NULL); /* Wait for child to terminate */
exit(EXIT_SUCCESS);
}
}
If you need to test with no interruptions from the kernel you need to write a kernel module for that.