Tensorflow with XLA hangs with both GPU and CPU at ~0% usage - tensorflow

I wanted to try out the XLA backend on tensorflow 1.1.0 which I built from source to support the XLA compiler. Also I am using Ubuntu 16.04. My model runs fine without the XLA backend. It takes about 0.8 seconds to compute a single training step on my GTX 1080 GPU. However when I enable the XLA compiler it makes it up to the point where I make the first call to session.run in my model and then just hangs there. My CPU and GPU are at about zero usage.
config = tf.ConfigProto()
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
sess = tf.Session(config = config)
m = model.CharacterTranslator(sess, MAX_LENGTH)
m.init_variables()
best_cost = None
m.restore('/media/chase/98d61322-9ea7-473e-b835-8739c77d1e1e/Translator/model.chk')
while True:
#session.run is called inside of here
m.train(random.sample(training_data, 40000), 64, False)
c = m.train(validation_data, 64, True)[0]
if best_cost is None or c < best_cost:
count = 0
best_cost = c
print('Saving...')
m.save('/media/chase/98d61322-9ea7-473e-b835-8739c77d1e1e/Translator/model.chk')
else:
count += 1
if count == 10:
break
...
def train(self, training_data, batch_size, validate = False, verbose = True):
total_cost = 0
total_acc = 0
total_time = 0
last_chars = 0
total_batches = len(training_data) // batch_size
for i, batch in enumerate(_batch(training_data, batch_size, False)):
x, y = zip(*batch)
x, xl = zip(*[self._vectorize_sent(s) for s in x])
y, yl = zip(*[self._vectorize_sent(s) for s in y])
start_time = time.time()
c, a, g, l, _ = self.session.run((self.cost, self.accuracy, self.global_step, self.learning_rate, self.null_train_step if validate else self.train_step), {
self.source_text: x,
self.target_text: y,
self.target_length: yl,
self.teacher_forcing: True,
})
end_time = time.time()
total_cost += c
total_acc += a
total_time += end_time - start_time
if verbose:
msg = '%s b(%d / %d) g(%d) c(%e) a(%0.4f) lr(%e) dt(%0.2f)' % ('Validating' if validate else 'Training', i, total_batches, g, total_cost / (i + 1), total_acc / (i + 1), l, total_time / (i + 1))
msg += ' ' * max(0, last_chars - len(msg))
last_chars = len(msg)
print(msg, end = '\r')
if verbose:
print()
return total_cost / (i + 1), total_acc / (i + 1)
It produces the following tensorflow output when I try to run it.
2017-04-26 05:15:58.421388: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2017-04-26 05:15:58.421698: I tensorflow/core/common_runtime/gpu/gpu_device.cc:887] Found device 0 with properties:
name: GeForce GTX 1080
major: 6 minor: 1 memoryClockRate (GHz) 1.7335
pciBusID 0000:01:00.0
Total memory: 7.92GiB
Free memory: 7.33GiB
2017-04-26 05:15:58.421708: I tensorflow/core/common_runtime/gpu/gpu_device.cc:908] DMA: 0
2017-04-26 05:15:58.421711: I tensorflow/core/common_runtime/gpu/gpu_device.cc:918] 0: Y
2017-04-26 05:15:58.421719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py:93: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
"Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
2017-04-26 05:17:17.107616: I tensorflow/compiler/xla/service/platform_util.cc:58] platform CUDA present with 1 visible devices
2017-04-26 05:17:17.107635: I tensorflow/compiler/xla/service/platform_util.cc:58] platform Host present with 8 visible devices
2017-04-26 05:17:17.108265: I tensorflow/compiler/xla/service/service.cc:183] XLA service 0xa103840 executing computations on platform Host. Devices:
2017-04-26 05:17:17.108274: I tensorflow/compiler/xla/service/service.cc:191] StreamExecutor device (0): <undefined>, <undefined>
2017-04-26 05:17:17.108393: I tensorflow/compiler/xla/service/platform_util.cc:58] platform CUDA present with 1 visible devices
2017-04-26 05:17:17.108398: I tensorflow/compiler/xla/service/platform_util.cc:58] platform Host present with 8 visible devices
2017-04-26 05:17:17.108602: I tensorflow/compiler/xla/service/service.cc:183] XLA service 0xe383100 executing computations on platform CUDA. Devices:
2017-04-26 05:17:17.108607: I tensorflow/compiler/xla/service/service.cc:191] StreamExecutor device (0): GeForce GTX 1080, Compute Capability 6.1
I then attached gdb to the process to see what it was doing. It looks like it is just sitting on a pthread conditional wait.
#0 pthread_cond_wait##GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1 0x00007f715569291c in std::condition_variable::wait(std::unique_lock<std::mutex>&) ()
from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#2 0x00007f716d85257b in tensorflow::DirectSession::WaitForNotification(tensorflow::Notification*, long long) ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#3 0x00007f716d85262d in tensorflow::DirectSession::WaitForNotification(tensorflow::DirectSession::RunState*, tensorflow::CancellationManager*, long long) ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#4 0x00007f716d85d287 in tensorflow::DirectSession::Run(tensorflow::RunOptions const&, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, tensorflow::RunMetadata*) ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#5 0x00007f716c3259d1 in TF_Run_Helper(tensorflow::Session*, char const*, TF_Buffer const*, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Tensor**, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Buffer*, TF_Status*) [clone .constprop.554] ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#6 0x00007f716c32639a in TF_Run ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#7 0x00007f716c0ab351 in tensorflow::TF_Run_wrapper_helper(TF_DeprecatedSession*, char const*, TF_Buffer const*, _object*, tensorflow::gtl::InlinedVector<char const*, 8> const&, tensorflow::gtl::InlinedVector<char const*, 8> const&, TF_Status*, tensorflow::gtl::InlinedVector<_object*, 8>*, TF_Buffer*) ()
Does anyone have any idea why my tensorflow model with the XLA backend gets stuck?

Related

Throw exception 'cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED' in training ONNX's pretrained model Emotion FerPlus

I am testing to train Emotion FerPlus emotion recognition model.
Training has cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED error.
I am using Nvidia GPU TitanRTX 24G.
Then change the minibatch_size from 32 to 1. But still have error.
I am using CNTK-GPU docker.
The complete error messages are
About to throw exception 'cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED ; GPU=0 ; hostname=d9150da5d531 ; expr=cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out))'
cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED ; GPU=0 ; hostname=d9150da5d531 ; expr=cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out))
Traceback (most recent call last):
File "train.py", line 193, in <module>
main(args.base_folder, args.training_mode)
File "train.py", line 124, in main
trainer.train_minibatch({input_var : images, label_var : labels})
File "/root/anaconda3/envs/cntk-py35/lib/python3.5/site-packages/cntk/train/trainer.py", line 184, in train_minibatch
device)
File "/root/anaconda3/envs/cntk-py35/lib/python3.5/site-packages/cntk/cntk_py.py", line 3065, in train_minibatch
return _cntk_py.Trainer_train_minibatch(self, *args)
RuntimeError: cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED ; GPU=0 ; hostname=d9150da5d531 ; expr=cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out))
[CALL STACK]
[0x7fc04da7ce89] + 0x732e89
[0x7fc045a71aaf] + 0xeabaaf
[0x7fc045a7b613] Microsoft::MSR::CNTK::CuDnnConvolutionEngine<float>:: ForwardCore (Microsoft::MSR::CNTK::Matrix<float> const&, Microsoft::MSR::CNTK::Matrix<float> const&, Microsoft::MSR::CNTK::Matrix<float>&, Microsoft::MSR::CNTK::Matrix<float>&) + 0x1a3
[0x7fc04dd4f8d3] Microsoft::MSR::CNTK::ConvolutionNode<float>:: ForwardProp (Microsoft::MSR::CNTK::FrameRange const&) + 0xa3
[0x7fc04dfba654] Microsoft::MSR::CNTK::ComputationNetwork::PARTraversalFlowControlNode:: ForwardProp (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&, Microsoft::MSR::CNTK::FrameRange const&) + 0xf4
[0x7fc04dcb6e33] std::_Function_handler<void (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&),void Microsoft::MSR::CNTK::ComputationNetwork::ForwardProp<std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>>>(std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>> const&)::{lambda(std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&)#1}>:: _M_invoke (std::_Any_data const&, std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&) + 0x63
[0x7fc04dd04ed9] void Microsoft::MSR::CNTK::ComputationNetwork:: TravserseInSortedGlobalEvalOrder <std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>>>(std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>> const&, std::function<void (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&)> const&) + 0x5b9
[0x7fc04dca64da] CNTK::CompositeFunction:: Forward (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, CNTK::DeviceDescriptor const&, std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&, std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&) + 0x15da
[0x7fc04dc3d603] CNTK::Function:: Forward (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, CNTK::DeviceDescriptor const&, std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&, std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&) + 0x93
[0x7fc04ddbf91b] CNTK::Trainer:: ExecuteForwardBackward (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, CNTK::DeviceDescriptor const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&) + 0x36b
[0x7fc04ddc06e4] CNTK::Trainer:: TrainLocalMinibatch (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, bool, CNTK::DeviceDescriptor const&) + 0x94
[0x7fc04ddc178a] CNTK::Trainer:: TrainMinibatch (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, bool, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, CNTK::DeviceDescriptor const&) + 0x5a
[0x7fc04ddc1852] CNTK::Trainer:: TrainMinibatch (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, bool, CNTK::DeviceDescriptor const&) + 0x52
[0x7fc04eb2db22] + 0x229b22
[0x7fc057ea15e9] PyCFunction_Call + 0xf9
[0x7fc057f267c0] PyEval_EvalFrameEx + 0x6ba0
[0x7fc057f29b49] + 0x144b49
[0x7fc057f28df5] PyEval_EvalFrameEx + 0x91d5
[0x7fc057f29b49] + 0x144b49
[0x7fc057f28df5] PyEval_EvalFrameEx + 0x91d5
[0x7fc057f29b49] + 0x144b49
[0x7fc057f28df5] PyEval_EvalFrameEx + 0x91d5
[0x7fc057f29b49] + 0x144b49
[0x7fc057f29cd8] PyEval_EvalCodeEx + 0x48
[0x7fc057f29d1b] PyEval_EvalCode + 0x3b
[0x7fc057f4f020] PyRun_FileExFlags + 0x130
[0x7fc057f50623] PyRun_SimpleFileExFlags + 0x173
[0x7fc057f6b8c7] Py_Main + 0xca7
[0x400add] main + 0x15d
[0x7fc056f06830] __libc_start_main + 0xf0
[0x4008b9]
CNTK is in maintenance mode now (basically deprecated). While CNTK can export to ONNX pretty OK, importing ONNX models is not really well-supported.
ONNX Runtime https://github.com/microsoft/onnxruntime now supports training, so please try it. ONNX Runtime training is actively developing and is supported, so if something doesn't quite work, it's likely the issues will be resolved fast.

Tensorflow v2.0.0 c-api crashes when loading "saved_model" format Keras model

Tensorflow v2.0.0 segfaults when trying to load a model using the C_API.
The Keras model is absurdly simple:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(28*28,)))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
model.summary()
# ---- Compile the network
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
The Keras model was saved, using the tensorflow v2.0.0 backend. (I checked this.)
model_fname = '/tmp/test1.h5'
print('Saving model to {}'.format(model_fname))
tf.saved_model.save(network, model_fname);
And then it was loaded in the CAPI:
const auto export_dir = format("{}", filename);
vector<const char*> tags;
tags.resize(1);
tags[0] = "serve";
TF_Graph* graph = TF_NewGraph();
TF_Session* session = TF_LoadSessionFromSavedModel(opts,
nullptr,
export_dir.c_str(),
&tags[0],
tags.size(),
graph,
nullptr,
status);
This results in a segfault. Below is the stack trace from ASAN:
2020-01-09 20:21:48.066441: I tensorflow/cc/saved_model/reader.cc:31] Reading SavedModel from: /tmp/test1.h5
2020-01-09 20:21:48.069571: I tensorflow/cc/saved_model/reader.cc:54] Reading meta graph with tags { serve }
AddressSanitizer:DEADLYSIGNAL
=================================================================
==36979==ERROR: AddressSanitizer: SEGV on unknown address 0x000002000904 (pc 0x7f33b0591e7d bp 0x7ffee61acb20 sp 0x7ffee61ac960 T0)
==36979==The signal is caused by a READ memory access.
#0 0x7f33b0591e7c in tensorflow::GPUCompatibleCPUDeviceFactory::CreateDevices(tensorflow::SessionOptions const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<std::unique_ptr<tensorflow::Device, std::default_delete<tensorflow::Device> >, std::allocator<std::unique_ptr<tensorflow::Device, std::default_delete<tensorflow::Device> > > >*) (/opt/tensorflow/v2.0.0/lib/libtensorflow_framework.so.2+0xfb1e7c)
#1 0x7f33b05d4f2a in tensorflow::DeviceFactory::AddDevices(tensorflow::SessionOptions const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<std::unique_ptr<tensorflow::Device, std::default_delete<tensorflow::Device> >, std::allocator<std::unique_ptr<tensorflow::Device, std::default_delete<tensorflow::Device> > > >*) (/opt/tensorflow/v2.0.0/lib/libtensorflow_framework.so.2+0xff4f2a)
#2 0x7f33b9529172 in tensorflow::DirectSessionFactory::NewSession(tensorflow::SessionOptions const&, tensorflow::Session**) (/opt/tensorflow/v2.0.0/lib/libtensorflow.so.2+0x818e172)
#3 0x7f33b065159f in tensorflow::NewSession(tensorflow::SessionOptions const&, tensorflow::Session**) (/opt/tensorflow/v2.0.0/lib/libtensorflow_framework.so.2+0x107159f)
#4 0x7f33afce3df8 in tensorflow::LoadSavedModel(tensorflow::SessionOptions const&, tensorflow::RunOptions const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_set<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, tensorflow::SavedModelBundle*) (/opt/tensorflow/v2.0.0/lib/libtensorflow_framework.so.2+0x703df8)
#5 0x7f33b1d22eb6 in TF_LoadSessionFromSavedModel (/opt/tensorflow/v2.0.0/lib/libtensorflow.so.2+0x987eb6)
#6 0x6f4812 in zero::test::load_model_test(std::basic_string_view<char, std::char_traits<char> >) src/zero/tf-tests/load-model_test.cpp:216
#7 0x713ba0 in main src/zero/main.cpp:35
#8 0x7f33aedd9b96 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b96)
#9 0x417989 in _start (/home/amichaux/Documents/CrashPlan/Development/shinny/zero/build/gcc-10-asan/zero+0x417989)
I'm not sure what to do with this error message. The model loads and executes fine in python3.

Valgrind outputs unrelated to program error messages in MacOS High Sierra

I've been trying Valgrind out with the following program:
#include <cstdlib>
#define BUF_SIZE 1000
int main() {
char *path = new char[BUF_SIZE]
return 0;
}
Evidently the program has a memory leak, but when I test Valgrind I get a lot of unrelated leak errors, in addition to the actual leak.
I installed Valgrind using the instructions I found in: https://www.gungorbudak.com/blog/2018/04/28/how-to-install-valgrind-on-macos-high-sierra/
I even tried the use --suppressions with darwin*.supp (from the Git Repository https://sourceware.org/git/?p=valgrind.git)
Any suggestions or advice will be welcome.
ARIAS-CSC-MBP:cpp arias$ valgrind --leak-check=yes ./test
==72896== Memcheck, a memory error detector
==72896== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==72896== Using Valgrind-3.14.0.GIT and LibVEX; rerun with -h for copyright info
==72896== Command: ./test
==72896==
==72896==
==72896== HEAP SUMMARY:
==72896== in use at exit: 19,411 bytes in 167 blocks
==72896== total heap usage: 188 allocs, 21 frees, 27,859 bytes allocated
==72896==
==72896== 64 bytes in 1 blocks are definitely lost in loss record 27 of 47
==72896== at 0x1000ACC32: calloc (in /usr/local/Cellar/valgrind/HEAD-5f900ed/lib/valgrind/vgpreload_memcheck-amd64-darwin.so)
==72896== by 0x10075ABA4: realizeClass(objc_class*) (in /usr/lib/libobjc.A.dylib)
==72896== by 0x10075AC5A: realizeClass(objc_class*) (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100759363: _read_images (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100757AC4: map_images_nolock (in /usr/lib/libobjc.A.dylib)
==72896== by 0x10076A7DA: objc_object::sidetable_retainCount() (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100007C64: dyld::notifyBatchPartial(dyld_image_states, bool, char const* (*)(dyld_image_states, unsigned int, dyld_image_info const*), bool, bool) (in /usr/lib/dyld)
==72896== by 0x100007E39: dyld::registerObjCNotifiers(void (*)(unsigned int, char const* const*, mach_header const* const*), void (*)(char const*, mach_header const*), void (*)(char const*, mach_header const*)) (in /usr/lib/dyld)
==72896== by 0x10022271D: _dyld_objc_notify_register (in /usr/lib/system/libdyld.dylib)
==72896== by 0x100757075: _objc_init (in /usr/lib/libobjc.A.dylib)
==72896== by 0x1001ACB34: _os_object_init (in /usr/lib/system/libdispatch.dylib)
==72896== by 0x1001ACB1B: libdispatch_init (in /usr/lib/system/libdispatch.dylib)
==72896==
==72896== 64 bytes in 1 blocks are definitely lost in loss record 28 of 47
==72896== at 0x1000ACC32: calloc (in /usr/local/Cellar/valgrind/HEAD-5f900ed/lib/valgrind/vgpreload_memcheck-amd64-darwin.so)
==72896== by 0x10075ABA4: realizeClass(objc_class*) (in /usr/lib/libobjc.A.dylib)
==72896== by 0x10075AC72: realizeClass(objc_class*) (in /usr/lib/libobjc.A.dylib)
==72896== by 0x10075AC5A: realizeClass(objc_class*) (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100759363: _read_images (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100757AC4: map_images_nolock (in /usr/lib/libobjc.A.dylib)
==72896== by 0x10076A7DA: objc_object::sidetable_retainCount() (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100007C64: dyld::notifyBatchPartial(dyld_image_states, bool, char const* (*)(dyld_image_states, unsigned int, dyld_image_info const*), bool, bool) (in /usr/lib/dyld)
==72896== by 0x100007E39: dyld::registerObjCNotifiers(void (*)(unsigned int, char const* const*, mach_header const* const*), void (*)(char const*, mach_header const*), void (*)(char const*, mach_header const*)) (in /usr/lib/dyld)
==72896== by 0x10022271D: _dyld_objc_notify_register (in /usr/lib/system/libdyld.dylib)
==72896== by 0x100757075: _objc_init (in /usr/lib/libobjc.A.dylib)
==72896== by 0x1001ACB34: _os_object_init (in /usr/lib/system/libdispatch.dylib)
==72896==
==72896== 72 bytes in 3 blocks are possibly lost in loss record 29 of 47
==72896== at 0x1000ACC32: calloc (in /usr/local/Cellar/valgrind/HEAD-5f900ed/lib/valgrind/vgpreload_memcheck-amd64-darwin.so)
==72896== by 0x1007577E2: map_images_nolock (in /usr/lib/libobjc.A.dylib)
==72896== by 0x10076A7DA: objc_object::sidetable_retainCount() (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100007C64: dyld::notifyBatchPartial(dyld_image_states, bool, char const* (*)(dyld_image_states, unsigned int, dyld_image_info const*), bool, bool) (in /usr/lib/dyld)
==72896== by 0x100007E39: dyld::registerObjCNotifiers(void (*)(unsigned int, char const* const*, mach_header const* const*), void (*)(char const*, mach_header const*), void (*)(char const*, mach_header const*)) (in /usr/lib/dyld)
==72896== by 0x10022271D: _dyld_objc_notify_register (in /usr/lib/system/libdyld.dylib)
==72896== by 0x100757075: _objc_init (in /usr/lib/libobjc.A.dylib)
==72896== by 0x1001ACB34: _os_object_init (in /usr/lib/system/libdispatch.dylib)
==72896== by 0x1001ACB1B: libdispatch_init (in /usr/lib/system/libdispatch.dylib)
==72896== by 0x1000BB9C2: libSystem_initializer (in /usr/lib/libSystem.B.dylib)
==72896== by 0x100019AC5: ImageLoaderMachO::doModInitFunctions(ImageLoader::LinkContext const&) (in /usr/lib/dyld)
==72896== by 0x100019CF5: ImageLoaderMachO::doInitialization(ImageLoader::LinkContext const&) (in /usr/lib/dyld)
==72896==
==72896== 1,000 bytes in 1 blocks are definitely lost in loss record 39 of 47
==72896== at 0x1000ACC32: calloc (in /usr/local/Cellar/valgrind/HEAD-5f900ed/lib/valgrind/vgpreload_memcheck-amd64-darwin.so)
==72896== by 0x100000EF1: main (anothertest.cpp:8)
==72896==
==72896== 1,792 bytes in 28 blocks are definitely lost in loss record 44 of 47
==72896== at 0x1000ACC32: calloc (in /usr/local/Cellar/valgrind/HEAD-5f900ed/lib/valgrind/vgpreload_memcheck-amd64-darwin.so)
==72896== by 0x10075ABA4: realizeClass(objc_class*) (in /usr/lib/libobjc.A.dylib)
==72896== by 0x10075AC72: realizeClass(objc_class*) (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100759363: _read_images (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100757AC4: map_images_nolock (in /usr/lib/libobjc.A.dylib)
==72896== by 0x10076A7DA: objc_object::sidetable_retainCount() (in /usr/lib/libobjc.A.dylib)
==72896== by 0x100007C64: dyld::notifyBatchPartial(dyld_image_states, bool, char const* (*)(dyld_image_states, unsigned int, dyld_image_info const*), bool, bool) (in /usr/lib/dyld)
==72896== by 0x100007E39: dyld::registerObjCNotifiers(void (*)(unsigned int, char const* const*, mach_header const* const*), void (*)(char const*, mach_header const*), void (*)(char const*, mach_header const*)) (in /usr/lib/dyld)
==72896== by 0x10022271D: _dyld_objc_notify_register (in /usr/lib/system/libdyld.dylib)
==72896== by 0x100757075: _objc_init (in /usr/lib/libobjc.A.dylib)
==72896== by 0x1001ACB34: _os_object_init (in /usr/lib/system/libdispatch.dylib)
==72896== by 0x1001ACB1B: libdispatch_init (in /usr/lib/system/libdispatch.dylib)
==72896==
==72896== LEAK SUMMARY:
==72896== definitely lost: 2,920 bytes in 31 blocks
==72896== indirectly lost: 0 bytes in 0 blocks
==72896== possibly lost: 72 bytes in 3 blocks
==72896== still reachable: 200 bytes in 6 blocks
==72896== suppressed: 16,219 bytes in 127 blocks
==72896== Reachable blocks (those to which a pointer was found) are not shown.
==72896== To see them, rerun with: --leak-check=full --show-leak-kinds=all
==72896==
==72896== For counts of detected and suppressed errors, rerun with: -v
==72896== ERROR SUMMARY: 5 errors from 5 contexts (suppressed: 16 from 16)
Valgrind is currently pre-release software for macOS High Sierra. As Valgrind needs to hook at a low level into the macOS kernel (mach) it needs continual tweaking upon each major new release of macOS.
At present, that means even using development Git versions of Valgrind, there will be false-positive errors (or at least memory errors which you can do nothing about) in library code which runs within the process space of your small test executable.
Future development releases of Valgrind may addresses these system errors. You can comfortably ignore them if not reporting problems in your own code.

how to debug local variables in tensorflow

i'd like to print the value of tensor in tensorflow,but it falied,how can i correct it?
train_vector1, train_vector2, train_vector3, train_vector4, train_vector5,train_labels = decode_records(FLAGS.record_train, FLAGS.epoch, record_params)
sess = tf.Session(config=session_conf)
print(sess.run(train_labels))
and run the tf.py,the progress is hunted.why?
2018-06-15 16:52:53.782143: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2018-06-15 16:52:54.111552: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1212] Found device 0 with properties:
name: Tesla P100-PCIE-16GB major: 6 minor: 0 memoryClockRate(GHz): 1.3285
pciBusID: 0000:0d:00.0
totalMemory: 15.89GiB freeMemory: 15.60GiB
2018-06-15 16:52:54.111607: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1312] Adding visible gpu devices: 0
2018-06-15 16:52:54.408837: I tensorflow/core/common_runtime/gpu/gpu_device.cc:993] Creating TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 15128 MB memory) -> physical GPU (device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:0d:00.0, compute capability: 6.0)

axtls ssl_client_new double free or corruption

I have a small problem with AXTLS. I'm trying to establish a SSL communication with a server, but it looks like it double-frees something in "ssl_client_new" function.
The code creates a new ctx (ssl_ctx_new) and loads the certificates (ssl_obj_load) during initialization (no problem here), the socket is connected and when I call "ssl_client_new", libc detects a double free:
"double free or corruption (fasttop): 0x0809f740"
I started the application under gdb, and this is the backtrace:
Program received signal SIGABRT, Aborted. 0xb7fdd424 in
__kernel_vsyscall () (gdb) bt
#0 0xb7fdd424 in __kernel_vsyscall ()
#1 0xb7d371ef in __GI_raise (sig=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:64
#2 0xb7d3a835 in __GI_abort () at abort.c:91
#3 0xb7d722fa in __libc_message (do_abort=2, fmt=0xb7e6a3bc "*** glibc detected *** %s: %s: 0x%s ***\n") at ../sysdeps/unix/sysv/linux/libc_fatal.c:201
#4 0xb7d7ce42 in malloc_printerr (action=<optimized out>, str=<optimized out>, ptr=0x809f740) at malloc.c:5007
#5 0x080712fd in asn1_name ()
#6 0x0806849f in x509_new ()
#7 0x080642cc in process_certificate ()
#8 0x08067caa in do_clnt_handshake ()
#9 0x08066675 in basic_read ()
#10 0x08067a07 in ssl_client_new ()
The ctx pointer and the sockets are ok, and last two parameters are NULL and 0. (http://axtls.sourceforge.net/dox/group_c_api.html#ga4eef242a559b06d72b862c1e5ab3d0a2)
Here is the strace for my socket:
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 8
fcntl64(8, F_GETFL) = 0x2 (flags O_RDWR)
fcntl64(8, F_SETFL, O_RDWR|O_NONBLOCK) = 0
connect(8, {sa_family=AF_INET, sin_port=htons(443), sin_addr=inet_addr("81.12.132.173")}, 16) = -1 EINPROGRESS (Operation now in progress)
select(9, NULL, [8], NULL, {10, 0}) = 1 (out [8], left {9, 996818})
getsockopt(8, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
fcntl64(8, F_SETFL, O_RDWR) = 0
fcntl64(8, F_GETFL) = 0x2 (flags O_RDWR)
fcntl64(8, F_SETFL, O_RDWR) = 0
write(8, "\26\3\1\0003\1\0\0/\3\1P#hf?\222Y\3046n\215\364\317\34-D8\311\270=\225"..., 56) = 56
read(8, "\26\3\1\0J", 5) = 5
read(8, "\2\0\0F\3\1Q\301_\210\267\200\352*}H\330\265\n;\33\253\31\24\320\377+\3\371\276g\362"..., 74) = 74
read(8, "\26\3\1\20)", 5) = 5
read(8, "\v\0\20%\0\20\"\0\5\0070\202\5\0030\202\3\353\240\3\2\1\2\2\23w\0\0~\263\3446"..., 4137) = 2632
read(8, "1\0270\25\6\3U\4\n\f\16Vodafone Group1)0'\6\3U"..., 1505) = 1505
So ssl handshake started.
And immediatly after this:
open("/dev/tty", O_RDWR|O_NOCTTY|O_NONBLOCK) = 9
writev(9, [{"*** glibc detected *** ", 23}, {"./HTTPS_TOOL", 11}, {": ", 2}, {"double free or corruption (fastt"..., 35}, {": 0x", 4}, {"082ba740", 8}, {" ***\n", 5}], 7*** glibc detected *** ./HTTPS_TOOL: double free or corruption (fasttop): 0x082ba740 ***
) = 88
Any ideas? Which can be the problem and what can I do?
Thank you.
Are you still getting the issue? I haven't had any reports of this and it may be a way the library is being used.
The library has been checked many times in Valgrind and there aren't any recent issues. Could you perhaps run it in Valgrind to verify?