Throw exception 'cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED' in training ONNX's pretrained model Emotion FerPlus - cntk

I am testing to train Emotion FerPlus emotion recognition model.
Training has cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED error.
I am using Nvidia GPU TitanRTX 24G.
Then change the minibatch_size from 32 to 1. But still have error.
I am using CNTK-GPU docker.
The complete error messages are
About to throw exception 'cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED ; GPU=0 ; hostname=d9150da5d531 ; expr=cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out))'
cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED ; GPU=0 ; hostname=d9150da5d531 ; expr=cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out))
Traceback (most recent call last):
File "train.py", line 193, in <module>
main(args.base_folder, args.training_mode)
File "train.py", line 124, in main
trainer.train_minibatch({input_var : images, label_var : labels})
File "/root/anaconda3/envs/cntk-py35/lib/python3.5/site-packages/cntk/train/trainer.py", line 184, in train_minibatch
device)
File "/root/anaconda3/envs/cntk-py35/lib/python3.5/site-packages/cntk/cntk_py.py", line 3065, in train_minibatch
return _cntk_py.Trainer_train_minibatch(self, *args)
RuntimeError: cuDNN failure 8: CUDNN_STATUS_EXECUTION_FAILED ; GPU=0 ; hostname=d9150da5d531 ; expr=cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out))
[CALL STACK]
[0x7fc04da7ce89] + 0x732e89
[0x7fc045a71aaf] + 0xeabaaf
[0x7fc045a7b613] Microsoft::MSR::CNTK::CuDnnConvolutionEngine<float>:: ForwardCore (Microsoft::MSR::CNTK::Matrix<float> const&, Microsoft::MSR::CNTK::Matrix<float> const&, Microsoft::MSR::CNTK::Matrix<float>&, Microsoft::MSR::CNTK::Matrix<float>&) + 0x1a3
[0x7fc04dd4f8d3] Microsoft::MSR::CNTK::ConvolutionNode<float>:: ForwardProp (Microsoft::MSR::CNTK::FrameRange const&) + 0xa3
[0x7fc04dfba654] Microsoft::MSR::CNTK::ComputationNetwork::PARTraversalFlowControlNode:: ForwardProp (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&, Microsoft::MSR::CNTK::FrameRange const&) + 0xf4
[0x7fc04dcb6e33] std::_Function_handler<void (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&),void Microsoft::MSR::CNTK::ComputationNetwork::ForwardProp<std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>>>(std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>> const&)::{lambda(std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&)#1}>:: _M_invoke (std::_Any_data const&, std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&) + 0x63
[0x7fc04dd04ed9] void Microsoft::MSR::CNTK::ComputationNetwork:: TravserseInSortedGlobalEvalOrder <std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>>>(std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>,std::allocator<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>> const&, std::function<void (std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase> const&)> const&) + 0x5b9
[0x7fc04dca64da] CNTK::CompositeFunction:: Forward (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, CNTK::DeviceDescriptor const&, std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&, std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&) + 0x15da
[0x7fc04dc3d603] CNTK::Function:: Forward (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, CNTK::DeviceDescriptor const&, std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&, std::unordered_set<CNTK::Variable,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<CNTK::Variable>> const&) + 0x93
[0x7fc04ddbf91b] CNTK::Trainer:: ExecuteForwardBackward (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, CNTK::DeviceDescriptor const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&) + 0x36b
[0x7fc04ddc06e4] CNTK::Trainer:: TrainLocalMinibatch (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, bool, CNTK::DeviceDescriptor const&) + 0x94
[0x7fc04ddc178a] CNTK::Trainer:: TrainMinibatch (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, bool, std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>>&, CNTK::DeviceDescriptor const&) + 0x5a
[0x7fc04ddc1852] CNTK::Trainer:: TrainMinibatch (std::unordered_map<CNTK::Variable,std::shared_ptr<CNTK::Value>,std::hash<CNTK::Variable>,std::equal_to<CNTK::Variable>,std::allocator<std::pair<CNTK::Variable const,std::shared_ptr<CNTK::Value>>>> const&, bool, CNTK::DeviceDescriptor const&) + 0x52
[0x7fc04eb2db22] + 0x229b22
[0x7fc057ea15e9] PyCFunction_Call + 0xf9
[0x7fc057f267c0] PyEval_EvalFrameEx + 0x6ba0
[0x7fc057f29b49] + 0x144b49
[0x7fc057f28df5] PyEval_EvalFrameEx + 0x91d5
[0x7fc057f29b49] + 0x144b49
[0x7fc057f28df5] PyEval_EvalFrameEx + 0x91d5
[0x7fc057f29b49] + 0x144b49
[0x7fc057f28df5] PyEval_EvalFrameEx + 0x91d5
[0x7fc057f29b49] + 0x144b49
[0x7fc057f29cd8] PyEval_EvalCodeEx + 0x48
[0x7fc057f29d1b] PyEval_EvalCode + 0x3b
[0x7fc057f4f020] PyRun_FileExFlags + 0x130
[0x7fc057f50623] PyRun_SimpleFileExFlags + 0x173
[0x7fc057f6b8c7] Py_Main + 0xca7
[0x400add] main + 0x15d
[0x7fc056f06830] __libc_start_main + 0xf0
[0x4008b9]

CNTK is in maintenance mode now (basically deprecated). While CNTK can export to ONNX pretty OK, importing ONNX models is not really well-supported.
ONNX Runtime https://github.com/microsoft/onnxruntime now supports training, so please try it. ONNX Runtime training is actively developing and is supported, so if something doesn't quite work, it's likely the issues will be resolved fast.

Related

CRASH gmscore::vector::GMSMarkupPolygonInstance::CreateEntities

I have not been able to reproduce this crash myself ever but occasionally I see (on crashlytics) user getting this. I feel the frequency of crashing has increased with GoogleMaps version 3.3.0. I am getting this crash for iOS 12 and above.
Crashed: com.apple.main-thread
0 Someapp 0x101027058 gmscore::vector::GMSMarkupPolygonInstance::CreateEntities(gmscore::base::reffed_ptr<gmscore::vector::Camera> const&, gmscore::renderer::EntityRenderer*, id<GMSEntityResources>, gmscore::renderer::ProxySortedRenderBin<std::__1::tuple<unsigned int, unsigned long, unsigned int, gmscore::base::reffed_ptr<gmscore::renderer::BaseEntity> >, gmscore::renderer::ProxySortedRenderBin::less<gmscore::base::reffed_ptr<gmscore::renderer::BaseEntity> > >*, gmscore::renderer::Behavior*, char const* const&) + 311748
1 Someapp 0x101027044 gmscore::vector::GMSMarkupPolygonInstance::CreateEntities(gmscore::base::reffed_ptr<gmscore::vector::Camera> const&, gmscore::renderer::EntityRenderer*, id<GMSEntityResources>, gmscore::renderer::ProxySortedRenderBin<std::__1::tuple<unsigned int, unsigned long, unsigned int, gmscore::base::reffed_ptr<gmscore::renderer::BaseEntity> >, gmscore::renderer::ProxySortedRenderBin::less<gmscore::base::reffed_ptr<gmscore::renderer::BaseEntity> > >*, gmscore::renderer::Behavior*, char const* const&) + 311728
2 Someapp 0x101021c90 gmscore::vector::GMSMarkupMultiZoomLinesInstance::UpdateEntities(float, gmscore::base::reffed_ptr<gmscore::vector::Camera>, gmscore::renderer::EntityRenderer*, id<GMSEntityResources>, gmscore::renderer::ProxySortedRenderBin<std::__1::tuple<unsigned int, unsigned long, unsigned int, gmscore::base::reffed_ptr<gmscore::renderer::BaseEntity> >, gmscore::renderer::ProxySortedRenderBin::less<gmscore::base::reffed_ptr<gmscore::renderer::BaseEntity> > >*, gmscore::renderer::Behavior*, char const* const&) + 290300
3 Someapp 0x10101bb94 gmscore::vector::GMSMarkupBehavior::UpdateInstanceMap(std::__1::map<unsigned long, gmscore::base::reffed_ptr<gmscore::vector::GMSMarkupInstance>, std::__1::less<unsigned long>, std::__1::allocator<std::__1::pair<unsigned long const, gmscore::base::reffed_ptr<gmscore::vector::GMSMarkupInstance> > > > const&, gmscore::renderer::EntityRenderer*, bool) + 265472
4 Someapp 0x10101b428 gmscore::vector::GMSMarkupBehavior::Commit(gmscore::renderer::EntityRenderer*) + 263572
5 Someapp 0x100efa9e8 gmscore::renderer::EntityRenderer::Draw(bool) + 400
6 Someapp 0x100f9ded4 -[GMSPhoenixRenderer drawForced:] + 6452
7 Someapp 0x100f80bc8 -[GMSEntityRendererView draw] + 518060
8 Someapp 0x100f7f224 -[GMSEntityRendererView displayLinkFired:] + 511496
9 Someapp 0x100f7de10 -[GMSDisplayLink displayLinkFired:] + 506356
10 QuartzCore 0x1f9308f90 CA::Display::DisplayLink::dispatch_items(unsigned long long, unsigned long long, unsigned long long) + 636
11 QuartzCore 0x1f93d2b10 display_timer_callback(__CFMachPort*, void*, long, void*) + 272
12 CoreFoundation 0x1f4eeca8c __CFMachPortPerform + 188
13 CoreFoundation 0x1f4f13690 __CFRUNLOOP_IS_CALLING_OUT_TO_A_SOURCE1_PERFORM_FUNCTION__ + 56
14 CoreFoundation 0x1f4f12ddc __CFRunLoopDoSource1 + 440
15 CoreFoundation 0x1f4f0dc00 __CFRunLoopRun + 2096
16 CoreFoundation 0x1f4f0d0b0 CFRunLoopRunSpecific + 436
17 GraphicsServices 0x1f710d79c GSEventRunModal + 104
18 UIKitCore 0x221887978 UIApplicationMain + 212
19 Someapp 0x100511850 main + 16 (TripResultViewController+Validation.swift:16)
20 libdyld.dylib 0x1f49d28e0 start + 4
I am facing the same issue but in iOS 13 devices with GoogleMaps 3.7.0 pod.
The error only appears the first time I open the app (after installing it), but the rest of the executions are working well, maybe this give u some clues about the crash.
There is an open issue for de maps-sdk-for-ios that it's worth to keep an eye on it, because it seems to be related to a bug in the SDK:
issue tracker link
I hope this information helps to solve the problem or at least to throw some light on it ...
The issue was originally reported here: https://github.com/googlemaps/google-maps-ios-utils/issues/236
In our app we use Google Map SDK and I can reproduce this issue by zooming in on the map to the max (starting with version 3.3.0)
I reported it here: https://issuetracker.google.com/issues/148238890

Tensorflow v2.0.0 c-api crashes when loading "saved_model" format Keras model

Tensorflow v2.0.0 segfaults when trying to load a model using the C_API.
The Keras model is absurdly simple:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(28*28,)))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
model.summary()
# ---- Compile the network
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
The Keras model was saved, using the tensorflow v2.0.0 backend. (I checked this.)
model_fname = '/tmp/test1.h5'
print('Saving model to {}'.format(model_fname))
tf.saved_model.save(network, model_fname);
And then it was loaded in the CAPI:
const auto export_dir = format("{}", filename);
vector<const char*> tags;
tags.resize(1);
tags[0] = "serve";
TF_Graph* graph = TF_NewGraph();
TF_Session* session = TF_LoadSessionFromSavedModel(opts,
nullptr,
export_dir.c_str(),
&tags[0],
tags.size(),
graph,
nullptr,
status);
This results in a segfault. Below is the stack trace from ASAN:
2020-01-09 20:21:48.066441: I tensorflow/cc/saved_model/reader.cc:31] Reading SavedModel from: /tmp/test1.h5
2020-01-09 20:21:48.069571: I tensorflow/cc/saved_model/reader.cc:54] Reading meta graph with tags { serve }
AddressSanitizer:DEADLYSIGNAL
=================================================================
==36979==ERROR: AddressSanitizer: SEGV on unknown address 0x000002000904 (pc 0x7f33b0591e7d bp 0x7ffee61acb20 sp 0x7ffee61ac960 T0)
==36979==The signal is caused by a READ memory access.
#0 0x7f33b0591e7c in tensorflow::GPUCompatibleCPUDeviceFactory::CreateDevices(tensorflow::SessionOptions const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<std::unique_ptr<tensorflow::Device, std::default_delete<tensorflow::Device> >, std::allocator<std::unique_ptr<tensorflow::Device, std::default_delete<tensorflow::Device> > > >*) (/opt/tensorflow/v2.0.0/lib/libtensorflow_framework.so.2+0xfb1e7c)
#1 0x7f33b05d4f2a in tensorflow::DeviceFactory::AddDevices(tensorflow::SessionOptions const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<std::unique_ptr<tensorflow::Device, std::default_delete<tensorflow::Device> >, std::allocator<std::unique_ptr<tensorflow::Device, std::default_delete<tensorflow::Device> > > >*) (/opt/tensorflow/v2.0.0/lib/libtensorflow_framework.so.2+0xff4f2a)
#2 0x7f33b9529172 in tensorflow::DirectSessionFactory::NewSession(tensorflow::SessionOptions const&, tensorflow::Session**) (/opt/tensorflow/v2.0.0/lib/libtensorflow.so.2+0x818e172)
#3 0x7f33b065159f in tensorflow::NewSession(tensorflow::SessionOptions const&, tensorflow::Session**) (/opt/tensorflow/v2.0.0/lib/libtensorflow_framework.so.2+0x107159f)
#4 0x7f33afce3df8 in tensorflow::LoadSavedModel(tensorflow::SessionOptions const&, tensorflow::RunOptions const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_set<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, tensorflow::SavedModelBundle*) (/opt/tensorflow/v2.0.0/lib/libtensorflow_framework.so.2+0x703df8)
#5 0x7f33b1d22eb6 in TF_LoadSessionFromSavedModel (/opt/tensorflow/v2.0.0/lib/libtensorflow.so.2+0x987eb6)
#6 0x6f4812 in zero::test::load_model_test(std::basic_string_view<char, std::char_traits<char> >) src/zero/tf-tests/load-model_test.cpp:216
#7 0x713ba0 in main src/zero/main.cpp:35
#8 0x7f33aedd9b96 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b96)
#9 0x417989 in _start (/home/amichaux/Documents/CrashPlan/Development/shinny/zero/build/gcc-10-asan/zero+0x417989)
I'm not sure what to do with this error message. The model loads and executes fine in python3.

Tensorflow with XLA hangs with both GPU and CPU at ~0% usage

I wanted to try out the XLA backend on tensorflow 1.1.0 which I built from source to support the XLA compiler. Also I am using Ubuntu 16.04. My model runs fine without the XLA backend. It takes about 0.8 seconds to compute a single training step on my GTX 1080 GPU. However when I enable the XLA compiler it makes it up to the point where I make the first call to session.run in my model and then just hangs there. My CPU and GPU are at about zero usage.
config = tf.ConfigProto()
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
sess = tf.Session(config = config)
m = model.CharacterTranslator(sess, MAX_LENGTH)
m.init_variables()
best_cost = None
m.restore('/media/chase/98d61322-9ea7-473e-b835-8739c77d1e1e/Translator/model.chk')
while True:
#session.run is called inside of here
m.train(random.sample(training_data, 40000), 64, False)
c = m.train(validation_data, 64, True)[0]
if best_cost is None or c < best_cost:
count = 0
best_cost = c
print('Saving...')
m.save('/media/chase/98d61322-9ea7-473e-b835-8739c77d1e1e/Translator/model.chk')
else:
count += 1
if count == 10:
break
...
def train(self, training_data, batch_size, validate = False, verbose = True):
total_cost = 0
total_acc = 0
total_time = 0
last_chars = 0
total_batches = len(training_data) // batch_size
for i, batch in enumerate(_batch(training_data, batch_size, False)):
x, y = zip(*batch)
x, xl = zip(*[self._vectorize_sent(s) for s in x])
y, yl = zip(*[self._vectorize_sent(s) for s in y])
start_time = time.time()
c, a, g, l, _ = self.session.run((self.cost, self.accuracy, self.global_step, self.learning_rate, self.null_train_step if validate else self.train_step), {
self.source_text: x,
self.target_text: y,
self.target_length: yl,
self.teacher_forcing: True,
})
end_time = time.time()
total_cost += c
total_acc += a
total_time += end_time - start_time
if verbose:
msg = '%s b(%d / %d) g(%d) c(%e) a(%0.4f) lr(%e) dt(%0.2f)' % ('Validating' if validate else 'Training', i, total_batches, g, total_cost / (i + 1), total_acc / (i + 1), l, total_time / (i + 1))
msg += ' ' * max(0, last_chars - len(msg))
last_chars = len(msg)
print(msg, end = '\r')
if verbose:
print()
return total_cost / (i + 1), total_acc / (i + 1)
It produces the following tensorflow output when I try to run it.
2017-04-26 05:15:58.421388: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2017-04-26 05:15:58.421698: I tensorflow/core/common_runtime/gpu/gpu_device.cc:887] Found device 0 with properties:
name: GeForce GTX 1080
major: 6 minor: 1 memoryClockRate (GHz) 1.7335
pciBusID 0000:01:00.0
Total memory: 7.92GiB
Free memory: 7.33GiB
2017-04-26 05:15:58.421708: I tensorflow/core/common_runtime/gpu/gpu_device.cc:908] DMA: 0
2017-04-26 05:15:58.421711: I tensorflow/core/common_runtime/gpu/gpu_device.cc:918] 0: Y
2017-04-26 05:15:58.421719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py:93: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
"Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
2017-04-26 05:17:17.107616: I tensorflow/compiler/xla/service/platform_util.cc:58] platform CUDA present with 1 visible devices
2017-04-26 05:17:17.107635: I tensorflow/compiler/xla/service/platform_util.cc:58] platform Host present with 8 visible devices
2017-04-26 05:17:17.108265: I tensorflow/compiler/xla/service/service.cc:183] XLA service 0xa103840 executing computations on platform Host. Devices:
2017-04-26 05:17:17.108274: I tensorflow/compiler/xla/service/service.cc:191] StreamExecutor device (0): <undefined>, <undefined>
2017-04-26 05:17:17.108393: I tensorflow/compiler/xla/service/platform_util.cc:58] platform CUDA present with 1 visible devices
2017-04-26 05:17:17.108398: I tensorflow/compiler/xla/service/platform_util.cc:58] platform Host present with 8 visible devices
2017-04-26 05:17:17.108602: I tensorflow/compiler/xla/service/service.cc:183] XLA service 0xe383100 executing computations on platform CUDA. Devices:
2017-04-26 05:17:17.108607: I tensorflow/compiler/xla/service/service.cc:191] StreamExecutor device (0): GeForce GTX 1080, Compute Capability 6.1
I then attached gdb to the process to see what it was doing. It looks like it is just sitting on a pthread conditional wait.
#0 pthread_cond_wait##GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1 0x00007f715569291c in std::condition_variable::wait(std::unique_lock<std::mutex>&) ()
from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#2 0x00007f716d85257b in tensorflow::DirectSession::WaitForNotification(tensorflow::Notification*, long long) ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#3 0x00007f716d85262d in tensorflow::DirectSession::WaitForNotification(tensorflow::DirectSession::RunState*, tensorflow::CancellationManager*, long long) ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#4 0x00007f716d85d287 in tensorflow::DirectSession::Run(tensorflow::RunOptions const&, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::vector<tensorflow::Tensor, std::allocator<tensorflow::Tensor> >*, tensorflow::RunMetadata*) ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#5 0x00007f716c3259d1 in TF_Run_Helper(tensorflow::Session*, char const*, TF_Buffer const*, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tensorflow::Tensor> > > const&, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Tensor**, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, TF_Buffer*, TF_Status*) [clone .constprop.554] ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#6 0x00007f716c32639a in TF_Run ()
from /usr/local/lib/python3.5/dist-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#7 0x00007f716c0ab351 in tensorflow::TF_Run_wrapper_helper(TF_DeprecatedSession*, char const*, TF_Buffer const*, _object*, tensorflow::gtl::InlinedVector<char const*, 8> const&, tensorflow::gtl::InlinedVector<char const*, 8> const&, TF_Status*, tensorflow::gtl::InlinedVector<_object*, 8>*, TF_Buffer*) ()
Does anyone have any idea why my tensorflow model with the XLA backend gets stuck?

XCode7.3 keep crash when I enable breakpoint to debug

It will not crash when I debug without breakpoint, once enable a breakpoint, XCode will debug to that line of code, then immediately crash。
It crash at thread 22 as bellow:
Thread 22 Crashed:: <DBGLLDBSessionThread (pid=838)> Dispatch queue: DVTInvalidationPreventionQueue
0 com.apple.LLDB.framework 0x0000000118e3d986 clang::ASTContext::getFunctionType(clang::QualType, llvm::ArrayRef<clang::QualType>, clang::FunctionProtoType::ExtProtoInfo const&) const + 294
1 com.apple.LLDB.framework 0x000000011a139964 lldb_private::ClangASTContext::CreateFunctionType(clang::ASTContext*, lldb_private::CompilerType const&, lldb_private::CompilerType const*, unsigned int, bool, unsigned int) + 542
2 com.apple.LLDB.framework 0x0000000119fb290a DWARFASTParserClang::ParseTypeFromDWARF(lldb_private::SymbolContext const&, DWARFDIE const&, lldb_private::Log*, bool*) + 9830
3 com.apple.LLDB.framework 0x000000011a120e68 SymbolFileDWARF::ParseType(lldb_private::SymbolContext const&, DWARFDIE const&, bool*) + 184
4 com.apple.LLDB.framework 0x000000011a11b108 SymbolFileDWARF::GetTypeForDIE(DWARFDIE const&, bool) + 368
5 com.apple.LLDB.framework 0x000000011a11ab21 SymbolFileDWARF::ResolveType(DWARFDIE const&, bool, bool) + 129
6 com.apple.LLDB.framework 0x0000000119fb7ff9 DWARFASTParserClang::CompleteTypeFromDWARF(DWARFDIE const&, lldb_private::Type*, lldb_private::CompilerType&) + 1165
7 com.apple.LLDB.framework 0x000000011a11aedc SymbolFileDWARF::CompleteType(lldb_private::CompilerType&) + 628
8 com.apple.LLDB.framework 0x000000011a1a59cb lldb_private::Type::ResolveClangType(lldb_private::Type::ResolveStateTag) + 1483
9 com.apple.LLDB.framework 0x000000011a1a5c74 lldb_private::Type::GetFullCompilerType() + 26
10 com.apple.LLDB.framework 0x000000011a0007da lldb_private::ValueObject::MaybeCalculateCompleteType() + 352
11 com.apple.LLDB.framework 0x000000011a000568 lldb_private::ValueObject::GetCompilerType() + 18
12 com.apple.LLDB.framework 0x000000011a1c680a lldb_private::Process::IsPossibleDynamicValue(lldb_private::ValueObject&) + 64
13 com.apple.LLDB.framework 0x000000011a005bca lldb_private::ValueObject::CalculateDynamicValue(lldb::DynamicValueType) + 90
14 com.apple.LLDB.framework 0x000000011a005c6f lldb_private::ValueObject::GetDynamicValue(lldb::DynamicValueType) + 65
15 com.apple.LLDB.framework 0x0000000117ff51dd ValueImpl::GetSP(lldb_private::ProcessRunLock::ProcessRunLocker&, lldb_private::Mutex::Locker&, lldb_private::Error&) + 243
16 com.apple.LLDB.framework 0x0000000117fef90a lldb::SBValue::GetSP(ValueLocker&) const + 58
17 com.apple.LLDB.framework 0x0000000117fef9f9 lldb::SBValue::GetName() + 41
18 com.apple.dt.dbg.DebuggerLLDB 0x0000000117f3ad75 -[DBGLLDBDataValue initWithLLDBValueObject:forStackFrame:withParent:updateSummary:] + 234
19 com.apple.dt.dbg.DebuggerLLDB 0x0000000117f3ac29 -[DBGLLDBDataValue initWithLLDBValueObject:forStackFrame:withParent:] + 92
20 com.apple.dt.dbg.DebuggerLLDB 0x0000000117f38370 -[DBGLLDBStackFrame _findSymbolWithName:symbolKind:atLocation:] + 1070
21 com.apple.dt.dbg.DebuggerLLDB 0x0000000117f38748 __93-[DBGLLDBStackFrame requestDataValueForSymbol:symbolKind:atLocation:onQueue:withResultBlock:]_block_invoke + 60
22 com.apple.dt.DVTFoundation 0x000000010d08dfb4 ___DVTPreventInvalidationForObjectAllowingRecursiveCallsDuringBlock_block_invoke_2 + 49
23 libdispatch.dylib 0x00007fff8491b40b _dispatch_client_callout + 8
24 libdispatch.dylib 0x00007fff8492c5a4 _dispatch_sync_f_invoke + 56
25 com.apple.dt.DVTFoundation 0x000000010d08df54 ___DVTPreventInvalidationForObjectAllowingRecursiveCallsDuringBlock_block_invoke + 150
26 libdispatch.dylib 0x00007fff8491b40b _dispatch_client_callout + 8
27 libdispatch.dylib 0x00007fff8492c5a4 _dispatch_sync_f_invoke + 56
28 com.apple.dt.DVTFoundation 0x000000010d08dda3 _DVTPreventInvalidationForObjectAllowingRecursiveCallsDuringBlock + 383
29 com.apple.dt.DVTFoundation 0x000000010d08e10d _DVTPreventInvalidationAllowingRecursiveCallsDuringBlock + 35
30 com.apple.dt.dbg.DebuggerLLDB 0x0000000117f37b80 __45-[DBGLLDBStackFrame _addSessionThreadAction:]_block_invoke + 108
31 com.apple.dt.dbg.DebuggerLLDB 0x0000000117f50755 -[DBGLLDBSession handleNextActionWithState:withRunPending:] + 457
32 com.apple.dt.dbg.DebuggerLLDB 0x0000000117f4aca5 DBGLLDBSessionThread(void*) + 2089
33 libsystem_pthread.dylib 0x00007fff8dc4599d _pthread_body + 131
34 libsystem_pthread.dylib 0x00007fff8dc4591a _pthread_start + 168
35 libsystem_pthread.dylib 0x00007fff8dc43351 thread_start + 13
Thread 22 crashed with X86 Thread State (64-bit):
Include a setting CLANG_ENABLE_MODULE_DEBUGGING = NO stopped the crash for me.

How to debug crash during spritekit rendering?

I'm making an IOS game using spritekit, and also integrated crashlytics for error reporting. I'm receiving the following error report periodically, and I'm unable to reproduce it, and unsure how to even start going about finding the problem. Any ideas would be greatly appreciated.
Thread : Crashed: com.apple.spritekit.renderQueue
0 SpriteKit 0x2faed240 SKCShapeSprite::_NEW_copyRenderPathData(SKRenderQuad*, bool) const + 711
1 ??? 0x0061b9d0
2 SpriteKit 0x2fb12d09 SKCShapeSprite::copyRenderPathData(SKRenderQuad*, bool) + 40
3 SpriteKit 0x2fb0fdbd SKCRenderer::preprocessSpriteImp(std::__1::vector<SKCRenderer::SpriteRenderInfo, std::__1::allocator<SKCRenderer::SpriteRenderInfo> >&, SKRenderQuadPool&, SKCSprite const*, _GLKMatrix4 const&, float, unsigned int&, bool) + 428
4 SpriteKit 0x2fb10987 SKCRenderer::preprocessSpriteImp(std::__1::vector<SKCRenderer::SpriteRenderInfo, std::__1::allocator<SKCRenderer::SpriteRenderInfo> >&, SKRenderQuadPool&, SKCSprite const*, _GLKMatrix4 const&, float, unsigned int&, bool) + 3446
5 SpriteKit 0x2fb10987 SKCRenderer::preprocessSpriteImp(std::__1::vector<SKCRenderer::SpriteRenderInfo, std::__1::allocator<SKCRenderer::SpriteRenderInfo> >&, SKRenderQuadPool&, SKCSprite const*, _GLKMatrix4 const&, float, unsigned int&, bool) + 3446
6 SpriteKit 0x2fb0b0d7 SKCRenderer::preprocessAndSubmitSpriteInternal(std::__1::vector<SKCRenderer::SpriteRenderInfo const*, std::__1::allocator<SKCRenderer::SpriteRenderInfo const*> >&, std::__1::vector<SKCRenderer::SpriteRenderInfo, std::__1::allocator<SKCRenderer::SpriteRenderInfo> >&, SKRenderQuadPool&, SKCSprite const*, _GLKMatrix4 const&) + 94
7 SpriteKit 0x2fb0f94f SKCRenderer::preprocessAndSubmitSprite(SKCSprite const*, _GLKMatrix4 const&) + 154
8 SpriteKit 0x2fb0da69 SKCRenderer::submitScene(SKScene*, bool) + 524
9 SpriteKit 0x2fb11ba9 SKCRenderer::renderScene(SKScene*, bool) + 152
10 SpriteKit 0x2faa95f7 -[SKView _renderContent] + 1102
11 libdispatch.dylib 0x3a3758b7 _dispatch_client_callout + 22
12 libdispatch.dylib 0x3a37cb69 _dispatch_barrier_sync_f_invoke + 48
13 SpriteKit 0x2faa917b -[SKView renderContent] + 82
14 SpriteKit 0x2faa5f15 __29-[SKView setUpRenderCallback]_block_invoke + 116
15 SpriteKit 0x2fad7509 -[SKDisplayLink _callbackForNextFrame:] + 248
16 QuartzCore 0x2f679aa3 CA::Display::DisplayLinkItem::dispatch() + 98
17 QuartzCore 0x2f67990b CA::Display::DisplayLink::dispatch_items(unsigned long long, unsigned long long, unsigned long long) + 366
18 IOMobileFramebuffer 0x33f2e82b IOMobileFramebufferVsyncNotifyFunc + 90
19 IOKit 0x2d68c801 IODispatchCalloutFromCFMessage + 256
20 CoreFoundation 0x2c71d8e5 __CFMachPortPerform + 132
21 CoreFoundation 0x2c72ddab __CFRUNLOOP_IS_CALLING_OUT_TO_A_SOURCE1_PERFORM_FUNCTION__ + 34
22 CoreFoundation 0x2c72dd47 __CFRunLoopDoSource1 + 346
23 CoreFoundation 0x2c72c349 __CFRunLoopRun + 1608
24 CoreFoundation 0x2c67a621 CFRunLoopRunSpecific + 476
25 CoreFoundation 0x2c67a433 CFRunLoopRunInMode + 106
26 GraphicsServices 0x33a290a9 GSEventRunModal + 136
27 UIKit 0x2fc65359 UIApplicationMain + 1440