I'm using PubSubReactiveFactory fromspring-cloud-gcp-pubsub onOpenJDK 11 Debian Linux and I've observed the following exception in our application:
com.google.api.gax.rpc.InternalException: io.grpc.StatusRuntimeException: INTERNAL: http2 exception
at com.google.api.gax.rpc.ApiExceptionFactory.createException(ApiExceptionFactory.java:110)
at com.google.api.gax.rpc.ApiExceptionFactory.createException(ApiExceptionFactory.java:41)
at com.google.api.gax.grpc.GrpcApiExceptionFactory.create(GrpcApiExceptionFactory.java:86)
at com.google.api.gax.grpc.GrpcApiExceptionFactory.create(GrpcApiExceptionFactory.java:66)
at com.google.api.gax.grpc.GrpcExceptionCallable$ExceptionTransformingFuture.onFailure(GrpcExceptionCallable.java:97)
at com.google.api.core.ApiFutures$1.onFailure(ApiFutures.java:67)
at com.google.common.util.concurrent.Futures$CallbackListener.run(Futures.java:1132)
at com.google.common.util.concurrent.DirectExecutor.execute(DirectExecutor.java:31)
at com.google.common.util.concurrent.AbstractFuture.executeListener(AbstractFuture.java:1270)
at com.google.common.util.concurrent.AbstractFuture.complete(AbstractFuture.java:1038)
at com.google.common.util.concurrent.AbstractFuture.setException(AbstractFuture.java:808)
at io.grpc.stub.ClientCalls$GrpcFuture.setException(ClientCalls.java:572)
at io.grpc.stub.ClientCalls$UnaryStreamToFuture.onClose(ClientCalls.java:542)
at io.grpc.PartialForwardingClientCallListener.onClose(PartialForwardingClientCallListener.java:39)
at io.grpc.ForwardingClientCallListener.onClose(ForwardingClientCallListener.java:23)
at io.grpc.ForwardingClientCallListener$SimpleForwardingClientCallListener.onClose(ForwardingClientCallListener.java:40)
at com.google.api.gax.grpc.ChannelPool$ReleasingClientCall$1.onClose(ChannelPool.java:535)
at io.grpc.internal.ClientCallImpl.closeObserver(ClientCallImpl.java:562)
at io.grpc.internal.ClientCallImpl.access$300(ClientCallImpl.java:70)
at io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInternal(ClientCallImpl.java:743)
at io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInContext(ClientCallImpl.java:722)
at io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: io.grpc.StatusRuntimeException: INTERNAL: http2 exception
at io.grpc.Status.asRuntimeException(Status.java:535)
... 14 common frames omitted
Caused by: io.grpc.netty.shaded.io.netty.handler.codec.http2.Http2Exception$StreamException: Stream closed before write could take place
at io.grpc.netty.shaded.io.netty.handler.codec.http2.Http2Exception.streamError(Http2Exception.java:172)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2RemoteFlowController$FlowState.cancel(DefaultHttp2RemoteFlowController.java:481)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2RemoteFlowController$1.onStreamClosed(DefaultHttp2RemoteFlowController.java:105)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2Connection.notifyClosed(DefaultHttp2Connection.java:357)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2Connection$ActiveStreams.removeFromActiveStreams(DefaultHttp2Connection.java:1007)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2Connection$ActiveStreams$2.process(DefaultHttp2Connection.java:968)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2Connection$ActiveStreams.decrementPendingIterations(DefaultHttp2Connection.java:1029)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2Connection$ActiveStreams.forEachActiveStream(DefaultHttp2Connection.java:984)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2Connection.forEachActiveStream(DefaultHttp2Connection.java:209)
at io.grpc.netty.shaded.io.grpc.netty.NettyClientHandler.goingAway(NettyClientHandler.java:839)
at io.grpc.netty.shaded.io.grpc.netty.NettyClientHandler.access$200(NettyClientHandler.java:91)
at io.grpc.netty.shaded.io.grpc.netty.NettyClientHandler$2.onGoAwayReceived(NettyClientHandler.java:278)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2Connection.goAwayReceived(DefaultHttp2Connection.java:237)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2ConnectionDecoder.onGoAwayRead0(DefaultHttp2ConnectionDecoder.java:217)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2ConnectionDecoder$FrameReadListener.onGoAwayRead(DefaultHttp2ConnectionDecoder.java:583)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.Http2InboundFrameLogger$1.onGoAwayRead(Http2InboundFrameLogger.java:119)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2FrameReader.readGoAwayFrame(DefaultHttp2FrameReader.java:580)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2FrameReader.processPayloadState(DefaultHttp2FrameReader.java:271)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2FrameReader.readFrame(DefaultHttp2FrameReader.java:159)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.Http2InboundFrameLogger.readFrame(Http2InboundFrameLogger.java:41)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.DefaultHttp2ConnectionDecoder.decodeFrame(DefaultHttp2ConnectionDecoder.java:173)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.Http2ConnectionHandler$FrameDecoder.decode(Http2ConnectionHandler.java:378)
at io.grpc.netty.shaded.io.netty.handler.codec.http2.Http2ConnectionHandler.decode(Http2ConnectionHandler.java:438)
at io.grpc.netty.shaded.io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:507)
at io.grpc.netty.shaded.io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:446)
at io.grpc.netty.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:276)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at io.grpc.netty.shaded.io.netty.handler.ssl.SslHandler.unwrap(SslHandler.java:1371)
at io.grpc.netty.shaded.io.netty.handler.ssl.SslHandler.decodeJdkCompatible(SslHandler.java:1234)
at io.grpc.netty.shaded.io.netty.handler.ssl.SslHandler.decode(SslHandler.java:1283)
at io.grpc.netty.shaded.io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:507)
at io.grpc.netty.shaded.io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:446)
at io.grpc.netty.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:276)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at io.grpc.netty.shaded.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at io.grpc.netty.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at io.grpc.netty.shaded.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
at io.grpc.netty.shaded.io.netty.channel.epoll.AbstractEpollStreamChannel$EpollStreamUnsafe.epollInReady(AbstractEpollStreamChannel.java:795)
at io.grpc.netty.shaded.io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:480)
at io.grpc.netty.shaded.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:378)
at io.grpc.netty.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)
at io.grpc.netty.shaded.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at io.grpc.netty.shaded.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
... 1 common frames omitted
The "Internal" gRPC status is propagated to application code and force us to retry pull operation polluting logs with errors/warnings along the way.
To add more context this is happening when Pub/Sub PullRequest API takes long time (I'm observing p99 20-30 seconds latency when this happens). Netty closes the connection with "Stream closed before write could take place" in DefaultHttp2RemoteFlowController.java:481 and status io.netty.handler.codec.http2.Http2Error.STREAM_CLOSED(0x5) Then this status code is translated to io.grpc.internal.Http2Error.INTERNAL and propagated up the stack.
Has anybody experience this error and come up with a way to gracefully handle it?
When I am running buildindex in my Websphere application, I have this error in buildindex log:
[2021/05/10 15:41:57:590 GMT] I Data import pre-processing completed in 0.389 seconds for table TI_CAT_EXTENDED_41060.
[2021/05/10 15:41:57:591 GMT] I /opt/IBM/WebSphere/CommerceServer80/instances/auth/search/pre-processConfig/MC_41060/DB2/wc-dataimport-preprocess-catentry-metainf.xml
[2021/05/10 15:41:57:591 GMT] I
Table name: TI_X_CATENT_META_INF_410600
Fetch size: 500
Batch size: 500
[2021/05/10 15:41:58:048 GMT] I Error for batch element #415: DB2 SQL Error: SQLCODE=-302, SQLSTATE=22001, SQLERRMC=null, DRIVER=4.19.77
[2021/05/10 15:41:58:048 GMT] I SQL: SELECT CATENTRY_ID, TITLE, TITLE_KEYWORDS, SHORT_DESC, SHORT_DESC_KEYWORDS, LONG_DESC, LONG_DESC_KEYWORDS, LOCALE FROM X_CATENT_META_INF WHERE STORE_ID = 41006
[2021/05/10 15:41:58:087 GMT] I
The program exiting with exit code: 1.
Data import pre-processing was unsuccessful. An unrecoverable error has occurred.
[2021/05/10 15:41:58:091 GMT] E com.ibm.commerce.foundation.dataimport.preprocess.DataImportPreProcessorMain:handleExecutionException Exception message: CWFDIH0002: An SQL exception was caught. The following error occurred: [jcc][t4][102][10040][4.19.77] Batch failure. The batch was submitted, but at least one exception occurred on an individual member of the batch.
Use getNextException() to retrieve the exceptions for specific batched elements. ERRORCODE=-4229, SQLSTATE=null., stack trace: com.ibm.commerce.foundation.dataimport.exception.DataImportSystemException: CWFDIH0002: An SQL exception was caught. The following error occurred: [jcc][t4][102][10040][4.19.77] Batch failure. The batch was submitted, but at least one exception occurred on an individual member of the batch.
Use getNextException() to retrieve the exceptions for specific batched elements. ERRORCODE=-4229, SQLSTATE=null.
at com.ibm.commerce.foundation.dataimport.preprocess.DataImportPreProcessorMain.processDataConfig(DataImportPreProcessorMain.java:1515)
at com.ibm.commerce.foundation.dataimport.preprocess.DataImportPreProcessorMain.execute(DataImportPreProcessorMain.java:1331)
at com.ibm.commerce.foundation.dataimport.preprocess.DataImportPreProcessorMain.main(DataImportPreProcessorMain.java:534)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:95)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:56)
at java.lang.reflect.Method.invoke(Method.java:620)
at com.ibm.ws.bootstrap.WSLauncher.main(WSLauncher.java:280)
Caused by: com.ibm.db2.jcc.am.BatchUpdateException: [jcc][t4][102][10040][4.19.77] Batch failure. The batch was submitted, but at least one exception occurred on an individual member of the batch.
Use getNextException() to retrieve the exceptions for specific batched elements. ERRORCODE=-4229, SQLSTATE=null
at com.ibm.db2.jcc.am.b4.a(b4.java:475)
at com.ibm.db2.jcc.am.Agent.endBatchedReadChain(Agent.java:414)
at com.ibm.db2.jcc.am.ki.a(ki.java:5342)
at com.ibm.db2.jcc.am.ki.c(ki.java:4929)
at com.ibm.db2.jcc.am.ki.executeBatch(ki.java:3045)
at com.ibm.commerce.foundation.dataimport.preprocess.AbstractDataPreProcessor.populateTable(AbstractDataPreProcessor.java:373)
at com.ibm.commerce.foundation.dataimport.preprocess.StaticAttributeDataPreProcessor.process(StaticAttributeDataPreProcessor.java:461)
at com.ibm.commerce.foundation.dataimport.preprocess.DataImportPreProcessorMain.processDataConfig(DataImportPreProcessorMain.java:1482)
... 7 more
The exception seems to be clear, but I can't identify what is the element #415 in batch. Even the log doesn't helps, because it doesn't point to another more detailed log. Do you have any suggestion for find it?
Thanks to the comment of user #mao, I have followed this link
The failing table first must be identified. Enable more detailed tracing for di-preprocess:
Navigate to :
WC_installdir/instances/instance_name/xml/config/dataimport
and open the logging.properties file. Find all instances of INFO and
change it to FINEST. Optionally increase the size of the log file and
the number of historical log files while editing this file.
Thanks to this suggestion, I had re-run the buildindex process, and found that solr was wrongly grouping fields from original table, thus generating a too long field for the destination, and generating the error.
We have a spring-webflux application running on spring-boot-starter-webflux:jar:2.1.5.RELEASE and reactor-netty:jar:0.8.8.RELEASE. When a reactive client goes away (k8s pod killed or the client subscription is disposed off) before the request is completed, we see the server simply stops processing the request and no more application logs related to that request are seen. However, reactive-netty prints a trace log indicating that the channel is inactive and will be terminated.
Is there anything we can do to handle this termination gracefully? We would ideally like to respond to a cancellation signal from reactor.
2020-03-18T16:46:40.372Z TRACE --- [reactor-http-epoll-2] r.n.c.ChannelOperations : [id: 0x4cad974b, L:/<SERVER_IP_ADDRESS>:8080 ! R:/<SOME_OTHER_IP_ADDRESS>:55436] Disposing ChannelOperation from a channel
java.lang.Exception: ChannelOperation terminal stack
at reactor.netty.channel.ChannelOperations.terminate(ChannelOperations.java:391)
at reactor.netty.channel.ChannelOperations.onInboundClose(ChannelOperations.java:360)
at reactor.netty.channel.ChannelOperationsHandler.channelInactive(ChannelOperationsHandler.java:72)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:257)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:243)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:236)
at io.netty.channel.CombinedChannelDuplexHandler$DelegatingChannelHandlerContext.fireChannelInactive(CombinedChannelDuplexHandler.java:420)
at io.netty.handler.codec.ByteToMessageDecoder.channelInputClosed(ByteToMessageDecoder.java:393)
at io.netty.handler.codec.ByteToMessageDecoder.channelInactive(ByteToMessageDecoder.java:358)
at io.netty.channel.CombinedChannelDuplexHandler.channelInactive(CombinedChannelDuplexHandler.java:223)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:257)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:243)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:236)
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelInactive(DefaultChannelPipeline.java:1416)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:257)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:243)
at io.netty.channel.DefaultChannelPipeline.fireChannelInactive(DefaultChannelPipeline.java:912)
at io.netty.channel.AbstractChannel$AbstractUnsafe$8.run(AbstractChannel.java:816)
at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:163)
at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:416)
at io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:331)
at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:918)
at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at java.base/java.lang.Thread.run(Thread.java:834)
The doOnCancel operator allows for a callback to be provided when a subscription is cancelled. This is different from the doOnEach operator which is run only when the subscription completes or errors out.
https://projectreactor.io/docs/core/release/api/reactor/core/publisher/Mono.html#doOnCancel-java.lang.Runnable-
Moderate workload on 3 node ignite cluster causes one node to fail with stripped pool startvation while archiving WAL.
This happens one or two times in week.
I already checked all IO problems which could hang WAL rollover. But this issue still persist
I am using latest ignite 2.7 as a library inside spring boot application
: >>> Possible starvation in striped pool.
Deadlock: false
Completed: 1397
Thread [name="sys-stripe-7-#8%server.node%", id=22, state=WAITING, blockCnt=3, waitCnt=757]
Lock [object=java.util.concurrent.locks.ReentrantLock$NonfairSync#b01791b, ownerName=sys-#214%server.node%, ownerId=248]
at sun.misc.Unsafe.park(Native Method)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199)
at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(ReentrantLock.java:209)
at java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:285)
at o.a.i.i.processors.cache.persistence.wal.FileWriteAheadLogManager$FileWriteHandle.awaitNext(FileWriteAheadLogManager.java:2871)
at o.a.i.i.processors.cache.persistence.wal.FileWriteAheadLogManager$FileWriteHandle.access$2300(FileWriteAheadLogManager.java:2451)
at o.a.i.i.processors.cache.persistence.wal.FileWriteAheadLogManager.rollOver(FileWriteAheadLogManager.java:1205)
at o.a.i.i.processors.cache.persistence.wal.FileWriteAheadLogManager.log(FileWriteAheadLogManager.java:836)
at o.a.i.i.processors.cache.GridCacheMapEntry.logUpdate(GridCacheMapEntry.java:4267)
at o.a.i.i.processors.cache.GridCacheMapEntry$AtomicCacheUpdateClosure.update(GridCacheMapEntry.java:6333)
at o.a.i.i.processors.cache.GridCacheMapEntry$AtomicCacheUpdateClosure.call(GridCacheMapEntry.java:6082)
at o.a.i.i.processors.cache.GridCacheMapEntry$AtomicCacheUpdateClosure.call(GridCacheMapEntry.java:5782)
at o.a.i.i.processors.cache.persistence.tree.BPlusTree$Invoke.invokeClosure(BPlusTree.java:3719)
at o.a.i.i.processors.cache.persistence.tree.BPlusTree$Invoke.access$5900(BPlusTree.java:3613)
at o.a.i.i.processors.cache.persistence.tree.BPlusTree.invokeDown(BPlusTree.java:1895)
at o.a.i.i.processors.cache.persistence.tree.BPlusTree.invoke(BPlusTree.java:1779)
at o.a.i.i.processors.cache.IgniteCacheOffheapManagerImpl$CacheDataStoreImpl.invoke0(IgniteCacheOffheapManagerImpl.java:1638)
at o.a.i.i.processors.cache.IgniteCacheOffheapManagerImpl$CacheDataStoreImpl.invoke(IgniteCacheOffheapManagerImpl.java:1621)
at o.a.i.i.processors.cache.persistence.GridCacheOffheapManager$GridCacheDataStore.invoke(GridCacheOffheapManager.java:1935)
at o.a.i.i.processors.cache.IgniteCacheOffheapManagerImpl.invoke(IgniteCacheOffheapManagerImpl.java:428)
at o.a.i.i.processors.cache.GridCacheMapEntry.innerUpdate(GridCacheMapEntry.java:2295)
at o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.processDhtAtomicUpdateRequest(GridDhtAtomicCache.java:3242)
at o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.access$600(GridDhtAtomicCache.java:135)
at o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache$7.apply(GridDhtAtomicCache.java:309)
at o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache$7.apply(GridDhtAtomicCache.java:304)
at o.a.i.i.processors.cache.GridCacheIoManager.processMessage(GridCacheIoManager.java:1056)
at o.a.i.i.processors.cache.GridCacheIoManager.onMessage0(GridCacheIoManager.java:581)
at o.a.i.i.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:380)
at o.a.i.i.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:306)
at o.a.i.i.processors.cache.GridCacheIoManager.access$100(GridCacheIoManager.java:101)
at o.a.i.i.processors.cache.GridCacheIoManager$1.onMessage(GridCacheIoManager.java:295)
at o.a.i.i.managers.communication.GridIoManager.invokeListener(GridIoManager.java:1569)
at o.a.i.i.managers.communication.GridIoManager.processRegularMessage0(GridIoManager.java:1197)
at o.a.i.i.managers.communication.GridIoManager.access$4200(GridIoManager.java:127)
at o.a.i.i.managers.communication.GridIoManager$9.run(GridIoManager.java:1093)
at o.a.i.i.util.StripedExecutor$Stripe.body(StripedExecutor.java:505)
at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
at java.lang.Thread.run(Thread.java:748)
ERROR --- [tcp-disco-msg-worker-#2%server.node%] [] o.a.i.i.u.t.G : Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [threadName=sys-stripe-1, blockedFor=10s]
WARN --- [tcp-disco-msg-worker-#2%server.node%] [] o.a.i.i.u.t.G : Thread [name="sys-stripe-1-#2%server.node%", id=16, state=WAITING, blockCnt=0, waitCnt=754]
Lock [object=java.util.concurrent.locks.ReentrantLock$NonfairSync#b01791b, ownerName=sys-#214%server.node%, ownerId=248]
Failure Detection feature is not very well configured in Apache Ignite 2.7 by default. You can turn it off (by setting to NoOp) or set a large failureDetectionTimeout to avoid such messages (and shutdown of nodes).
Our RabbitMQ service crashed twice with the following report in the $RABBITMQ_NODENAME-sasl.log:
=CRASH REPORT==== 7-Jun-2016::14:37:25 ===
crasher:
initial call: gen:init_it/6
pid: <0.223.0>
registered_name: []
exception exit: {{badmatch,
{[{msg_location,
<<162,171,39,113,226,229,228,92,227,253,48,186,
45,48,29,98>>,
1,357,0,583},
******************
16000 similar msg_location lines snipped
******************
1795219}},
[{rabbit_msg_store,combine_files,3,[]},
{rabbit_msg_store_gc,attempt_action,3,[]},
{rabbit_msg_store_gc,handle_cast,2,[]},
{gen_server2,handle_msg,2,[]},
{proc_lib,wake_up,3,
[{file,"proc_lib.erl"},{line,250}]}]}
in function gen_server2:terminate/3
ancestors: [msg_store_persistent,rabbit_sup,<0.159.0>]
messages: [{'$gen_cast',{combine,394,380}}]
links: [#Port<0.86370>,<0.218.0>,#Port<0.86369>]
dictionary: [{{"/var/lib/rabbitmq/mnesia/$RABBITMQ_NODENAME/msg_store_persistent/357.rdq",
fhc_file},
{file,1,false}},
{{"/var/lib/rabbitmq/mnesia/$RABBITMQ_NODENAME/msg_store_persistent/340.rdq",
fhc_file},
{file,1,true}},
{fhc_age_tree,{2,
{{1465,346244,764691},
#Ref<0.0.3145729.257998>,nil,
{{1465,346244,891543},
#Ref<0.0.3145729.258001>,nil,nil}}}},
{{#Ref<0.0.3145729.257998>,fhc_handle},
{handle,{file_descriptor,prim_file,{#Port<0.86369>,59}},
0,false,0,1048576,[],false,
"/var/lib/rabbitmq/mnesia/$RABBITMQ_NODENAME/msg_store_persistent/357.rdq",
[raw,binary,read_ahead,read],
[{write_buffer,1048576}],
false,true,
{1465,346244,764691}}},
{{#Ref<0.0.3145729.258001>,fhc_handle},
{handle,{file_descriptor,prim_file,{#Port<0.86370>,64}},
14212552,false,0,1048576,[],false,
"/var/lib/rabbitmq/mnesia/$RABBITMQ_NODENAME/msg_store_persistent/340.rdq",
[raw,binary,read_ahead,read,write],
[{write_buffer,1048576}],
true,true,
{1465,346244,891543}}}]
trap_exit: false
status: running
heap_size: 121536
stack_size: 27
reductions: 835024
neighbours:
We'd like to understand what this crash report means. Does it signify a bad message, RMQ can't find a message, or something completely different? We're using RabbitMQ 3.1.5 with Erlang 18, and while we know we're using an old version, we want to first know what's causing the crash before dedicating resources to an upgrade.
This message means that RabbitMQ message storage process has failed to combine files during garbage collection on message store. This can in theory cause message loss.
Note that 3.1.5 is not supported and has not been tested with OTP 10. This issue can be already fixed in newer versions though.