Ignite Cache operations get stuck when multiple thin clients try to perform CRUD operations in parallel with partition map exchange - ignite

Looks like partition map exchange process and cache CRUD operations in parallel are causing deadlock or lock acquire failures.
What should be the strategy to handle this scenario ?
Ignite server has below errors:
Exception stack trace 1:
WARNING: Dumping the near node thread that started transaction [xidVer=GridCacheVersion [topVer=247332659, order=1635852705217, nodeOrder=1], nodeId=2735bef0-7404-41e3-843f-7043490c9d84]
Stack trace of the transaction owner thread:
Thread [name="client-connector-#56%perf-dn1%", id=93, state=WAITING, blockCnt=5023, waitCnt=36165]
at sun.misc.Unsafe.park(Native Method)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:304)
at o.a.i.i.util.future.GridFutureAdapter.get0(GridFutureAdapter.java:178)
at o.a.i.i.util.future.GridFutureAdapter.get(GridFutureAdapter.java:141)
at o.a.i.i.processors.cache.GridCacheAdapter$41.op(GridCacheAdapter.java:3430)
at o.a.i.i.processors.cache.GridCacheAdapter$41.op(GridCacheAdapter.java:3423)
at o.a.i.i.processors.cache.GridCacheAdapter.syncOp(GridCacheAdapter.java:4480)
at o.a.i.i.processors.cache.GridCacheAdapter.remove0(GridCacheAdapter.java:3423)
at o.a.i.i.processors.cache.GridCacheAdapter.remove(GridCacheAdapter.java:3405)
at o.a.i.i.processors.cache.GridCacheAdapter.remove(GridCacheAdapter.java:3388)
at o.a.i.i.processors.cache.IgniteCacheProxyImpl.remove(IgniteCacheProxyImpl.java:1438)
at o.a.i.i.processors.cache.GatewayProtectedCacheProxy.remove(GatewayProtectedCacheProxy.java:964)
at o.a.i.i.processors.platform.client.cache.ClientCacheRemoveKeyRequest.process(ClientCacheRemoveKeyRequest.java:41)
at o.a.i.i.processors.platform.client.ClientRequestHandler.handle(ClientRequestHandler.java:77)
at o.a.i.i.processors.odbc.ClientListenerNioListener.onMessage(ClientListenerNioListener.java:204)
at o.a.i.i.processors.odbc.ClientListenerNioListener.onMessage(ClientListenerNioListener.java:55)
at o.a.i.i.util.nio.GridNioFilterChain$TailFilter.onMessageReceived(GridNioFilterChain.java:279)
at o.a.i.i.util.nio.GridNioFilterAdapter.proceedMessageReceived(GridNioFilterAdapter.java:109)
at o.a.i.i.util.nio.GridNioAsyncNotifyFilter$3.body(GridNioAsyncNotifyFilter.java:97)
at o.a.i.i.util.worker.GridWorker.run(GridWorker.java:120)
at o.a.i.i.util.worker.GridWorkerPool$1.run(GridWorkerPool.java:70)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Exception stack trace 2:
WARNING: >>> Transaction [startTime=11:39:27.214, curTime=11:40:36.277, systemTime=0, userTime=69063, tx=GridNearTxLocal [mappings=IgniteTxMappingsImpl [], nearLocallyMapped=false, colocatedLocallyMapped=false, needCheckBackup=null, hasRemoteLocks=false, trackTimeout=false, systemTime=44700, systemStartTime=0, prepareStartTime=0, prepareTime=0, commitOrRollbackStartTime=0, commitOrRollbackTime=0, lb=null, mvccOp=null, qryId=-1, crdVer=0, thread=client-connector-#57%perf-dn1%, mappings=IgniteTxMappingsImpl [], super=GridDhtTxLocalAdapter [nearOnOriginatingNode=false, span=o.a.i.i.processors.tracing.NoopSpan#4a931268, nearNodes=KeySetView [], dhtNodes=KeySetView [], explicitLock=false, super=IgniteTxLocalAdapter [completedBase=null, sndTransformedVals=false, depEnabled=false, txState=IgniteTxStateImpl [activeCacheIds=[], recovery=null, mvccEnabled=null, mvccCachingCacheIds=[], txMap=EmptySet []], super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=247332659, order=1635852705226, nodeOrder=1], writeVer=null, implicit=false, loc=true, threadId=95, startTime=1635853167214, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, isolation=REPEATABLE_READ, concurrency=PESSIMISTIC, timeout=0, sysInvalidate=false, sys=false, plc=2, commitVer=null, finalizing=NONE, invalidParts=null, state=SUSPENDED, timedOut=false, topVer=AffinityTopologyVersion [topVer=-1, minorTopVer=0], mvccSnapshot=null, skipCompletedVers=false, parentTx=null, duration=69079ms, onePhaseCommit=false], size=0]]]]
Nov 2, 2021 11:40:36 AM org.apache.ignite.logger.java.JavaLogger warning
WARNING: First 10 long running cache futures [total=16]
Nov 2, 2021 11:40:36 AM org.apache.ignite.logger.java.JavaLogger warning
WARNING: >>> Future [startTime=11:39:27.324, curTime=11:40:36.277, fut=GridDhtLockFuture [span=o.a.i.i.processors.tracing.NoopSpan#4a931268, nearNodeId=2735bef0-7404-41e3-843f-7043490c9d84, nearLockVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], threadId=124, futId=be58a60ec71-1d64903c-c700-4deb-bace-cc5158713120, lockVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], read=false, err=null, timedOut=false, timeout=0, tx=GridNearTxLocal [mappings=IgniteTxMappingsImpl []dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]], prepared=0, locked=false, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, locMapped=false, expiryPlc=null, transferExpiryPlc=false, flags=2, partUpdateCntr=0, serReadVer=null, xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1]]]], super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], writeVer=null, implicit=false, loc=true, threadId=124, startTime=1635853167214, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, isolation=REPEATABLE_READ, concurrency=PESSIMISTIC, timeout=0, sysInvalidate=false, sys=false, plc=2, commitVer=null, finalizing=NONE, invalidParts=null, state=ACTIVE, timedOut=false, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], mvccSnapshot=null, skipCompletedVers=false, parentTx=null, duration=69094ms, onePhaseCommit=false], size=1]], nearLocallyMapped=false, colocatedLocallyMapped=true, needCheckBackup=null, hasRemoteLocks=false, trackTimeout=false, systemTime=75000, systemStartTime=971108549857700, prepareStartTime=0, prepareTime=0, commitOrRollbackStartTime=0, commitOrRollbackTime=0, lb=null, mvccOp=null, qryId=-1, crdVer=0, thread=client-connector-#84%perf-dn1%, mappings=IgniteTxMappingsImpl []dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]], prepared=0, locked=false, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, locMapped=false, expiryPlc=null, transferExpiryPlc=false, flags=2, partUpdateCntr=0, serReadVer=null, xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1]]]], super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], writeVer=null, implicit=false, loc=true, threadId=124, startTime=1635853167214, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, isolation=REPEATABLE_READ, concurrency=PESSIMISTIC, timeout=0, sysInvalidate=false, sys=false, plc=2, commitVer=null, finalizing=NONE, invalidParts=null, state=ACTIVE, timedOut=false, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], mvccSnapshot=null, skipCompletedVers=false, parentTx=null, duration=69094ms, onePhaseCommit=false], size=1]], super=GridDhtTxLocalAdapter [nearOnOriginatingNode=false, span=o.a.i.i.processors.tracing.NoopSpan#4a931268, nearNodes=KeySetView [], dhtNodes=KeySetView [], explicitLock=false, super=IgniteTxLocalAdapter [completedBase=null, sndTransformedVals=false, depEnabled=false, txState=IgniteTxStateImpl [activeCacheIds=[585748697], recovery=false, mvccEnabled=false, mvccCachingCacheIds=[], txMap=ArrayList [IgniteTxEntry [txKey=IgniteTxKey [key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], cacheId=585748697], val=TxEntryValueHolder [val=null, op=DELETE], prevVal=TxEntryValueHolder [val=null, op=NOOP], oldVal=TxEntryValueHolder [val=null, op=NOOP], entryProcessorsCol=null, ttl=-1, conflictExpireTime=-1, conflictVer=null, explicitVer=null, dhtVer=null, filters=CacheEntryPredicate[] [], filtersPassed=false, filtersSet=true, entry=GridDhtCacheEntry [rdrs=ReaderId[] [], part=244, super=GridDistributedCacheEntry [super=GridCacheMapEntry [key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], val=null, ver=GridCacheVersion [topVer=247332659, order=1635852705229, nodeOrder=1], hash=1085684290, extras=GridCacheMvccEntryExtras [mvcc=GridCacheMvcc [locs=LinkedList [GridCacheMvccCandidate [nodeId=2735bef0-7404-41e3-843f-7043490c9d84, ver=GridCacheVersion [topVer=247332659, order=1635852705207, nodeOrder=1], threadId=122, id=2104, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], reentry=null, otherNodeId=2735bef0-7404-41e3-843f-7043490c9d84, otherVer=GridCacheVersion [topVer=247332659, order=1635852705207, nodeOrder=1], mappedDhtNodes=null, mappedNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=1|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null], GridCacheMvccCandidate [nodeId=2735bef0-7404-41e3-843f-7043490c9d84, ver=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], threadId=124, id=2102, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], reentry=null, otherNodeId=2735bef0-7404-41e3-843f-7043490c9d84, otherVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], mappedDhtNodes=null, mappedNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null], GridCacheMvccCandidate [nodeId=2735bef0-7404-41e3-843f-7043490c9d84, ver=GridCacheVersion [topVer=247332659, order=1635852705213, nodeOrder=1], threadId=122, id=2120, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], reentry=null, otherNodeId=2735bef0-7404-41e3-843f-7043490c9d84, otherVer=GridCacheVersion [topVer=247332659, order=1635852705213, nodeOrder=1], mappedDhtNodes=null, mappedNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705207, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null], GridCacheMvccCandidate [nodeId=2735bef0-7404-41e3-843f-7043490c9d84, ver=GridCacheVersion [topVer=247332659, order=1635852705214, nodeOrder=1], threadId=123, id=2118, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], reentry=null, otherNodeId=2735bef0-7404-41e3-843f-7043490c9d84, otherVer=GridCacheVersion [topVer=247332659, order=1635852705214, nodeOrder=1], mappedDhtNodes=null, mappedNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705207, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null], GridCacheMvccCandidate [nodeId=2735bef0-7404-41e3-843f-7043490c9d84, ver=GridCacheVersion [topVer=247332659, order=1635852705217, nodeOrder=1], threadId=93, id=2108, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], reentry=null, otherNodeId=2735bef0-7404-41e3-843f-7043490c9d84, otherVer=GridCacheVersion [topVer=247332659, order=1635852705217, nodeOrder=1], mappedDhtNodes=null, mappedNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null], GridCacheMvccCandidate [nodeId=2735bef0-7404-41e3-843f-7043490c9d84, ver=GridCacheVersion [topVer=247332659, order=1635852705218, nodeOrder=1], threadId=115, id=2106, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], reentry=null, otherNodeId=2735bef0-7404-41e3-843f-7043490c9d84, otherVer=GridCacheVersion [topVer=247332659, order=1635852705218, nodeOrder=1], mappedDhtNodes=null, mappedNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null], GridCacheMvccCandidate [nodeId=2735bef0-7404-41e3-843f-7043490c9d84, ver=GridCacheVersion [topVer=247332659, order=1635852705222, nodeOrder=1], threadId=95, id=2110, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], reentry=null, otherNodeId=2735bef0-7404-41e3-843f-7043490c9d84, otherVer=GridCacheVersion [topVer=247332659, order=1635852705222, nodeOrder=1], mappedDhtNodes=null, mappedNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null], GridCacheMvccCandidate [nodeId=2735bef0-7404-41e3-843f-7043490c9d84, ver=GridCacheVersion [topVer=247332659, order=1635852705223, nodeOrder=1], threadId=120, id=2112, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], reentry=null, otherNodeId=2735bef0-7404-41e3-843f-7043490c9d84, otherVer=GridCacheVersion [topVer=247332659, order=1635852705223, nodeOrder=1], mappedDhtNodes=null, mappedNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null], GridCacheMvccCandidate [nodeId=2735bef0-7404-41e3-843f-7043490c9d84, ver=GridCacheVersion [topVer=247332659, order=1635852705227, nodeOrder=1], threadId=118, id=2114, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], reentry=null, otherNodeId=2735bef0-7404-41e3-843f-7043490c9d84, otherVer=GridCacheVersion [topVer=247332659, order=1635852705227, nodeOrder=1], mappedDhtNodes=null, mappedNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]], prepared=0, locked=false, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, locMapped=false, expiryPlc=null, transferExpiryPlc=false, flags=2, partUpdateCntr=0, serReadVer=null, xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1]]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]], prepared=0, locked=false, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, locMapped=false, expiryPlc=null, transferExpiryPlc=false, flags=2, partUpdateCntr=0, serReadVer=null, xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1]]]]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]], prepared=0, locked=false, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, locMapped=false, expiryPlc=null, transferExpiryPlc=false, flags=2, partUpdateCntr=0, serReadVer=null, xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1]]]], super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], writeVer=null, implicit=false, loc=true, threadId=124, startTime=1635853167214, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, isolation=REPEATABLE_READ, concurrency=PESSIMISTIC, timeout=0, sysInvalidate=false, sys=false, plc=2, commitVer=null, finalizing=NONE, invalidParts=null, state=ACTIVE, timedOut=false, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], mvccSnapshot=null, skipCompletedVers=false, parentTx=null, duration=69094ms, onePhaseCommit=false], size=1]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]], prepared=0, locked=false, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, locMapped=false, expiryPlc=null, transferExpiryPlc=false, flags=2, partUpdateCntr=0, serReadVer=null, xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1]]]], super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], writeVer=null, implicit=false, loc=true, threadId=124, startTime=1635853167214, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, isolation=REPEATABLE_READ, concurrency=PESSIMISTIC, timeout=0, sysInvalidate=false, sys=false, plc=2, commitVer=null, finalizing=NONE, invalidParts=null, state=ACTIVE, timedOut=false, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], mvccSnapshot=null, skipCompletedVers=false, parentTx=null, duration=69094ms, onePhaseCommit=false], size=1]]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]], prepared=0, locked=false, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, locMapped=false, expiryPlc=null, transferExpiryPlc=false, flags=2, partUpdateCntr=0, serReadVer=null, xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1]]]], super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], writeVer=null, implicit=false, loc=true, threadId=124, startTime=1635853167214, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, isolation=REPEATABLE_READ, concurrency=PESSIMISTIC, timeout=0, sysInvalidate=false, sys=false, plc=2, commitVer=null, finalizing=NONE, invalidParts=null, state=ACTIVE, timedOut=false, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], mvccSnapshot=null, skipCompletedVers=false, parentTx=null, duration=69094ms, onePhaseCommit=false], size=1]]]dNearNodes=null, ownerVer=GridCacheVersion [topVer=247332659, order=1635852705211, nodeOrder=1], serOrder=null, key=KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true], masks=local=1|owner=0|ready=1|reentry=0|used=0|tx=1|single_implicit=0|dht_local=1|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], rmts=null]], flags=3]]], prepared=0, locked=false, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, locMapped=false, expiryPlc=null, transferExpiryPlc=false, flags=2, partUpdateCntr=0, serReadVer=null, xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1]]]], super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=247332659, order=1635852705208, nodeOrder=1], writeVer=null, implicit=false, loc=true, threadId=124, startTime=1635853167214, nodeId=2735bef0-7404-41e3-843f-7043490c9d84, isolation=REPEATABLE_READ, concurrency=PESSIMISTIC, timeout=0, sysInvalidate=false, sys=false, plc=2, commitVer=null, finalizing=NONE, invalidParts=null, state=ACTIVE, timedOut=false, topVer=AffinityTopologyVersion [topVer=1, minorTopVer=162], mvccSnapshot=null, skipCompletedVers=false, parentTx=null, duration=69094ms, onePhaseCommit=false], size=1]]], mapped=false, trackable=true, createTtl=-1, accessTtl=-1, needReturnVal=false, skipStore=false, keepBinary=true, innerFuts=EmptyList [], pendingLocks=HashSet [KeyCacheObjectImpl [part=244, val=data=6ff0c60ec71-625345be-9a91-497a-895e-abbe5df9da3d], hasValBytes=true]], super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=0, lsnrCalls=0, done=false, cancelled=false, err=null, futs=EmptyList []]]]]
Nov 2, 2021 11:40:36 AM org.apache.ignite.logger.java.JavaLogger warning
Exception stack trace 3 when TxTimeoutOnPartitionMapExchange is set to some value
Nov 2, 2021 1:11:19 PM org.apache.ignite.logger.java.JavaLogger warning
WARNING: The transaction was forcibly rolled back on partition map exchange because a timeout is reached: [tx=GridNearTxLocal[xid=e0e3db0ec71-00000000-0ebe-146c-0000-000000000001, xidVersion=GridCacheVersion [topVer=247338092, order=1635858071054, nodeOrder=1], nearXidVersion=GridCacheVersion [topVer=247338092, order=1635858071054, nodeOrder=1], concurrency=PESSIMISTIC, isolation=REPEATABLE_READ, state=ACTIVE, invalidate=false, rollbackOnly=false, nodeId=1ed7077a-5547-41c1-94ab-d246944ed4a9, timeout=0, startTime=1635858659324, duration=20140, label=null], topVer=AffinityTopologyVersion [topVer=1, minorTopVer=104]]
Nov 2, 2021 1:11:19 PM org.apache.ignite.logger.java.JavaLogger error
SEVERE: Failed to process client request [req=o.a.i.i.processors.platform.client.cache.ClientCacheRemoveKeyRequest#2fea4a3]
javax.cache.CacheException: class org.apache.ignite.transactions.TransactionRollbackException: Failed to finish transaction because it has been rolled back [timeout=0, tx=GridNearTxLocal[xid=f0e3db0ec71-00000000-0ebe-146c-0000-000000000001, xidVersion=GridCacheVersion [topVer=247338092, order=1635858071055, nodeOrder=1], nearXidVersion=GridCacheVersion [topVer=247338092, order=1635858071055, nodeOrder=1], concurrency=PESSIMISTIC, isolation=REPEATABLE_READ, state=ACTIVE, invalidate=false, rollbackOnly=false, nodeId=1ed7077a-5547-41c1-94ab-d246944ed4a9, timeout=0, startTime=1635858659324, duration=20140, label=null]]
at org.apache.ignite.internal.processors.cache.GridCacheUtils.convertToCacheException(GridCacheUtils.java:1266)
at org.apache.ignite.internal.processors.cache.IgniteCacheProxyImpl.cacheException(IgniteCacheProxyImpl.java:2084)
at org.apache.ignite.internal.processors.cache.IgniteCacheProxyImpl.remove(IgniteCacheProxyImpl.java:1441)
at org.apache.ignite.internal.processors.cache.GatewayProtectedCacheProxy.remove(GatewayProtectedCacheProxy.java:964)
at org.apache.ignite.internal.processors.platform.client.cache.ClientCacheRemoveKeyRequest.process(ClientCacheRemoveKeyRequest.java:41)
at org.apache.ignite.internal.processors.platform.client.ClientRequestHandler.handle(ClientRequestHandler.java:77)
at org.apache.ignite.internal.processors.odbc.ClientListenerNioListener.onMessage(ClientListenerNioListener.java:204)
at org.apache.ignite.internal.processors.odbc.ClientListenerNioListener.onMessage(ClientListenerNioListener.java:55)
at org.apache.ignite.internal.util.nio.GridNioFilterChain$TailFilter.onMessageReceived(GridNioFilterChain.java:279)
at org.apache.ignite.internal.util.nio.GridNioFilterAdapter.proceedMessageReceived(GridNioFilterAdapter.java:109)
at org.apache.ignite.internal.util.nio.GridNioAsyncNotifyFilter$3.body(GridNioAsyncNotifyFilter.java:97)
at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:120)
at org.apache.ignite.internal.util.worker.GridWorkerPool$1.run(GridWorkerPool.java:70)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: class org.apache.ignite.transactions.TransactionRollbackException: Failed to finish transaction because it has been rolled back [timeout=0, tx=GridNearTxLocal[xid=f0e3db0ec71-00000000-0ebe-146c-0000-000000000001, xidVersion=GridCacheVersion [topVer=247338092, order=1635858071055, nodeOrder=1], nearXidVersion=GridCacheVersion [topVer=247338092, order=1635858071055, nodeOrder=1], concurrency=PESSIMISTIC, isolation=REPEATABLE_READ, state=ACTIVE, invalidate=false, rollbackOnly=false, nodeId=1ed7077a-5547-41c1-94ab-d246944ed4a9, timeout=0, startTime=1635858659324, duration=20140, label=null]]
at org.apache.ignite.internal.util.IgniteUtils$11.apply(IgniteUtils.java:974)
at org.apache.ignite.internal.util.IgniteUtils$11.apply(IgniteUtils.java:972)
... 16 more

This is the expected behavior:
When the partition map exchange starts, Ignite acquires a global lock
at a particular stage. The lock can’t be obtained while incomplete
transactions are running in parallel. These transactions prevent the
partition map exchange process from moving forward​, thus, blocking
some operations such as a new node join process.
In other words, PME process should wait until the end of all active transactions to acquire a global lock. Therefore it's recommended to:
either set a global TX timeout (TransactionConfiguration#DefaultTxTimeout) or configure it individually for every transaction, i.e. make sure that you are not running it forever in a worst-case scenario.
configure TxTimeoutOnPartitionMapExchange to be non zero (default).
The concrete numbers are individual, but I suppose mostly you'd like to have a lower timeout on PME in comparison to a "normal" TX timeout for quicker cluster-wide updates.

Related

Application is unable to connect to Apache ignite 2.12 after couple of days

We are using Apache Ignite 2.12 and everything works fine but few days after unable to connect/query the cache.
Below is the exception in the logs.
responseMessage: "Failed to execute map query on remote node [nodeId=16a76bdb-40bd-4a9c-8df5-dcbb989cfa21, errMsg=Getting affinity for too old topology version that is already out of history [locNode=TcpDiscoveryNode [id=16a76bdb-40bd-4a9c-8df5-dcbb989cfa21, consistentId=bcd6db5d-54da-4016-a185-b130e209fb43, addrs=ArrayList [10.15.150.17, 127.0.0.1], sockAddrs=HashSet [ig1la2pi7ac-2.ig1la2pi7ac.ig1la2pi7ac.svc.cluster.local/10.15.150.17:47500, /127.0.0.1:47500], discPort=47500, order=3, intOrder=3, lastExchangeTime=1667835848088, loc=true, ver=2.12.0#20220108-sha1:b1289f75, isClient=false], grp=FICO-PTO-TENANT-design.Hotlist-Service.HotlistCache, topVer=AffinityTopologyVersion [topVer=24, minorTopVer=6], lastAffChangeTopVer=AffinityTopologyVersion [topVer=24, minorTopVer=6], head=AffinityTopologyVersion [topVer=40, minorTopVer=0], history=[AffinityTopologyVersion [topVer=25, minorTopVer=2], AffinityTopologyVersion [topVer=26, minorTopVer=0], AffinityTopologyVersion [topVer=27, minorTopVer=0], AffinityTopologyVersion [topVer=28, minorTopVer=0], AffinityTopologyVersion [topVer=29, minorTopVer=0], AffinityTopologyVersion [topVer=30, minorTopVer=0], AffinityTopologyVersion [topVer=31, minorTopVer=0], AffinityTopologyVersion [topVer=32, minorTopVer=0], AffinityTopologyVersion [topVer=33, minorTopVer=0], AffinityTopologyVersion [topVer=34, minorTopVer=0], AffinityTopologyVersion [topVer=35, minorTopVer=0], AffinityTopologyVersion [topVer=36, minorTopVer=0], AffinityTopologyVersion [topVer=37, minorTopVer=0], AffinityTopologyVersion [topVer=38, minorTopVer=0], AffinityTopologyVersion [topVer=39, minorTopVer=0], AffinityTopologyVersion [topVer=40, minorTopVer=0]]]]"
Restarting the ignite pods is fixing the issue.

Ignite transaction failure not recoverable with persistance

I'm running into situations with Ignite cache with persistence enabled where the cache cannot be used anymore after a transaction error it seems. Now, any time I write to the cache leads to the failure below.
What does this error tell me exactly?
Also, how can I recover the cache to operate correctly again now?
Any help appreciated.
javax.cache.CacheException: class org.apache.ignite.transactions.TransactionHeuristicException: Failed to locally write to cache (all transaction entries will be invalidated, however there was a window when entries for this transaction were visible to others): GridDhtTxLocal [nearNodeId=20a50b0c-5b80-4f9c-8be9-9e93260decfc, nearFutId=de1ac717861-3e67525b-9f11-4865-9603-f78250c0a5a8, nearMiniId=1, nearFinFutId=null, nearFinMiniId=0, nearXidVer=GridCacheVersion [topVer=159572191, order=1548092220488, nodeOrder=3], super=GridDhtTxLocalAdapter [nearOnOriginatingNode=false, nearNodes=[], dhtNodes=[], explicitLock=false, super=IgniteTxLocalAdapter [completedBase=null, sndTransformedVals=false, depEnabled=false, txState=IgniteTxImplicitSingleStateImpl [init=true, recovery=false], super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=159572191, order=1548092220489, nodeOrder=1], writeVer=GridCacheVersion [topVer=159572191, order=1548092220491, nodeOrder=1], implicit=true, loc=true, threadId=86, startTime=1548092238406, nodeId=de15c5e1-f745-4fc0-a215-4ad455f16824, startVer=GridCacheVersion [topVer=159572191, order=1548092220489, nodeOrder=1], endVer=null, isolation=READ_COMMITTED, concurrency=OPTIMISTIC, timeout=0, sysInvalidate=false, sys=false, plc=2, commitVer=null, finalizing=USER_FINISH, invalidParts=null, state=COMMITTING, timedOut=false, topVer=AffinityTopologyVersion [topVer=3, minorTopVer=0], duration=11ms, onePhaseCommit=true], size=1]]]
at org.apache.ignite.internal.processors.cache.GridCacheUtils.convertToCacheException(GridCacheUtils.java:1302) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.IgniteCacheProxyImpl.cacheException(IgniteCacheProxyImpl.java:1734) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.IgniteCacheProxyImpl.putIfAbsent(IgniteCacheProxyImpl.java:1171) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GatewayProtectedCacheProxy.putIfAbsent(GatewayProtectedCacheProxy.java:860) ~[ignite-core-2.5.0.jar:2.5.0]
at de.kramersoft.mediafan.net.ignite.cache.IgniteCachedMediaSetStatusHistory.put(IgniteCachedMediaSetStatusHistory.java:43) ~[bin/:?]
at de.kramersoft.mediafan.net.ignite.cache.IgniteCachedMediaSetStatusHistory.put(IgniteCachedMediaSetStatusHistory.java:57) ~[bin/:?]
at de.kramersoft.mediafan.net.ignite.tasklist.TaskManager.updateTaskEntry(TaskManager.java:573) ~[bin/:?]
at de.kramersoft.mediafan.net.ignite.tasklist.TaskManager.addTask(TaskManager.java:89) ~[bin/:?]
at de.kramersoft.mediafan.net.NetMasterIgnite.createTask(NetMasterIgnite.java:1412) ~[bin/:?]
at de.kramersoft.mediafan.net.NetMasterIgnite.createNewMediaSet(NetMasterIgnite.java:1543) [bin/:?]
at de.kramersoft.mediafan.plugins.builtin.MediaProducerSingleFile.produceMedia(MediaProducerSingleFile.java:142) [bin/:?]
at de.kramersoft.mediafan.net.ignite.jobqueue.JobManager.startMediaFanJob(JobManager.java:323) [bin/:?]
at de.kramersoft.mediafan.net.ignite.jobqueue.JobManager.startNextJobOrWait(JobManager.java:254) [bin/:?]
at de.kramersoft.mediafan.net.ignite.jobqueue.JobManager.run(JobManager.java:559) [bin/:?]
at java.lang.Thread.run(Thread.java:748) [?:1.8.0_171]
Caused by: org.apache.ignite.transactions.TransactionHeuristicException: Failed to locally write to cache (all transaction entries will be invalidated, however there was a window when entries for this transaction were visible to others): GridDhtTxLocal [nearNodeId=20a50b0c-5b80-4f9c-8be9-9e93260decfc, nearFutId=de1ac717861-3e67525b-9f11-4865-9603-f78250c0a5a8, nearMiniId=1, nearFinFutId=null, nearFinMiniId=0, nearXidVer=GridCacheVersion [topVer=159572191, order=1548092220488, nodeOrder=3], super=GridDhtTxLocalAdapter [nearOnOriginatingNode=false, nearNodes=[], dhtNodes=[], explicitLock=false, super=IgniteTxLocalAdapter [completedBase=null, sndTransformedVals=false, depEnabled=false, txState=IgniteTxImplicitSingleStateImpl [init=true, recovery=false], super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=159572191, order=1548092220489, nodeOrder=1], writeVer=GridCacheVersion [topVer=159572191, order=1548092220491, nodeOrder=1], implicit=true, loc=true, threadId=86, startTime=1548092238406, nodeId=de15c5e1-f745-4fc0-a215-4ad455f16824, startVer=GridCacheVersion [topVer=159572191, order=1548092220489, nodeOrder=1], endVer=null, isolation=READ_COMMITTED, concurrency=OPTIMISTIC, timeout=0, sysInvalidate=false, sys=false, plc=2, commitVer=null, finalizing=USER_FINISH, invalidParts=null, state=COMMITTING, timedOut=false, topVer=AffinityTopologyVersion [topVer=3, minorTopVer=0], duration=11ms, onePhaseCommit=true], size=1]]]
at org.apache.ignite.internal.util.IgniteUtils$12.apply(IgniteUtils.java:890) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.util.IgniteUtils$12.apply(IgniteUtils.java:888) ~[ignite-core-2.5.0.jar:2.5.0]
... 15 more
Caused by: org.apache.ignite.IgniteCheckedException: Runtime failure on row: Row#76348cbc[ key: 43e7fea1-edbe-41a3-9942-27d782112c17, val: MediaFan_MediaSetStatusHistory [idHash=1108159155, hash=1617878958, mediaSetId=7e1a83f9-f6fb-48cf-8527-629fce810bab, id=43e7fea1-edbe-41a3-9942-27d782112c17, status=2, changed=Mon Jan 21 18:37:18 CET 2019], ver: GridCacheVersion [topVer=159572191, order=1548092220491, nodeOrder=1] ][ 43e7fea1-edbe-41a3-9942-27d782112c17, 7e1a83f9-f6fb-48cf-8527-629fce810bab, 2, 2019-01-21 18:37:18.386 ]
at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.doPut(BPlusTree.java:2116) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.putx(BPlusTree.java:2066) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.query.h2.database.H2TreeIndex.putx(H2TreeIndex.java:247) ~[ignite-indexing-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.query.h2.opt.GridH2Table.addToIndex(GridH2Table.java:548) ~[ignite-indexing-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.query.h2.opt.GridH2Table.update(GridH2Table.java:480) ~[ignite-indexing-2.5.0.jar:2.5.0]
......
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler.processNearTxPrepareRequest0(IgniteTxHandler.java:157) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler.processNearTxPrepareRequest(IgniteTxHandler.java:135) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler.access$000(IgniteTxHandler.java:97) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler$1.apply(IgniteTxHandler.java:177) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler$1.apply(IgniteTxHandler.java:175) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.processMessage(GridCacheIoManager.java:1054) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.onMessage0(GridCacheIoManager.java:579) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:378) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:304) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.access$100(GridCacheIoManager.java:99) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager$1.onMessage(GridCacheIoManager.java:293) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.managers.communication.GridIoManager.invokeListener(GridIoManager.java:1556) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.managers.communication.GridIoManager.processRegularMessage0(GridIoManager.java:1184) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.managers.communication.GridIoManager.access$4200(GridIoManager.java:125) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.managers.communication.GridIoManager$9.run(GridIoManager.java:1091) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.util.StripedExecutor$Stripe.run(StripedExecutor.java:511) ~[ignite-core-2.5.0.jar:2.5.0]
... 1 more
Caused by: org.apache.ignite.IgniteCheckedException: Maximum of retries 1000 reached.
at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree$Get.checkLockRetry(BPlusTree.java:2565) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.putDown(BPlusTree.java:2330) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.putDown(BPlusTree.java:2348) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.putDown(BPlusTree.java:2348) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.doPut(BPlusTree.java:2086) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.putx(BPlusTree.java:2066) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.query.h2.database.H2TreeIndex.putx(H2TreeIndex.java:247) ~[ignite-indexing-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.query.h2.opt.GridH2Table.addToIndex(GridH2Table.java:548) ~[ignite-indexing-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.query.h2.opt.GridH2Table.update(GridH2Table.java:480) ~[ignite-indexing-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.query.h2.IgniteH2Indexing.store(IgniteH2Indexing.java:659) ~[ignite-indexing-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.query.GridQueryProcessor.store(GridQueryProcessor.java:1866) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.query.GridCacheQueryManager.store(GridCacheQueryManager.java:403) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl$CacheDataStoreImpl.finishUpdate(IgniteCacheOffheapManagerImpl.java:1393) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl$CacheDataStoreImpl.invoke(IgniteCacheOffheapManagerImpl.java:1257) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager$GridCacheDataStore.invoke(GridCacheOffheapManager.java:1529) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl.invoke(IgniteCacheOffheapManagerImpl.java:352) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheMapEntry.storeValue(GridCacheMapEntry.java:3602) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheMapEntry.storeValue(GridCacheMapEntry.java:3578) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheMapEntry.innerSet(GridCacheMapEntry.java:1040) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxLocalAdapter.userCommit(IgniteTxLocalAdapter.java:652) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxLocalAdapter.localFinish(GridDhtTxLocalAdapter.java:795) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxLocal.localFinish(GridDhtTxLocal.java:583) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxLocal.finishTx(GridDhtTxLocal.java:464) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxLocal.commitDhtLocalAsync(GridDhtTxLocal.java:517) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxLocal.commitAsync(GridDhtTxLocal.java:526) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxPrepareFuture.onDone(GridDhtTxPrepareFuture.java:735) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxPrepareFuture.onDone(GridDhtTxPrepareFuture.java:104) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.util.future.GridFutureAdapter.onDone(GridFutureAdapter.java:451) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.util.future.GridCompoundFuture.checkComplete(GridCompoundFuture.java:285) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.util.future.GridCompoundFuture.markInitialized(GridCompoundFuture.java:276) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxPrepareFuture.prepare0(GridDhtTxPrepareFuture.java:1254) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxPrepareFuture.mapIfLocked(GridDhtTxPrepareFuture.java:671) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxPrepareFuture.prepare(GridDhtTxPrepareFuture.java:1048) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtTxLocal.prepareAsync(GridDhtTxLocal.java:397) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler.prepareNearTx(IgniteTxHandler.java:516) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler.processNearTxPrepareRequest0(IgniteTxHandler.java:157) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler.processNearTxPrepareRequest(IgniteTxHandler.java:135) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler.access$000(IgniteTxHandler.java:97) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler$1.apply(IgniteTxHandler.java:177) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.transactions.IgniteTxHandler$1.apply(IgniteTxHandler.java:175) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.processMessage(GridCacheIoManager.java:1054) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.onMessage0(GridCacheIoManager.java:579) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:378) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:304) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager.access$100(GridCacheIoManager.java:99) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.processors.cache.GridCacheIoManager$1.onMessage(GridCacheIoManager.java:293) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.managers.communication.GridIoManager.invokeListener(GridIoManager.java:1556) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.managers.communication.GridIoManager.processRegularMessage0(GridIoManager.java:1184) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.managers.communication.GridIoManager.access$4200(GridIoManager.java:125) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.managers.communication.GridIoManager$9.run(GridIoManager.java:1091) ~[ignite-core-2.5.0.jar:2.5.0]
at org.apache.ignite.internal.util.StripedExecutor$Stripe.run(StripedExecutor.java:511) ~[ignite-core-2.5.0.jar:2.5.0]
... 1 more
Looks like H2 index is corrupted. In order to recover the node, you can remove all index.bin files from persistence subdirectories. The index will be rebuilt on next restart.

Ignite issue when a node in the cluster becomes unstable unable to join the cluster and hangs indefinitely

HI I am facing a critical issue with Ignite in our production servers . We have 2 instances with heap sizes of 8gb each . Sometimes due to long gc pause or network issue one of our instances gets stopped . This causes aws auto-scaling to kick in and bring another instance up . This is fine but we have observed that in tis state the grid becomes unstable and our new ignite instaces are never able to join the topology and hang forever causing new autoscaled instances to come again and again .The workaround for this is to restart other instances in the cluster as doing so causes nodes to join again .But ideally in a prod environment this should happen automatically with auto scaling .
Had also added a longer failuredetection timeout but that also doesnt solve it completely and we still observe this sometimes .
The logs observed on the new instances not coming up is as below .Igite version use is 2.4 and off heap mode is used for partitioned caches .Our grid is setup using tcp discovery service using a s3 bucket .
I have some transactional caches as well which do lock based on
tryLocks.
evtLatch=0, remaining=[a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9], super=GridFutureAdapter [ignoreInterrupts=false, state=INIT, res=null, hash=1272213534]]]
2018-07-18 16:34:10.534 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Failed to wait for partition map exchange [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], node=7d5e83aa-736a-4190-8b64-7261db7382f6]. Dumping pending objects that might be the cause:
2018-07-18 16:34:20.534 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Failed to wait for partition map exchange [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], node=7d5e83aa-736a-4190-8b64-7261db7382f6]. Dumping pending objects that might be the cause:
2018-07-18 16:34:20.534 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Ready affinity version: AffinityTopologyVersion [topVer=-1, minorTopVer=0]
2018-07-18 16:34:20.535 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Last exchange future: GridDhtPartitionsExchangeFuture [firstDiscoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931660255, loc=true, ver=2.4.0#20180305-sha1:aa342270, isClient=false], topVer=32, nodeId8=7d5e83aa, msg=null, type=NODE_JOINED, tstamp=1531931329481], crd=TcpDiscoveryNode [id=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, addrs=[10.83.87.131, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-87-131.ec2.internal/10.83.87.131:47500], discPort=47500, order=26, intOrder=14, lastExchangeTime=1531931329258, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], exchId=GridDhtPartitionExchangeId [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], discoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931660255, loc=true, ver=2.4.0#20180305-sha1:aa342270, isClient=false], topVer=32, nodeId8=7d5e83aa, msg=null, type=NODE_JOINED, tstamp=1531931329481], nodeId=7d5e83aa, evt=NODE_JOINED], added=true, initFut=GridFutureAdapter [ignoreInterrupts=false, state=DONE, res=true, hash=247159314], init=true, lastVer=null, partReleaseFut=PartitionReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[ExplicitLockReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[]], TxReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[]], AtomicUpdateReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[]], DataStreamerReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[]]]], exchActions=ExchangeActions [startCaches=null, stopCaches=null, startGrps=[], stopGrps=[], resetParts=null, stateChangeRequest=null], affChangeMsg=null, initTs=1531931329576, centralizedAff=false, changeGlobalStateE=null, done=false, state=SRV, evtLatch=0, remaining=[a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9], super=GridFutureAdapter [ignoreInterrupts=false, state=INIT, res=null, hash=1272213534]]
2018-07-18 16:34:20.535 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.a.i.i.p.c.GridCachePartitionExchangeManager - First 10 pending exchange futures [total=0]
2018-07-18 16:34:20.535 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Last 10 exchange futures (total: 1):
2018-07-18 16:34:20.536 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - >>> GridDhtPartitionsExchangeFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], evt=NODE_JOINED, evtNode=TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931660255, loc=true, ver=2.4.0#20180305-sha1:aa342270, isClient=false], done=false]
2018-07-18 16:34:20.536 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Pending transactions:
2018-07-18 16:34:20.536 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Pending explicit locks:
2018-07-18 16:34:20.536 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Pending cache futures:
2018-07-18 16:34:20.536 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Pending atomic cache futures:
2018-07-18 16:34:20.536 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Pending data streamer futures:
2018-07-18 16:34:20.536 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Pending transaction deadlock detection futures:
2018-07-18 16:34:20.547 UTC [FDPS] [grid-nio-worker-tcp-comm-3-#28%fdps%] [INFO ] [,] o.apache.ignite.internal.diagnostic - Exchange future waiting for coordinator response [crd=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0]]
Remote node information:
General node info [id=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, client=false, discoTopVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], time=12:34:20.537]
Partitions exchange info [readyVer=AffinityTopologyVersion [topVer=29, minorTopVer=0]]
Last initialized exchange future: GridDhtPartitionsExchangeFuture [firstDiscoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=ba6aba6c-7f5d-41bf-bfcc-5eefcad36b62, addrs=[10.83.85.122, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-85-122.ec2.internal/10.83.85.122:47500], discPort=47500, order=30, intOrder=16, lastExchangeTime=1531930705943, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], topVer=30, nodeId8=a450db0b, msg=Node joined: TcpDiscoveryNode [id=ba6aba6c-7f5d-41bf-bfcc-5eefcad36b62, addrs=[10.83.85.122, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-85-122.ec2.internal/10.83.85.122:47500], discPort=47500, order=30, intOrder=16, lastExchangeTime=1531930705943, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], type=NODE_JOINED, tstamp=1531930706210], crd=TcpDiscoveryNode [id=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, addrs=[10.83.87.131, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-87-131.ec2.internal/10.83.87.131:47500], discPort=47500, order=26, intOrder=14, lastExchangeTime=1531931660254, loc=true, ver=2.4.0#20180305-sha1:aa342270, isClient=false], exchId=GridDhtPartitionExchangeId [topVer=AffinityTopologyVersion [topVer=30, minorTopVer=0], discoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=ba6aba6c-7f5d-41bf-bfcc-5eefcad36b62, addrs=[10.83.85.122, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-85-122.ec2.internal/10.83.85.122:47500], discPort=47500, order=30, intOrder=16, lastExchangeTime=1531930705943, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], topVer=30, nodeId8=a450db0b, msg=Node joined: TcpDiscoveryNode [id=ba6aba6c-7f5d-41bf-bfcc-5eefcad36b62, addrs=[10.83.85.122, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-85-122.ec2.internal/10.83.85.122:47500], discPort=47500, order=30, intOrder=16, lastExchangeTime=1531930705943, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], type=NODE_JOINED, tstamp=1531930706210], nodeId=ba6aba6c, evt=NODE_JOINED], added=true, initFut=GridFutureAdapter [ignoreInterrupts=false, state=INIT, res=null, hash=1921954756], init=false, lastVer=GridCacheVersion [topVer=0, order=1531930704443, nodeOrder=0], partReleaseFut=PartitionReleaseFuture [topVer=AffinityTopologyVersion [topVer=30, minorTopVer=0], futures=[ExplicitLockReleaseFuture [topVer=AffinityTopologyVersion [topVer=30, minorTopVer=0], futures=[ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547786935479, nodeOrder=26], threadId=39726, id=559000, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=221, val=49583853497448469294730566354366524577617095530402283666, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547787212113, nodeOrder=26], threadId=39741, id=603904, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=288, val=49583853499611641578988037213538229804531966271996035234, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547786935487, nodeOrder=26], threadId=39740, id=558993, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=133, val=49583853497448469294730566354417299462040910024459419794, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547786935323, nodeOrder=26], threadId=39728, id=558949, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=1023, val=49583853497448469294730566353278491339963927967496667282, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547786935470, nodeOrder=26], threadId=39951, id=559009, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=556, val=49583853497448469294730566354226289182541798339977937042, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547786935497, nodeOrder=26], threadId=39683, id=558982, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=373, val=49583853497448469294730566354541818821461216966893109394, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547786935339, nodeOrder=26], threadId=39682, id=558941, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=156, val=49583853497448469294730566353353444740780034976328450194, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547786935358, nodeOrder=26], threadId=39936, id=558921, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=59, val=49583853497448469294730566353578304943228356208982229138, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandida... and 48550 skipped ...ead=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547786935486, nodeOrder=26], threadId=39894, id=558992, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=488, val=49583853497448469294730566354434224423515514832905306258, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]], ExplicitLockSpan [topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], firstCand=GridCacheMvccCandidate [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, ver=GridCacheVersion [topVer=141782290, order=1547786935331, nodeOrder=26], threadId=39893, id=558948, topVer=AffinityTopologyVersion [topVer=29, minorTopVer=0], reentry=null, otherNodeId=null, otherVer=null, mappedDhtNodes=null, mappedNearNodes=null, ownerVer=null, serOrder=null, key=KeyCacheObjectImpl [part=570, val=49583853497448469294730566353289371672340459630069022866, hasValBytes=false], masks=local=1|owner=0|ready=0|reentry=0|used=0|tx=0|single_implicit=0|dht_local=0|near_local=0|removed=0|read=0, prevVer=null, nextVer=null]]]], TxReleaseFuture [topVer=AffinityTopologyVersion [topVer=30, minorTopVer=0], futures=[]], AtomicUpdateReleaseFuture [topVer=AffinityTopologyVersion [topVer=30, minorTopVer=0], futures=[]], DataStreamerReleaseFuture [topVer=AffinityTopologyVersion [topVer=30, minorTopVer=0], futures=[]]]], exchActions=null, affChangeMsg=null, initTs=1531930706210, centralizedAff=false, changeGlobalStateE=null, done=false, state=CRD, evtLatch=0, remaining=[ba6aba6c-7f5d-41bf-bfcc-5eefcad36b62], super=GridFutureAdapter [ignoreInterrupts=false, state=INIT, res=null, hash=325602672]]
Communication SPI statistics [rmtNode=7d5e83aa-736a-4190-8b64-7261db7382f6]
Communication SPI recovery descriptors:
[key=ConnectionKey [nodeId=7d5e83aa-736a-4190-8b64-7261db7382f6, idx=0, connCnt=0], msgsSent=5, msgsAckedByRmt=0, msgsRcvd=7, lastAcked=0, reserveCnt=1, descIdHash=1972345954]
Communication SPI clients:
[node=7d5e83aa-736a-4190-8b64-7261db7382f6, client=GridTcpNioCommunicationClient [ses=GridSelectorNioSessionImpl [worker=DirectNioClientWorker [super=AbstractNioClientWorker [idx=3, bytesRcvd=5740, bytesSent=77322, bytesRcvd0=853, bytesSent0=0, select=true, super=GridWorker [name=grid-nio-worker-tcp-comm-3, igniteInstanceName=fdps, finished=false, hashCode=2068348067, interrupted=false, runner=grid-nio-worker-tcp-comm-3-#28%fdps%]]], writeBuf=java.nio.DirectByteBuffer[pos=0 lim=32768 cap=32768], readBuf=java.nio.DirectByteBuffer[pos=0 lim=32768 cap=32768], inRecovery=GridNioRecoveryDescriptor [acked=0, resendCnt=0, rcvCnt=7, sentCnt=5, reserved=true, lastAck=0, nodeLeft=false, node=TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931329178, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], connected=true, connectCnt=0, queueLimit=262144, reserveCnt=1, pairedConnections=false], outRecovery=GridNioRecoveryDescriptor [acked=0, resendCnt=0, rcvCnt=7, sentCnt=5, reserved=true, lastAck=0, nodeLeft=false, node=TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931329178, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], connected=true, connectCnt=0, queueLimit=262144, reserveCnt=1, pairedConnections=false], super=GridNioSessionImpl [locAddr=/10.83.87.131:47100, rmtAddr=/10.83.89.183:34664, createTime=1531931330498, closeTime=0, bytesSent=77322, bytesRcvd=5740, bytesSent0=0, bytesRcvd0=853, sndSchedTime=1531931330498, lastSndTime=1531931500547, lastRcvTime=1531931660527, readsPaused=false, filterChain=FilterChain[filters=[GridNioCodecFilter [parser=org.apache.ignite.internal.util.nio.GridDirectParser#665c2413, directMode=true], GridConnectionBytesVerifyFilter], accepted=true]], super=GridAbstractCommunicationClient [lastUsed=1531931330508, closed=false, connIdx=0]]]
NIO sessions statistics:
>> Selector info [idx=3, keysCnt=1, bytesRcvd=5740, bytesRcvd0=853, bytesSent=77322, bytesSent0=0]
Connection info [in=true, rmtAddr=/10.83.89.183:34664, locAddr=/10.83.87.131:47100, msgsSent=5, msgsAckedByRmt=0, descIdHash=1972345954, unackedMsgs=[IgniteDiagnosticMessage, IgniteDiagnosticMessage, IgniteDiagnosticMessage, IgniteDiagnosticMessage, IgniteDiagnosticMessage], msgsRcvd=7, lastAcked=0, descIdHash=1972345954, bytesRcvd=5740, bytesRcvd0=853, bytesSent=77322, bytesSent0=0, opQueueSize=0]
Exchange future: GridDhtPartitionsExchangeFuture [firstDiscoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931329178, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], topVer=32, nodeId8=a450db0b, msg=Node joined: TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931329178, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], type=NODE_JOINED, tstamp=1531931329402], crd=null, exchId=GridDhtPartitionExchangeId [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], discoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931329178, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], topVer=32, nodeId8=a450db0b, msg=Node joined: TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931329178, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], type=NODE_JOINED, tstamp=1531931329402], nodeId=7d5e83aa, evt=NODE_JOINED], added=true, initFut=GridFutureAdapter [ignoreInterrupts=false, state=INIT, res=null, hash=980776600], init=false, lastVer=GridCacheVersion [topVer=0, order=1531931327875, nodeOrder=0], partReleaseFut=null, exchActions=null, affChangeMsg=null, initTs=0, centralizedAff=false, changeGlobalStateE=null, done=false, state=null, evtLatch=0, remaining=[], super=GridFutureAdapter [ignoreInterrupts=false, state=INIT, res=null, hash=2138568466]]
Local communication statistics:
Communication SPI statistics [rmtNode=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9]
Communication SPI recovery descriptors:
[key=ConnectionKey [nodeId=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, idx=0, connCnt=-1], msgsSent=7, msgsAckedByRmt=0, msgsRcvd=6, lastAcked=0, reserveCnt=1, descIdHash=1891649612]
Communication SPI clients:
Communication SPI clients:
[node=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, client=GridTcpNioCommunicationClient [ses=GridSelectorNioSessionImpl [worker=DirectNioClientWorker [super=AbstractNioClientWorker [idx=0, bytesRcvd=92833, bytesSent=5698, bytesRcvd0=15539, bytesSent0=853, select=true, super=GridWorker [name=grid-nio-worker-tcp-comm-0, igniteInstanceName=fdps, finished=false, hashCode=2040212682, interrupted=false, runner=grid-nio-worker-tcp-comm-0-#25%fdps%]]], writeBuf=java.nio.DirectByteBuffer[pos=0 lim=32768 cap=32768], readBuf=java.nio.DirectByteBuffer[pos=0 lim=32768 cap=32768], inRecovery=GridNioRecoveryDescriptor [acked=0, resendCnt=0, rcvCnt=6, sentCnt=7, reserved=true, lastAck=0, nodeLeft=false, node=TcpDiscoveryNode [id=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, addrs=[10.83.87.131, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-87-131.ec2.internal/10.83.87.131:47500], discPort=47500, order=26, intOrder=14, lastExchangeTime=1531931329258, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], connected=false, connectCnt=1, queueLimit=262144, reserveCnt=1, pairedConnections=false], outRecovery=GridNioRecoveryDescriptor [acked=0, resendCnt=0, rcvCnt=6, sentCnt=7, reserved=true, lastAck=0, nodeLeft=false, node=TcpDiscoveryNode [id=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, addrs=[10.83.87.131, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-87-131.ec2.internal/10.83.87.131:47500], discPort=47500, order=26, intOrder=14, lastExchangeTime=1531931329258, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], connected=false, connectCnt=1, queueLimit=262144, reserveCnt=1, pairedConnections=false], super=GridNioSessionImpl [locAddr=/10.83.89.183:34664, rmtAddr=ip-10-83-87-131.ec2.internal/10.83.87.131:47100, createTime=1531931330468, closeTime=0, bytesSent=5698, bytesRcvd=92833, bytesSent0=853, bytesRcvd0=15539, sndSchedTime=1531931330468, lastSndTime=1531931660528, lastRcvTime=1531931660538, readsPaused=false, filterChain=FilterChain[filters=[GridNioCodecFilter [parser=org.apache.ignite.internal.util.nio.GridDirectParser#72024a61, directMode=true], GridConnectionBytesVerifyFilter], accepted=false]], super=GridAbstractCommunicationClient [lastUsed=1531931330468, closed=false, connIdx=0]]]
NIO sessions statistics:
>> Selector info [idx=0, keysCnt=1, bytesRcvd=92833, bytesRcvd0=15539, bytesSent=5698, bytesSent0=853]
Connection info [in=false, rmtAddr=ip-10-83-87-131.ec2.internal/10.83.87.131:47100, locAddr=/10.83.89.183:34664, msgsSent=7, msgsAckedByRmt=0, descIdHash=1891649612, unackedMsgs=[GridDhtPartitionsSingleMessage, IgniteDiagnosticMessage, IgniteDiagnosticMessage, IgniteDiagnosticMessage, IgniteDiagnosticMessage], msgsRcvd=6, lastAcked=0, descIdHash=1891649612, bytesRcvd=92833, bytesRcvd0=15539, bytesSent=5698, bytesSent0=853, opQueueSize=0]
2018-07-18 16:34:29.598 UTC [FDPS] [localhost-startStop-1] [WARN ] [,] o.a.i.i.p.c.GridCachePartitionExchangeManager - Still waiting for initial partition map exchange [fut=GridDhtPartitionsExchangeFuture [firstDiscoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931669507, loc=true, ver=2.4.0#20180305-sha1:aa342270, isClient=false], topVer=32, nodeId8=7d5e83aa, msg=null, type=NODE_JOINED, tstamp=1531931329481], crd=TcpDiscoveryNode [id=a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9, addrs=[10.83.87.131, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-87-131.ec2.internal/10.83.87.131:47500], discPort=47500, order=26, intOrder=14, lastExchangeTime=1531931329258, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], exchId=GridDhtPartitionExchangeId [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], discoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=7d5e83aa-736a-4190-8b64-7261db7382f6, addrs=[10.83.89.183, 127.0.0.1], sockAddrs=[/127.0.0.1:47500, ip-10-83-89-183.ec2.internal/10.83.89.183:47500], discPort=47500, order=32, intOrder=17, lastExchangeTime=1531931669507, loc=true, ver=2.4.0#20180305-sha1:aa342270, isClient=false], topVer=32, nodeId8=7d5e83aa, msg=null, type=NODE_JOINED, tstamp=1531931329481], nodeId=7d5e83aa, evt=NODE_JOINED], added=true, initFut=GridFutureAdapter [ignoreInterrupts=false, state=DONE, res=true, hash=247159314], init=true, lastVer=null, partReleaseFut=PartitionReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[ExplicitLockReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[]], TxReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[]], AtomicUpdateReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[]], DataStreamerReleaseFuture [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], futures=[]]]], exchActions=ExchangeActions [startCaches=null, stopCaches=null, startGrps=[], stopGrps=[], resetParts=null, stateChangeRequest=null], affChangeMsg=null, initTs=1531931329576, centralizedAff=false, changeGlobalStateE=null, done=false, state=SRV, evtLatch=0, remaining=[a450db0b-ce86-4f0b-a34b-a2f9c83bb3d9], super=GridFutureAdapter [ignoreInterrupts=false, state=INIT, res=null, hash=1272213534]]]
2018-07-18 16:34:30.537 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Failed to wait for partition map exchange [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], node=7d5e83aa-736a-4190-8b64-7261db7382f6]. Dumping pending objects that might be the cause:
2018-07-18 16:34:40.537 UTC [FDPS] [exchange-worker-#35%fdps%] [WARN ] [,] o.apache.ignite.internal.diagnostic - Failed to wait for partition map exchange [topVer=AffinityTopologyVersion [topVer=32, minorTopVer=0], node=7d5e83aa-736a-4190-8b64-7261db7382f6]. Dumping pending objects that might be the cause:
Info about the other node 10-83-85-122
The other joining node never got started and was stuck in the ignite start phase . The logs also dont show the node to get up or the ip discovery to get kicked in . to eventually cause the node to be removed via autoscaling .
Transactional errors received
javax.cache.CacheException: Failed to acquire lock for keys (primary node left grid, retry transaction if possible) [keys=[UserKeyCacheObjectImpl [part=281,
Partition map exchange is a process of exchanging information between nodes where each piece of data is stored. It happens every time, when topology changes.
Every node sends a GridDhtPartitionsSingleMessage to a coordinator. Once the coordinator collected all such messages, it sends GridDhtPartitionsFullMessage back to other nodes. These messages are sent over communication SPI.
But if some of non-coordinator nodes don't send the SingleMessage to the coordinator, or if the coordinator doesn't send the FullMessage, then "Failed to wait for partition map exchange" error occurs.
Judging by the piece of log, that you provided, a node with ID=ba6aba6c didn't send the SingleMessage to the coordinator. It may mean, that communication SPI doesn't work there properly. Make sure, that ports, that are required for communication SPI are available. Usually it's 47100..47200.
Also joining node may be stuck on something. Look at its log to figure out, what happens there.

Ignite cache fails after Failed to process selector key...java.io.IOException: Broken pipe exception

We are running Ignite 2.4 & have 2 server nodes & 30 odd client nodes. We use zookeeper discovery & the nodes are deployed in a Docker swarm environment.
After a while of running i saw the below exception in one of the ignite clients & the caches no longer seem to work,
service-be - [INFO ] 2018-06-15 02:01:52.256 [grid-timeout-worker-#55] org.apache.ignite.internal.IgniteKernal -
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
^-- Node [id=5249f20c, uptime=02:49:02.178]
^-- H/N/C [hosts=34, nodes=34, CPUs=816]
^-- CPU [cur=24.2%, avg=0.27%, GC=0%]
^-- PageMemory [pages=0]
^-- Heap [used=848MB, free=17.19%, comm=1024MB]
^-- Non heap [used=241MB, free=84.12%, comm=251MB]
^-- Outbound messages queue [size=4]
^-- Public thread pool [active=0, idle=0, qSize=0]
^-- System thread pool [active=0, idle=24, qSize=0]
service-be - [INFO ] 2018-06-15 02:01:52.432 [grid-nio-worker-tcp-comm-2-#59] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Accepted incoming communication connection [locAddr=/10.11.0.7:47100, rmtAddr=/10.11.0.75:59204]
service-be - [INFO ] 2018-06-15 02:01:52.433 [grid-nio-worker-tcp-comm-2-#59] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Received incoming connection when already connected to this node, rejecting [locNode=5249f20c-456b-4b6f-ab41-f5cd5c3c05ba, rmtNode=6739c9af-42d1-4aad-ac9c-ac738ed13534]
service-be - [INFO ] 2018-06-15 02:01:52.634 [grid-nio-worker-tcp-comm-3-#60] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Accepted incoming communication connection [locAddr=/10.11.0.7:47100, rmtAddr=/10.11.0.75:59206]
service-be - [INFO ] 2018-06-15 02:01:52.635 [grid-nio-worker-tcp-comm-3-#60] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Received incoming connection when already connected to this node, rejecting [locNode=5249f20c-456b-4b6f-ab41-f5cd5c3c05ba, rmtNode=6739c9af-42d1-4aad-ac9c-ac738ed13534]
service-be - [INFO ] 2018-06-15 02:01:52.836 [grid-nio-worker-tcp-comm-4-#61] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Accepted incoming communication connection [locAddr=/10.11.0.7:47100, rmtAddr=/10.11.0.75:59208]
service-be - [INFO ] 2018-06-15 02:01:52.837 [grid-nio-worker-tcp-comm-4-#61] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Received incoming connection when already connected to this node, rejecting [locNode=5249f20c-456b-4b6f-ab41-f5cd5c3c05ba, rmtNode=6739c9af-42d1-4aad-ac9c-ac738ed13534]
service-be - [INFO ] 2018-06-15 02:01:53.038 [grid-nio-worker-tcp-comm-5-#62] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Accepted incoming communication connection [locAddr=/10.11.0.7:47100, rmtAddr=/10.11.0.75:59210]
service-be - [INFO ] 2018-06-15 02:01:53.039 [grid-nio-worker-tcp-comm-5-#62] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Received incoming connection when already connected to this node, rejecting [locNode=5249f20c-456b-4b6f-ab41-f5cd5c3c05ba, rmtNode=6739c9af-42d1-4aad-ac9c-ac738ed13534]
service-be - [ERROR] 2018-06-15 02:01:53.231 [grid-nio-worker-tcp-comm-0-#57] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Failed to process selector key [ses=GridSelectorNioSessionImpl [worker=DirectNioClientWorker [super=AbstractNioClientWorker [idx=0, bytesRcvd=70700138, bytesSent=18478193, bytesRcvd0=0, bytesSent0=0, select=true, super=GridWorker [name=grid-nio-worker-tcp-comm-0, igniteInstanceName=null, finished=false, hashCode=30436088, interrupted=false, runner=grid-nio-worker-tcp-comm-0-#57]]], writeBuf=java.nio.DirectByteBuffer[pos=0 lim=186 cap=32768], readBuf=java.nio.DirectByteBuffer[pos=0 lim=32768 cap=32768], inRecovery=GridNioRecoveryDescriptor [acked=48224, resendCnt=0, rcvCnt=111504, sentCnt=48229, reserved=true, lastAck=111488, nodeLeft=false, node=TcpDiscoveryNode [id=6739c9af-42d1-4aad-ac9c-ac738ed13534, addrs=[10.11.0.74, 10.11.0.75, 127.0.0.1, 172.18.0.22], sockAddrs=[/172.18.0.22:47500, bdd554c3dc77/10.11.0.75:47500, /10.11.0.74:47500, /127.0.0.1:47500], discPort=47500, order=1, intOrder=1, lastExchangeTime=1529039549468, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], connected=false, connectCnt=1, queueLimit=131072, reserveCnt=2, pairedConnections=false], outRecovery=GridNioRecoveryDescriptor [acked=48224, resendCnt=0, rcvCnt=111504, sentCnt=48229, reserved=true, lastAck=111488, nodeLeft=false, node=TcpDiscoveryNode [id=6739c9af-42d1-4aad-ac9c-ac738ed13534, addrs=[10.11.0.74, 10.11.0.75, 127.0.0.1, 172.18.0.22], sockAddrs=[/172.18.0.22:47500, bdd554c3dc77/10.11.0.75:47500, /10.11.0.74:47500, /127.0.0.1:47500], discPort=47500, order=1, intOrder=1, lastExchangeTime=1529039549468, loc=false, ver=2.4.0#20180305-sha1:aa342270, isClient=false], connected=false, connectCnt=1, queueLimit=131072, reserveCnt=2, pairedConnections=false], super=GridNioSessionImpl [locAddr=/10.11.0.7:42970, rmtAddr=bdd554c3dc77/10.11.0.75:47100, createTime=1529039561958, closeTime=0, bytesSent=18478193, bytesRcvd=70700138, bytesSent0=0, bytesRcvd0=0, sndSchedTime=1529044007457, lastSndTime=1529049712225, lastRcvTime=1529049712225, readsPaused=false, filterChain=FilterChain[filters=[GridNioCodecFilter [parser=o.a.i.i.util.nio.GridDirectParser#7a15b36, directMode=true], GridConnectionBytesVerifyFilter], accepted=false]]]
java.io.IOException: Broken pipe
at sun.nio.ch.FileDispatcherImpl.write0(Native Method)
at sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:47)
at sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:93)
at sun.nio.ch.IOUtil.write(IOUtil.java:51)
at sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:471)
at org.apache.ignite.internal.util.nio.GridNioServer$DirectNioClientWorker.processWrite0(GridNioServer.java:1636)
at org.apache.ignite.internal.util.nio.GridNioServer$DirectNioClientWorker.processWrite(GridNioServer.java:1293)
at org.apache.ignite.internal.util.nio.GridNioServer$AbstractNioClientWorker.processSelectedKeysOptimized(GridNioServer.java:2307)
at org.apache.ignite.internal.util.nio.GridNioServer$AbstractNioClientWorker.bodyInternal(GridNioServer.java:2080)
at org.apache.ignite.internal.util.nio.GridNioServer$AbstractNioClientWorker.body(GridNioServer.java:1749)
at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:110)
at java.lang.Thread.run(Thread.java:748)
service-be - [WARN ] 2018-06-15 02:01:53.231 [grid-nio-worker-tcp-comm-0-#57] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Closing NIO session because of unhandled exception [cls=class o.a.i.i.util.nio.GridNioException, msg=Broken pipe]
service-be - [INFO ] 2018-06-15 02:01:53.240 [grid-nio-worker-tcp-comm-6-#63] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Accepted incoming communication connection [locAddr=/10.11.0.7:47100, rmtAddr=/10.11.0.75:59212]
service-be - [WARN ] 2018-06-15 02:02:03.253 [tcp-comm-worker-#1] org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Connect timed out (consider increasing 'failureDetectionTimeout' configuration property) [addr=/172.18.0.22:47100, failureDetectionTimeout=10000]
On searching with the remote node with which there seems to be trouble connecting (as mentioned in the trace above) I also see these warnings in some of the other client nodes aswell.
Any obvious pointers on what could be going wrong?. From what i have searched one suggestion was to use ipv4 but the docker overlay has enableipv6 as disabled in our case..so i am not sure how much that will help.
[root#rhel743411 logs]# egrep -i "6739c9af-42d1-4aad-ac9c-ac738ed13534" *
service1-mw.log:service1-mw - [WARN ] 2018-06-16 00:27:55.884 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:26:02.991, curTime=00:27:55.876, fut=GridDhtColocatedLockFuture [threadId=39579, keys=[UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=776a8520461-6a403605-a8fd-4ed1-bd45-92e648929a2a, lockVer=GridCacheVersion [topVer=140519300, order=1529059257539, nodeOrder=6], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service1-mw.log:service1-mw - [WARN ] 2018-06-16 00:27:55.884 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:25:55.893, curTime=00:27:55.876, fut=GridDhtColocatedLockFuture [threadId=297, keys=[UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=f03a8520461-6a403605-a8fd-4ed1-bd45-92e648929a2a, lockVer=GridCacheVersion [topVer=140519300, order=1529059253553, nodeOrder=6], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service1-mw.log:service1-mw - [WARN ] 2018-06-16 00:27:55.884 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:26:51.661, curTime=00:27:55.876, fut=GridDhtColocatedLockFuture [threadId=38749, keys=[UserKeyCacheObjectImpl [part=7, val=7, hasValBytes=false], UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=354b8520461-6a403605-a8fd-4ed1-bd45-92e648929a2a, lockVer=GridCacheVersion [topVer=140519300, order=1529059268380, nodeOrder=6], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service1-mw.log:service1-mw - [WARN ] 2018-06-16 00:27:55.885 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:26:51.772, curTime=00:27:55.876, fut=GridDhtColocatedLockFuture [threadId=343, keys=[UserKeyCacheObjectImpl [part=7, val=7, hasValBytes=false]], futId=125b8520461-6a403605-a8fd-4ed1-bd45-92e648929a2a, lockVer=GridCacheVersion [topVer=140519300, order=1529059268816, nodeOrder=6], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:01:10.227 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=23:59:12.637, curTime=00:01:10.221, fut=GridDhtColocatedLockFuture [threadId=21129, keys=[UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=f5216120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529058842000, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:09:10.242 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:07:30.520, curTime=00:09:10.239, fut=GridDhtColocatedLockFuture [threadId=21304, keys=[UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=42176120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529058982457, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:13:10.269 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:11:32.462, curTime=00:13:10.268, fut=GridDhtColocatedLockFuture [threadId=21368, keys=[UserKeyCacheObjectImpl [part=7, val=7, hasValBytes=false]], futId=c0f96120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529059041395, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:15:10.281 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:13:43.800, curTime=00:15:10.279, fut=GridDhtColocatedLockFuture [threadId=172, keys=[UserKeyCacheObjectImpl [part=7, val=7, hasValBytes=false], UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=49ab6120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529059079186, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:17:10.289 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:15:44.860, curTime=00:17:10.287, fut=GridDhtColocatedLockFuture [threadId=172, keys=[UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=a3ec6120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529059106786, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:20:10.299 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:18:51.741, curTime=00:20:10.298, fut=GridDhtColocatedLockFuture [threadId=172, keys=[UserKeyCacheObjectImpl [part=7, val=7, hasValBytes=false], UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=8ace6120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529059136637, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:21:10.308 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:19:19.018, curTime=00:21:10.304, fut=GridDhtColocatedLockFuture [threadId=21484, keys=[UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=bd7f6120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529059155514, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:24:10.326 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:23:03.860, curTime=00:24:10.323, fut=GridDhtColocatedLockFuture [threadId=21544, keys=[UserKeyCacheObjectImpl [part=7, val=7, hasValBytes=false]], futId=f3e17120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529059200701, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:24:10.326 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:22:52.783, curTime=00:24:10.323, fut=GridDhtColocatedLockFuture [threadId=172, keys=[UserKeyCacheObjectImpl [part=7, val=7, hasValBytes=false], UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=edc17120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529059199113, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:26:10.330 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:24:59.321, curTime=00:26:10.328, fut=GridDhtColocatedLockFuture [threadId=172, keys=[UserKeyCacheObjectImpl [part=7, val=7, hasValBytes=false]], futId=74737120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529059232146, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]
service2-mw.log:service2y-mw - [WARN ] 2018-06-16 00:29:10.349 [grid-timeout-worker-#55] org.apache.ignite.internal.diagnostic - Found long running cache future [startTime=00:27:32.480, curTime=00:29:10.347, fut=GridDhtColocatedLockFuture [threadId=21621, keys=[UserKeyCacheObjectImpl [part=8, val=8, hasValBytes=false]], futId=1fe57120461-0c4dcfda-c90b-42a3-83c4-8d2f8ecb6ab1, lockVer=GridCacheVersion [topVer=140519300, order=1529059289421, nodeOrder=17], read=false, retval=true, err=null, timeout=120000, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], done=0, trackable=true, createTtl=-1, accessTtl=-1, skipStore=false, keepBinary=false, recovery=false, miniId=1, topVer=AffinityTopologyVersion [topVer=34, minorTopVer=0], innerFuts=[[node=6739c9af-42d1-4aad-ac9c-ac738ed13534, rcvRes=false, loc=false, done=false]], inTx=false, super=GridCompoundIdentityFuture [super=GridCompoundFuture [rdc=Bool reducer: true, initFlag=1, lsnrCalls=0, done=false, cancelled=false, err=null, futs=[false]]]]]

Ignite Cluster getting stuck when new node Join or release

I have 3 node cluster with 20+ client and it's running in spark context.Initially it working fine but randomly get issue whenever new node i.e. client try to connect with cluster.The cluster getting inoperative.I have got following logs when its stuck.If I restart any Ignite server explicitly then its release and work fine.I have use Ignite 2.4.0 version. same issue produced in Ignite 2.5.0 version too.
client side Logs
Failed to wait for partition map exchange [topVer=AffinityTopologyVersion [topVer=44, minorTopVer=0], node=4d885cfd-45ed-43a2-8088-f35c9469797f]. Dumping pending objects that might be the cause:
GridDhtPartitionsExchangeFuture [topVer=AffinityTopologyVersion [topVer=44, minorTopVer=0], evt=NODE_JOINED, evtNode=TcpDiscoveryNode [id=4d885cfd-45ed-43a2-8088-f35c9469797f, addrs=[0:0:0:0:0:0:0:1%lo, 10.13.10.179, 127.0.0.1], sockAddrs=[/0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0, hdn6.mstorm.com/10.13.10.179:0], discPort=0, order=44, intOrder=0, lastExchangeTime=1527651620413, loc=true, ver=2.4.0#20180305-sha1:aa342270, isClient=true], done=false]
Failed to wait for partition map exchange [topVer=AffinityTopologyVersion [topVer=44, minorTopVer=0], node=4d885cfd-45ed-43a2-8088-f35c9469797f]. Dumping pending objects that might be the cause:
GridDhtPartitionsExchangeFuture [topVer=AffinityTopologyVersion [topVer=44, minorTopVer=0], evt=NODE_JOINED, evtNode=TcpDiscoveryNode [id=4d885cfd-45ed-43a2-8088-f35c9469797f, addrs=[0:0:0:0:0:0:0:1%lo, 10.13.10.179, 127.0.0.1], sockAddrs=[/0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0, hdn6.mstorm.com/10.13.10.179:0], discPort=0, order=44, intOrder=0, lastExchangeTime=1527651620413, loc=true, ver=2.4.0#20180305-sha1:aa342270, isClient=true], done=false]
Failed to wait for initial partition map exchange. Possible reasons are:
^-- Transactions in deadlock.
^-- Long running transactions (ignore if this is the case).
^-- Unreleased explicit locks.
Still waiting for initial partition map exchange [fut=GridDhtPartitionsExchangeFuture [firstDiscoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=4d885cfd-45ed-43a2-8088-f35c9469797f, addrs=
Server Side Logs
Possible starvation in striped pool. Thread name: sys-stripe-0-#1 Queue: [Message closure [msg=GridIoMessage [plc=2, topic=TOPIC_CACHE, topicOrd=8, ordered=false, timeout=0, skipOnTimeout=false, msg=GridDhtTxPrepareResponse [nearEvicted=null, futId=869dd4ca361-fe7e167d-4d80-4f57-b004-13359a9f2c11, miniId=1, super=GridDistributedTxPrepareResponse [txState=null, part=-1, err=null, super=GridDistributedBaseMessage [ver=GridCacheVersion [topVer=139084030, order=1527604094903, nodeOrder=1], committedVers=null, rolledbackVers=null, cnt=0, super=GridCacheIdMessage [cacheId=0]]]]]], Message closure [msg=GridIoMessage [plc=2, topic=TOPIC_CACHE, topicOrd=8, ordered=false, timeout=0, skipOnTimeout=false, msg=GridDhtAtomicSingleUpdateRequest [key=KeyCacheObjectImpl [part=984, val=null, hasValBytes=true], val=BinaryObjectImpl [arr= true, ctx=false, start=0], prevVal=null, super=GridDhtAtomicAbstractUpdateRequest [onRes=false, nearNodeId=null, nearFutId=0, flags=]]]], o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache$DeferredUpdateTimeout#2735c674, Message closure [msg=GridIoMessage [plc=2, topic=TOPIC_CACHE, topicOrd=8, ordered=false, timeout=0, skipOnTimeout=false, msg=GridDhtTxPrepareRequest [nearNodeId=628e3078-17fd-4e49-b9ae-ad94ad97a2f1, futId=6576e4ca361-6e7cdac2-d5a3-4624-9ad3-b93f25546cc3, miniId=1, topVer=AffinityTopologyVersion [topVer=20, minorTopVer=0], invalidateNearEntries={}, nearWrites=null, owned=null, nearXidVer=GridCacheVersion [topVer=139084030, order=1527604094933, nodeOrder=2], subjId=628e3078-17fd-4e49-b9ae-ad94ad97a2f1, taskNameHash=0, preloadKeys=null, super=GridDistributedTxPrepareRequest [threadId=86, concurrency=OPTIMISTIC, isolation=READ_COMMITTED, writeVer=GridCacheVersion [topVer=139084030, order=1527604094935, nodeOrder=2], timeout=0, reads=null, writes=[IgniteTxEntry [key=BinaryObjectImpl [arr= true, ctx=false, start=0], cacheId=-1755241537, txKey=null, val=[op=UPDATE, val=BinaryObjectImpl [arr= true, ctx=false, start=0]], prevVal=[op=NOOP, val=null], oldVal=[op=NOOP, val=null], entryProcessorsCol=null, ttl=-1, conflictExpireTime=-1, conflictVer=null, explicitVer=null, dhtVer=null, filters=null, filtersPassed=false, filtersSet=false, entry=null, prepared=0, locked=false, nodeId=null, locMapped=false, expiryPlc=null, transferExpiryPlc=false, flags=0, partUpdateCntr=0, serReadVer=null, xidVer=null]], dhtVers=null, txSize=0, plc=2, txState=null, flags=onePhase|last, super=GridDistributedBaseMessage [ver=GridCacheVersion [topVer=139084030, order=1527604094933, nodeOrder=2], committedVers=null, rolledbackVers=null, cnt=0, super=GridCacheIdMessage [cacheId=0]]]]]], Message closure [msg=GridIoMessage [plc=2, topic=TOPIC_CACHE, topicOrd=8, ordered=false, timeout=0, skipOnTimeout=false, msg=GridDhtAtomicDeferredUpdateResponse [futIds=GridLongList [idx=2, arr=[65774,65775]]]]], Message closure [msg=GridIoMessage [plc=2, topic=TOPIC_CACHE, topicOrd=8, ordered=false, timeout=0, skipOnTimeout=false, msg=GridNearAtomicSingleUpdateRequest [key=KeyCacheObjectImpl [part=1016, val=null, hasValBytes=true], parent=GridNearAtomicAbstractSingleUpdateRequest [nodeId=null, futId=49328, topVer=AffinityTopologyVersion [topVer=20, minorTopVer=0], parent=GridNearAtomicAbstractUpdateRequest [res=null, flags=needRes]]]]], Message closure [msg=GridIoMessage [plc=2, topic=TOPIC_CACHE, topicOrd=8, ordered=false, timeout=0, skipOnTimeout=false, msg=GridDhtAtomicDeferredUpdateResponse [futIds=GridLongList [idx=1, arr=[98591]]]]], Message closure [msg=GridIoMessage [plc=2, topic=TOPIC_CACHE, topicOrd=8, ordered=false, timeout=0, skipOnTimeout=false, msg=GridDhtAtomicDeferredUpdateResponse [futIds=GridLongList [idx=1, arr=[114926]]]]], Message closure [msg=GridIoMessage [plc=2, topic=TOPIC_CACHE, topicOrd=8, ordered=false, timeout=0, skipOnTimeout=false, msg=GridNearAtomicSingleUpdateRequest [key=KeyCacheObjectImpl [part=1016, val=null, hasValBytes=true], parent=GridNearAtomicAbstractSingleUpdateRequest [nodeId=null, futId=32946, topVer=AffinityTopologyVersion [topVer=20, minorTopVer=0], parent=GridNear