Using Pandas UDF with Large Broadcast object - serialization

I am trying to use Pandas UDF GROUPEDMAP to do some processing. The code is below:
from pyspark.sql.functions import pandas_udf, PandasUDFType, spark_partition_id
models_broadcast = self.spark_session.sparkContext.broadcast(models)
#pandas_udf("id string, score string",
PandasUDFType.GROUPED_MAP)
def _segment_partition_score(segment_partition_pd):
my_models = models_broadcast.value # comment this line, the code ran run through
segment_partition_pd["score"] = "aaa"
segment_score_pd = segment_partition_pd[['id', 'score']]
return segment_score_pd
model_score = my_data_df.withColumn("pid", spark_partition_id()).groupby('pid').apply(_segment_partition_score)
model_score.show(100)
Here is the error message. It simply states " java.net.SocketException: Connection reset"
y4JJavaError: An error occurred while calling o1525.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 31.0 failed 4 times, most recent failure: Lost task 2.3 in stage 31.0 (TID 2466, 10.139.64.43, executor 10): java.net.SocketException: Connection reset
at java.net.SocketInputStream.read(SocketInputStream.java:210)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
at java.io.DataInputStream.readInt(DataInputStream.java:387)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:181)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:144)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:494)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:62)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:159)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:158)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:140)
at org.apache.spark.scheduler.Task.run(Task.scala:113)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:537)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1541)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:543)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2362)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2350)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2349)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2349)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1102)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1102)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1102)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2582)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2529)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2517)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:897)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2280)
at org.apache.spark.sql.execution.collect.Collector.runSparkJobs(Collector.scala:270)
at org.apache.spark.sql.execution.collect.Collector.collect(Collector.scala:280)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:80)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:86)
at org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:508)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollectResult(limit.scala:57)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectResult(Dataset.scala:2905)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3517)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2634)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2634)
at org.apache.spark.sql.Dataset$$anonfun$54.apply(Dataset.scala:3501)
at org.apache.spark.sql.Dataset$$anonfun$54.apply(Dataset.scala:3496)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1$$anonfun$apply$1.apply(SQLExecution.scala:112)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:217)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:98)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:835)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:74)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:169)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3496)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2634)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2848)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:279)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:316)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketException: Connection reset
at java.net.SocketInputStream.read(SocketInputStream.java:210)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
at java.io.DataInputStream.readInt(DataInputStream.java:387)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:181)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:144)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:494)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:62)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:159)
at org.apache.spark.sql.execution.collect.Collector$$anonfun$2.apply(Collector.scala:158)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:140)
at org.apache.spark.scheduler.Task.run(Task.scala:113)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:537)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1541)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:543)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
The broadcast object is a pickle object which is around ~100 MB. It is a trained ski-learn machine learning model.
But you may find, in the UDF _segment_partition_score, I actually didn't use this broadcast object. I just want to get it at the UDF, for debugging purpose. Even this, the pyspark program will crash.
If I don't receive this broadcast object in the Pandas UDF, the program can run through, and generate result.
The dataframe "my_data_df" is very small (700 MB in Parquet) . I am sure given the existing cluster size (64GB) * 20 workers, it should be no problem to deal with it.
I highly suspect that it is the problem of serialization for the broadcast object, either size or type of serializer. But I have no idea what should I tune.
Could anyone point me out?
Thanks in advance.

Related

java.io.IOException: Error closing multipart upload

I am working on pyspark code which processes Terabytes of data and write on s3.
After processing a data I am getting below error
`
Caused by: org.apache.spark.SparkException: Task failed while writing rows.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:257)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1405)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Caused by: java.io.IOException: Error closing multipart upload
at com.amazon.ws.emr.hadoop.fs.s3n.MultipartUploadOutputStream.doMultiPartUpload(MultipartUploadOutputStream.java:441)
at com.amazon.ws.emr.hadoop.fs.s3n.MultipartUploadOutputStream.close(MultipartUploadOutputStream.java:421)
at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:74)
at org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:108)
at org.apache.parquet.hadoop.util.HadoopPositionOutputStream.close(HadoopPositionOutputStream.java:64)
at org.apache.parquet.hadoop.ParquetFileWriter.end(ParquetFileWriter.java:685)
at org.apache.parquet.hadoop.InternalParquetRecordWriter.close(InternalParquetRecordWriter.java:122)
at org.apache.parquet.hadoop.ParquetRecordWriter.close(ParquetRecordWriter.java:165)
at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.close(ParquetOutputWriter.scala:42)
`
I tried by setting below configurations. Still I am getting same error.
self._spark_session.conf.set("spark.hadoop.fs.s3a.multipart.threshold", 2097152000)
self._spark_session.conf.set("spark.hadoop.fs.s3a.multipart.size", 104857600)
self._spark_session.conf.set("spark.hadoop.fs.s3a.connection.maximum", 500)
self._spark_session.conf.set("spark.hadoop.fs.s3a.connection.timeout", 600000)
self._spark_session.conf.set("spark.hadoop.fs.s3.maxRetries", 50)
Can someone please help me to resolve this issue ?

Spark job gets stuck indefinitely while trying to fetch data from ignite cluster

private static final ThreadLocal<IgniteClient> igniteClientContext = new ThreadLocal<>();
public static IgniteClient getIgniteClient(String[] address) {
if(igniteClientContext.get() == null) {
ClientConfiguration clientConfig = null;
if(cfg == null) {
clientConfig = new ClientConfiguration().setAddresses(address);
} else {
clientConfig = cfg;
}
IgniteClient igniteClient = Ignition.startClient(clientConfig);
logger.info("igniteClient initialized ");
igniteClientContext.set(igniteClient);
}
return igniteClientContext.get();
}
From spark code, I'm trying to create instance of ignite thin client and create cache object.
val address = config.igniteServers.split(",") // config.igniteServers ="10.xx.xxx.xxx:10800,10.xx.xx.xxx:10800"
Below code will be called from spark executor. We will be processing set or records in each executor and we are only reading data from cache and comparing with currently processing record. If it is already present in cache, we will ignore otherwise we will consume it.
val cacheCfg = new ClientCacheConfiguration()
.setName(PNR_CACHE)
.setCacheMode(CacheMode.REPLICATED)
.setWriteSynchronizationMode(CacheWriteSynchronizationMode.FULL_SYNC)
.setDefaultLockTimeout(30000)
val igniteClient = IgniteHelper.getIgniteClient(address)
val cache : ClientCache[Long, Boolean] = igniteClient.getOrCreateCache(cacheCfg);
At the end of the job, we will be updating cache with all valid records.
This has been running fine for couple of runs and at some point, it gets stuck indefinitely while trying to read data from cache.
In Executor logs, I can see IgniteClusterUnavailable exception.
org.apache.ignite.client.ClientConnectionException: Ignite cluster is unavailable [sock=Socket[addr=hdpct2ldap01g02.hadoop.sgdcprod.XXXX.com/10.xx.xx.xx,port=10800,localport=20214]]
at org.apache.ignite.internal.client.thin.TcpClientChannel.handleIOError(TcpClientChannel.java:499)
at org.apache.ignite.internal.client.thin.TcpClientChannel.handleIOError(TcpClientChannel.java:491)
at org.apache.ignite.internal.client.thin.TcpClientChannel.access$100(TcpClientChannel.java:92)
at org.apache.ignite.internal.client.thin.TcpClientChannel$ByteCountingDataInput.read(TcpClientChannel.java:538)
at org.apache.ignite.internal.client.thin.TcpClientChannel$ByteCountingDataInput.readInt(TcpClientChannel.java:572)
at org.apache.ignite.internal.client.thin.TcpClientChannel.processNextResponse(TcpClientChannel.java:272)
at org.apache.ignite.internal.client.thin.TcpClientChannel.receive(TcpClientChannel.java:234)
at org.apache.ignite.internal.client.thin.TcpClientChannel.service(TcpClientChannel.java:171)
at org.apache.ignite.internal.client.thin.ReliableChannel.service(ReliableChannel.java:160)
at org.apache.ignite.internal.client.thin.ReliableChannel.request(ReliableChannel.java:187)
at org.apache.ignite.internal.client.thin.TcpIgniteClient.getOrCreateCache(TcpIgniteClient.java:124)
at com.XXXX.eda.pnr.PnrApplication$$anonfun$2$$anonfun$apply$4.apply(PnrApplication.scala:305)
at com.XXXX.eda.pnr.PnrApplication$$anonfun$2$$anonfun$apply$4.apply(PnrApplication.scala:297)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:217)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1094)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1085)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1020)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1085)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:811)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:381)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketException: Connection timed out (Read failed)
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at org.apache.ignite.internal.client.thin.TcpClientChannel$ByteCountingDataInput.read(TcpClientChannel.java:535)
... 24 more
20/06/21 05:49:42 INFO executor.CoarseGrainedExecutorBackend: Got assigned task 1949
20/06/21 05:49:42 INFO executor.Executor: Running task 103.1 in stage 25.0 (TID 1949)
Threaddump contains below exception as well.
20/06/21 05:51:57 WARN hdfs.BlockReaderFactory: I/O error constructing remote block reader.
java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.93.133.157:20952 remote=host.hadoop.sgdcprod.XXXX.com/10.93.133.136:1004]
at org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)
at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)
at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:131)
at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:118)
at java.io.FilterInputStream.read(FilterInputStream.java:83)
at org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed(PBHelper.java:2354)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil.readSaslMessage(DataTransferSaslUtil.java:212)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.doSaslHandshake(SaslDataTransferClient.java:451)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.getEncryptedStreams(SaslDataTransferClient.java:299)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.send(SaslDataTransferClient.java:242)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.checkTrustAndSend(SaslDataTransferClient.java:211)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.peerSend(SaslDataTransferClient.java:160)
at org.apache.hadoop.hdfs.net.TcpPeerServer.peerFromSocketAndKey(TcpPeerServer.java:92)
at org.apache.hadoop.hdfs.DFSClient.newConnectedPeer(DFSClient.java:3593)
at org.apache.hadoop.hdfs.BlockReaderFactory.nextTcpPeer(BlockReaderFactory.java:849)
at org.apache.hadoop.hdfs.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:764)
at org.apache.hadoop.hdfs.BlockReaderFactory.build(BlockReaderFactory.java:377)
at org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:666)
at org.apache.hadoop.hdfs.DFSInputStream.seekToBlockSource(DFSInputStream.java:1663)
at org.apache.hadoop.hdfs.DFSInputStream.readBuffer(DFSInputStream.java:877)
at org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:913)
at org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:981)
at org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:197)
at java.io.DataInputStream.read(DataInputStream.java:149)
at org.apache.avro.mapred.FsInput.read(FsInput.java:54)
at org.apache.avro.file.DataFileReader$SeekableInputStream.read(DataFileReader.java:210)
at org.apache.avro.io.BinaryDecoder$InputStreamByteSource.readRaw(BinaryDecoder.java:824)
at org.apache.avro.io.BinaryDecoder.doReadBytes(BinaryDecoder.java:349)
at org.apache.avro.io.BinaryDecoder.readFixed(BinaryDecoder.java:302)
at org.apache.avro.file.DataFileStream.nextRawBlock(DataFileStream.java:293)
at org.apache.avro.file.DataFileStream.hasNext(DataFileStream.java:198)
at com.databricks.spark.avro.DefaultSource$$anonfun$buildReader$1$$anon$1.hasNext(DefaultSource.scala:215)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1094)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1085)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1020)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1085)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:811)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:381)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
20/06/21 05:51:57 WARN hdfs.DFSClient: Failed to connect to host.hadoop.sgdcprod.XXXX.com/10.93.133.136:1004 for block BP-1009813635-10.93.133.107-1555169940973:blk_1182405155_108738113, add to deadNodes and continue.
java.net.SocketTimeoutException: 60000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.93.133.157:20952 remote=host.hadoop.sgdcprod.XXXX.com/10.93.133.136:1004]
at org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)
at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161)
at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:131)
at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:118)
at java.io.FilterInputStream.read(FilterInputStream.java:83)
at org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed(PBHelper.java:2354)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil.readSaslMessage(DataTransferSaslUtil.java:212)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.doSaslHandshake(SaslDataTransferClient.java:451)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.getEncryptedStreams(SaslDataTransferClient.java:299)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.send(SaslDataTransferClient.java:242)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.checkTrustAndSend(SaslDataTransferClient.java:211)
at org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient.peerSend(SaslDataTransferClient.java:160)
at org.apache.hadoop.hdfs.net.TcpPeerServer.peerFromSocketAndKey(TcpPeerServer.java:92)
at org.apache.hadoop.hdfs.DFSClient.newConnectedPeer(DFSClient.java:3593)
at org.apache.hadoop.hdfs.BlockReaderFactory.nextTcpPeer(BlockReaderFactory.java:849)
at org.apache.hadoop.hdfs.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:764)
at org.apache.hadoop.hdfs.BlockReaderFactory.build(BlockReaderFactory.java:377)
at org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:666)
at org.apache.hadoop.hdfs.DFSInputStream.seekToBlockSource(DFSInputStream.java:1663)
at org.apache.hadoop.hdfs.DFSInputStream.readBuffer(DFSInputStream.java:877)
at org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:913)
at org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:981)
at org.apache.hadoop.crypto.CryptoInputStream.read(CryptoInputStream.java:197)
at java.io.DataInputStream.read(DataInputStream.java:149)
at org.apache.avro.mapred.FsInput.read(FsInput.java:54)
at org.apache.avro.file.DataFileReader$SeekableInputStream.read(DataFileReader.java:210)
at org.apache.avro.io.BinaryDecoder$InputStreamByteSource.readRaw(BinaryDecoder.java:824)
at org.apache.avro.io.BinaryDecoder.doReadBytes(BinaryDecoder.java:349)
at org.apache.avro.io.BinaryDecoder.readFixed(BinaryDecoder.java:302)
at org.apache.avro.file.DataFileStream.nextRawBlock(DataFileStream.java:293)
at org.apache.avro.file.DataFileStream.hasNext(DataFileStream.java:198)
at com.databricks.spark.avro.DefaultSource$$anonfun$buildReader$1$$anon$1.hasNext(DefaultSource.scala:215)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1094)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1085)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1020)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1085)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:811)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:381)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
We are setting defaultReadTimeout in spark.properties file. But it is not getting timedout correctly.
spark.executor.extraJavaOptions=-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC -Dsun.net.client.defaultReadTimeout:300000 -Dsun .net.client.defaultConnectTimeout=300000 -DIGNITE_REST_START_ON_CLIENT=true
spark.driver.exetraJavaOptions=-Dsun.net.client.defaultReadTimeout:300000 -Dsun.net.client.defaultConnectTimeout=300000 -DIGNITE_REST_START_ON_CLIENT=true
Please help in resolving the issue.
Ignite version using : 2.8.0 & 2.8.1
There is some connectivity problem with HDFS and Ignite cluster from your Spark cluster:
HDFS:
WARN hdfs.DFSClient: Failed to connect to host.hadoop.sgdcprod.XXXX.com/10.93.133.136:1004 for block BP-1009813635-10.93.133.107-1555169940973:blk_1182405155_108738113, add to deadNodes and continue.
And for Ignite:
Caused by: java.net.SocketException: Connection timed out (Read failed)
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
Taking into account that your spark job wasn't able to connect both of them then I guess that you have the problem with connectivity there.
However please check the following for HDFS:
1)Some of your Spark workers can't connect host.hadoop.sgdcprod.XXXX.com/10.93.133.136:1004 address because of closed ports or some connectivity issues. You can try to check this address via some tool like netstat.
2)You are going to use an incorrect HDFS port. Please check that the HDFS URL of namenode used in your Spark job code is correct.
3)Something wrong with your configuration of HDFS workers. Probably some of them can't connect to name node.
And for IGNITE:
1)Check that your cluster is alive and doesn't hang. I hope that your cluster was started somewhere outside and can be available via some monitoring tool.
2)Check that server addresses from IP finder can be resolved from every Spark worker.

Why is my RabbitMQ message impossible to serialize using Apache Beam?

I'm trying to read a RabbitMQ queue using Apache Beam.
I've written some transformation code to have messages written to Kafka.
I've even tested my scenario using simple text messages.
Now I try to deploy it with the effective messages this transformer is made to run on. These are JSON message of a quite moderate size.
Strangely, when i try to read "production" messages, I get this exception stack trace.
java.lang.IllegalArgumentException: Unable to encode element 'ValueWithRecordId{id=[], value=org.apache.beam.sdk.io.rabbitmq.RabbitMqMessage#f179a7f}' with coder 'ValueWithRecordId$ValueWithRecordIdCoder(org.apache.beam.sdk.coders.SerializableCoder#76190fb2)'.
org.apache.beam.sdk.coders.Coder.getEncodedElementByteSize(Coder.java:300)
org.apache.beam.sdk.coders.Coder.registerByteSizeObserver(Coder.java:291)
org.apache.beam.sdk.util.WindowedValue$FullWindowedValueCoder.registerByteSizeObserver(WindowedValue.java:564)
org.apache.beam.sdk.util.WindowedValue$FullWindowedValueCoder.registerByteSizeObserver(WindowedValue.java:480)
org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory$ElementByteSizeObservableCoder.registerByteSizeObserver(IntrinsicMapTaskExecutorFactory.java:400)
org.apache.beam.runners.dataflow.worker.util.common.worker.OutputObjectAndByteCounter.update(OutputObjectAndByteCounter.java:125)
org.apache.beam.runners.dataflow.worker.DataflowOutputCounter.update(DataflowOutputCounter.java:64)
org.apache.beam.runners.dataflow.worker.util.common.worker.OutputReceiver.process(OutputReceiver.java:43)
org.apache.beam.runners.dataflow.worker.util.common.worker.ReadOperation.runReadLoop(ReadOperation.java:201)
org.apache.beam.runners.dataflow.worker.util.common.worker.ReadOperation.start(ReadOperation.java:159)
org.apache.beam.runners.dataflow.worker.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:77)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker.process(StreamingDataflowWorker.java:1283)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker.access$1000(StreamingDataflowWorker.java:147)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker$6.run(StreamingDataflowWorker.java:1020)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
java.lang.Thread.run(Thread.java:745)
Caused by: java.io.NotSerializableException: com.rabbitmq.client.impl.LongStringHelper$ByteArrayLongString
java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1184)
java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
java.util.HashMap.internalWriteEntries(HashMap.java:1785)
java.util.HashMap.writeObject(HashMap.java:1362)
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.lang.reflect.Method.invoke(Method.java:498)
java.io.ObjectStreamClass.invokeWriteObject(ObjectStreamClass.java:1028)
java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1496)
java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)
java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
org.apache.beam.sdk.coders.SerializableCoder.encode(SerializableCoder.java:183)
org.apache.beam.sdk.coders.SerializableCoder.encode(SerializableCoder.java:53)
org.apache.beam.sdk.values.ValueWithRecordId$ValueWithRecordIdCoder.encode(ValueWithRecordId.java:105)
org.apache.beam.sdk.values.ValueWithRecordId$ValueWithRecordIdCoder.encode(ValueWithRecordId.java:99)
org.apache.beam.sdk.values.ValueWithRecordId$ValueWithRecordIdCoder.encode(ValueWithRecordId.java:81)
org.apache.beam.sdk.coders.Coder.getEncodedElementByteSize(Coder.java:297)
org.apache.beam.sdk.coders.Coder.registerByteSizeObserver(Coder.java:291)
org.apache.beam.sdk.util.WindowedValue$FullWindowedValueCoder.registerByteSizeObserver(WindowedValue.java:564)
org.apache.beam.sdk.util.WindowedValue$FullWindowedValueCoder.registerByteSizeObserver(WindowedValue.java:480)
org.apache.beam.runners.dataflow.worker.IntrinsicMapTaskExecutorFactory$ElementByteSizeObservableCoder.registerByteSizeObserver(IntrinsicMapTaskExecutorFactory.java:400)
org.apache.beam.runners.dataflow.worker.util.common.worker.OutputObjectAndByteCounter.update(OutputObjectAndByteCounter.java:125)
org.apache.beam.runners.dataflow.worker.DataflowOutputCounter.update(DataflowOutputCounter.java:64)
org.apache.beam.runners.dataflow.worker.util.common.worker.OutputReceiver.process(OutputReceiver.java:43)
org.apache.beam.runners.dataflow.worker.util.common.worker.ReadOperation.runReadLoop(ReadOperation.java:201)
org.apache.beam.runners.dataflow.worker.util.common.worker.ReadOperation.start(ReadOperation.java:159)
org.apache.beam.runners.dataflow.worker.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:77)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker.process(StreamingDataflowWorker.java:1283)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker.access$1000(StreamingDataflowWorker.java:147)
org.apache.beam.runners.dataflow.worker.StreamingDataflowWorker$6.run(StreamingDataflowWorker.java:1020)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
java.lang.Thread.run(Thread.java:745)
My understanding is that the RabbitMQ reader consider the messages big enough to require the use of LongString, which is not serializable.
Am I right on this point ? And if so, how do I suggest RabbitMQ to use a simple String (which will be enough for these messages) ?
This is an Apache Beam (https://issues.apache.org/jira/browse/BEAM-7414) for which solution is ... not yet merged into Apache Beam repo by pure laziness (this is bad). If someone wants to have the fix immediatly, it is possible to build my branch https://github.com/Riduidel/beam/tree/fix/rabbitmq-message-not-serializable

Spark 2.2 toPandas() is returning Py4JError

Currently refactoring some PySpark code and this specific snippet is causing me issues whereas it has ran fine before:
zip_table=spark.read.format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat").schema(schemas.zip_schema).load(path_to_file,header=True)
zip_pandas = zip_table.toPandas()
running into:
Py4JJavaError: An error occurred while calling o167.get.
: java.util.NoSuchElementException: spark.sql.execution.pandas.respectSessionTimeZone
at org.apache.spark.sql.internal.SQLConf$$anonfun$getConfString$2.apply(SQLConf.scala:1175)
at org.apache.spark.sql.internal.SQLConf$$anonfun$getConfString$2.apply(SQLConf.scala:1175)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.internal.SQLConf.getConfString(SQLConf.scala:1175)
at org.apache.spark.sql.RuntimeConfig.get(RuntimeConfig.scala:74)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Other Spark Dataframes are successfully able to call toPandas() while others such as this return the same error.
The file in question is not particularly big and should be able to fit in the driver no problem. Also, I know it may be better to just load it into Pandas directly and not convert from Spark, but there is further logic in the code that requires both Spark/Pandas dataframes (and a later re-architecture of this work will address this issue).

Hive error : java.io.IOException: Couldn't create proxy provider class org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider

I have hive query which run successful sometimes but maximum time gives an error "java.io.IOException: Couldn't create proxy provider class org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
Below is my error log
java.lang.RuntimeException: java.io.IOException: Couldn't create proxy
provider class org.apache.hadoop.hdfs.server.namenode.ha.Con\
figuredFailoverProxyProvider at
org.apache.hadoop.mapred.lib.CombineFileInputFormat.isSplitable(CombineFileInputFormat.java:154)
at
org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.getMoreSplits(CombineFileInputFormat.java:283)
at
org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.getSplits(CombineFileInputFormat.java:239)
at
org.apache.hadoop.mapred.lib.CombineFileInputFormat.getSplits(CombineFileInputFormat.java:75)
at
org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileInputFormatShim.getSplits(HadoopShimsSecure.java:336)
at
org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileInputFormatShim.getSplits(HadoopShimsSecure.java:302)
at
org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getSplits(CombineHiveInputFormat.java:435)
at
org.apache.hadoop.mapreduce.JobSubmitter.writeOldSplits(JobSubmitter.java:525)
at
org.apache.hadoop.mapreduce.JobSubmitter.writeSplits(JobSubmitter.java:517)
at
org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:399)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1295) at
org.apache.hadoop.mapreduce.Job$10.run(Job.java:1292) at
java.security.AccessController.doPrivileged(Native Method) at
javax.security.auth.Subject.doAs(Subject.java:415) at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1614)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1292) at
org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:564) at
org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:559) at
java.security.AccessController.doPrivileged(Native Method) at
javax.security.auth.Subject.doAs(Subject.java:415) at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1614)
at
org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:559)
at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:550)
at
org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:420)
at
org.apache.hadoop.hive.ql.exec.mr.MapRedTask.execute(MapRedTask.java:136)
at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:153) at
org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:85)
at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1516) at
org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1283) at
org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1101) at
org.apache.hadoop.hive.ql.Driver.run(Driver.java:924) at
org.apache.hadoop.hive.ql.Driver.run(Driver.java:914) at
org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:269)
at
org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:221)
at
org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:431)
at
org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:367)
at
org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:464)
at
org.apache.hadoop.hive.cli.CliDriver.processFile(CliDriver.java:474)
at
org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:756)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:694) at
org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:633) at
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606) at
org.apache.hadoop.util.RunJar.main(RunJar.java:212) Caused by:
java.io.IOException: Couldn't create proxy provider class
org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverPr\
oxyProvider at
org.apache.hadoop.hdfs.NameNodeProxies.createFailoverProxyProvider(NameNodeProxies.java:475)
at
org.apache.hadoop.hdfs.NameNodeProxies.createProxy(NameNodeProxies.java:148)
at org.apache.hadoop.hdfs.DFSClient.(DFSClient.java:632) at
org.apache.hadoop.hdfs.DFSClient.(DFSClient.java:570) at
org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:147)
at
org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2596)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:367) at
org.apache.hadoop.fs.FileSystem.get(FileSystem.java:169) at
org.apache.hadoop.mapred.lib.CombineFileInputFormat.isSplitable(CombineFileInputFormat.java:151)
... 45 more Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.GeneratedConstructorAccessor32.newInstance(Unknown
Source) at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at
org.apache.hadoop.hdfs.NameNodeProxies.createFailoverProxyProvider(NameNodeProxies.java:458)
... 53 more Caused by: java.lang.OutOfMemoryError: GC overhead limit
exceeded at java.util.Arrays.copyOf(Arrays.java:2219) at
java.util.ArrayList.grow(ArrayList.java:242) at
java.util.ArrayList.ensureExplicitCapacity(ArrayList.java:216) at
java.util.ArrayList.ensureCapacityInternal(ArrayList.java:208) at
java.util.ArrayList.add(ArrayList.java:440) at
java.lang.String.split(String.java:2288) at
sun.net.util.IPAddressUtil.textToNumericFormatV4(IPAddressUtil.java:47)
at java.net.InetAddress.getAllByName(InetAddress.java:1129) at
java.net.InetAddress.getAllByName(InetAddress.java:1098) at
java.net.InetAddress.getByName(InetAddress.java:1048) at
org.apache.hadoop.security.SecurityUtil$StandardHostResolver.getByName(SecurityUtil.java:474)
at
org.apache.hadoop.security.SecurityUtil.getByName(SecurityUtil.java:461)
at
org.apache.hadoop.net.NetUtils.createSocketAddrForHost(NetUtils.java:235)
at org.apache.hadoop.net.NetUtils.createSocketAddr(NetUtils.java:215)
at org.apache.hadoop.net.NetUtils.createSocketAddr(NetUtils.java:163)
at org.apache.hadoop.net.NetUtils.createSocketAddr(NetUtils.java:152)
at
org.apache.hadoop.hdfs.DFSUtil.getAddressesForNameserviceId(DFSUtil.java:677)
at
org.apache.hadoop.hdfs.DFSUtil.getAddressesForNsIds(DFSUtil.java:645)
at org.apache.hadoop.hdfs.DFSUtil.getAddresses(DFSUtil.java:628) at
org.apache.hadoop.hdfs.DFSUtil.getHaNnRpcAddresses(DFSUtil.java:727)
at
org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider.(ConfiguredFailoverProxyProvider.java:88)
at sun.reflect.GeneratedConstructorAccessor32.newInstance(Unknown
Source) at
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at
org.apache.hadoop.hdfs.NameNodeProxies.createFailoverProxyProvider(NameNodeProxies.java:458)
at
org.apache.hadoop.hdfs.NameNodeProxies.createProxy(NameNodeProxies.java:148)
at org.apache.hadoop.hdfs.DFSClient.(DFSClient.java:632) at
org.apache.hadoop.hdfs.DFSClient.(DFSClient.java:570) at
org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:147)
at
org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2596)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:367) at
org.apache.hadoop.fs.FileSystem.get(FileSystem.java:169) Job
Submission failed with exception
'java.lang.RuntimeException(java.io.IOException: Couldn't create proxy
provider class org.apac\
he.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider)'
Could anyone tell me why this happen?
I just stumbled across a similar exception myself, and increasing the hive client heap didn't help. I found I was able to clear up the OutOfMemory GC Overhead exception by adding a partition column to the where clause of the query, so I've concluded that having a very large number of splits is causing this exception. I haven't dug into the code, but I believe I've seen this happen with string concatenation in a loop triggering gc thrashing, and something similar might be happening in the CombineHiveInputFormat.getSplits method.