Spark 3 is failing when I try to execute a simple query - sql

I have this table on Hive:
CREATE TABLE `mydb`.`raw_sales` (
`combustivel` STRING,
`regiao` STRING,
`estado` STRING,
`jan` STRING,
`fev` STRING,
`mar` STRING,
`abr` STRING,
`mai` STRING,
`jun` STRING,
`jul` STRING,
`ago` STRING,
`set` STRING,
`out` STRING,
`nov` STRING,
`dez` STRING,
`total` STRING,
`created_at` TIMESTAMP,
`ano` STRING)
USING orc
LOCATION 'hdfs://localhost:9000/jobs/etl/tables/raw_sales.orc'
TBLPROPERTIES (
'transient_lastDdlTime' = '1601322056',
'ORC.COMPRESS' = 'SNAPPY')
There is data on table, but when I try this query at bellow:
spark.sql("SELECT * FROM mydb.raw_sales WHERE ano = '2000' AND combustivel like '%GASOLINA%'").show()
It's crashing!
>>> spark.sql("SELECT * FROM mydb.raw_sales WHERE ano = 2000 AND combustivel like '%GASOLINA%'").show() [164/3679]
20/09/28 19:25:30 ERROR executor.Executor: Exception in task 0.0 in stage 61.0 (TID 133)
java.lang.ClassCastException: java.lang.String cannot be cast to java.lang.Number
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.castLiteralValue(OrcFilters.scala:163)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.buildLeafSearchArgument(OrcFilters.scala:235)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFiltersHelper$1(OrcFilters.scala:134)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.$anonfun$convertibleFilters$4(OrcFilters.scala:137)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFilters(OrcFilters.scala:136)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.createFilter(OrcFilters.scala:75)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4(OrcFileFormat.scala:189)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4$adapted(OrcFileFormat.scala:188)
at scala.Option.map(Option.scala:230)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:188)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:491)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
20/09/28 19:25:30 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 61.0 (TID 133, 639773a482b8, executor driver): java.lang.ClassCastException: java.lang.String cannot be cast to java.l
ang.Number
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.castLiteralValue(OrcFilters.scala:163)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.buildLeafSearchArgument(OrcFilters.scala:235)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFiltersHelper$1(OrcFilters.scala:134)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.$anonfun$convertibleFilters$4(OrcFilters.scala:137)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFilters(OrcFilters.scala:136)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.createFilter(OrcFilters.scala:75) [112/3679]
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4(OrcFileFormat.scala:189)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4$adapted(OrcFileFormat.scala:188)
at scala.Option.map(Option.scala:230)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:188)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:491)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
20/09/28 19:25:30 ERROR scheduler.TaskSetManager: Task 0 in stage 61.0 failed 1 times; aborting job
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/opt/spark/current/python/pyspark/sql/dataframe.py", line 440, in show
print(self._jdf.showString(n, 20, vertical))
File "/opt/spark/current/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
File "/opt/spark/current/python/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/opt/spark/current/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o122.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 61.0 failed 1 times, most recent failure: Lost task 0.0 in stage 61.0 (TID 133, 639773a482b8, executor dr
iver): java.lang.ClassCastException: java.lang.String cannot be cast to java.lang.Number
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.castLiteralValue(OrcFilters.scala:163)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.buildLeafSearchArgument(OrcFilters.scala:235)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFiltersHelper$1(OrcFilters.scala:134)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.$anonfun$convertibleFilters$4(OrcFilters.scala:137)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFilters(OrcFilters.scala:136)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.createFilter(OrcFilters.scala:75)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4(OrcFileFormat.scala:189)
at scala.Option.map(Option.scala:230) [59/3679]
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:188)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:491)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:467)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) [7/3679]
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
at sun.reflect.GeneratedMethodAccessor64.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassCastException: java.lang.String cannot be cast to java.lang.Number
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.castLiteralValue(OrcFilters.scala:163)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.buildLeafSearchArgument(OrcFilters.scala:235)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFiltersHelper$1(OrcFilters.scala:134)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.$anonfun$convertibleFilters$4(OrcFilters.scala:137)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFilters(OrcFilters.scala:136)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.createFilter(OrcFilters.scala:75)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4(OrcFileFormat.scala:189)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4$adapted(OrcFileFormat.scala:188)
at scala.Option.map(Option.scala:230)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:188)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:491)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
However, this query at bellow works!
spark.sql("SELECT * FROM mydb.raw_sales WHERE ano = 2000").show()
And this works too!
spark.sql("SELECT combustivel FROM mydb.raw_sales WHERE combustivel like '%GASOLINA C%' ").show()
My environment
Hadoop 2.10.0
Hive 2.3.7
Tez 0.9.2
Spark 3.0.1
The results is the same for pyspark and Scala.
I'm completely lost here! Maybe you can help me!
Thanks!

I've changed the data of 'ano' field type to 'int'. Is not a ideal solution since even using 'cast' was not enough to resolve this issue! I have a good friend that is a monster on Spark that says that is possibly a bug on Spark. I'm not sure about that, but I would like that someone else corroborates this isue if possible!
Anyways, change data type to 'int' works!

Related

Convert spark.read.parquet into Pandas DataFrame

I am working on converting snappy.parquet files into Pandas dataframe. I was able to load in all of my parquet files, but once I tried to convert it to Pandas, it failed. Please see the code below.
from pathlib import Path
import os
import pandas as pd
# Initiate findspark instance to run pyspark
import findspark
findspark.init()
# Importing PySpark
import pyspark
# Create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
parquetFile_all = spark.read.parquet("Resources")
display(parquetFile_all)
parquetFile_all.count()
The output of the last line of code is: 12216053, which Pandas should be able to handle.
Once I ran the following code:
file_output_all = spark.sql("SELECT * FROM parquetFile_all")
all_files_df = file_output_all.select("*").toPandas()
That's where it breaks with the following error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-11-83baa41f2e6a> in <module>
1 # Convert to Pandas df
----> 2 all_files_df = file_output_all.select("*").toPandas()
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/pyspark/sql/dataframe.py in toPandas(self)
2141
2142 # Below is toPandas without Arrow optimization.
-> 2143 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
2144
2145 dtype = {}
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/pyspark/sql/dataframe.py in collect(self)
532 """
533 with SCCallSiteSync(self._sc) as css:
--> 534 sock_info = self._jdf.collectToPython()
535 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
536
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o45.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 9.0 failed 1 times, most recent failure: Lost task 2.0 in stage 9.0 (TID 335, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:3236)
at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:220)
at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:173)
at java.io.DataOutputStream.write(DataOutputStream.java:107)
at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:554)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:258)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:3236)
at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:220)
at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:173)
at java.io.DataOutputStream.write(DataOutputStream.java:107)
at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:554)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:258)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more```

Insertion into hive ORC format table from other hive formats cannot be selected from

After creating a new hive external table in ORC format which is bucketed and inserting from another table (with the exact schema) but in Avro format (and non-bucketed) when selecting from the new table there are many errors. I put the stack of the errors here (some of them are repeated and had to delete from the end due to lack of space):
Status: Failed
Vertex failed, vertexName=Map 1, vertexId=vertex_1572541024266_0020_1_00, diagnostics=[Task failed, taskId=task_1572541024266_0020_1_00_000000, diagnostics=[TaskAttempt 0 failed, info=[Error: Error while running task ( failure ) : attempt_1572541024266_0020_1_00_000000_0:java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:211)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:168)
at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1893)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:74)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.run(MapRecordProcessor.java:419)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:185)
... 14 more
Caused by: java.io.IOException: java.io.IOException: Error reading file: path/000003_0
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderNextException(HiveIOExceptionHandlerChain.java:121)
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderNextException(HiveIOExceptionHandlerUtil.java:77)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:365)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:79)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:33)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.next(HiveContextAwareRecordReader.java:116)
at org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.next(TezGroupedSplitsInputFormat.java:151)
at org.apache.tez.mapreduce.lib.MRReaderMapred.next(MRReaderMapred.java:116)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:62)
... 16 more
Caused by: java.io.IOException: Error reading file: /path/000003_0
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1191)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.ensureBatch(RecordReaderImpl.java:77)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.hasNext(RecordReaderImpl.java:93)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:238)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:213)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:360)
... 22 more
Caused by: java.io.EOFException: Read past EOF for compressed stream Stream for column 44 kind DATA position: 0 length: 0 range: 0 offset: 0 limit: 0
at org.apache.orc.impl.SerializationUtils.readFully(SerializationUtils.java:119)
at org.apache.orc.impl.SerializationUtils.readLongLE(SerializationUtils.java:102)
at org.apache.orc.impl.SerializationUtils.readDouble(SerializationUtils.java:98)
at org.apache.orc.impl.TreeReaderFactory$DoubleTreeReader.nextVector(TreeReaderFactory.java:762)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextVector(TreeReaderFactory.java:1833)
at org.apache.orc.impl.TreeReaderFactory$ListTreeReader.nextVector(TreeReaderFactory.java:2001)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextBatch(TreeReaderFactory.java:1815)
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1184)
... 27 more
], TaskAttempt 1 failed, info=[Error: Error while running task ( failure ) : attempt_1572541024266_0020_1_00_000000_1:java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:211)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:168)
at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1893)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:74)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.run(MapRecordProcessor.java:419)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:185)
... 14 more
Caused by: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderNextException(HiveIOExceptionHandlerChain.java:121)
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderNextException(HiveIOExceptionHandlerUtil.java:77)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:365)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:79)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:33)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.next(HiveContextAwareRecordReader.java:116)
at org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.next(TezGroupedSplitsInputFormat.java:151)
at org.apache.tez.mapreduce.lib.MRReaderMapred.next(MRReaderMapred.java:116)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:62)
... 16 more
Caused by: java.io.IOException: Error reading file: /path/000003_0
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1191)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.ensureBatch(RecordReaderImpl.java:77)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.hasNext(RecordReaderImpl.java:93)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:238)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:213)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:360)
... 22 more
Caused by: java.io.EOFException: Read past EOF for compressed stream Stream for column 44 kind DATA position: 0 length: 0 range: 0 offset: 0 limit: 0
at org.apache.orc.impl.SerializationUtils.readFully(SerializationUtils.java:119)
at org.apache.orc.impl.SerializationUtils.readLongLE(SerializationUtils.java:102)
at org.apache.orc.impl.SerializationUtils.readDouble(SerializationUtils.java:98)
at org.apache.orc.impl.TreeReaderFactory$DoubleTreeReader.nextVector(TreeReaderFactory.java:762)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextVector(TreeReaderFactory.java:1833)
at org.apache.orc.impl.TreeReaderFactory$ListTreeReader.nextVector(TreeReaderFactory.java:2001)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextBatch(TreeReaderFactory.java:1815)
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1184)
... 27 more
], TaskAttempt 2 failed, info=[Error: Error while running task ( failure ) : attempt_1572541024266_0020_1_00_000000_2:java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:211)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:168)
at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1893)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:74)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.run(MapRecordProcessor.java:419)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:185)
... 14 more
Caused by: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderNextException(HiveIOExceptionHandlerChain.java:121)
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderNextException(HiveIOExceptionHandlerUtil.java:77)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:365)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:79)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:33)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.next(HiveContextAwareRecordReader.java:116)
at org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.next(TezGroupedSplitsInputFormat.java:151)
at org.apache.tez.mapreduce.lib.MRReaderMapred.next(MRReaderMapred.java:116)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:62)
... 16 more
Caused by: java.io.IOException: Error reading file: /path/000003_0
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1191)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.ensureBatch(RecordReaderImpl.java:77)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.hasNext(RecordReaderImpl.java:93)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:238)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:213)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:360)
... 22 more
Caused by: java.io.EOFException: Read past EOF for compressed stream Stream for column 44 kind DATA position: 0 length: 0 range: 0 offset: 0 limit: 0
at org.apache.orc.impl.SerializationUtils.readFully(SerializationUtils.java:119)
at org.apache.orc.impl.SerializationUtils.readLongLE(SerializationUtils.java:102)
at org.apache.orc.impl.SerializationUtils.readDouble(SerializationUtils.java:98)
at org.apache.orc.impl.TreeReaderFactory$DoubleTreeReader.nextVector(TreeReaderFactory.java:762)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextVector(TreeReaderFactory.java:1833)
at org.apache.orc.impl.TreeReaderFactory$ListTreeReader.nextVector(TreeReaderFactory.java:2001)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextBatch(TreeReaderFactory.java:1815)
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1184)
... 27 more
], TaskAttempt 3 failed, info=[Error: Error while running task ( failure ) : attempt_1572541024266_0020_1_00_000000_3:java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:211)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:168)
at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1893)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:74)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.run(MapRecordProcessor.java:419)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:185)
... 14 more
Caused by: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderNextException(HiveIOExceptionHandlerChain.java:121)
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderNextException(HiveIOExceptionHandlerUtil.java:77)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:365)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:79)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:33)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.next(HiveContextAwareRecordReader.java:116)
at org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.next(TezGroupedSplitsInputFormat.java:151)
at org.apache.tez.mapreduce.lib.MRReaderMapred.next(MRReaderMapred.java:116)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:62)
... 16 more
Caused by: java.io.IOException: Error reading file: /path/000003_0
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1191)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.ensureBatch(RecordReaderImpl.java:77)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.hasNext(RecordReaderImpl.java:93)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:238)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:213)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:360)
... 22 more
Caused by: java.io.EOFException: Read past EOF for compressed stream Stream for column 44 kind DATA position: 0 length: 0 range: 0 offset: 0 limit: 0
at org.apache.orc.impl.SerializationUtils.readFully(SerializationUtils.java:119)
at org.apache.orc.impl.SerializationUtils.readLongLE(SerializationUtils.java:102)
at org.apache.orc.impl.SerializationUtils.readDouble(SerializationUtils.java:98)
at org.apache.orc.impl.TreeReaderFactory$DoubleTreeReader.nextVector(TreeReaderFactory.java:762)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextVector(TreeReaderFactory.java:1833)
at org.apache.orc.impl.TreeReaderFactory$ListTreeReader.nextVector(TreeReaderFactory.java:2001)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextBatch(TreeReaderFactory.java:1815)
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1184)
... 27 more
]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_1572541024266_0020_1_00 [Map 1] killed/failed due to:OWN_TASK_FAILURE]
DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:0
FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.tez.TezTask. Vertex failed, vertexName=Map 1, vertexId=vertex_1572541024266_0020_1_00, diagnostics=[Task failed, taskId=task_1572541024266_0020_1_00_000000, diagnostics=[TaskAttempt 0 failed, info=[Error: Error while running task ( failure ) : attempt_1572541024266_0020_1_00_000000_0:java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:211)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:168)
at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1893)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:74)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.run(MapRecordProcessor.java:419)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:185)
... 14 more
Caused by: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderNextException(HiveIOExceptionHandlerChain.java:121)
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderNextException(HiveIOExceptionHandlerUtil.java:77)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:365)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:79)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:33)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.next(HiveContextAwareRecordReader.java:116)
at org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.next(TezGroupedSplitsInputFormat.java:151)
at org.apache.tez.mapreduce.lib.MRReaderMapred.next(MRReaderMapred.java:116)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:62)
... 16 more
Caused by: java.io.IOException: Error reading file: /path/000003_0
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1191)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.ensureBatch(RecordReaderImpl.java:77)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.hasNext(RecordReaderImpl.java:93)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:238)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:213)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:360)
... 22 more
Caused by: java.io.EOFException: Read past EOF for compressed stream Stream for column 44 kind DATA position: 0 length: 0 range: 0 offset: 0 limit: 0
at org.apache.orc.impl.SerializationUtils.readFully(SerializationUtils.java:119)
at org.apache.orc.impl.SerializationUtils.readLongLE(SerializationUtils.java:102)
at org.apache.orc.impl.SerializationUtils.readDouble(SerializationUtils.java:98)
at org.apache.orc.impl.TreeReaderFactory$DoubleTreeReader.nextVector(TreeReaderFactory.java:762)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextVector(TreeReaderFactory.java:1833)
at org.apache.orc.impl.TreeReaderFactory$ListTreeReader.nextVector(TreeReaderFactory.java:2001)
at org.apache.orc.impl.TreeReaderFactory$StructTreeReader.nextBatch(TreeReaderFactory.java:1815)
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1184)
... 27 more
], TaskAttempt 1 failed, info=[Error: Error while running task ( failure ) : attempt_1572541024266_0020_1_00_000000_1:java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:211)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:168)
at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:374)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:73)
at org.apache.tez.runtime.task.TaskRunner2Callable$1.run(TaskRunner2Callable.java:61)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1893)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:61)
at org.apache.tez.runtime.task.TaskRunner2Callable.callInternal(TaskRunner2Callable.java:37)
at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:74)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.run(MapRecordProcessor.java:419)
at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:185)
... 14 more
Caused by: java.io.IOException: java.io.IOException: Error reading file: /path/000003_0
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderNextException(HiveIOExceptionHandlerChain.java:121)
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderNextException(HiveIOExceptionHandlerUtil.java:77)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:365)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:79)
at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:33)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.next(HiveContextAwareRecordReader.java:116)
at org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.next(TezGroupedSplitsInputFormat.java:151)
at org.apache.tez.mapreduce.lib.MRReaderMapred.next(MRReaderMapred.java:116)
at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:62)
... 16 more
Caused by: java.io.IOException: Error reading file:/path/000003_0
at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1191)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.ensureBatch(RecordReaderImpl.java:77)
at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.hasNext(RecordReaderImpl.java:93)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:238)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.next(OrcInputFormat.java:213)
at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:360)
... 22 more
Any suggestion on how to resolve this issue?

Unicode error while reading data from file/rdd

I am trying to create dataframe with proper schema after fetching data from text file. in RDD, all data types are strings however one of the field data type is interger, which i want to ensure that created as integer. So i created Structtype and created dataframe. but it throws an error as below.
Error Message:
--------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call
last) in ()
----> 1 df.show()
/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/dataframe.pyc
in show(self, n, truncate, vertical)
376 """
377 if isinstance(truncate, bool) and truncate:
--> 378 print(self._jdf.showString(n, 20, vertical))
379 else:
380 print(self._jdf.showString(n, int(truncate), vertical))
/Applications/anaconda2/lib/python2.7/site-packages/py4j/java_gateway.pyc
in call(self, *args) 1284 answer =
self.gateway_client.send_command(command) 1285 return_value
= get_return_value(
-> 1286 answer, self.gateway_client, self.target_id, self.name) 1287 1288 for temp_arg in temp_args:
/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/utils.pyc
in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/Applications/anaconda2/lib/python2.7/site-packages/py4j/protocol.pyc
in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o64.showString. :
org.apache.spark.SparkException: Job aborted due to stage failure:
Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0
in stage 3.0 (TID 5, localhost, executor driver):
org.apache.spark.api.python.PythonException: Traceback (most recent
call last): File
"/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py",
line 377, in main
process() File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py",
line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile) File
"/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py",
line 393, in dump_stream
vs = list(itertools.islice(iterator, batch)) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py",
line 99, in wrapper
return f(*args, **kwargs) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/session.py",
line 730, in prepare
verify_func(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1389, in verify
verify_value(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1370, in verify_struct
verifier(v) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1389, in verify
verify_value(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1315, in verify_integer
verify_acceptable_types(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1278, in verify_acceptable_types
% (dataType, obj, type(obj)))) TypeError: field id: IntegerType can not accept object u'1' in type
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source) at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
org.apache.spark.scheduler.Task.run(Task.scala:121) at
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace: at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257) at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) at
org.apache.spark.SparkContext.runJob(SparkContext.scala:2082) at
org.apache.spark.SparkContext.runJob(SparkContext.scala:2101) at
org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
at
org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
at
org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
at
org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
at
org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363) at
org.apache.spark.sql.Dataset.head(Dataset.scala:2544) at
org.apache.spark.sql.Dataset.take(Dataset.scala:2758) at
org.apache.spark.sql.Dataset.getRows(Dataset.scala:254) at
org.apache.spark.sql.Dataset.showString(Dataset.scala:291) at
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at
py4j.Gateway.invoke(Gateway.java:282) at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79) at
py4j.GatewayConnection.run(GatewayConnection.java:238) at
java.lang.Thread.run(Thread.java:748) Caused by:
org.apache.spark.api.python.PythonException: Traceback (most recent
call last): File
"/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py",
line 377, in main
process() File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py",
line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile) File
"/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py",
line 393, in dump_stream
vs = list(itertools.islice(iterator, batch)) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py",
line 99, in wrapper
return f(*args, **kwargs) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/session.py",
line 730, in prepare
verify_func(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1389, in verify
verify_value(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1370, in verify_struct
verifier(v) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1389, in verify
verify_value(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1315, in verify_integer
verify_acceptable_types(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1278, in verify_acceptable_types
% (dataType, obj, type(obj)))) TypeError: field id: IntegerType can not accept object u'1' in type
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source) at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
org.apache.spark.scheduler.Task.run(Task.scala:121) at
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
#!/usr/bin/env python
coding: utf-8
In[11]:
import os import sys from pyspark import SparkContext from pyspark.sql
import SparkSession from pyspark.sql.types import *
spark=SparkSession.builder.getOrCreate() sc =
SparkContext.getOrCreate()
In[12]:
Reads data from file and creates rdd rdd=sc.textFile('/Users/nagaraju.n/Downloads/sample_data.txt')
In[13]:
type(rdd)
In[14]:
rdd_data=rdd.map(lambda p: p.split(","))
In[15]:
rdd_data.collect()
In[16]:
print(rdd_data)
In[17]:
orig_header=rdd_data.first()
In[18]:
type(orig_header)
In[19]:
rdd_withoutheader=rdd_data.filter(lambda p:p != orig_header)
In[20]:
rdd_withoutheader.collect()
In[21]:
Create Schema header = StructType([StructField("id", IntegerType(), True),StructField("first_name", StringType(),
True),StructField("last_name", StringType(),
True),StructField("email", StringType(), True),StructField("phone",
StringType(), True),StructField("city", StringType(),
True),StructField("country", StringType(), True)])
In[22]:
header
In[23]:
df=spark.createDataFrame(rdd_withoutheader,header)
In[24]:
df.show()
/// Part of your code:
header = StructType([StructField("stockticker", StringType(), True),StructField("tradedate", IntegerType(), True),StructField("openprice", FloatType(), True),StructField("highprice", FloatType(), True),StructField("lowprice", FloatType(), True),StructField("closeprice", FloatType(), True),StructField("volume", IntegerType(), True)])
df=spark.createDataFrame(rdd_data,header)
///
My answer:
Schema is used most to avoid a full table scan to infer types and doesn't perform any type casting. Hence above method best works for Json/avro/parquet input files not for text files. For textfiles following are the best methods:
Method 1 based on your code, convert rdd to dataframe and define schema as below:
rdd=sc.textFile('/Users/nagaraju.n/Downloads/sample_data.txt')
df_noType=data.map(lambda p: p.split(",")).toDF(["id", "first_name", "last_name", "email", "phone", "city", "country"])
Now you can type cast either of these ways:
Way1:
df_typecast=df_noType.select(df_noType.id.cast('int'), df_noType.first_name, df_noType.last_name, df_noType.email, df_noType.phone, df_noType.city, df_noType.country)
Note: in above line no need to type cast other fields to string as they are bydefault string
Note: if decimals are there then you can use df_noType.id.cast('float')
(or)
way2:
from pyspark.sql.types import *
df_typecast=df_noType.select(df_noType.id.cast(IntegerType()), df_noType.first_name.cast(StringType()), df_noType.last_name.cast(StringType()), df_noType.email.cast(StringType()), df_noType.phone.cast(StringType()), df_noType.city.cast(StringType()), df_noType.country.cast(StringType()))
Method 2: I usually use this always which I feel best and easy
rdd=sc.textFile('/Users/nagaraju.n/Downloads/sample_data.txt')
from pyspark.sql import Row
df=rdd.map(lambda p: Row(id= int(p.split(",")[0]), first_name= p.split(",")[1], last_name= p.split(",")[2], email= p.split(",")[3], phone= p.split(",")[4], city= p.split(",")[5], country=p.split(",")[6])).toDF()
df.printSchema()
Note: if decimals are there then you can use float(p.split(",")[0])

javax.servlet.ServletException: com.acme.dao.StudentDao

I'm trying to deploy my project within different servers. Successfully, it was working as well with Tomcat7, Tomcat8, WildFly8 and WildFly10.
Once, I was running it with JBoss 7.1, the following exception is displayed:
09:37:42,894 INFO [org.hibernate.tool.hbm2ddl.TableMetadata] (http-localhost-127.0.0.1-8080-1) HHH000261: Table found: JPAHibernateSpring.dbo.Employee
09:37:42,894 INFO [org.hibernate.tool.hbm2ddl.TableMetadata] (http-localhost-127.0.0.1-8080-1) HHH000037: Columns: [id, name, country]
09:37:42,897 WARN [org.hibernate.internal.SessionFactoryImpl] (http-localhost-127.0.0.1-8080-1) HHH000008: JTASessionContext being used with JDBCTransactionFactory; auto-flush will not operate correctly with getCurrentSession()
09:37:43,465 INFO [stdout] (http-localhost-127.0.0.1-8080-1) Hibernate: select employee0_.id as id2_, employee0_.country as country2_, employee0_.name as name2_ from Employee employee0_
09:37:43,543 INFO [stdout] (http-localhost-127.0.0.1-8080-1) ---------------------------------------List size= 3
09:37:49,665 SEVERE [javax.enterprise.resource.webcontainer.jsf.application] (http-localhost-127.0.0.1-8080-1) com.acme.dao.EmpDao from [Module "com.sun.jsf-impl:main" from local module loader #1f3eab7 (roots: D:\jboss-as-7.1.1.Final\modules)]: java.lang.ClassNotFoundException: com.acme.dao.EmpDao from [Module "com.sun.jsf-impl:main" from local module loader #1f3eab7 (roots: D:\jboss-as-7.1.1.Final\modules)]
at org.jboss.modules.ModuleClassLoader.findClass(ModuleClassLoader.java:190)
at org.jboss.modules.ConcurrentClassLoader.performLoadClassUnchecked(ConcurrentClassLoader.java:468)
at org.jboss.modules.ConcurrentClassLoader.performLoadClassChecked(ConcurrentClassLoader.java:456)
at org.jboss.modules.ConcurrentClassLoader.performLoadClass(ConcurrentClassLoader.java:398)
at org.jboss.modules.ConcurrentClassLoader.loadClass(ConcurrentClassLoader.java:120)
at java.lang.Class.forName0(Native Method) [rt.jar:1.7.0_55]
at java.lang.Class.forName(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.resolveProxyClass(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readProxyDesc(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readClassDesc(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readOrdinaryObject(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readObject0(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.defaultReadFields(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readSerialData(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readOrdinaryObject(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readObject0(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readObject(Unknown Source) [rt.jar:1.7.0_55]
at java.util.HashMap.readObject(Unknown Source) [rt.jar:1.7.0_55]
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) [rt.jar:1.7.0_55]
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) [rt.jar:1.7.0_55]
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) [rt.jar:1.7.0_55]
at java.lang.reflect.Method.invoke(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectStreamClass.invokeReadObject(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readSerialData(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readOrdinaryObject(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readObject0(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.defaultReadFields(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readSerialData(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readOrdinaryObject(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readObject0(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readArray(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readObject0(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readObject(Unknown Source) [rt.jar:1.7.0_55]
at java.util.HashMap.readObject(Unknown Source) [rt.jar:1.7.0_55]
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) [rt.jar:1.7.0_55]
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) [rt.jar:1.7.0_55]
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) [rt.jar:1.7.0_55]
at java.lang.reflect.Method.invoke(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectStreamClass.invokeReadObject(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readSerialData(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readOrdinaryObject(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readObject0(Unknown Source) [rt.jar:1.7.0_55]
at java.io.ObjectInputStream.readObject(Unknown Source) [rt.jar:1.7.0_55]
at com.sun.faces.renderkit.ClientSideStateHelper.doGetState(ClientSideStateHelper.java:255) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.renderkit.ClientSideStateHelper.getState(ClientSideStateHelper.java:198) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.renderkit.ResponseStateManagerImpl.getState(ResponseStateManagerImpl.java:100) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.application.view.StateManagementStrategyImpl.restoreView(StateManagementStrategyImpl.java:224) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.application.StateManagerImpl.restoreView(StateManagerImpl.java:188) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.application.view.ViewHandlingStrategy.restoreView(ViewHandlingStrategy.java:123) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.application.view.FaceletViewHandlingStrategy.restoreView(FaceletViewHandlingStrategy.java:453) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.application.view.MultiViewHandler.restoreView(MultiViewHandler.java:142) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.lifecycle.RestoreViewPhase.execute(RestoreViewPhase.java:192) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.lifecycle.Phase.doPhase(Phase.java:101) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.lifecycle.RestoreViewPhase.doPhase(RestoreViewPhase.java:116) [jsf-impl-2.1.7-jbossorg-2.jar:]
at com.sun.faces.lifecycle.LifecycleImpl.execute(LifecycleImpl.java:118) [jsf-impl-2.1.7-jbossorg-2.jar:]
at javax.faces.webapp.FacesServlet.service(FacesServlet.java:593) [jboss-jsf-api_2.1_spec-2.0.1.Final.jar:2.0.1.Final]
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:329) [jbossweb-7.0.13.Final.jar:]
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:248) [jbossweb-7.0.13.Final.jar:]
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:275) [jbossweb-7.0.13.Final.jar:]
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:161) [jbossweb-7.0.13.Final.jar:]
at org.jboss.as.jpa.interceptor.WebNonTxEmCloserValve.invoke(WebNonTxEmCloserValve.java:50) [jboss-as-jpa-7.1.1.Final.jar:7.1.1.Final]
at org.jboss.as.web.security.SecurityContextAssociationValve.invoke(SecurityContextAssociationValve.java:153) [jboss-as-web-7.1.1.Final.jar:7.1.1.Final]
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:155) [jbossweb-7.0.13.Final.jar:]
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102) [jbossweb-7.0.13.Final.jar:]
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109) [jbossweb-7.0.13.Final.jar:]
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:368) [jbossweb-7.0.13.Final.jar:]
at org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:877) [jbossweb-7.0.13.Final.jar:]
at org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:671) [jbossweb-7.0.13.Final.jar:]
at org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:930) [jbossweb-7.0.13.Final.jar:]
at java.lang.Thread.run(Unknown Source) [rt.jar:1.7.0_55]
I'm looking for a solution to this issue. Could you please help me?

How to delete from table using not equal to

I have a table CleintParentQuestion and it has got a column qid.
+------+
| qid |
+------+
| 1 |
| 2 |
| 3 |
| 4 |
+------+
I want to delete all the details whose qid is not 1 nor 2
so the sql query is
DELETE FROM client_parent_question WHERE qid<> 1 AND qid <>2
But when I tried to execute using jpa
#Transactional
public void deleteFromClientParentQuestion(String qids) {
String queryString=" DELETE FROM client_parent_question WHERE "+qids;
System.out.println("deleteFromClientParentQuestion "+queryString);
int delete=entityManagerUtil.getEntityManager().createQuery(queryString).executeUpdate();
}
give me error
org.springframework.dao.InvalidDataAccessApiUsageException: org.hibernate.hql.ast.QuerySyntaxException: client_parent_question is not mapped [ DELETE FROM client_parent_question WHERE qid<> 1]; nested exception is java.lang.IllegalArgumentException: org.hibernate.hql.ast.QuerySyntaxException: client_parent_question is not mapped [ DELETE FROM client_parent_question WHERE qid<> 1]] with root cause
org.hibernate.hql.ast.QuerySyntaxException: client_parent_question is not mapped [ DELETE FROM client_parent_question WHERE qid<> 1]
at org.hibernate.hql.ast.util.SessionFactoryHelper.requireClassPersister(SessionFactoryHelper.java:180)
at org.hibernate.hql.ast.tree.FromElementFactory.addFromElement(FromElementFactory.java:111)
at org.hibernate.hql.ast.tree.FromClause.addFromElement(FromClause.java:93)
at org.hibernate.hql.ast.HqlSqlWalker.createFromElement(HqlSqlWalker.java:327)
at org.hibernate.hql.antlr.HqlSqlBaseWalker.fromElement(HqlSqlBaseWalker.java:3441)
at org.hibernate.hql.antlr.HqlSqlBaseWalker.fromElementList(HqlSqlBaseWalker.java:3325)
at org.hibernate.hql.antlr.HqlSqlBaseWalker.fromClause(HqlSqlBaseWalker.java:733)
at org.hibernate.hql.antlr.HqlSqlBaseWalker.deleteStatement(HqlSqlBaseWalker.java:447)
at org.hibernate.hql.antlr.HqlSqlBaseWalker.statement(HqlSqlBaseWalker.java:260)
at org.hibernate.hql.ast.QueryTranslatorImpl.analyze(QueryTranslatorImpl.java:254)
at org.hibernate.hql.ast.QueryTranslatorImpl.doCompile(QueryTranslatorImpl.java:185)
at org.hibernate.hql.ast.QueryTranslatorImpl.compile(QueryTranslatorImpl.java:136)
at org.hibernate.engine.query.HQLQueryPlan.<init>(HQLQueryPlan.java:101)
at org.hibernate.engine.query.HQLQueryPlan.<init>(HQLQueryPlan.java:80)
at org.hibernate.engine.query.QueryPlanCache.getHQLQueryPlan(QueryPlanCache.java:124)
at org.hibernate.impl.AbstractSessionImpl.getHQLQueryPlan(AbstractSessionImpl.java:156)
at org.hibernate.impl.AbstractSessionImpl.createQuery(AbstractSessionImpl.java:135)
at org.hibernate.impl.SessionImpl.createQuery(SessionImpl.java:1770)
at org.hibernate.ejb.AbstractEntityManagerImpl.createQuery(AbstractEntityManagerImpl.java:272)
at sun.reflect.GeneratedMethodAccessor1826.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.springframework.orm.jpa.SharedEntityManagerCreator$SharedEntityManagerInvocationHandler.invoke(SharedEntityManagerCreator.java:240)
at com.sun.proxy.$Proxy1592.createQuery(Unknown Source)
at com.iconma.surveytab.dao.implementations.ServiceClientDaoImpl.deleteFromClientParentQuestion(ServiceClientDaoImpl.java:738)
at com.iconma.surveytab.service.implementations.ServiceClientServiceImpl.deleteFromClientParentQuestion(ServiceClientServiceImpl.java:1644)
at com.iconma.surveytab.controllers.ClientController.addOptions(ClientController.java:409)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.springframework.web.bind.annotation.support.HandlerMethodInvoker.invokeHandlerMethod(HandlerMethodInvoker.java:176)
at org.springframework.web.servlet.mvc.annotation.AnnotationMethodHandlerAdapter.invokeHandlerMethod(AnnotationMethodHandlerAdapter.java:426)
at org.springframework.web.servlet.mvc.annotation.AnnotationMethodHandlerAdapter.handle(AnnotationMethodHandlerAdapter.java:414)
at org.springframework.web.servlet.DispatcherServlet.doDispatch(DispatcherServlet.java:790)
at org.springframework.web.servlet.DispatcherServlet.doService(DispatcherServlet.java:719)
at org.springframework.web.servlet.FrameworkServlet.processRequest(FrameworkServlet.java:644)
at org.springframework.web.servlet.FrameworkServlet.doPost(FrameworkServlet.java:560)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:641)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:722)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:304)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:210)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:368)
at org.springframework.security.web.access.intercept.FilterSecurityInterceptor.invoke(FilterSecurityInterceptor.java:109)
at org.springframework.security.web.access.intercept.FilterSecurityInterceptor.doFilter(FilterSecurityInterceptor.java:83)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.access.ExceptionTranslationFilter.doFilter(ExceptionTranslationFilter.java:97)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.session.SessionManagementFilter.doFilter(SessionManagementFilter.java:100)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.authentication.AnonymousAuthenticationFilter.doFilter(AnonymousAuthenticationFilter.java:78)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.servletapi.SecurityContextHolderAwareRequestFilter.doFilter(SecurityContextHolderAwareRequestFilter.java:54)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.savedrequest.RequestCacheAwareFilter.doFilter(RequestCacheAwareFilter.java:35)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.authentication.www.BasicAuthenticationFilter.doFilter(BasicAuthenticationFilter.java:177)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.authentication.AbstractAuthenticationProcessingFilter.doFilter(AbstractAuthenticationProcessingFilter.java:187)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.authentication.logout.LogoutFilter.doFilter(LogoutFilter.java:105)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.context.SecurityContextPersistenceFilter.doFilter(SecurityContextPersistenceFilter.java:79)
at org.springframework.security.web.FilterChainProxy$VirtualFilterChain.doFilter(FilterChainProxy.java:380)
at org.springframework.security.web.FilterChainProxy.doFilter(FilterChainProxy.java:169)
at org.springframework.web.filter.DelegatingFilterProxy.invokeDelegate(DelegatingFilterProxy.java:237)
at org.springframework.web.filter.DelegatingFilterProxy.doFilter(DelegatingFilterProxy.java:167)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:243)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:210)
at org.springframework.web.filter.RequestContextFilter.doFilterInternal(RequestContextFilter.java:83)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:76)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:243)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:210)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:240)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:164)
at org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:462)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:164)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:100)
at org.apache.catalina.valves.AccessLogValve.invoke(AccessLogValve.java:562)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:118)
at com.springsource.insight.collection.tcserver.request.HttpRequestOperationCollectionValve.invoke(HttpRequestOperationCollectionValve.java:84)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:395)
at org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:250)
at org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:188)
at org.apache.tomcat.util.net.JIoEndpoint$SocketProcessor.run(JIoEndpoint.java:302)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
note qids will return me qid<> 1 AND qid<> 2
so queryString becomes DELETE FROM client_parent_question WHERE qid<> 1 AND qid<> 2
You are trying to execute a SQL query as a HQL query.
getEntityManager().createQuery(queryString)
This is for creating HQL queries NOT SQL queries. For sql you have to use createNativeQuery
getEntityManager().createNativeQuery(queryString)
change your query
DELETE FROM client_parent_question WHERE qid ="+qids