Convert spark.read.parquet into Pandas DataFrame - pandas

I am working on converting snappy.parquet files into Pandas dataframe. I was able to load in all of my parquet files, but once I tried to convert it to Pandas, it failed. Please see the code below.
from pathlib import Path
import os
import pandas as pd
# Initiate findspark instance to run pyspark
import findspark
findspark.init()
# Importing PySpark
import pyspark
# Create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
parquetFile_all = spark.read.parquet("Resources")
display(parquetFile_all)
parquetFile_all.count()
The output of the last line of code is: 12216053, which Pandas should be able to handle.
Once I ran the following code:
file_output_all = spark.sql("SELECT * FROM parquetFile_all")
all_files_df = file_output_all.select("*").toPandas()
That's where it breaks with the following error:
Py4JJavaError Traceback (most recent call last)
<ipython-input-11-83baa41f2e6a> in <module>
1 # Convert to Pandas df
----> 2 all_files_df = file_output_all.select("*").toPandas()
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/pyspark/sql/dataframe.py in toPandas(self)
2141
2142 # Below is toPandas without Arrow optimization.
-> 2143 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
2144
2145 dtype = {}
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/pyspark/sql/dataframe.py in collect(self)
532 """
533 with SCCallSiteSync(self._sc) as css:
--> 534 sock_info = self._jdf.collectToPython()
535 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
536
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/2.4.4/libexec/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o45.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 9.0 failed 1 times, most recent failure: Lost task 2.0 in stage 9.0 (TID 335, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:3236)
at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:220)
at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:173)
at java.io.DataOutputStream.write(DataOutputStream.java:107)
at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:554)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:258)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:3236)
at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:220)
at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:173)
at java.io.DataOutputStream.write(DataOutputStream.java:107)
at org.apache.spark.sql.catalyst.expressions.UnsafeRow.writeToStream(UnsafeRow.java:554)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:258)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more```

Related

There is already an object named ""; in the database

I am using pyspark in databricks to append data to a sql table via ADF pipelines. Code is sown below:
log_status_df_all = spark.createDataFrame(log_status_df_all)
log_status_df_all.write.format("com.microsoft.sqlserver.jdbc.spark").mode(write_mode).option("url", url).option("dbtable", 'Logs_Collection_Status').option("user", username).option("password", password).save()
log_status_df_all.show()
On some days I am get the error message:
com.microsoft.sqlserver.jdbc.SQLServerException: There is already an object named '<table_name>' in the database.
Upon simply re-running the pipeline the table is updated with no issues; therefore the code is working. How can I prevent this from happening again? Is it an error when multiple pipelines try writing to the same table at the same time?
The rest of the error message is shown below:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-3143827225825384> in <module>
8
9 log_collection_df = spark.createDataFrame(log_collection_df)
---> 10 write_df_sql(log_collection_df, 'Logs_Collection_Status', 'overwrite')
11
<command-1421348210166948> in write_df_sql(df, table, write_mode)
14
15
---> 16 spark_df.write.format("com.microsoft.sqlserver.jdbc.spark").mode(write_mode).option("url", url).option("dbtable", table_name).option("user", username).option("password", password).save()
17
18 #backup table
/databricks/spark/python/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
735 self.format(format)
736 if path is None:
--> 737 self._jwrite.save()
738 else:
739 self._jwrite.save(path)
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args) 1255 answer = self.gateway_client.send_command(command) 1256 return_value
= get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name) 1258 1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o661.save. : com.microsoft.sqlserver.jdbc.SQLServerException: There is already an object named 'Logs_Collection_Status' in the database. at com.microsoft.sqlserver.jdbc.SQLServerException.makeFromDatabaseError(SQLServerException.java:258) at com.microsoft.sqlserver.jdbc.SQLServerStatement.getNextResult(SQLServerStatement.java:1535) at com.microsoft.sqlserver.jdbc.SQLServerStatement.doExecuteStatement(SQLServerStatement.java:845) at com.microsoft.sqlserver.jdbc.SQLServerStatement$StmtExecCmd.doExecute(SQLServerStatement.java:752) at com.microsoft.sqlserver.jdbc.TDSCommand.execute(IOBuffer.java:7151) at com.microsoft.sqlserver.jdbc.SQLServerConnection.executeCommand(SQLServerConnection.java:2478) at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeCommand(SQLServerStatement.java:219) at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeStatement(SQLServerStatement.java:199) at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeUpdate(SQLServerStatement.java:680) at com.microsoft.sqlserver.jdbc.spark.BulkCopyUtils$.executeUpdate(BulkCopyUtils.scala:456) at com.microsoft.sqlserver.jdbc.spark.BulkCopyUtils$.mssqlCreateTable(BulkCopyUtils.scala:495) at com.microsoft.sqlserver.jdbc.spark.SingleInstanceConnector$.createTable(SingleInstanceConnector.scala:33) at com.microsoft.sqlserver.jdbc.spark.Connector.write(Connector.scala:60) at com.microsoft.sqlserver.jdbc.spark.DefaultSource.createRelation(DefaultSource.scala:51) at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68) at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:152) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:140) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$5.apply(SparkPlan.scala:193) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:189) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:140) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:117) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:115) at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:711) at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:711) at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:113) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:243) at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:99) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:173) at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:711) at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:307) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293) at sun.reflect.GeneratedMethodAccessor841.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) at py4j.Gateway.invoke(Gateway.java:295) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:251) at java.lang.Thread.run(Thread.java:748)
The issue has now been resolved.
Previous code:
if condition1==True:
write_mode = 'overwrite'
elif condition2==True:
write_mode = 'append'
log_status_df_all = spark.createDataFrame(log_status_df_all)
log_status_df_all.write.format("com.microsoft.sqlserver.jdbc.spark").mode(write_mode).option("url", url).option("dbtable", 'Logs_Collection_Status').option("user", username).option("password", password).save()
log_status_df_all.show()
Current code:
write_mode = 'append'
log_status_df_all = spark.createDataFrame(log_status_df_all)
log_status_df_all.write.format("com.microsoft.sqlserver.jdbc.spark").mode(write_mode).option("url", url).option("dbtable", 'Logs_Collection_Status').option("user", username).option("password", password).save()
log_status_df_all.show()

Spark 3 is failing when I try to execute a simple query

I have this table on Hive:
CREATE TABLE `mydb`.`raw_sales` (
`combustivel` STRING,
`regiao` STRING,
`estado` STRING,
`jan` STRING,
`fev` STRING,
`mar` STRING,
`abr` STRING,
`mai` STRING,
`jun` STRING,
`jul` STRING,
`ago` STRING,
`set` STRING,
`out` STRING,
`nov` STRING,
`dez` STRING,
`total` STRING,
`created_at` TIMESTAMP,
`ano` STRING)
USING orc
LOCATION 'hdfs://localhost:9000/jobs/etl/tables/raw_sales.orc'
TBLPROPERTIES (
'transient_lastDdlTime' = '1601322056',
'ORC.COMPRESS' = 'SNAPPY')
There is data on table, but when I try this query at bellow:
spark.sql("SELECT * FROM mydb.raw_sales WHERE ano = '2000' AND combustivel like '%GASOLINA%'").show()
It's crashing!
>>> spark.sql("SELECT * FROM mydb.raw_sales WHERE ano = 2000 AND combustivel like '%GASOLINA%'").show() [164/3679]
20/09/28 19:25:30 ERROR executor.Executor: Exception in task 0.0 in stage 61.0 (TID 133)
java.lang.ClassCastException: java.lang.String cannot be cast to java.lang.Number
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.castLiteralValue(OrcFilters.scala:163)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.buildLeafSearchArgument(OrcFilters.scala:235)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFiltersHelper$1(OrcFilters.scala:134)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.$anonfun$convertibleFilters$4(OrcFilters.scala:137)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFilters(OrcFilters.scala:136)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.createFilter(OrcFilters.scala:75)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4(OrcFileFormat.scala:189)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4$adapted(OrcFileFormat.scala:188)
at scala.Option.map(Option.scala:230)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:188)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:491)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
20/09/28 19:25:30 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 61.0 (TID 133, 639773a482b8, executor driver): java.lang.ClassCastException: java.lang.String cannot be cast to java.l
ang.Number
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.castLiteralValue(OrcFilters.scala:163)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.buildLeafSearchArgument(OrcFilters.scala:235)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFiltersHelper$1(OrcFilters.scala:134)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.$anonfun$convertibleFilters$4(OrcFilters.scala:137)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFilters(OrcFilters.scala:136)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.createFilter(OrcFilters.scala:75) [112/3679]
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4(OrcFileFormat.scala:189)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4$adapted(OrcFileFormat.scala:188)
at scala.Option.map(Option.scala:230)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:188)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:491)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
20/09/28 19:25:30 ERROR scheduler.TaskSetManager: Task 0 in stage 61.0 failed 1 times; aborting job
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/opt/spark/current/python/pyspark/sql/dataframe.py", line 440, in show
print(self._jdf.showString(n, 20, vertical))
File "/opt/spark/current/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
File "/opt/spark/current/python/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/opt/spark/current/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o122.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 61.0 failed 1 times, most recent failure: Lost task 0.0 in stage 61.0 (TID 133, 639773a482b8, executor dr
iver): java.lang.ClassCastException: java.lang.String cannot be cast to java.lang.Number
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.castLiteralValue(OrcFilters.scala:163)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.buildLeafSearchArgument(OrcFilters.scala:235)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFiltersHelper$1(OrcFilters.scala:134)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.$anonfun$convertibleFilters$4(OrcFilters.scala:137)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFilters(OrcFilters.scala:136)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.createFilter(OrcFilters.scala:75)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4(OrcFileFormat.scala:189)
at scala.Option.map(Option.scala:230) [59/3679]
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:188)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:491)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:467)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3627)
at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) [7/3679]
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2697)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2904)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:300)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:337)
at sun.reflect.GeneratedMethodAccessor64.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassCastException: java.lang.String cannot be cast to java.lang.Number
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.castLiteralValue(OrcFilters.scala:163)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.buildLeafSearchArgument(OrcFilters.scala:235)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFiltersHelper$1(OrcFilters.scala:134)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.$anonfun$convertibleFilters$4(OrcFilters.scala:137)
at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245)
at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.convertibleFilters(OrcFilters.scala:136)
at org.apache.spark.sql.execution.datasources.orc.OrcFilters$.createFilter(OrcFilters.scala:75)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4(OrcFileFormat.scala:189)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$4$adapted(OrcFileFormat.scala:188)
at scala.Option.map(Option.scala:230)
at org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.$anonfun$buildReaderWithPartitionValues$1(OrcFileFormat.scala:188)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:491)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
However, this query at bellow works!
spark.sql("SELECT * FROM mydb.raw_sales WHERE ano = 2000").show()
And this works too!
spark.sql("SELECT combustivel FROM mydb.raw_sales WHERE combustivel like '%GASOLINA C%' ").show()
My environment
Hadoop 2.10.0
Hive 2.3.7
Tez 0.9.2
Spark 3.0.1
The results is the same for pyspark and Scala.
I'm completely lost here! Maybe you can help me!
Thanks!
I've changed the data of 'ano' field type to 'int'. Is not a ideal solution since even using 'cast' was not enough to resolve this issue! I have a good friend that is a monster on Spark that says that is possibly a bug on Spark. I'm not sure about that, but I would like that someone else corroborates this isue if possible!
Anyways, change data type to 'int' works!

Unicode error while reading data from file/rdd

I am trying to create dataframe with proper schema after fetching data from text file. in RDD, all data types are strings however one of the field data type is interger, which i want to ensure that created as integer. So i created Structtype and created dataframe. but it throws an error as below.
Error Message:
--------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call
last) in ()
----> 1 df.show()
/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/dataframe.pyc
in show(self, n, truncate, vertical)
376 """
377 if isinstance(truncate, bool) and truncate:
--> 378 print(self._jdf.showString(n, 20, vertical))
379 else:
380 print(self._jdf.showString(n, int(truncate), vertical))
/Applications/anaconda2/lib/python2.7/site-packages/py4j/java_gateway.pyc
in call(self, *args) 1284 answer =
self.gateway_client.send_command(command) 1285 return_value
= get_return_value(
-> 1286 answer, self.gateway_client, self.target_id, self.name) 1287 1288 for temp_arg in temp_args:
/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/utils.pyc
in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/Applications/anaconda2/lib/python2.7/site-packages/py4j/protocol.pyc
in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o64.showString. :
org.apache.spark.SparkException: Job aborted due to stage failure:
Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0
in stage 3.0 (TID 5, localhost, executor driver):
org.apache.spark.api.python.PythonException: Traceback (most recent
call last): File
"/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py",
line 377, in main
process() File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py",
line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile) File
"/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py",
line 393, in dump_stream
vs = list(itertools.islice(iterator, batch)) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py",
line 99, in wrapper
return f(*args, **kwargs) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/session.py",
line 730, in prepare
verify_func(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1389, in verify
verify_value(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1370, in verify_struct
verifier(v) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1389, in verify
verify_value(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1315, in verify_integer
verify_acceptable_types(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1278, in verify_acceptable_types
% (dataType, obj, type(obj)))) TypeError: field id: IntegerType can not accept object u'1' in type
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source) at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
org.apache.spark.scheduler.Task.run(Task.scala:121) at
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace: at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257) at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) at
org.apache.spark.SparkContext.runJob(SparkContext.scala:2082) at
org.apache.spark.SparkContext.runJob(SparkContext.scala:2101) at
org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
at
org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at
org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
at
org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
at
org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
at
org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363) at
org.apache.spark.sql.Dataset.head(Dataset.scala:2544) at
org.apache.spark.sql.Dataset.take(Dataset.scala:2758) at
org.apache.spark.sql.Dataset.getRows(Dataset.scala:254) at
org.apache.spark.sql.Dataset.showString(Dataset.scala:291) at
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at
py4j.Gateway.invoke(Gateway.java:282) at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79) at
py4j.GatewayConnection.run(GatewayConnection.java:238) at
java.lang.Thread.run(Thread.java:748) Caused by:
org.apache.spark.api.python.PythonException: Traceback (most recent
call last): File
"/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py",
line 377, in main
process() File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py",
line 372, in process
serializer.dump_stream(func(split_index, iterator), outfile) File
"/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py",
line 393, in dump_stream
vs = list(itertools.islice(iterator, batch)) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py",
line 99, in wrapper
return f(*args, **kwargs) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/session.py",
line 730, in prepare
verify_func(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1389, in verify
verify_value(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1370, in verify_struct
verifier(v) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1389, in verify
verify_value(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1315, in verify_integer
verify_acceptable_types(obj) File "/Users/nagaraju.n/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/types.py",
line 1278, in verify_acceptable_types
% (dataType, obj, type(obj)))) TypeError: field id: IntegerType can not accept object u'1' in type
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at
org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown
Source) at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
org.apache.spark.scheduler.Task.run(Task.scala:121) at
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
#!/usr/bin/env python
coding: utf-8
In[11]:
import os import sys from pyspark import SparkContext from pyspark.sql
import SparkSession from pyspark.sql.types import *
spark=SparkSession.builder.getOrCreate() sc =
SparkContext.getOrCreate()
In[12]:
Reads data from file and creates rdd rdd=sc.textFile('/Users/nagaraju.n/Downloads/sample_data.txt')
In[13]:
type(rdd)
In[14]:
rdd_data=rdd.map(lambda p: p.split(","))
In[15]:
rdd_data.collect()
In[16]:
print(rdd_data)
In[17]:
orig_header=rdd_data.first()
In[18]:
type(orig_header)
In[19]:
rdd_withoutheader=rdd_data.filter(lambda p:p != orig_header)
In[20]:
rdd_withoutheader.collect()
In[21]:
Create Schema header = StructType([StructField("id", IntegerType(), True),StructField("first_name", StringType(),
True),StructField("last_name", StringType(),
True),StructField("email", StringType(), True),StructField("phone",
StringType(), True),StructField("city", StringType(),
True),StructField("country", StringType(), True)])
In[22]:
header
In[23]:
df=spark.createDataFrame(rdd_withoutheader,header)
In[24]:
df.show()
/// Part of your code:
header = StructType([StructField("stockticker", StringType(), True),StructField("tradedate", IntegerType(), True),StructField("openprice", FloatType(), True),StructField("highprice", FloatType(), True),StructField("lowprice", FloatType(), True),StructField("closeprice", FloatType(), True),StructField("volume", IntegerType(), True)])
df=spark.createDataFrame(rdd_data,header)
///
My answer:
Schema is used most to avoid a full table scan to infer types and doesn't perform any type casting. Hence above method best works for Json/avro/parquet input files not for text files. For textfiles following are the best methods:
Method 1 based on your code, convert rdd to dataframe and define schema as below:
rdd=sc.textFile('/Users/nagaraju.n/Downloads/sample_data.txt')
df_noType=data.map(lambda p: p.split(",")).toDF(["id", "first_name", "last_name", "email", "phone", "city", "country"])
Now you can type cast either of these ways:
Way1:
df_typecast=df_noType.select(df_noType.id.cast('int'), df_noType.first_name, df_noType.last_name, df_noType.email, df_noType.phone, df_noType.city, df_noType.country)
Note: in above line no need to type cast other fields to string as they are bydefault string
Note: if decimals are there then you can use df_noType.id.cast('float')
(or)
way2:
from pyspark.sql.types import *
df_typecast=df_noType.select(df_noType.id.cast(IntegerType()), df_noType.first_name.cast(StringType()), df_noType.last_name.cast(StringType()), df_noType.email.cast(StringType()), df_noType.phone.cast(StringType()), df_noType.city.cast(StringType()), df_noType.country.cast(StringType()))
Method 2: I usually use this always which I feel best and easy
rdd=sc.textFile('/Users/nagaraju.n/Downloads/sample_data.txt')
from pyspark.sql import Row
df=rdd.map(lambda p: Row(id= int(p.split(",")[0]), first_name= p.split(",")[1], last_name= p.split(",")[2], email= p.split(",")[3], phone= p.split(",")[4], city= p.split(",")[5], country=p.split(",")[6])).toDF()
df.printSchema()
Note: if decimals are there then you can use float(p.split(",")[0])

not able to convert RDD to DF using customSchema

I am not able to convert a rdd to data frame using a custom schema. The details are below with the code:
It works when I use a customSchema as below:
>>> customSchema = StructType([
... StructField("EID",StringType()),\
... StructField("Name",StringType()),\
... StructField("email",StringType()),\
... StructField("Salary",StringType()),\
... StructField("PlaceName",StringType()),\
... StructField("County",StringType()),\
... StructField("City",StringType()),\
... StructField("Gender",StringType())\
... ])
>>>
>>> myDF = spark.createDataFrame(emp1,customSchema)
>>> myDF1 = myDF.withColumn("EID",col("EID").cast("integer")).withColumn("Salary",col("Salary").cast("integer"))
>>> myDF1.show()
+------+--------------------+--------------------+------+--------------+--------------------+--------------+------+
| EID| Name| email|Salary| PlaceName| County| City|Gender|
+------+--------------------+--------------------+------+--------------+--------------------+--------------+------+
|111135| Darell T Grizzle|darell.grizzle#ya...|196416| Tallahassee| Leon| Tallahassee| M|
|111159| Deanna Z Nestor|deanna.nestor#gma...|184760| Collegeport| Matagorda| Collegeport| F|
|111160| Marion G Mcqueary|marion.mcqueary#y...|189506| Flensburg| Morrison| Flensburg| M|
|111175| Monserrate D Bentz|monserrate.bentz#...|184412|South Freeport| Cumberland|South Freeport| F|
|111214| Jamie E Spataro|jamie.spataro#gma...|189926| Gilliam| Saline| Gilliam| M|
|111228| Ernest J Woolbright|ernest.woolbright...|194929| Tacoma| Tacoma| Tacoma| M|
|111243| Ivette F Manzanares|ivette.manzanares...|189834| Lemasters| Franklin| Lemasters| F|
|111274| Erwin F Bouchard|erwin.bouchard#ao...|184390| Bessemer City| Gaston| Bessemer City| M|
|111293| Walton E Garza|walton.garza#comc...|198280| Suncook| Merrimack| Suncook| M|
|111316| Jospeh E Holle|jospeh.holle#gmai...|181878| Wagon Mound| Mora| Wagon Mound| M|
|111327| Angelo S Fizer|angelo.fizer#ibm.com|199654| Zelienople| Butler| Zelienople| M|
|111350| Numbers H Luo| numbers.luo#aol.com|198095| Eva| Benton| Eva| M|
|111359| Jim Z Jewett|jim.jewett#gmail.com|198956| Hatchechubbee| Russell| Hatchechubbee| M|
|111396| Edward M Pentecost|edward.pentecost#...|194979| Dayhoit| Harlan| Dayhoit| M|
|111403| Henry F Lawyer|henry.lawyer#appl...|198515| Washington|District of Columbia| Washington| M|
|111442| Manual X Meany|manual.meany#yaho...|196608| Hunter| Cass| Hunter| M|
|111446| Ethan V Folmar|ethan.folmar#yaho...|188581| Ridgeview| Boone| Ridgeview| M|
|111449| Tanja J Sparrow|tanja.sparrow#yah...|195398| Tower City| Cass| Tower City| F|
|111478|Leigha K Courtema...|leigha.courtemanc...|195306| Sun Valley| Blaine| Sun Valley| F|
|111514| Rob F Struck|rob.struck#gmail.com|198750| Centertown| Cole| Centertown| M|
+------+--------------------+--------------------+------+--------------+--------------------+--------------+------+
only showing top 20 rows
But it fails when I use a Schema (where I directly define EID and Salary as IntegerType) as below:
>>> customSchema = StructType([
... StructField("EID",IntegerType()),\
... StructField("Name",StringType()),\
... StructField("email",StringType()),\
... StructField("Salary",IntegerType()),\
... StructField("PlaceName",StringType()),\
... StructField("County",StringType()),\
... StructField("City",StringType()),\
... StructField("Gender",StringType())\
... ])
Full code below:
>>> rdd = sc.textFile("C:/sparkCourse/filetext/part-00000-646a1d36-8f75-4eee-b937-135e933ede7f-c000.csv").map(lambda row: row.split(','))
>>> rdd.take(1)
[['EID', 'Name', 'email', 'Salary', 'PlaceName', 'County', 'City', 'Gender']]
>>> header = rdd.first()
>>> emp = rdd.filter(lambda row: row != header)
>>> emp.take(1)
[['111135', 'Darell T Grizzle', 'darell.grizzle#yahoo.ca', '196416', 'Tallahassee', 'Leon', 'Tallahassee', 'M']]
>>> emp1 = emp.map(lambda fields:[fields[0],fields[1],fields[2],fields[3],fields[4],fields[5],fields[6],fields[7]])
>>> emp1.take(1)
[['111135', 'Darell T Grizzle', 'darell.grizzle#yahoo.ca', '196416', 'Tallahassee', 'Leon', 'Tallahassee', 'M']]
>>>
>>> customSchema = StructType([
... StructField("EID",IntegerType()),\
... StructField("Name",StringType()),\
... StructField("email",StringType()),\
... StructField("Salary",IntegerType()),\
... StructField("PlaceName",StringType()),\
... StructField("County",StringType()),\
... StructField("City",StringType()),\
... StructField("Gender",StringType())\
... ])
>>> myDF = spark.createDataFrame(emp1,customSchema)
I get the below error:
IntegerType can not accept object '111135' in type <class 'str'>
But, why does it allow the column to be casted as an integer later and not at the time of defining Schema.
Where am I going wrong?
>>> myDF.show()
[Stage 47:> (0 + 1) / 1]19/02/08 19:54:21 ERROR Executor: Exception in task 0.0 in stage 47.0 (TID 55)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\spark\python\lib\pyspark.zip\pyspark\worker.py", line 229, in main
File "C:\spark\python\lib\pyspark.zip\pyspark\worker.py", line 224, in process
File "C:\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 372, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "C:\spark\python\pyspark\sql\session.py", line 671, in prepare
verify_func(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1421, in verify
verify_value(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1402, in verify_struct
verifier(v)
File "C:\spark\python\pyspark\sql\types.py", line 1421, in verify
verify_value(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1347, in verify_integer
verify_acceptable_types(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1310, in verify_acceptable_types
% (dataType, obj, type(obj))))
TypeError: field EID: IntegerType can not accept object '111135' in type <class 'str'>
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
19/02/08 19:54:21 WARN TaskSetManager: Lost task 0.0 in stage 47.0 (TID 55, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\spark\python\lib\pyspark.zip\pyspark\worker.py", line 229, in main
File "C:\spark\python\lib\pyspark.zip\pyspark\worker.py", line 224, in process
File "C:\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 372, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "C:\spark\python\pyspark\sql\session.py", line 671, in prepare
verify_func(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1421, in verify
verify_value(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1402, in verify_struct
verifier(v)
File "C:\spark\python\pyspark\sql\types.py", line 1421, in verify
verify_value(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1347, in verify_integer
verify_acceptable_types(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1310, in verify_acceptable_types
% (dataType, obj, type(obj))))
TypeError: field EID: IntegerType can not accept object '111135' in type <class 'str'>
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
19/02/08 19:54:21 ERROR TaskSetManager: Task 0 in stage 47.0 failed 1 times; aborting job
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\spark\python\pyspark\sql\dataframe.py", line 350, in show
print(self._jdf.showString(n, 20, vertical))
File "C:\spark\python\lib\py4j-0.10.6-src.zip\py4j\java_gateway.py", line 1160, in __call__
File "C:\spark\python\pyspark\sql\utils.py", line 63, in deco
return f(*a, **kw)
File "C:\spark\python\lib\py4j-0.10.6-src.zip\py4j\protocol.py", line 320, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o1148.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 47.0 failed 1 times, most recent failure: Lost task 0.0 in stage 47.0 (TID 55, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\spark\python\lib\pyspark.zip\pyspark\worker.py", line 229, in main
File "C:\spark\python\lib\pyspark.zip\pyspark\worker.py", line 224, in process
File "C:\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 372, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "C:\spark\python\pyspark\sql\session.py", line 671, in prepare
verify_func(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1421, in verify
verify_value(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1402, in verify_struct
verifier(v)
File "C:\spark\python\pyspark\sql\types.py", line 1421, in verify
verify_value(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1347, in verify_integer
verify_acceptable_types(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1310, in verify_acceptable_types
% (dataType, obj, type(obj))))
TypeError: field EID: IntegerType can not accept object '111135' in type <class 'str'>
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3272)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3253)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3252)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2484)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2698)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\spark\python\lib\pyspark.zip\pyspark\worker.py", line 229, in main
File "C:\spark\python\lib\pyspark.zip\pyspark\worker.py", line 224, in process
File "C:\spark\python\lib\pyspark.zip\pyspark\serializers.py", line 372, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "C:\spark\python\pyspark\sql\session.py", line 671, in prepare
verify_func(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1421, in verify
verify_value(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1402, in verify_struct
verifier(v)
File "C:\spark\python\pyspark\sql\types.py", line 1421, in verify
verify_value(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1347, in verify_integer
verify_acceptable_types(obj)
File "C:\spark\python\pyspark\sql\types.py", line 1310, in verify_acceptable_types
% (dataType, obj, type(obj))))
TypeError: field EID: IntegerType can not accept object '111135' in type <class 'str'>
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
>>>
It could work if you don't define the schema in the beginning just read the csv with spark.read.csv(....) and then transform the columns with cast.
so if you just want to convert this columns from string to integer, you could use the following code:
from pyspark.sql.functions import *
df1= sqlContext.createDataFrame([('111135', 'Darell T Grizzle', 'darell.grizzle#yahoo.ca', '196416', 'Tallahassee', 'Leon', 'Tallahassee', 'M'),\
('111136', 'Darell X Xrizzle', 'darell.Xrizzle#yahoo.ca', '206416', 'Example', 'Leroy', 'Example', 'W')],\
['EID', 'Name', 'email', 'Salary', 'PlaceName', 'County', 'City', 'Gender'])
#above code is only used to create some dataframe with a similar format
#and the functions are used to access the columns with col()
df1 = df1.withColumn("EID", col("EID").cast("int")).withColumn("Salary", col("Salary").cast("int"))
#this line transforms your string columns to integer
df1.printSchema()
df1.show(truncate=False)
Output:
root
|-- EID: integer (nullable = true)
|-- Name: string (nullable = true)
|-- email: string (nullable = true)
|-- Salary: integer (nullable = true)
|-- PlaceName: string (nullable = true)
|-- County: string (nullable = true)
|-- City: string (nullable = true)
|-- Gender: string (nullable = true)
+------+----------------+-----------------------+------+-----------+------+-----------+------+
|EID |Name |email |Salary|PlaceName |County|City |Gender|
+------+----------------+-----------------------+------+-----------+------+-----------+------+
|111135|Darell T Grizzle|darell.grizzle#yahoo.ca|196416|Tallahassee|Leon |Tallahassee|M |
|111136|Darell X Xrizzle|darell.Xrizzle#yahoo.ca|206416|Example |Leroy |Example |W |
+------+----------------+-----------------------+------+-----------+------+-----------+------+
If you want to work with rdd you can use the following code and apply a map function that converts the respective columns:
x = sc.parallelize([['111135', 'Darell T Grizzle', 'darell.grizzle#yahoo.ca', '196416', 'Tallahassee', 'Leon', 'Tallahassee', 'M']])
customSchema = StructType([
StructField("EID",IntegerType()),\
StructField("Name",StringType()),\
StructField("email",StringType()),\
StructField("Salary",IntegerType()),\
StructField("PlaceName",StringType()),\
StructField("County",StringType()),\
StructField("City",StringType()),\
StructField("Gender",StringType())\
])
x = x.map(lambda fields: [int(fields[0]),fields[1],fields[2],int(fields[3]),fields[4],fields[5],fields[6],fields[7]]).collect()
myDF = spark.createDataFrame(x,customSchema)
myDF.show()
Output:
+------+----------------+--------------------+------+-----------+------+-----------+------+
| EID| Name| email|Salary| PlaceName|County| City|Gender|
+------+----------------+--------------------+------+-----------+------+-----------+------+
|111135|Darell T Grizzle|darell.grizzle#ya...|196416|Tallahassee| Leon|Tallahassee| M|
+------+----------------+--------------------+------+-----------+------+-----------+------+
In case any one wants to do the same task using SparkSession, below is the code:
df = spark.read.option("header","true").schema(customSchema).csv("C:/sparkCourse/filetext/part-00000-646a1d36-8f75-4eee-b937-135e933ede7f-c000.csv")
But, any help using the sparkContext would really be appreciated.

Numpy error in printing a RDD in Spark with Ipython

I am trying to print a RDD using Spark in Ipython and when I do that I get this error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-4-77015cd18335> in <module>()
---> 24 print inputData.collect()
25
26
/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/pyspark/rdd.pyc in collect(self)
771 """
772 with SCCallSiteSync(self.context) as css:
--> 773 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
774 return list(_load_from_socket(port, self._jrdd_deserializer))
775
/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/pyspark/sql/utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 7.0 failed 1 times, most recent failure: Lost task 0.0 in stage 7.0 (TID 56, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 98, in main
command = pickleSer._read_with_length(infile)
File "/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 164, in _read_with_length
return self.loads(obj)
File "/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 421, in loads
return pickle.loads(obj)
File "/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/mllib/__init__.py", line 25, in <module>
ImportError: No module named numpy
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1280)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1268)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1267)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1267)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1493)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1455)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1444)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1813)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1826)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1839)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1910)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:905)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.collect(RDD.scala:904)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:373)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 98, in main
command = pickleSer._read_with_length(infile)
File "/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 164, in _read_with_length
return self.loads(obj)
File "/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 421, in loads
return pickle.loads(obj)
File "/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/mllib/__init__.py", line 25, in <module>
ImportError: No module named numpy
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:138)
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:179)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:97)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
... 1 more
My current code is:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
import os.path
import numpy as np
print np.version.version
def extract(line):
return (line[1])
inputPath = os.path.join('file1.csv')
fileName = os.path.join(inputPath)
Data = sc.textFile(fileName).zipWithIndex().filter(lambda (line,rownum): rownum>0).map(lambda (line, rownum): line)
inputData = (Data
.map(lambda line: line.split(";"))
.filter(lambda line: len(line) >1 )
.map(extract)) # Map to tuples
# error comes a this line
print inputData.collect()
I have numpy already installed (sudo apt-get install python-numpy) and can print numpy version in Ipython using numpy.version.version
Why is this error coming and how to resolve it?
NOTE 1: My current bash_profile:
# Set the Spark Home as an environment variable.
export SPARK_HOME="$HOME/spark-1.5.0-bin-hadoop2.6"
# Define your Spark arguments for when running Spark.
export PYSPARK_SUBMIT_ARGS="--master local[2]"
# IPython alias for the use with SPARK.
alias IPYSPARK='PYSPARK_DRIVER_PYTHON=ipython PYSPARK_DRIVER_PYTHON_OPTS="notebook --profile=pyspark --ip=0.0.0.0" $SPARK_HOME/bin/pyspark'
I have also added following to my spark-env.sh.template file:
#!/usr/bin/env bash
# This file is sourced when running various Spark programs.
export PYSPARK_PYTHON=/usr/bin/python2.7
export PYSPARK_DRIVER_PYTHON=/usr/bin/ipython
NOTE 2: I am launching the Ipython notebook from inside a virtual environment.
NOTE 3: I have Spark 1.5.0 and numpy 1.8.2
UPDATE: output from sc.parallelize([],1).mapPartitions(lambda _: [(sys.executable, sys.path)]).first()
('/home/vagrant/pyEnv/bin/python2.7', ['', u'/tmp/spark-dbbcfd0b-413e-4406-8bd5-37de29d3fcc5/userFiles-6296ba2d-4ec5-4956-9904-828bda0c6424', '/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/pyspark.zip', '/home/vagrant/spark-1.5.0-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip', '/home/vagrant/spark-1.5.0-bin-hadoop2.6/lib/spark-assembly-1.5.0-hadoop2.6.0.jar', '/home/vagrant/spark-1.5.0-bin-hadoop2.6/python', '/home/vagrant', '/home/vagrant/pyEnv/lib/python2.7', '/home/vagrant/pyEnv/lib/python2.7/plat-x86_64-linux-gnu', '/home/vagrant/pyEnv/lib/python2.7/lib-tk', '/home/vagrant/pyEnv/lib/python2.7/lib-old', '/home/vagrant/pyEnv/lib/python2.7/lib-dynload', '/usr/lib/python2.7', '/usr/lib/python2.7/plat-x86_64-linux-gnu', '/usr/lib/python2.7/lib-tk', '/home/vagrant/pyEnv/local/lib/python2.7/site-packages', '/home/vagrant/pyEnv/lib/python2.7/site-packages'])