Cannot read parquet files in s3 bucket with Pyspark 2.4.4 - amazon-s3

I am using Pyspark 2.4.4.
I want to load into a spark dataframe some parquet files that are in a s3 bucket and I want to read all these files at once.
I have been looking how to do it in these links:
How to read parquet data from S3 to spark dataframe Python?
Unable to read from s3 bucket using spark
https://gist.github.com/asmaier/5768c7cda3620901440a62248614bbd0
I have tried in multiple ways but I cannot load the files, I have tried for example:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd
import databricks.koalas as ks
import boto3
from boto3.session import Session
import botocore
from zipfile import ZipFile
import urllib
import datetime
import os
from s3fs import S3FileSystem
import dask.dataframe as dd
aws_region = 'ap-southeast-1'
# Create Spark config for our Kubernetes based cluster manager
sparkConf = SparkConf()
sparkConf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")
sparkConf.setAppName("spark")
sparkConf.set("spark.kubernetes.container.image", "<myimage>")
sparkConf.set("spark.kubernetes.container.image.pullSecrets", "<secret>")
sparkConf.set("spark.kubernetes.namespace", "spark")
sparkConf.set("spark.executor.instances", "3")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.memory", "512m")
sparkConf.set("spark.executor.memory", "512m")
sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
sparkConf.set("spark.driver.port", "29413")
sparkConf.set("spark.driver.host", "<HOST>")
sparkConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sparkConf.set("com.amazonaws.services.s3.enableV4", "true")
sparkConf.set("fs.s3a.access.key", "<mykey>")
sparkConf.set("fs.s3a.secret.key", "<mysecret>")
sparkConf.set("fs.s3a.connection.maximum", "100000")
# see https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region
sparkConf.set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")
# Initialize our Spark cluster, this will actually
# generate the worker nodes.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext
df = spark.read.parquet(f"s3a://<path>")
Also I have tried:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd
import databricks.koalas as ks
import boto3
from boto3.session import Session
import botocore
from zipfile import ZipFile
import urllib
import datetime
import os
from s3fs import S3FileSystem
import dask.dataframe as dd
aws_region = 'ap-southeast-1'
# Create Spark config for our Kubernetes based cluster manager
sparkConf = SparkConf()
sparkConf.setMaster("k8s://https://kubernetes.default.svc.cluster.local:443")
sparkConf.setAppName("spark")
sparkConf.set("spark.kubernetes.container.image", "<myimage>")
sparkConf.set("spark.kubernetes.container.image.pullSecrets", "<secret>")
sparkConf.set("spark.kubernetes.namespace", "spark")
sparkConf.set("spark.executor.instances", "3")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.memory", "512m")
sparkConf.set("spark.executor.memory", "512m")
sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
sparkConf.set("spark.driver.port", "29413")
sparkConf.set("spark.driver.host", "<HOST>")
# Initialize our Spark cluster, this will actually
# generate the worker nodes.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
hadoop_conf=sc._jsc.hadoopConfiguration()
aws_region = 'ap-southeast-1'
# see https://stackoverflow.com/questions/43454117/how-do-you-use-s3a-with-spark-2-1-0-on-aws-us-east-2
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("com.amazonaws.services.s3.enableV4", "true")
hadoop_conf.set("fs.s3a.access.key", "<KEY>")
hadoop_conf.set("fs.s3a.secret.key", "<SECRET>")
hadoop_conf.set("fs.s3a.connection.maximum", "100000")
# see https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region
hadoop_conf.set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")
import pyspark
date = datetime.datetime.today() - datetime.timedelta(days=2)
path = '<path>'
sql=pyspark.sql.SparkSession(sc)
sc.parquet("s3a://" + path)
But I have this error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-6-14c1e166e21f> in <module>
1 date = datetime.datetime.today() - datetime.timedelta(days=2)
----> 2 df = spark.read.parquet(f"s3a://cp-datadumps/MCF/2020/10/17/advances/advances.parquet_0_0_0.snappy.parquet")
/usr/local/spark/python/pyspark/sql/readwriter.py in parquet(self, *paths)
314 [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
315 """
--> 316 return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
317
318 #ignore_unicode_prefix
/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o209.parquet.
: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:547)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:392)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:355)
at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:644)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
... 30 more
I know that the path is correct because using das I am able to load the data:
`
storage_options = {
"key": "<MYKEY>",
"secret": "<MYSECRET>",
}
s3 = S3FileSystem(**storage_options)
s3.invalidate_cache()
df1 = dd.read_parquet(f"s3://<path>", storage_options=storage_options)

The issue is hidden at the end of the Java stacktrace and is independent from the file being Parquet. What is missing is the libraries that are needed for the S3A filesystem are not available.
java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found
You need to make sure that the hadoop-aws JAR is on the classpath. This JAR contains the class org.apache.hadoop.fs.s3a.S3AFileSystem which could not be found in the above code.
More information about these JARs can be found on https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html#Getting_Started

Related

boto3 waiter to check the file availability in s3 bucket

from __future__ import print_function
import urllib.parse
import boto3
import json
s3 = boto3.client('s3')
def lambda_handler(event, context):
# TODO implement
source_bucket = event['Records'][0]['s3']['bucket']['name']
object_key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'])
try:
waiter = s3.get_waiter('object_exists')
waiter.wait(Bucket=source_bucket, Key="<dirname>" + str(object_key),
WaiterConfig={
'Delay': 2,
'MaxAttempts': 5})
print("Object s3://{bucket}/{key} arrived!")
except Exception as e:
print(e)
print('Error getting object')
raise e

Error when joining dataframes using AWS Glue Container

I tried joining two sample dataframes using the code below :
from pyspark import SparkContext
from awsglue.context import GlueContext
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
inputDF = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": ["s3://bayu-wbi-test/customers.csv"]}, format = "csv")
DF1 = inputDF.toDF()
DF2 = inputDF.toDF()
DoubleDF = DF1.join(DF2,DF1.col0 == DF2.col0)
DoubleDF.show()
however i encounter this error when i run it in my Glue container :
An error was encountered:
An error occurred while calling o135.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: ResultStage 4 ($anonfun$withThreadLocalCaptured$1 at FutureTask.java:266) has failed the maximum allowable number of times: 4. Most recent failure reason: org.apache.spark.shuffle.FetchFailedException: Stream is corrupted at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:772) at org.apache.spark.storage.BufferReleasingInputStream.read(ShuffleBlockFetcherIterator.scala:845) at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) at java.io.BufferedInputStream.read(BufferedInputStream.java:265) at java.io.DataInputStream.readInt(DataInputStream.java:387) at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.readSize(UnsafeRowSerializer.scala:113) at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.next(UnsafeRowSerializer.scala:129) at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.next(UnsafeRowSerializer.scala:110) at scala.collection.Iterator$$anon$11.next(Iterator.scala:494) at scala.collection.Iterator$$anon$10.next(Iterator.scala:459) at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29) at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:40) at scala.collection.Iterator$$anon$10.next(Iterator.scala:459) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:351) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at
This container is running on my local machine and i've tried increasing the spark driver memory.
Thanks for the help.
Think it might be related to this issue:
https://issues.apache.org/jira/browse/SPARK-34790
According to this reported issue: https://github.com/delta-io/delta/issues/841
A possible workaround is to set
sparkConf.set("spark.sql.adaptive.fetchShuffleBlocksInBatch", "false")

delete s3 object using pyspark

i need delete object
import logging
import boto3
from botocore.exceptions import ClientError
def delete_object(bucket_name, object_name):
# Delete the object
s3 = boto3.client('s3')
try:
s3.delete_object(Bucket=bucket_name, Key=object_name)
except ClientError as e:
logging.error(e)
return False
return True
a = delete_object("dgaray-bucket","consolidado.dat")
generates error
Command failed with exit code 1
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
def delete_object(bucket_name, object_name):
# Delete the object
s3 = boto3.client('s3')
s3.delete_object(Bucket=bucket_name, Key=object_name)
a = delete_object("name-bucket","directory/file.dat")
I failed because of the spark session.

To read csv in google colaboratore

from google.colab import files
uploaded = files.upload()
import io
def ls(ruta = uploaded):
return [arch.name for arch in io.StringIO((ruta)) if arch.is_file()]
divisas = ls()
I have this error:
TypeError: initial_value must be str or None, not dict
from google.colab import files
uploaded = files.upload()
Import the google.colab library for file upload then upload the file and pass file name inside the pandas read_csv function
import io
import pandas as pd
df2 = pd.read_csv(io.BytesIO(uploaded['heart.csv']))
df2.head()
I need to include the file names in divisas list

"No space left on device" error and SIGTERM signal when using Spark LSH in EMR

Spark version-2.3.2
EMR - 5.19.0
8 Executors
Each executor - 5 core
What I am trying to do :-
1. millions of numpy features (.npy files) are there in s3.
2. Using spark download all the numpy feature vectors.
3. convert them to Spark Dataframes where the Dataframe schema is feature-name (string) and feature-vector (VectorUDT).
4. Then find the LSH using spark between the data frames.
code:-
import numpy as np
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.sql import SparkSession
from pyspark import SQLContext
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.types import StructField
from pyspark.sql.types import StructType
from pyspark.ml.linalg import VectorUDT
spark = SparkSession \
.builder \
.appName('test-app') \
.getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
features_rdd = sc. \
binaryFiles("s3://input_path/"). \
map(
lambda x: (
x[0].split("/")[-1].split(".")[0],
Vectors.dense(np.fromstring(x[1], dtype='<f4'))
)
)
FEATURES_RAW_SCHEMA = StructType([StructField("image_id", StringType(), True),
StructField("feature", VectorUDT(), True)
])
features_df = sqlContext.createDataFrame(features_rdd, FEATURES_RAW_SCHEMA)
brp = BucketedRandomProjectionLSH(inputCol="feature", outputCol="hashes", seed=12345, numHashTables=30,
bucketLength=1000)
model = brp.fit(features_df)
all_pairs_df = model.approxSimilarityJoin(features_df, features_df, 50.0, distCol="similarity_score").select(
F.col("datasetA.image_id").alias("image_id1"), F.col("datasetB.image_id").alias("image_id2"),
F.col("similarity_score"))
all_pairs_df.write.mode("overwrite").json(
"s3a://output")
Error:-
19/01/10 11:20:11 WARN TaskSetManager: Lost task 120.0 in stage 3.0 (TID 11, ip-ip1.ec2.internal, executor 5): java.io.IOException: No space left on device
at java.io.FileOutputStream.writeBytes(Native Method)
at java.io.FileOutputStream.write(FileOutputStream.java:326)
at org.apache.spark.storage.TimeTrackingOutputStream.write(TimeTrackingOutputStream.java:58)
at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:220)
at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:173)
at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:252)
at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.write(UnsafeSorterSpillWriter.java:133)
at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.spillIterator(UnsafeExternalSorter.java:498)
at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.spill(UnsafeExternalSorter.java:222)
at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.createWithExistingInMemorySorter(UnsafeExternalSorter.java:111)
at org.apache.spark.sql.execution.UnsafeKVExternalSorter.<init>(UnsafeKVExternalSorter.java:156)
at org.apache.spark.sql.execution.UnsafeFixedWidthAggregationMap.destructAndCreateExternalSorter(UnsafeFixedWidthAggregationMap.java:248)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.agg_doConsume_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage7.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$12$$anon$2.hasNext(WholeStageCodegenExec.scala:633)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
submit script:-
sudo -i spark-submit --verbose --deploy-mode cluster --num-executors 11 --executor-cores 5 --executor-memory 8G --driver-cores 5 --driver-memory 8G --conf spark.dynamicAllocation.enabled=false --conf spark.yarn.executor.memoryOverhead=2048 /home/hadoop/filename.py
In some cases the the executors dying with SIGTERM signal I can see in log files. What am I doing wrong here ?