Unable to fetch data from Presto SQL (Trino) using pySpark - ssl

I have a pyspark job that I run on AWS Glue.
The code is running fine when I run it through my local Machine.
But when I try to run the same code from AWS Glue I am not able to fetch data.
Below is my code and error message.
From the output you will notice that I am able to get schema information. Error happens as soon as I try to get the data.
Update :
The issue is happening as the worker node are not able to access the Keystore present in the Master node. Can someone help how to copy the file to child node or how to make the file accessible to the child nodes?
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
import ssl
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType,StructField, StringType
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
sqlCont = SQLContext(spark)
job.init(args['JOB_NAME'], args)
query = "SELECT * FROM test.employee where id='1001'"
s3_client = boto3.client('s3')
s3_client.download_file('bucket1', 'rootca_ca.jks', '/tmp/rootca_ca.jks')
conparms_r = glueContext.extract_jdbc_conf("presto_test", catalog_id = None)
source_df = sqlCont.read.format("jdbc").option("driver","io.prestosql.jdbc.PrestoDriver").option("url", "jdbc:presto://test-db.net:18000/hive").option("query", query).option("user", conparms_r['user']).option("password", conparms_r['password']).option("SSL", True).option("SSLKeyStorePath","/tmp/rootca_ca.jks").option("SSLKeyStorePassword","test12").load()
print("************************************source_df SUCCESSFULLY CREATED !!!!!!!!!!!!!!!!!*****************************************")
source_df.printSchema()
source_df.show(5)
Output:
************************************source_df SUCCESSFULLY CREATED !!!!!!!!!!!!!!!!!*****************************************
root
|-- lineage_key: long (nullable = true)
|-- agreement_id: string (nullable = true)
|-- termination_date: timestamp (nullable = true)
Traceback (most recent call last):
File "/tmp/pytest", line 51, in <module>
source_df.show(5)
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 378, in show
print(self._jdf.showString(n, 20, vertical))
File "/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o91.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, 100.64.188.253, executor 1): java.sql.SQLException: Error setting up SSL: /tmp/rootca_ca.jks (No such file or directory)
at io.prestosql.jdbc.PrestoDriverUri.setupClient(PrestoDriverUri.java:235)
at io.prestosql.jdbc.PrestoDriver.connect(PrestoDriver.java:88)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:63)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:54)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(JDBCRDD.scala:272)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: io.prestosql.jdbc.$internal.client.ClientException: Error setting up SSL: /tmp/rootca_ca.jks (No such file or directory)
at io.prestosql.jdbc.$internal.client.OkHttpUtil.setupSsl(OkHttpUtil.java:241)
at io.prestosql.jdbc.PrestoDriverUri.setupClient(PrestoDriverUri.java:203)
... 23 more

I was able to solve this issue by passing the certificate location as job parameter --extra-files
and referring in the code as below:
source_df = sqlCont.read.format("jdbc").option("driver","io.prestosql.jdbc.PrestoDriver").option("url", "jdbc:presto://test-db.net:18000/hive").option("query", query).option("user", conparms_r['user']).option("password", conparms_r['password']).option("SSL", True).option("SSLKeyStorePath","./rootca_ca.jks").option("SSLKeyStorePassword","test12").load()

Related

Error in AWS Lambda function while reading from S3

I am trying to read an excel file from S3 bucket. Here is my Lambda function code but it throws syntax error for any statement after I read the byte stream into a dataframe using pd.read_excel.
I am unable to figure out the issue as syntax looks fine to me. Is there an issue with reading the data? Kindly help.
import json
import boto3
import pandas as pd
import io
def lambda_handler(event, context):
s3 = boto3.client("s3")
s3_resource = boto3.resource("s3")
if event:
s3_records = event["Records"][0]
bucket_name = str(s3_records["s3"]["bucket"]["name"])
file_name = str(s3_records["s3"]["object"]["key"])
file_obj = s3.get_object(Bucket=bucket_name, Key=file_name)
file_content = file_obj["Body"].read()
df = pd.read_excel(io.BytesIO(file_content, engine='xlrd')
return {
'statusCode': 200,
'body': json.dumps('Hello from Lambda!')
}
Here is the log:
[ERROR] Runtime.UserCodeSyntaxError: Syntax error in module 'lambda_function': invalid syntax (lambda_function.py, line 23)
Traceback (most recent call last):
  File "/var/task/lambda_function.py" Line 23
        return {
It seems you're missing closing parenthesis just before the return statement, it should be this:
df = pd.read_excel(io.BytesIO(file_content, engine='xlrd'))
instead of this
df = pd.read_excel(io.BytesIO(file_content, engine='xlrd')

from python aws-lambda how to read files on S3 bucket using GeoPandas reader?

There is a lot of literature on the topic but none of those I could find use a GeoPandas reader.
My code purpose is to identify if a point is located into a polygone described in a .shp file stored in S3. It's then expected to return a boolean True or False.
I use python-lambda-local python module to test my python script located on PyCharm.
import geopandas as gpd
from geopandas.geoseries import *
import boto3
from io import BytesIO
def search(event, context):
dep = event['Dep']
arr = event['Arr']
point_1 = GeoSeries(dep)
point_2 = GeoSeries(arr)
s3 = boto3.client("s3")
bucket = "mybucket"
obj_key = "filename.shp"
# bytes_buffer = BytesIO()
# client.download_fileobj(Bucket=bucket, Key=obj_key, Fileobj=bytes_buffer)
obj = s3.download_file(Bucket=bucket, Key="filename.shp", Filename=obj_key)
geo = obj['body'].read().decode('ISO-8859-9')
# geo = bytes_buffer.get_key(obj_key).get_contents_as_string()
answer = gpd.read_file(geo)
print(answer)
As you can see in the code, I tried a few different lines to use IO and the reader() in different ways. Always unsuccessfully though.
#And this is the error message:#
MacBook-Pro:IdPolygons me$ python-lambda-local -l lib/ -f search -t 4 IdAircraft.py event.json
*This is the point I'm trying to identify inside or outside the polygon:*
[root - INFO - 2019-12-24 07:33:54,388] Event: {'Dep': '(40.7128, 74.0060)', 'Arr': '(48.8566, 2.3522)'}
[root - INFO - 2019-12-24 07:33:54,388] START RequestId: 629c76c7-1008-40cc-8c09-6c5dd3877125 Version:
[botocore.credentials - INFO - 2019-12-24 07:33:54,923] Found credentials in shared credentials file: ~/.aws/credentials stored
[root - INFO - 2019-12-24 07:33:55,576] END RequestId: 629c76c7-1008-40cc-8c09-6c5dd3877125
[root - INFO - 2019-12-24 07:33:55,577] REPORT RequestId: 629c76c7-1008-40cc-8c09-6c5dd3877125 Duration: 663.91 ms
[root - INFO - 2019-12-24 07:33:55,577] RESULT:
{
"errorMessage": "'NoneType' object has no attribute 'startswith'",
"stackTrace": [
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/lambda_local/main.py\", line 153, in execute\n result = func(event, context._activate())\n",
" File \"IdAircraft.py\", line 30, in search\n df1 = gpd.read_file(obj)\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/geopandas/io/file.py\", line 77, in read_file\n with reader(path_or_bytes, **kwargs) as features:\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fiona/env.py\", line 397, in wrapper\n return f(*args, **kwargs)\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fiona/__init__.py\", line 249, in open\n path = parse_path(fp)\n",
" File \"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fiona/path.py\", line 132, in parse_path\n elif path.startswith('/vsi'):\n"
],
"errorType": "AttributeError"
}
Thank you for taking the time.

Getting sum by grouping other column

I have a dataframe as follows
Occupation, Genre, Rating
I have taken sum of all rating as totalRating. Now I want to create neeew column w_rating which take (rating >3)/totalRating for particular Occupation,Genre Combination. My dataframe name is joinedRDD so i amwriting below query
resultDF = joinedDF3.groupby([joinedDF3["Occupation"],joinedDF3["Genre"]]).withColumn(wa_rating, sum(Rating>3)/totalRating).collect()
but it is showing error
AttributeError: 'GroupedData' object has no attribute 'withColumn'
So it is clear from error that we cannot use withColumn with groupby
So my question is how to do it?
Below is my updated code.
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructField,StructType,IntegerType,StringType)
from pyspark.sql import Row
from pyspark.sql.functions import sum
import pyspark.sql.functions as F
from pyspark.sql.functions import lit
spark = SparkSession.builder.appName("Movielens Analysis").getOrCreate()
def refineMovieDF(row):
genre=[]
movieData =row[0].split("|")
for i in range(len(movieData)-5):
if int(movieData[i+5]) ==1:
genre.append((int(movieData[0]),i))
return genre
ratingSchema =StructType(fields=[StructField("UserId",IntegerType(),True),StructField("MovieId",IntegerType(),True),StructField("Rating",IntegerType(),True),StructField("TimeStamp",IntegerType(),True)])
ratingsDF = spark.read.load("ml-100k/u.data", format="csv",sep="\t", inferSchema=True, header=False,schema=ratingSchema)
genreSchema =StructType(fields=[StructField("Genre",StringType(),True),StructField("GenreId",IntegerType(),True)])
genreDF = spark.read.load("ml-100k/u.genre",format="csv",sep="|",inferSchema=True, header=False,schema=genreSchema)
userSchema =StructType(fields=[StructField("UserId",IntegerType(),True),StructField("Age",IntegerType(),True),StructField("Gender",StringType(),True),StructField("Occupation",StringType(),True),StructField("ZipCode",IntegerType(),True)])
usersDF = spark.read.load("ml-100k/u.user",format="csv",sep="|",inferSchema=True, header=False,schema=userSchema)
movieSchema =StructType(fields=[StructField("MovieRow",StringType(),True)])
movieDF = spark.read.load("ml-100k/u.item",format="csv",inferSchema=True, header=False,schema=movieSchema)
movieRefinedRDD = movieDF.rdd.flatMap(refineMovieDF)
movieSchema =StructType(fields=[StructField("MovieId",IntegerType(),True),StructField("GenreId",IntegerType(),True)])
movieRefinedDf = spark.createDataFrame(movieRefinedRDD, movieSchema)
joinedDF1 = ratingsDF.join(usersDF,ratingsDF.UserId==usersDF.UserId).select(usersDF["Occupation"],ratingsDF["Rating"],ratingsDF["MovieId"])
joinedDF3 = joinedDF1.join(joinedDF2,joinedDF1.MovieId == joinedDF2.MovieId).select(joinedDF1["Occupation"],joinedDF1["Rating"],joinedDF2["Genre"])
totalRating = joinedDF3.groupBy().sum("Rating").collect()
resultDF = joinedDF3.groupby([joinedDF3["Occupation"],joinedDF3["Genre"]]).agg((sum(joinedDF3["Rating"]>3)/totalRating).alias(wa_rating)).collect()
print(resultDF)
Now I am getting below error.
2019-08-06 22:24:20 INFO BlockManagerInfo:54 - Removed broadcast_11_piece0 on 10.0.2.15:58903 in memory (size: 4.3 KB, free: 413.8 MB)
Traceback (most recent call last):
File "/home/cloudera/workspace/MovielensAnalysis.py", line 59, in <module>
resultDF = joinedDF3.groupby([joinedDF3["Occupation"],joinedDF3["Genre"]]).agg((sum(joinedDF3["Rating"]>3)/totalRating).alias(wa_rating)).collect()
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/sql/column.py", line 116, in _
File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o129.divide.: java.lang.RuntimeException: Unsupported literal type class java.util.ArrayList [[572536]]

Twisted error in server with mqtt client

I'm running a Twisted TCP server. Each protocol instance has an mqtt pub/sub client. I've reduced the actual production code to the simplest possible form below. I've stripped out a lot of irrelevant complexity to simplify the bug-finding process. The server works for multiple client connections and produces the data received from the mqtt client, but after any client connects/disconnects/reconnects a few times I get an exception that I haven't been able to understand or trap. I'm hoping that Jean-Paul or someone can point me at the error of my ways.
Once the exception occurs, no new clients can connect to the server. Each new connect attempt produces the exception.
Clients that are already connected continue to receive data ok.
The exception is
Unhandled Error
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/python/log.py", line 73, in callWithContext
return context.call({ILogContext: newCtx}, func, *args, **kw)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/posixbase.py", line 614, in _doReadOrWrite
why = selectable.doRead()
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/internet/tcp.py", line 1069, in doRead
transport = self.transport(skt, protocol, addr, self, s, self.reactor)
File "/usr/lib/python2.7/dist-packages/twisted/internet/tcp.py", line 786, in __init__
self.startReading()
File "/usr/lib/python2.7/dist-packages/twisted/internet/abstract.py", line 429, in startReading
self.reactor.addReader(self)
File "/usr/lib/python2.7/dist-packages/twisted/internet/epollreactor.py", line 256, in addReader
_epoll.EPOLLIN, _epoll.EPOLLOUT)
File "/usr/lib/python2.7/dist-packages/twisted/internet/epollreactor.py", line 240, in _add
self._poller.modify(fd, flags)
exceptions.IOError: [Errno 2] No such file or directory
The basic server code is:
(this example will run and does generate the exception)
from twisted.internet import reactor
from twisted.internet.protocol import Factory
from twisted.protocols import basic
from paho.mqtt import client # The most recent version of the legacy Mosquitto client
from random import randint
class MsgReceiver(basic.LineReceiver):
def __init__(self, factory): # new (factory)
self.factory = factory # new (factory)
def connectionMade(self):
self.mqClient = self.startMQ()
if self.mqClient:
self.factory.clients.append(self)
else:
self.transport.loseConnection()
def connectionLost(self, reason):
pass
def lineReceived(self, line):
pass
def on_message(self, mosq, obj, msg):
try:
self.sendLine(msg.payload)
except Exception, err:
print(err.message)
def startMQ(self):
mqName = "-".join(["myAppName", str(randint(0, 99999))])
mqClient = client.Client(mqName)
if mqClient.connect("localhost", 1883, 60) != 0:
print('Could not connect to mq server')
return False
mqClient.on_message = self.on_message
(success, mid) = mqClient.subscribe("myTopic", 0)
if success != 0:
return False
mqClient.loop_start()
return mqClient
class MsgReceiverFactory(Factory):
allow_reuse_address = True
def __init__(self, clients):
self.clients = clients
def buildProtocol(self, addr):
return MsgReceiver(self)
if __name__ == "__main__":
try:
clients = []
reactor.listenTCP(43217, MsgReceiverFactory(clients))
reactor.run()
except Exception, err:
print(err.message)
if reactor.running:
reactor.stop()
A simple client that will induce the error when run twice (the first time it runs fine):
Interesting that if I enable the time.sleep(3) it runs fine and doesn't seem to induce the error
#!/usr/bin/python
from __future__ import print_function
import socket
import time
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(("localhost", 43217))
data = s.recv(1024)
print(data)
#time.sleep(3)
s.close()

weblogic wlst findService NameError

I have the following wlst script:
import wlstModule
from com.bea.wli.sb.management.configuration import SessionManagementMBean
from com.bea.wli.sb.management.configuration import ALSBConfigurationMBean
from com.bea.wli.config import Ref
#=======================================================================================
# Utility function to read a binary file
#=======================================================================================
def readBinaryFile(fileName):
file = open(fileName, 'rb')
bytes = file.read()
return bytes
#=======================================================================================
# Utility function to create an arbitrary session name
#=======================================================================================
def createSessionName():
sessionName = String("SessionScript"+Long(System.currentTimeMillis()).toString())
return sessionName
def getSessionManagementMBean(sessionName):
SessionMBean = findService("SessionManagement", "com.bea.wli.sb.management.configuration.SessionManagementMBean")
SessionMBean.createSession(sessionName)
return SessionMBean
SessionMBean = None
importJar='C:\\OSB_PROJECT.jar'
theBytes = readBinaryFile(importJar)
sessionName = createSessionName()
SessionMBean = getSessionManagementMBean(sessionName)
The result is an error:
wls:/offline> execfile('C:\script.py') Traceback (innermost last):
File "", line 1, in ? File "C:\script.py", line 31, in ?
File "C:\script.py", line 22, in get SessionManagementMBean
NameError: findService
How can I fix this?
Are you ever connecting to your server and accessing the domain runtime? You should be doing something like the following:
connect("weblogic", "weblogic", "t3://localhost:7001")
domainRuntime()
# obtain session management mbean to create a session.
# This mbean instance can be used more than once to
# create/discard/commit many sessions
sessionMBean = findService(SessionManagementMBean.NAME,SessionManagementMBean.TYPE)
See more here:
http://docs.oracle.com/cd/E13171_01/alsb/docs25/javadoc/com/bea/wli/sb/management/configuration/SessionManagementMBean.html