I am trying to read an excel file from S3 bucket. Here is my Lambda function code but it throws syntax error for any statement after I read the byte stream into a dataframe using pd.read_excel.
I am unable to figure out the issue as syntax looks fine to me. Is there an issue with reading the data? Kindly help.
import json
import boto3
import pandas as pd
import io
def lambda_handler(event, context):
s3 = boto3.client("s3")
s3_resource = boto3.resource("s3")
if event:
s3_records = event["Records"][0]
bucket_name = str(s3_records["s3"]["bucket"]["name"])
file_name = str(s3_records["s3"]["object"]["key"])
file_obj = s3.get_object(Bucket=bucket_name, Key=file_name)
file_content = file_obj["Body"].read()
df = pd.read_excel(io.BytesIO(file_content, engine='xlrd')
return {
'statusCode': 200,
'body': json.dumps('Hello from Lambda!')
}
Here is the log:
[ERROR] Runtime.UserCodeSyntaxError: Syntax error in module 'lambda_function': invalid syntax (lambda_function.py, line 23)
Traceback (most recent call last):
File "/var/task/lambda_function.py" Line 23
return {
It seems you're missing closing parenthesis just before the return statement, it should be this:
df = pd.read_excel(io.BytesIO(file_content, engine='xlrd'))
instead of this
df = pd.read_excel(io.BytesIO(file_content, engine='xlrd')
Related
I am trying to gather historical prices/data from Deribit using Pycharm and Spyder but I keep getting errors. I used the code below from the following website:
https://www.codearmo.com/python-tutorial/crypto-algo-trading-historical-data1
If anyone has a suggested fix that would be a huge help. I am relatively new to coding.
Thanks.
import asyncio
import websockets
import json
import pandas as pd
import datetime as dt
async def call_api(msg):
async with websockets.connect('wss://test.deribit.com/ws/api/v2') as websocket:
await websocket.send(msg)
while websocket.open:
response = await websocket.recv()
return response
def async_loop(api, message):
return asyncio.get_event_loop().run_until_complete(api(message))
def retrieve_historic_data(start, end, instrument, timeframe):
msg = \
{
"jsonrpc": "2.0",
"id": 833,
"method": "public/get_tradingview_chart_data",
"params": {
"instrument_name": instrument,
"start_timestamp": start,
"end_timestamp": end,
"resolution": timeframe
}
}
resp = async_loop(call_api, json.dumps(msg))
return resp
def json_to_dataframe(json_resp):
res = json.loads(json_resp)
df = pd.DataFrame(res['result'])
df['ticks'] = df.ticks / 1000
df['timestamp'] = [dt.datetime.fromtimestamp(date) for date in df.ticks]
return df
if __name__ == '__main__':
start = 1554373800000
end = 1554376800000
instrument = "BTC-PERPETUAL"
timeframe = '1'
json_resp = retrieve_historic_data(start, end, instrument, timeframe)
df = json_to_dataframe(json_resp)
print(df.head())
Console Message:
/Users/macbookair/PycharmProjects/untitled/venv/bin/python /Users/macbookair/PycharmProjects/Deribit01/Deribit_Options_01.py
Traceback (most recent call last):
File "/Users/macbookair/PycharmProjects/Deribit01/Deribit_Options_01.py", line 2, in <module>
import websockets
File "/Users/macbookair/PycharmProjects/untitled/venv/lib/python3.8/site-packages/websockets/__init__.py", line 4, in <module>
from .client import * # noqa
File "/Users/macbookair/PycharmProjects/untitled/venv/lib/python3.8/site-packages/websockets/client.py", line 20, in <module>
asyncio.get_event_loop().run_until_complete(call_api(json.dumps(msg)))
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/base_events.py", line 616, in run_until_complete
return future.result()
File "/Users/macbookair/PycharmProjects/untitled/venv/lib/python3.8/site-packages/websockets/client.py", line 13, in call_api
async with websockets.connect('wss://test.deribit.com/ws/api/v2') as websocket:
AttributeError: partially initialized module 'websockets' has no attribute 'connect' (most likely due to a circular import)
Process finished with exit code 1
I have a dataframe as follows
Occupation, Genre, Rating
I have taken sum of all rating as totalRating. Now I want to create neeew column w_rating which take (rating >3)/totalRating for particular Occupation,Genre Combination. My dataframe name is joinedRDD so i amwriting below query
resultDF = joinedDF3.groupby([joinedDF3["Occupation"],joinedDF3["Genre"]]).withColumn(wa_rating, sum(Rating>3)/totalRating).collect()
but it is showing error
AttributeError: 'GroupedData' object has no attribute 'withColumn'
So it is clear from error that we cannot use withColumn with groupby
So my question is how to do it?
Below is my updated code.
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructField,StructType,IntegerType,StringType)
from pyspark.sql import Row
from pyspark.sql.functions import sum
import pyspark.sql.functions as F
from pyspark.sql.functions import lit
spark = SparkSession.builder.appName("Movielens Analysis").getOrCreate()
def refineMovieDF(row):
genre=[]
movieData =row[0].split("|")
for i in range(len(movieData)-5):
if int(movieData[i+5]) ==1:
genre.append((int(movieData[0]),i))
return genre
ratingSchema =StructType(fields=[StructField("UserId",IntegerType(),True),StructField("MovieId",IntegerType(),True),StructField("Rating",IntegerType(),True),StructField("TimeStamp",IntegerType(),True)])
ratingsDF = spark.read.load("ml-100k/u.data", format="csv",sep="\t", inferSchema=True, header=False,schema=ratingSchema)
genreSchema =StructType(fields=[StructField("Genre",StringType(),True),StructField("GenreId",IntegerType(),True)])
genreDF = spark.read.load("ml-100k/u.genre",format="csv",sep="|",inferSchema=True, header=False,schema=genreSchema)
userSchema =StructType(fields=[StructField("UserId",IntegerType(),True),StructField("Age",IntegerType(),True),StructField("Gender",StringType(),True),StructField("Occupation",StringType(),True),StructField("ZipCode",IntegerType(),True)])
usersDF = spark.read.load("ml-100k/u.user",format="csv",sep="|",inferSchema=True, header=False,schema=userSchema)
movieSchema =StructType(fields=[StructField("MovieRow",StringType(),True)])
movieDF = spark.read.load("ml-100k/u.item",format="csv",inferSchema=True, header=False,schema=movieSchema)
movieRefinedRDD = movieDF.rdd.flatMap(refineMovieDF)
movieSchema =StructType(fields=[StructField("MovieId",IntegerType(),True),StructField("GenreId",IntegerType(),True)])
movieRefinedDf = spark.createDataFrame(movieRefinedRDD, movieSchema)
joinedDF1 = ratingsDF.join(usersDF,ratingsDF.UserId==usersDF.UserId).select(usersDF["Occupation"],ratingsDF["Rating"],ratingsDF["MovieId"])
joinedDF3 = joinedDF1.join(joinedDF2,joinedDF1.MovieId == joinedDF2.MovieId).select(joinedDF1["Occupation"],joinedDF1["Rating"],joinedDF2["Genre"])
totalRating = joinedDF3.groupBy().sum("Rating").collect()
resultDF = joinedDF3.groupby([joinedDF3["Occupation"],joinedDF3["Genre"]]).agg((sum(joinedDF3["Rating"]>3)/totalRating).alias(wa_rating)).collect()
print(resultDF)
Now I am getting below error.
2019-08-06 22:24:20 INFO BlockManagerInfo:54 - Removed broadcast_11_piece0 on 10.0.2.15:58903 in memory (size: 4.3 KB, free: 413.8 MB)
Traceback (most recent call last):
File "/home/cloudera/workspace/MovielensAnalysis.py", line 59, in <module>
resultDF = joinedDF3.groupby([joinedDF3["Occupation"],joinedDF3["Genre"]]).agg((sum(joinedDF3["Rating"]>3)/totalRating).alias(wa_rating)).collect()
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/sql/column.py", line 116, in _
File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o129.divide.: java.lang.RuntimeException: Unsupported literal type class java.util.ArrayList [[572536]]
On flow tutorial_8, I tried another scenario: SimpleGridScenario
I already check the grid.py, and followed the rules for the parameters.
My origianl program is that:
from flow.envs import Env
from gym.spaces.box import Box
from gym.spaces.tuple_space import Tuple
from flow.controllers import IDMController, ContinuousRouter,GridRouter
from flow.core.experiment import Experiment
from flow.core.params import SumoParams, EnvParams, \
InitialConfig, NetParams
from flow.core.params import VehicleParams
from flow.scenarios.loop import LoopScenario, ADDITIONAL_NET_PARAMS
from flow.scenarios.grid import SimpleGridScenario
from flow.scenarios import SimpleGridScenario
import numpy as np
from flow.envs import myEnv
ADDITIONAL_ENV_PARAMS={
"max_accel":1,
"max_decel":1,
}
sumo_params = SumoParams(sim_step=0.1,render=True)
vehicles=VehicleParams()
vehicles.add(
veh_id="idm",
acceleration_controller=(IDMController,{}),
routing_controller=(GridRouter,{}),
num_vehicles=22
)
env_params = EnvParams(additional_params=ADDITIONAL_ENV_PARAMS)
additional_net_params = ADDITIONAL_NET_PARAMS.copy()
net_params = NetParams(additional_params=additional_net_params)
initial_config = InitialConfig(bunching=20)
scenario = SimpleGridScenario(
name = 'grid',
vehicles = vehicles,
net_params = NetParams(
additional_params={
'grid_array':{
'row_num':3,
'col_num':2,
'inner_length':500,
'short_length':500,
'long_length':500,
'cars_top':20,
'cars_bot':20,
'cars_left':20,
'cars_right':20,
},
'horizontal_lanes':1,
'vertical_lanes':1,
'speed_limit':{
'vertical':35,
'horizontal':35
}
},
no_internal_links=False
),
initial_config = initial_config
)
env = myEnv(env_params, sumo_params, scenario)
exp = Experiment(env)
_ = exp.run(1, 1500)
And then I ran that, there is an error, the error log is:
(flow) dnl#dnl-Iiyama:~/flow$ python Tutorial_9_1.py
Loading configuration... done.
Success.
Loading configuration... done.
Error in edge length with key bot3_2
Error in edge length with key bot3_2
Traceback (most recent call last):
File "Tutorial_9_1.py", line 72, in <module>
_ = exp.run(1, 1500)
File "/home/dnl/flow/flow/core/experiment.py", line 118, in run
state = self.env.reset()
File "/home/dnl/flow/flow/envs/base_env.py", line 483, in reset
speed=speed)
File "/home/dnl/flow/flow/core/kernel/vehicle/traci.py", line 990, in add
[i for i in range(num_routes)], size=1, p=frac)[0])
File "mtrand.pyx", line 1126, in mtrand.RandomState.choice
ValueError: a must be non-empty
I want to why i am wrong ?
I try to test another scenario expect the LoopScenario. But it doesn't work
Please help.
I'm running a Python script to query an AWS-S3 bucket using the AWS-S3-Select tool. I'm importing a variable from a txt file and want to pass it into the S3-Select query. I also want to count all imported variable recurrences (within a specified column) by querying the entire S3 directory instead of just a single file.
This is what I have so far:
import boto3
from boto3.session import Session
with open('txtfile.txt', 'r') as myfile:
variable = myfile.read()
ACCESS_KEY='accessKey'
SECRET_KEY='secredtKey'
session = Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3b = session.client('s3')
r = s3b.select_object_content(
Bucket='s3BucketName',
Key='directory/fileName',
ExpressionType='SQL',
Expression="'select count(*)from S3Object s where s.columnName = %s;', [variable]",
InputSerialization={'CSV': {"FileHeaderInfo": "Use"}},
OutputSerialization={'CSV': {}},
)
for event in r['Payload']:
if 'Records' in event:
records = event['Records']['Payload'].decode('utf-8')
print(records)
elif 'Stats' in event:
statsDetails = event['Stats']['Details']
print("Stats details bytesScanned: ")
When I run this script I get back the following error:
Traceback (most recent call last):
File "s3_query.py", line 20, in <module>
OutputSerialization={'CSV': {}},
File "/root/anaconda3/lib/python3.6/site-packages/botocore/client.py", line 314, in _api_call
return self._make_api_call(operation_name, kwargs)
File "/root/anaconda3/lib/python3.6/site-packages/botocore/client.py", line 612, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.exceptions.ClientError: An error occurred (ParseUnexpectedToken) when calling the SelectObjectContent operation: Unexpected token found COMMA:',' at line 1, column 67.
This line looks quite strange:
Expression="'select count(*)from S3Object s where s.columnName = %s;', [variable]"
That is not normal SQL or Python syntax.
You should probably use:
Expression='select count(*)from S3Object s where s.columnName = %s;' % [variable]
I am trying to get the result of a search query from splunk. But when i try to get the session key, i am getting the following error.
Traceback (most recent call last):
File "splunkenter.py", line 18, in <module>
sessionkey = minidom.parseString(servercontent).getElementsByTagName('sessionKey')[0].childNodes[0].nodeValue
IndexError: list index out of range
splunkenter.py:
import urllib
import httplib2
import time
import re from time
import localtime,strftime
from xml.dom import minidom
import json
baseurl = 'abc.haihd.com:8000'
username = 'xxxxx'
password = 'xxxxx'
myhttp = httplib2.Http()
#Step 1: Get a session key
servercontent = myhttp.request(baseurl + '/services/auth/login', 'POST', headers={}, body=urllib.urlencode({'username':username, 'password':password}))[1]
sessionkey =
minidom.parseString(servercontent).getElementsByTagName('sessionKey')[0].childNodes[0].nodeValue
print "====>sessionkey: %s <====" % sessionke
Can anybody tell me where is the problem lying. I am very new to API's.