How to pass columns as a JSON record to the API method using Pyspark? - api

I have a dataset where I segregate the records in 4K batch for a DF and then try to hit the API. The first part is working fine. The problem occurs when I try to hit the API. I want to hit the API in the form of JSON records. Therefore, I am passing a struct of values converted as JSON to the call_to_cust_bulk_api method. Inside that method, I pass to a custRequestBody and then send that data to the API. The code is below:
def call_to_cust_bulk_api(url, payload):
print("Calling Bulk API")
try:
print(payload)
token = get_token(tokenUrl, tokenBody)
custRequestBody = {
"Token": token,
"CustomerName": "",
"Object": "",
"Data": payload
}
# TODO: write code...
headers = {'content-type': 'application/json'}
print(":::::::jsn load::::")
response = requests.post(url, data=custRequestBody, headers=headers)
print(":::Response::::", response)
data = response.json()
return data
except Exception as e:
print('ExceptionInPushing ' + str(e))
df = spark.read.option("header", "true").csv(
".csv",
sep="~")
df = df.withColumn("uniqueID", lit("1"))
df = df.withColumn("row_num", row_number().over(
Window.partitionBy(col("uniqueID")).orderBy(col("uniqueID"))
))
total_count = df.count()
i = 1
while i < total_count:
rangeNum = i + 3999
print("Range Num:::")
print(rangeNum)
df1 = df.filter((col("row_num") >= i) & (col("row_num") <= rangeNum))
finalDF = df1.drop("row_num", "edl_timestamp", "uniqueID")
colsListToBePassed = finalDF.columns
print("finalDF count:::", finalDF.count())
finalDF = finalDF.repartition(finalDF.rdd.getNumPartitions()).withColumn("status_for_batch",
call_to_cust_bulk_api(policyUrl,
to_json(
struct(*colsListToBePassed))))
It gives me the below error:
Traceback (most recent call last):
File "/home/lumiq/IdeaProjects/pyspark_python_test/com/apitest3.py", line 116, in <module>
finalDF = finalDF.repartition(finalDF.rdd.getNumPartitions()).withColumn("status_for_batch",
File "/home/lumiq/Downloads/spark-3.2.1-bin-hadoop3.2/python/pyspark/sql/dataframe.py", line 2477, in withColumn
raise TypeError("col should be Column")
TypeError: col should be Column
So, I want to track the response as a new column and write this data frame later.
So, where am I going wrong?

Related

Error in AWS Lambda function while reading from S3

I am trying to read an excel file from S3 bucket. Here is my Lambda function code but it throws syntax error for any statement after I read the byte stream into a dataframe using pd.read_excel.
I am unable to figure out the issue as syntax looks fine to me. Is there an issue with reading the data? Kindly help.
import json
import boto3
import pandas as pd
import io
def lambda_handler(event, context):
s3 = boto3.client("s3")
s3_resource = boto3.resource("s3")
if event:
s3_records = event["Records"][0]
bucket_name = str(s3_records["s3"]["bucket"]["name"])
file_name = str(s3_records["s3"]["object"]["key"])
file_obj = s3.get_object(Bucket=bucket_name, Key=file_name)
file_content = file_obj["Body"].read()
df = pd.read_excel(io.BytesIO(file_content, engine='xlrd')
return {
'statusCode': 200,
'body': json.dumps('Hello from Lambda!')
}
Here is the log:
[ERROR] Runtime.UserCodeSyntaxError: Syntax error in module 'lambda_function': invalid syntax (lambda_function.py, line 23)
Traceback (most recent call last):
  File "/var/task/lambda_function.py" Line 23
        return {
It seems you're missing closing parenthesis just before the return statement, it should be this:
df = pd.read_excel(io.BytesIO(file_content, engine='xlrd'))
instead of this
df = pd.read_excel(io.BytesIO(file_content, engine='xlrd')

Deribit code not working for me.. Does anyone have some suggestions?

I am trying to gather historical prices/data from Deribit using Pycharm and Spyder but I keep getting errors. I used the code below from the following website:
https://www.codearmo.com/python-tutorial/crypto-algo-trading-historical-data1
If anyone has a suggested fix that would be a huge help. I am relatively new to coding.
Thanks.
import asyncio
import websockets
import json
import pandas as pd
import datetime as dt
async def call_api(msg):
async with websockets.connect('wss://test.deribit.com/ws/api/v2') as websocket:
await websocket.send(msg)
while websocket.open:
response = await websocket.recv()
return response
def async_loop(api, message):
return asyncio.get_event_loop().run_until_complete(api(message))
def retrieve_historic_data(start, end, instrument, timeframe):
msg = \
{
"jsonrpc": "2.0",
"id": 833,
"method": "public/get_tradingview_chart_data",
"params": {
"instrument_name": instrument,
"start_timestamp": start,
"end_timestamp": end,
"resolution": timeframe
}
}
resp = async_loop(call_api, json.dumps(msg))
return resp
def json_to_dataframe(json_resp):
res = json.loads(json_resp)
df = pd.DataFrame(res['result'])
df['ticks'] = df.ticks / 1000
df['timestamp'] = [dt.datetime.fromtimestamp(date) for date in df.ticks]
return df
if __name__ == '__main__':
start = 1554373800000
end = 1554376800000
instrument = "BTC-PERPETUAL"
timeframe = '1'
json_resp = retrieve_historic_data(start, end, instrument, timeframe)
df = json_to_dataframe(json_resp)
print(df.head())
Console Message:
/Users/macbookair/PycharmProjects/untitled/venv/bin/python /Users/macbookair/PycharmProjects/Deribit01/Deribit_Options_01.py
Traceback (most recent call last):
File "/Users/macbookair/PycharmProjects/Deribit01/Deribit_Options_01.py", line 2, in <module>
import websockets
File "/Users/macbookair/PycharmProjects/untitled/venv/lib/python3.8/site-packages/websockets/__init__.py", line 4, in <module>
from .client import * # noqa
File "/Users/macbookair/PycharmProjects/untitled/venv/lib/python3.8/site-packages/websockets/client.py", line 20, in <module>
asyncio.get_event_loop().run_until_complete(call_api(json.dumps(msg)))
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/base_events.py", line 616, in run_until_complete
return future.result()
File "/Users/macbookair/PycharmProjects/untitled/venv/lib/python3.8/site-packages/websockets/client.py", line 13, in call_api
async with websockets.connect('wss://test.deribit.com/ws/api/v2') as websocket:
AttributeError: partially initialized module 'websockets' has no attribute 'connect' (most likely due to a circular import)
Process finished with exit code 1

Pandas Making multiple HTTP requests

I have below code that reads from a csv file a number of ticker symbols into a dataframe.
Each ticker calls the Web Api returning a dafaframe df which is then attached to the last one until complete. The code works , but when a large number of tickers is used the code slows down tremendously. I understand I can use multiprocessing and threads to speed up my code but dont know where to start and what would be the most suited in my particular case.
What code should I use to get my data into a combined daframe in the fastest possible manner?
import pandas as pd
import numpy as np
import json
tickers=pd.read_csv("D:/verhuizen/pensioen/MULTI.csv",names=['symbol','company'])
read_str='https://financialmodelingprep.com/api/v3/income-statement/AAPL?limit=120&apikey=demo'
df = pd.read_json (read_str)
df = pd.DataFrame(columns=df.columns)
for ind in range(len(tickers)):
read_str='https://financialmodelingprep.com/api/v3/income-statement/'+ tickers['symbol'][ind] +'?limit=120&apikey=demo'
df1 = pd.read_json (read_str)
df=pd.concat([df,df1], ignore_index=True)
df.set_index(['date','symbol'], inplace=True)
df.sort_index(inplace=True)
df.to_csv('D:/verhuizen/pensioen/MULTI_out.csv')
The code provided works fine for smaller data sets, but when I use a large number of tickers (>4,000) at some point I get the below error. Is this because the web api gets overloaded or is there another problem?
Traceback (most recent call last):
File "D:/Verhuizen/Pensioen/Equity_Extractor_2021.py", line 43, in <module>
data = pool.starmap(download_data, enumerate(TICKERS, start=1))
File "C:\Users\MLUY\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 276, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "C:\Users\MLUY\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 657, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x00C33E30>'. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object")'
Process finished with exit code 1
It keeps giving the same error (for a larger amount of tickers)
code is exactly as provided:
def download_data(pool_id, symbols):
df = []
for symbol in symbols:
print("[{:02}]: {}".format(pool_id, symbol))
#do stuff here
read_str = BASEURL.format(symbol)
df.append(pd.read_json(read_str))
#df.append(pd.read_json(fake_data(symbol)))
return pd.concat(df, ignore_index=True)
It failed again with the pool.map, but one strange thing I noticed. Each time it fails it does so around 12,500 tickers (total is around 23,000 tickers) Similar error:
Traceback (most recent call last):
File "C:/Users/MLUY/AppData/Roaming/JetBrains/PyCharmCE2020.1/scratches/Equity_naive.py", line 21, in <module>
data = pool.map(download_data, TICKERS)
File "C:\Users\MLUY\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\MLUY\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 657, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x078D1BF0>'. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object")'
Process finished with exit code 1
I get the tickers also from a API call https://financialmodelingprep.com/api/v3/financial-statement-symbol-lists?apikey=demo (I noticed it does not work without subscription), I wanted to attach the data it as a csv file but I dont have sufficient rights. I dont think its a good idea to paste the returned data here...
I tried adding time.sleep(0.2) before return as suggested, but again I ge the same error at ticker 12,510. Strange everytime its around the same location. As there are multiple processes going on I cannot see at what point its breaking
Traceback (most recent call last):
File "C:/Users/MLUY/AppData/Roaming/JetBrains/PyCharmCE2020.1/scratches/Equity_naive.py", line 24, in <module>
data = pool.map(download_data, TICKERS)
File "C:\Users\MLUY\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\MLUY\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 657, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x00F32C90>'. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object")'
Process finished with exit code 1
Something very very strange is going on , I have split the data in chunks of 10,000 / 5,000 / 4,000 and 2,000 and each time the code breaks approx 100 tickers from the end. Clearly there is something going on that not right
import time
import pandas as pd
import multiprocessing
# get tickers from your csv
df=pd.read_csv('D:/Verhuizen/Pensioen/All_Symbols.csv',header=None)
# setting the Dataframe to a list (in total 23,000 tickers)
df=df[0]
TICKERS=df.tolist()
#Select how many tickers I want
TICKERS=TICKERS[0:2000]
BASEURL = "https://financialmodelingprep.com/api/v3/income-statement/{}?limit=120&apikey=demo"
def download_data(symbol):
print(symbol)
# do stuff here
read_str = BASEURL.format(symbol)
df = pd.read_json(read_str)
#time.sleep(0.2)
return df
if __name__ == "__main__":
with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
data = pool.map(download_data, TICKERS)
df = pd.concat(data).set_index(["date", "symbol"]).sort_index()
df.to_csv('D:/verhuizen/pensioen/Income_2000.csv')
In this particular example the code breaks at position 1,903
RPAI
Traceback (most recent call last):
File "C:/Users/MLUY/AppData/Roaming/JetBrains/PyCharmCE2020.1/scratches/Equity_testing.py", line 27, in <module>
data = pool.map(download_data, TICKERS)
File "C:\Users\MLUY\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\MLUY\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 657, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x0793EAF0>'. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object")'
First optimization is to avoid concatenate your dataframe at each iteration.
You can try something like that:
url = "https://financialmodelingprep.com/api/v3/income-statement/{}?limit=120&apikey=demo"
df = []
for symbol in tickers["symbol"]:
read_str = url.format(symbol)
df.append(pd.read_json(read_str))
df = pd.concat(df, ignore_index=True)
If it's not sufficient, we will see to use async, threading or multiprocessing.
Edit:
The code below can do the job:
import pandas as pd
import numpy as np
import multiprocessing
import time
import random
PROCESSES = 4 # number of parallel process
CHUNKS = 6 # one process handle n symbols
# get tickers from your csv
TICKERS = ["BCDA", "WBAI", "NM", "ZKIN", "TNXP", "FLY", "MYSZ", "GASX", "SAVA", "GCE",
"XNET", "SRAX", "SINO", "LPCN", "XYF", "SNSS", "DRAD", "WLFC", "OILD", "JFIN",
"TAOP", "PIC", "DIVC", "MKGI", "CCNC", "AEI", "ZCMD", "YVR", "OCG", "IMTE",
"AZRX", "LIZI", "ORSN", "ASPU", "SHLL", "INOD", "NEXI", "INR", "SLN", "RHE-PA",
"MAX", "ARRY", "BDGE", "TOTA", "PFMT", "AMRH", "IDN", "OIS", "RMG", "IMV",
"CHFS", "SUMR", "NRG", "ULBR", "SJI", "HOML", "AMJL", "RUBY", "KBLMU", "ELP"]
# create a list of n sublist
TICKERS = [TICKERS[i:i + CHUNKS] for i in range(0, len(TICKERS), CHUNKS)]
BASEURL = "https://financialmodelingprep.com/api/v3/income-statement/{}?limit=120&apikey=demo"
def fake_data(symbol):
dti = pd.date_range("1985", "2020", freq="Y")
df = pd.DataFrame({"date": dti, "symbol": symbol,
"A": np.random.randint(0, 100, size=len(dti)),
"B": np.random.randint(0, 100, size=len(dti))})
time.sleep(random.random()) # to simulate network delay
return df.to_json()
def download_data(pool_id, symbols):
df = []
for symbol in symbols:
print("[{:02}]: {}".format(pool_id, symbol))
# do stuff here
# read_str = BASEURL.format(symbol)
# df.append(pd.read_json(read_str))
df.append(pd.read_json(fake_data(symbol)))
return pd.concat(df, ignore_index=True)
if __name__ == "__main__":
with multiprocessing.Pool(PROCESSES) as pool:
data = pool.starmap(download_data, enumerate(TICKERS, start=1))
df = pd.concat(data).set_index(["date", "symbol"]).sort_index()
In this example, I split the list of tickers into sublists for each process retrieves data for multiple symbols and limits overhead due to create and destroy processes.
The delay is to simulate the response time from the network connection and highlight the multiprocess behaviour.
Edit 2: simpler but naive version for your needs
import pandas as pd
import multiprocessing
# get tickers from your csv
TICKERS = ["BCDA", "WBAI", "NM", "ZKIN", "TNXP", "FLY", "MYSZ", "GASX", "SAVA", "GCE",
"XNET", "SRAX", "SINO", "LPCN", "XYF", "SNSS", "DRAD", "WLFC", "OILD", "JFIN",
"TAOP", "PIC", "DIVC", "MKGI", "CCNC", "AEI", "ZCMD", "YVR", "OCG", "IMTE",
"AZRX", "LIZI", "ORSN", "ASPU", "SHLL", "INOD", "NEXI", "INR", "SLN", "RHE-PA",
"MAX", "ARRY", "BDGE", "TOTA", "PFMT", "AMRH", "IDN", "OIS", "RMG", "IMV",
"CHFS", "SUMR", "NRG", "ULBR", "SJI", "HOML", "AMJL", "RUBY", "KBLMU", "ELP"]
BASEURL = "https://financialmodelingprep.com/api/v3/income-statement/{}?limit=120&apikey=demo"
def download_data(symbol):
print(symbol)
# do stuff here
read_str = BASEURL.format(symbol)
df = pd.read_json(read_str)
return df
if __name__ == "__main__":
with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
data = pool.map(download_data, TICKERS)
df = pd.concat(data).set_index(["date", "symbol"]).sort_index()
Note about pool.map: for each symbol in TICKERS, create a process and call function download_data.

TypeError: POST data should be bytes or an iterable of bytes. It cannot be of type str

My Code.
#!/usr/bin/env python
#coding: utf-8
userid="NicoNicoCreate#gmail.com"
passwd="********"
import sys, re, cgi, urllib, urllib.request, urllib.error, http.cookiejar, xml.dom.minidom, time, urllib.parse
import simplejson as json
def getToken():
html = urllib.request.urlopen("http://www.nicovideo.jp/my/mylist").read()
for line in html.splitlines():
mo = re.match(r'^\s*NicoAPI\.token = "(?P<token>[\d\w-]+)";\s*',line)
if mo:
token = mo.group('token')
break
assert token
return token
def mylist_create(name):
cmdurl = "http://www.nicovideo.jp/api/mylistgroup/add"
q = {}
q['name'] = name.encode("utf-8")
q['description'] = ""
q['public'] = 0
q['default_sort'] = 0
q['icon_id'] = 0
q['token'] = token
cmdurl += "?" + urllib.parse.urlencode(q).encode("utf-8")
j = json.load( urllib.request.urlopen(cmdurl), encoding='utf-8')
return j['id']
def addvideo_tomylist(mid,smids):
for smid in smids:
cmdurl = "http://www.nicovideo.jp/api/mylist/add"
q = {}
q['group_id'] = mid
q['item_type'] = 0
q['item_id'] = smid
q['description'] = u""
q['token'] = token
cmdurl += "?" + urllib.parse.urlencode(q).encode("utf-8")
j = json.load( urllib.request.urlopen(cmdurl), encoding='utf-8')
time.sleep(0.5)
#Login
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()))
urllib.request.install_opener(opener)
urllib.request.urlopen("https://secure.nicovideo.jp/secure/login",
urllib.parse.urlencode( {"mail":userid, "password":passwd}) ).encode("utf-8")
#GetToken
token = getToken()
#MakeMylist&AddMylist
mid = mylist_create(u"Testlist")
addvideo_tomylist(mid, ["sm9","sm1097445", "sm1715919" ] )
MyError.
Traceback (most recent call last):
File "Nico3.py", line 48, in <module>
urllib.parse.urlencode( {"mail":userid, "password":passwd}) ).encode("utf-8")
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 463, in open
req = meth(req)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 1170, in do_request_
raise TypeError(msg)
TypeError: POST data should be bytes or an iterable of bytes. It cannot be of type str.
I've tried encode but it did not help.
I'm japanese accademic students.
It was not able to be settled by my knowledge.
I am aware of this similar question, TypeError: POST data should be bytes or an iterable of bytes. It cannot be str, but am too new for the answer to be much help.
You paren is in the wrong place so you are not actually encoding:
.urlencode({"mail":userid, "password":passwd}).encode("utf-8")) # <- move inside

python TypeError: expected string or buffer when parsing JSON from a file

I realize this problem has been answered for other folks but none of the threads are helping me solve it. I'm trying to parse a JSON structure and add all values in the sent_file when the keys match with the tweet_file. The error I'm getting
import sys
import json
def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
scores = {}
#tweet = {}
#tweet_text = {}
#hw()
#lines(sent_file)
#lines(tweet_file)
for line in sent_file:
term,score = line.split("\t")
scores[term] = int(score)
#print scores.items()
for tweets in tweet_file:
current_sent_value = 0
tweet = {} #this is a dict
#print type(tweets) str
tweet = json.loads(tweets)#[0] #this assignment changes tweet to a list. Why?
if 'text' in tweet:
tweet_text = {}
unicode_string = tweet['text']
encoded_string = unicode_string.encode('utf-8')
tweet_text = encoded_string.split()
for key in tweet_text:
for key in scores:
#print type(tweet_text) -- list
#print type(scores) --dict
if tweet_text.get(key) == scores.get(key): # get() does not work on a list. tweet_text is a list.
current_sent_value += scores(value)
print current_sent_value
if name == 'main':
main()
The error is here \assignment1\tweet_sentiment2.py", line 42, in main
if tweet_text.get(key) == scores.get(key): # get() does not work on a list. tweet_text is a list.
AttributeError: 'list' object has no attribute 'get'