Only the string 'symbols' is supported for Nasdaq - pandas

import pandas_datareader as web
import datetime
start = datetime.datetime(2015, 1, 1)
end = datetime.datetime(2018, 7, 30)
f = web.DataReader("AAPL", "nasdaq", start, end)
I run this code and the result is like this
File "/anaconda3/lib/python3.6/site-packages/pandas_datareader/data.py", line 377, in DataReader
"Nasdaq, not %r" % (name,))
ValueError: Only the string 'symbols' is supported for Nasdaq, not 'AAPL'
so how can I fix this one please help.

Related

Deribit code not working for me.. Does anyone have some suggestions?

I am trying to gather historical prices/data from Deribit using Pycharm and Spyder but I keep getting errors. I used the code below from the following website:
https://www.codearmo.com/python-tutorial/crypto-algo-trading-historical-data1
If anyone has a suggested fix that would be a huge help. I am relatively new to coding.
Thanks.
import asyncio
import websockets
import json
import pandas as pd
import datetime as dt
async def call_api(msg):
async with websockets.connect('wss://test.deribit.com/ws/api/v2') as websocket:
await websocket.send(msg)
while websocket.open:
response = await websocket.recv()
return response
def async_loop(api, message):
return asyncio.get_event_loop().run_until_complete(api(message))
def retrieve_historic_data(start, end, instrument, timeframe):
msg = \
{
"jsonrpc": "2.0",
"id": 833,
"method": "public/get_tradingview_chart_data",
"params": {
"instrument_name": instrument,
"start_timestamp": start,
"end_timestamp": end,
"resolution": timeframe
}
}
resp = async_loop(call_api, json.dumps(msg))
return resp
def json_to_dataframe(json_resp):
res = json.loads(json_resp)
df = pd.DataFrame(res['result'])
df['ticks'] = df.ticks / 1000
df['timestamp'] = [dt.datetime.fromtimestamp(date) for date in df.ticks]
return df
if __name__ == '__main__':
start = 1554373800000
end = 1554376800000
instrument = "BTC-PERPETUAL"
timeframe = '1'
json_resp = retrieve_historic_data(start, end, instrument, timeframe)
df = json_to_dataframe(json_resp)
print(df.head())
Console Message:
/Users/macbookair/PycharmProjects/untitled/venv/bin/python /Users/macbookair/PycharmProjects/Deribit01/Deribit_Options_01.py
Traceback (most recent call last):
File "/Users/macbookair/PycharmProjects/Deribit01/Deribit_Options_01.py", line 2, in <module>
import websockets
File "/Users/macbookair/PycharmProjects/untitled/venv/lib/python3.8/site-packages/websockets/__init__.py", line 4, in <module>
from .client import * # noqa
File "/Users/macbookair/PycharmProjects/untitled/venv/lib/python3.8/site-packages/websockets/client.py", line 20, in <module>
asyncio.get_event_loop().run_until_complete(call_api(json.dumps(msg)))
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/base_events.py", line 616, in run_until_complete
return future.result()
File "/Users/macbookair/PycharmProjects/untitled/venv/lib/python3.8/site-packages/websockets/client.py", line 13, in call_api
async with websockets.connect('wss://test.deribit.com/ws/api/v2') as websocket:
AttributeError: partially initialized module 'websockets' has no attribute 'connect' (most likely due to a circular import)
Process finished with exit code 1

wrong format for time data pandas

what is the correct date format for the time
2020-02-24T18:00:00
I have tried using dates_cc = dt.datetime.strptime(dates_c, '%Y-%m-%d%H:%M:%S')
but I get a wrong date format error.
Just adding a T should work:
import datetime as dt
dates_c = "2020-02-24T18:00:00"
dates_cc = dt.datetime.strptime(dates_c, '%Y-%m-%dT%H:%M:%S')
dates_cc
Output:
datetime.datetime(2020, 2, 24, 18, 0)

Pandas HDFStore: append fails when min_itemsize is set to the maximum of the string column

I'm detecting the maximum lengths of all string columns of multiple dataframes, then attempting to build a HDFStore:
import pandas as pd
# Detect max string length for each column across all DataFrames
max_lens = {}
for df_path in paths:
df = pd.read_pickle(df_path)
for col in df.columns:
ser = df[col]
if ser.dtype == 'object' and isinstance(
ser.loc[ser.first_valid_index()], str
):
max_lens[col] = max(
ser.dropna().map(len).max(), max_lens.setdefault(col, 0)
)
print('Setting min itemsizes:', max_lens)
hdf_path.unlink() # Delete of file for clean retry
store = pd.HDFStore(hdf_path, complevel=9)
for df_path in paths:
df = pd.read_pickle(df_path)
store.append(hdf_key, df, min_itemsize=max_lens, data_columns=True)
store.close()
The detected maximum string lengths are as follows:
max_lens = {'hashtags': 139,
'id': 19,
'source': 157,
'text': 233,
'urls': 2352,
'user_mentions_user_ids': 199,
'in_reply_to_screen_name': 17,
'in_reply_to_status_id': 19,
'in_reply_to_user_id': 19,
'media': 286,
'place': 56,
'quoted_status_id': 19,
'user_id': 19}
Yet still I'm getting this error:
ValueError: Trying to store a string with len [220] in [hashtags] column but
this column has a limit of [194]!
Consider using min_itemsize to preset the sizes on these columns
Which is weird, because the detected maximum length of hashtags is 139.
HDF stores strings in UTF-8, and thus you need to encode the strings as UTF-8 and then find the maximum length.
a_pandas_string_series.str.encode('utf-8').str.len().max()

Getting sum by grouping other column

I have a dataframe as follows
Occupation, Genre, Rating
I have taken sum of all rating as totalRating. Now I want to create neeew column w_rating which take (rating >3)/totalRating for particular Occupation,Genre Combination. My dataframe name is joinedRDD so i amwriting below query
resultDF = joinedDF3.groupby([joinedDF3["Occupation"],joinedDF3["Genre"]]).withColumn(wa_rating, sum(Rating>3)/totalRating).collect()
but it is showing error
AttributeError: 'GroupedData' object has no attribute 'withColumn'
So it is clear from error that we cannot use withColumn with groupby
So my question is how to do it?
Below is my updated code.
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructField,StructType,IntegerType,StringType)
from pyspark.sql import Row
from pyspark.sql.functions import sum
import pyspark.sql.functions as F
from pyspark.sql.functions import lit
spark = SparkSession.builder.appName("Movielens Analysis").getOrCreate()
def refineMovieDF(row):
genre=[]
movieData =row[0].split("|")
for i in range(len(movieData)-5):
if int(movieData[i+5]) ==1:
genre.append((int(movieData[0]),i))
return genre
ratingSchema =StructType(fields=[StructField("UserId",IntegerType(),True),StructField("MovieId",IntegerType(),True),StructField("Rating",IntegerType(),True),StructField("TimeStamp",IntegerType(),True)])
ratingsDF = spark.read.load("ml-100k/u.data", format="csv",sep="\t", inferSchema=True, header=False,schema=ratingSchema)
genreSchema =StructType(fields=[StructField("Genre",StringType(),True),StructField("GenreId",IntegerType(),True)])
genreDF = spark.read.load("ml-100k/u.genre",format="csv",sep="|",inferSchema=True, header=False,schema=genreSchema)
userSchema =StructType(fields=[StructField("UserId",IntegerType(),True),StructField("Age",IntegerType(),True),StructField("Gender",StringType(),True),StructField("Occupation",StringType(),True),StructField("ZipCode",IntegerType(),True)])
usersDF = spark.read.load("ml-100k/u.user",format="csv",sep="|",inferSchema=True, header=False,schema=userSchema)
movieSchema =StructType(fields=[StructField("MovieRow",StringType(),True)])
movieDF = spark.read.load("ml-100k/u.item",format="csv",inferSchema=True, header=False,schema=movieSchema)
movieRefinedRDD = movieDF.rdd.flatMap(refineMovieDF)
movieSchema =StructType(fields=[StructField("MovieId",IntegerType(),True),StructField("GenreId",IntegerType(),True)])
movieRefinedDf = spark.createDataFrame(movieRefinedRDD, movieSchema)
joinedDF1 = ratingsDF.join(usersDF,ratingsDF.UserId==usersDF.UserId).select(usersDF["Occupation"],ratingsDF["Rating"],ratingsDF["MovieId"])
joinedDF3 = joinedDF1.join(joinedDF2,joinedDF1.MovieId == joinedDF2.MovieId).select(joinedDF1["Occupation"],joinedDF1["Rating"],joinedDF2["Genre"])
totalRating = joinedDF3.groupBy().sum("Rating").collect()
resultDF = joinedDF3.groupby([joinedDF3["Occupation"],joinedDF3["Genre"]]).agg((sum(joinedDF3["Rating"]>3)/totalRating).alias(wa_rating)).collect()
print(resultDF)
Now I am getting below error.
2019-08-06 22:24:20 INFO BlockManagerInfo:54 - Removed broadcast_11_piece0 on 10.0.2.15:58903 in memory (size: 4.3 KB, free: 413.8 MB)
Traceback (most recent call last):
File "/home/cloudera/workspace/MovielensAnalysis.py", line 59, in <module>
resultDF = joinedDF3.groupby([joinedDF3["Occupation"],joinedDF3["Genre"]]).agg((sum(joinedDF3["Rating"]>3)/totalRating).alias(wa_rating)).collect()
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/sql/column.py", line 116, in _
File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o129.divide.: java.lang.RuntimeException: Unsupported literal type class java.util.ArrayList [[572536]]

Spacy phrasematcher does not get matcher name

I am new to phraseMatcher and want to extract some keyword from my emails.
Everything is working well except that I can't get a name of added matcher.
This is my code below:
def main():
patterns_months = 'phraseMatcher/months.txt'
text_loc = 'phraseMatcher/text.txt'
nlp = spacy.blank('en')
nlp.vocab.lex_attr_getters ={}
phrases_months = read_gazetter(patterns_months)
txts = read_text(text_loc, n=n)
months = [nlp(text) for text in phrases_months]
matcher = PhraseMatcher(nlp.vocab)
matcher.add('MONTHS', None, *months)
print(nlp.vocab.strings['MONTHS'])
for txt in txts:
doc = nlp(txt)
matches = matcher(doc)
for match_id ,start, end in matches:
span = doc[start: end]
label = nlp.vocab.strings[match_id]
print(label, span.text, start, end)
The result:
12298211501233906429 <--- this is from print(nlp.vocab.strings['MONTHS'])
Traceback (most recent call last):
File "D:/workspace/phraseMatcher/venv/phraseMatcher.py", line 71, in <module>
plac.call(main)
File "D:\workspace\phraseMatcher\venv\lib\site-packages\plac_core.py", line 328, in call
cmd, result = parser.consume(arglist)
File "D:\workspace\phraseMatcher\venv\lib\site-packages\plac_core.py", line 207, in consume
return cmd, self.func(*(args + varargs + extraopts), **kwargs)
File "D:/workspace/phraseMatcher/venv/phraseMatcher.py", line 47, in main
label = nlp.vocab.strings[match_id]
File "strings.pyx", line 117, in spacy.strings.StringStore.__getitem__
KeyError: "[E018] Can't retrieve string for hash '18446744072093410045'."
spaCy version:** 2.0.12
Platform:** Windows-7-6.1.7601-SP1
Python version:** 3.7.0
I can't find what I did wrong. It is simple and I read these already:
Using PhraseMatcher in SpaCy to find multiple match types
Help me, thanks in advance.