Writing loop values within a function into Pandas dataframe

Writing loop values within a function into Pandas dataframe - pandas

The function below is part of googlesheets quickstart.py to allow people to read a Googlesheet URL.
I am able to run the test and getting the print to work.
See the print statement in the function below:
print('%s, %s,%s,%s,%s,%s,%s,%s' % (row[0],row[1],row[2],row[3], row[4], row[5],row[6],row[7]))
My ultimate goal is to capture the data in the print into a pandas dataframe instead. All my attempts did not work.
def main():
"""Shows basic usage of the Sheets API.
Prints values from a sample spreadsheet.
"""
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('sheets', 'v4', credentials=creds)
# Call the Sheets API
sheet = service.spreadsheets()
result = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID,
range=SAMPLE_RANGE_NAME).execute()
values = result.get('values', [])
if not values:
print('No data found.')
else:
#print('Name, Major:')
for row in values:
# d = {'Case_Type':row.Case_Type,
# 'Date':row.Date,
# 'Cases':row.Cases,
# 'Country_Region':row.Country_Region,
# 'Lat':Lat,
# 'Long':Long}
# L.append(d)
# df = pd.DataFrame(L)
#Print columns A and E, which correspond to indices 0 and 7.
print('%s, %s,%s,%s,%s,%s,%s,%s' % (row[0],row[1],row[2],row[3], row[4], row[5],row[6],row[7]))
if __name__ == '__main__':
main()

Since values seems to be a 2D list, try doing
pd.DataFrame.from_records(values, columns=['Date', 'Cases', 'Country_Region', 'Lat', 'Long'])

Related

Tensor flow and tflearn Chatbot keeps on getting high probability even when user input is wrong

I coded a simple AI chatbot with TensorFlow and tflearn and it runs just fine but the issue is when the user inputs the wrong thing, the bot is supposed to say it doesnt understand if the prediction accuracy is less than 70%, but the bot always scores above that even if the user gives jibberish like "rjrigrejfr". The bot assumes theyre greeting them. The patterns its supposed to study in the json are "patterns": ["Hi", "How are you", "Wassup", "Hello", "Good day", "Waddup", "Yo"]. I can share the json file if needed its short. Anyway, this is the python code:
import numpy as np
import nltk
import tensorflow
import tflearn
import random
import json
import pickle
# Some extra configuration:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
nltk.download('punkt')
# Load the data from the json file into a variable.
with open("intents.json") as file:
data = json.load(file)
# If we already have saved data, we do not need to retrain the model and waste time (could develop into an issue in more complex programs. Save in pickle. )
try:
with open("data.pickle", "rb") as f: # rb stands for bytes.
words, labels, training, output = pickle.load(f)
# --- Pre-training data preparation ---
except:
words = []
docsx = [] # Stores patterns
docsy = [] # Stores intents
labels = [] # All the specific tag values such as greeting, contact, etc.
for intent in data["intents"]:
for pattern in intent["patterns"]:
w = nltk.word_tokenize(pattern) # nltk function that splits the sentences inside intent into words list.
words.extend(w) # Add the tokenized list to words list.
docsx.append(w)
docsy.append(intent["tag"]) # append the classification of the sentence
if intent["tag"] not in labels:
labels.append(intent["tag"])
words = [stemmer.stem(w.lower()) for w in words if w not in ".?!"] # Stemming the words to remove unnecessary elements leaving their root. Convert all to lowercase.
words = sorted(list(set(words))) # Set ensures no duplicate elements then we convert back to list and sort it.
labels = sorted(labels)
training = []
output = []
out_empty = [0 for i in range(len(labels))] # Gives a list of 0 ints based on # of tags. This is useful later in the program when binerizing.
# One hot encoding the intent categories. Need to one-hot code the data which improves the efficiency of the ML to "binerize" the data.
# In this case, we have a list of 0s and 1s if the word appears it is assigned a 1 else a 0.
for x, doc in enumerate(docsx):
bag = [] # Bag of words or the one-hot coded data for the ML.
docx_word_stemmed = [stemmer.stem(word) for word in doc] # Stemming the data in docx.
# Now adding and transforming data into the one-hot coded list/bag of words data.
for i in words:
if i in docx_word_stemmed: # Checking against stemmed words:
# Word exists
bag.append(1)
else:
bag.append(0)
output_row = out_empty[:] # Copying out_empty
# Going through the labels list using .index() and for the occurance of docx value in docy, assign binary 1.
output_row[labels.index(docsy[x])] = 1
training.append(bag)
output.append(output_row)
# Required to use numpy arrays for use in tflearn. It is also faster.
training = np.array(training)
output = np.array(output)
# Saving the data so we do not need to do the data configuration every time.
with open("data.pickle", "wb") as f:
pickle.dump((words, labels, training, output), f)
try:
model.load('model.tflearn')
except:
tensorflow.compat.v1.reset_default_graph()
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(output[0]), activation='softmax')
net = tflearn.regression(net)
model = tflearn.DNN(net)
model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)
model.save("model.tflearn")
def bagofwords(sentence, words):
bag = [0 for _ in range(len(words))] # blank bag of words.
# Tokenize s and then stem it.
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
for string in sentence_words:
for i, word in enumerate(words):
if word == string:
bag[i] = 1
return np.array(bag)
def chat():
print("Hello there! I'm the SRO AI Virtual Assistant. How am I help you?")
# Figure out the error slime!
while True:
user_input = input("Type here:")
if user_input == "quit":
break
result = model.predict([bagofwords(user_input, words)])[0] #bagofwords func and predict function to give predictions on what the user is saying.
best_result = np.argmax(result) # We want to only use the best result.
tag = labels[best_result]
print(result[best_result])
# Open JSON file and pick a response.
if result[best_result] > 0.7:
for tg in data["intents"]:
if tg['tag'] == tag:
responses = tg['responses']
print(random.choice(responses))
else:
print("I don't quite understand")
chat()

concatenate results after multiprocessing

I have a function which is creating a data frame by doing multiprocessing on a df:-
Suppose if I am having 10 rows in my df so the function processor will process all 10 rows separately. what I want is to concatenate all the output of the function processor and make one data frame.
def processor(dff):
"""
reading data from a data frame and doing all sorts of data manipulation
for multiprocessing
"""
return df
def main(infile, mdebug):
global debug
debug = mdebug
try:
lines = sum(1 for line in open(infile))
except Exception as err:
print("Error {} opening file: {}").format(err, infile)
sys.exit(2000)
if debug >= 2:
print(infile)
try:
dff = pd.read_csv(infile)
except Exception as err:
print("Error {}, opening file: {}").format(err, infile)
sys.exit(2000)
df_split = np.array_split(dff, (lines+1))
cores = multiprocessing.cpu_count()
cores = 64
# pool = Pool(cores)
pool = Pool(lines-1)
for n, frame in enumerate(pool.imap(processor, df_split), start=1):
if frame is not None:
frame.to_csv('{}'.format(n))
pool.close()
pool.join()
if __name__ == "__main__":
args = parse_args()
"""
print "Debug is: {}".format(args.debug)
"""
if args.debug >= 1:
print("Running in debug mode: "), args.debug
main(infile=args.infile, mdebug=args.debug)

you can use either the data frame constructor or concat to solve your problem. the appropriate one to use depends on details of your code that you haven't included
here's a more complete example:
import numpy as np
import pandas as pd
# create dummy dataset
dff = pd.DataFrame(np.random.rand(101, 5), columns=list('abcde'))
# process data
with Pool() as pool:
result = pool.map(processor, np.array_split(dff, 7))
# put it all back together in one dataframe
result = np.concat(result)

Return KDB query to a pandas dataframe

I would like to extract data from a KDB database and place into a dataframe. My query runs fine in qpad, no issues; just need to write it into my Pandas dataframe. My code:
from qpython import qconnection
# Create the connection and save the handle to a variable
q = qconnection.QConnection(host = 'wokplpaxvj003', port = 11503, username = 'pelucas', password = 'Dive2600', timeout = 3.0)
try:
# initialize connection
q.open()
print(q)
print('IPC version: %s. Is connected: %s' % (q.protocol_version, q.is_connected()))
df = q.sendSync('{select from quote_flat where date within (2019.08.14;2019.08.14), amendment_no = (max;amendment_no)fby quote_id}')
df.info()
finally:
q.close()
It fails on the df.info() raising AttributeError: 'QLambda' object has no attribute 'info' so I guess the call is not successful.

It looks like you've sent only a lambda but with no instruction to execute that lambda. Two options:
Don't make it a lambda
df = q.sendSync('select from quote_flat where date within (2019.08.14;2019.08.14), amendment_no = (max;amendment_no)fby quote_id')
Execute the lambda
df = q.sendSync('{select from quote_flat where date within (2019.08.14;2019.08.14), amendment_no = (max;amendment_no)fby quote_id}[]')

Redis not returning result after upgrading Celery from 3.1 to 4.0

I recently upgraded my Celery installation to 4.0. After a few days of wrestling with the upgrade process, I finally got it to work... sort of. Some tasks will return, but the final task will not.
I have a class, SFF, that takes in and parses a file:
# Constructor with I/O file
def __init__(self, file):
# File data that's gonna get used a lot
sffDescriptor = file.fileno()
fileName = abspath(file.name)
# Get the pointer to the file
filePtr = mmap.mmap(sffDescriptor, 0, flags=mmap.MAP_SHARED, prot=mmap.PROT_READ)
# Get the header info
hdr = filePtr.read(HEADER_SIZE)
self.header = SFFHeader._make(unpack(HEADER_FMT, hdr))
# Read in the palette maps
print self.header.onDemandDataSize
print self.header.onLoadDataSize
palMapsResult = getPalettes.delay(fileName, self.header.palBankOff - HEADER_SIZE, self.header.onDemandDataSize, self.header.numPals)
# Read the sprite list nodes
nodesStart = self.header.sprListOff
nodesEnd = self.header.palBankOff
print nodesEnd - nodesStart
sprNodesResult = getSprNodes.delay(fileName, nodesStart, nodesEnd, self.header.numSprites)
# Get palette data
self.palettes = palMapsResult.get()
# Get sprite data
spriteNodes = sprNodesResult.get()
# TESTING
spritesResultSet = ResultSet([])
numSpriteNodes = len(spriteNodes)
# Split the nodes into chunks of size 32 elements
for x in xrange(0, numSpriteNodes, 32):
spritesResult = getSprites.delay(spriteNodes, x, x+32, fileName, self.palettes, self.header.palBankOff, self.header.onDemandDataSizeTotal)
spritesResultSet.add(spritesResult)
break # REMEMBER TO REMOVE FOR ENTIRE SFF
self.sprites = spritesResultSet.join_native()
It doesn't matter if it's a single task that returns the entire spritesResult, or if I split it using a ResultSet, the outcome is always the same: the Python console I'm using just hangs at either spritesResultSet.join_native() or spritesResult.get() (depending on how I format it).
Here is the task in question:
#task
def getSprites(nodes, start, end, fileName, palettes, palBankOff, onDemandDataSizeTotal):
sprites = []
with open(fileName, "rb") as file:
sffDescriptor = file.fileno()
sffData = mmap.mmap(sffDescriptor, 0, flags=mmap.MAP_SHARED, prot=mmap.PROT_READ)
for node in nodes[start:end]:
sprListNode = dict(SprListNode._make(node)._asdict()) # Need to convert it to a dict since values may change.
#print node
#print sprListNode
# If it's a linked sprite, the data length is 0, so get the linked index.
if sprListNode['dataLen'] == 0:
sprListNodeTemp = SprListNode._make(nodes[sprListNode['index']])
sprListNode['dataLen'] = sprListNodeTemp.dataLen
sprListNode['dataOffset'] = sprListNodeTemp.dataOffset
sprListNode['compression'] = sprListNodeTemp.compression
# What does the offset need to be?
dataOffset = sprListNode['dataOffset']
if sprListNode['loadMode'] == 0:
dataOffset += palBankOff #- HEADER_SIZE
elif sprListNode['loadMode'] == 1:
dataOffset += onDemandDataSizeTotal #- HEADER_SIZE
#print sprListNode
# Seek to the data location and "read" it in. First 4 bytes are just the image length
start = dataOffset + 4
end = dataOffset + sprListNode['dataLen']
#sffData.seek(start)
compressedSprite = sffData[start:end]
# Create the sprite
sprite = Sprite(sprListNode, palettes[sprListNode['palNo']], np.fromstring(compressedSprite, dtype=np.uint8))
sprites.append(sprite)
return json.dumps(sprites, cls=SpriteJSONEncoder)
I know it reaches the return statement, because if I put a print right above it, it will print in the Celery window. I also know that the task is running to completion because I get the following message from the worker:
[2016-11-16 00:03:33,639: INFO/PoolWorker-4] Task framedatabase.tasks.getSprites[285ac9b1-09b4-4cf1-a251-da6212863832] succeeded in 0.137236133218s: '[{"width": 120, "palNo": 30, "group": 9000, "xAxis": 0, "yAxis": 0, "data":...'
Here are my celery settings in settings.py:
# Celery settings
BROKER_URL='redis://localhost:1717/1'
CELERY_RESULT_BACKEND='redis://localhost:1717/0'
CELERY_IGNORE_RESULT=False
CELERY_IMPORTS = ("framedatabase.tasks", )
... and my celery.py:
from __future__ import absolute_import
import os
from celery import Celery
# set the default Django settings module for the 'celery' program.
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'framedatabase.settings')
from django.conf import settings # noqa
app = Celery('framedatabase', backend='redis://localhost:1717/1', broker="redis://localhost:1717/0",
include=['framedatabase.tasks'])
# Using a string here means the worker will not have to
# pickle the object when using Windows.
app.config_from_object('django.conf:settings', namespace='CELERY')
app.autodiscover_tasks()
#app.task(bind=True)
def debug_task(self):
print('Request: {0!r}'.format(self.request))

Found the problem. Apparently it was leading to deadlock as mentioned in the section "Avoid launching synchronous subtasks" in the Celery documentation here: http://docs.celeryproject.org/en/latest/userguide/tasks.html#tips-and-best-practices
So I got rid of the line:
sprNodesResult.get()
And changed the final result to a chain:
self.sprites = chain(getSprNodes.s(fileName, nodesStart, nodesEnd, self.header.numSprites),
getSprites.s(0,32,fileName,self.palettes,self.header.palBankOff,self.header.onDemandDataSizeTotal))().get()
And it works! Now I just have to find a way to split this the way I want!

Unable to reload data as a csv file from IPython Notebook

I have the following IPython Notebook, I am trying to access data base of movies from rotten tomatoes website.
But Rotten Tomatoes limits to 10,000 API requests a day
So I don't want to re-run this function every time when I restart the notebook, I am trying to save and reload this data as a CSV file. When I convert the data to a csv file I am getting this processing symbol[*] inside IPython notebook. After some time I am getting the following error
ConnectionError: HTTPConnectionPool(host='api.rottentomatoes.com', port=80): Max retries exceeded with url: /api/public/v1.0/movie_alias.json?apikey=5xr26r2qtgf9h3kcq5kt6y4v&type=imdb&id=0113845 (Caused by <class 'socket.gaierror'>: [Errno 11002] getaddrinfo failed)
Is this problem due to slow internet connection? Should I make some changes to my code? Kindly help me with this.
The code for the file is shown below:
%matplotlib inline
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
api_key = '5xr26r2qtgf9h3kcq5kt6y4v'
movie_id = '770672122' # toy story 3
url = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % movie_id
#these are "get parameters"
options = {'review_type': 'top_critic', 'page_limit': 20, 'page': 1, 'apikey': api_key}
data = requests.get(url, params=options).text
data = json.loads(data) # load a json string into a collection of lists and dicts
print json.dumps(data['reviews'][0], indent=2) # dump an object into a json string
from io import StringIO
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file,delimiter='\t')
movies
#print the first row
movies[['id', 'title', 'imdbID', 'year']]
def base_url():
return 'http://api.rottentomatoes.com/api/public/v1.0/'
def rt_id_by_imdb(imdb):
"""
Queries the RT movie_alias API. Returns the RT id associated with an IMDB ID,
or raises a KeyError if no match was found
"""
url = base_url() + 'movie_alias.json'
imdb = "%7.7i" % imdb
params = dict(id=imdb, type='imdb', apikey=api_key)
r = requests.get(url, params=params).text
r = json.loads(r)
return r['id']
def _imdb_review(imdb):
"""
Query the RT reviews API, to return the first page of reviews
for a movie specified by its IMDB ID
Returns a list of dicts
"""
rtid = rt_id_by_imdb(imdb)
url = base_url() + 'movies/{0}/reviews.json'.format(rtid)
params = dict(review_type='top_critic',
page_limit=20,
page=1,
country='us',
apikey=api_key)
data = json.loads(requests.get(url, params=params).text)
data = data['reviews']
data = [dict(fresh=r['freshness'],
quote=r['quote'],
critic=r['critic'],
publication=r['publication'],
review_date=r['date'],
imdb=imdb, rtid=rtid
) for r in data]
return data
def fetch_reviews(movies, row):
m = movies.irow(row)
try:
result = pd.DataFrame(_imdb_review(m['imdbID']))
result['title'] = m['title']
except KeyError:
return None
return result
def build_table(movies, rows):
dfs = [fetch_reviews(movies, r) for r in range(rows)]
dfs = [d for d in dfs if d is not None]
return pd.concat(dfs, ignore_index=True)
critics = build_table(movies, 3000)
critics.to_csv('critics.csv', index=False)
critics = pd.read_csv('critics.csv')

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Writing loop values within a function into Pandas dataframe - pandas

Since values seems to be a 2D list, try doing pd.DataFrame.from_records(values, columns=['Date', 'Cases', 'Country_Region', 'Lat', 'Long'])

Related

Tensor flow and tflearn Chatbot keeps on getting high probability even when user input is wrong

concatenate results after multiprocessing

Return KDB query to a pandas dataframe

Redis not returning result after upgrading Celery from 3.1 to 4.0

Unable to reload data as a csv file from IPython Notebook

Categories

Resources