Why doesnt gzip.open() recognize my compressed file? - gzip

I have been trying to extract a large set of images that are in a .pkl.gz file. Here is my code for doing so.
import gzip
import pickle
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import os
for f in os.listdir("W:\Code\Machine Learning\digit-recognition-dnn\data"):
print(f)
name = 'mnist.pkl.gz'
with gzip.open(name, 'rb') as f:
train_set, valid_set, test_set = pickle.load(f)
train_x, train_y = train_set
plt.imshow(train_x[0].reshape((28, 28)), cmap=cm.Greys_r)
plt.show()
Unfortunately, the output is like this:
getData.py
mnist.pkl.gz
Traceback (most recent call last):
File "w:\Code\Machine Learning\digit-recognition-dnn\data\getData.py", line 12, in <module>
with gzip.open(name, 'rb') as f:
File "C:\Users\trexx\AppData\Local\Programs\Python\Python39\lib\gzip.py", line 58, in open
binary_file = GzipFile(filename, gz_mode, compresslevel)
File "C:\Users\trexx\AppData\Local\Programs\Python\Python39\lib\gzip.py", line 173, in __init__
fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
FileNotFoundError: [Errno 2] No such file or directory: 'mnist.pkl.gz'
My filetree looks like this:
data
├─ getData.py
└─ mnist.pkl.gz
Any suggestions?
I tried changing name to `
name = 'mnist.pkl'
and
name = 'mnist.gz'
The result is the same. FileNotFound.

Try specifying a path instead of just the name.
Replace mnist.pkl.gz with something like "W:\my_dir\mnist.pkl.gz

Related

Loading .txt file from Google Cloud Storage into a Pandas DF

I'm trying to load a .txt file from a GCS bucket into pandas df via pd.read_csv. When I run this code on my local machine (sourcing the .txt file from a local directory), it works perfectly. However, when I try and run the code in a cloud function , accessing the same .txt file but from a GCS bucket, I get a 'TypeError: cannot use a string pattern on a bytes-like object'
The only thing that's different is the fact that I'm accessing the .txt file via the GCS bucket so its a bucket object (Blob) instead of a normal file. Would I need to download the blob as a string or as a file-like object first before doing pd.read_csv? code is below
def stage1_cogs_vfc(data, context):
from google.cloud import storage
import pandas as pd
import dask.dataframe as dd
import io
import numpy as np
start_bucket = 'my_bucket'
storage_client = storage.Client()
source_bucket = storage_client.bucket(start_bucket)
df = pd.DataFrame()
file_path = 'gs://my_bucket/SCE_Var_Fact_Costs.txt'
df = pd.read_csv(file_path,skiprows=12, encoding ='utf-8', error_bad_lines= False, warn_bad_lines= False , header = None ,sep = '\s+|\^+',engine='python')
Traceback (most recent call last):
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 383, in run_background_function _function_handler.invoke_user_function(event_object) File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 217, in invoke_user_function return call_user_function(request_or_event) File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 214, in call_user_function event_context.Context(**request_or_event.context)) File "/user_code/main.py", line 20, in stage1_cogs_vfc df = pd.read_csv(file_path,skiprows=12, encoding ='utf-8', error_bad_lines= False, warn_bad_lines= False , header = None ,sep = '\s+|\^+',engine='python') File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 702, in parser_f return _read(filepath_or_buffer, kwds) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 429, in _read parser = TextFileReader(filepath_or_buffer, **kwds) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 895, in __init__ self._make_engine(self.engine) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 1132, in _make_engine self._engine = klass(self.f, **self.options) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2238, in __init__ self.unnamed_cols) = self._infer_columns() File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2614, in _infer_columns line = self._buffered_line() File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2689, in _buffered_line return self._next_line() File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2791, in _next_line next(self.data) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2379, in _read yield pat.split(line.strip()) TypeError: cannot use a string pattern on a bytes-like object
``|
I found a similar situation here.
I also noticed that on the line:
source_bucket = storage_client.bucket(source_bucket)
you are using "source_bucket" for both: your variable name and parameter. I would suggest to change one of those.
However, I think you'd like to see this doc for any further question related to the API itself: Storage Client - Google Cloud Storage API
Building on points from #K_immer is my updated code that includes reading into 'Dask' df...
def stage1_cogs_vfc(data, context):
from google.cloud import storage
import pandas as pd
import dask.dataframe as dd
import io
import numpy as np
import datetime as dt
start_bucket = 'my_bucket'
destination_path = 'gs://my_bucket/ddf-*_cogs_vfc.csv'
storage_client = storage.Client()
bucket = storage_client.get_bucket(start_bucket)
blob = bucket.get_blob('SCE_Var_Fact_Costs.txt')
df0 = pd.DataFrame()
file_path = 'gs://my_bucket/SCE_Var_Fact_Costs.txt'
df0 = dd.read_csv(file_path,skiprows=12, dtype=object ,encoding ='utf-8', error_bad_lines= False, warn_bad_lines= False , header = None ,sep = '\s+|\^+',engine='python')
df7 = df7.compute() # converts dask df to pandas df
# then do your heavy ETL stuff here using pandas...

Jython ValueError: chr() arg not in range(256)

I am using Jython (jython2.7.0) to send a string value from a java program to a python method and then return the value to the java program but I get this error. ValueError: chr() arg not in range(256) Do you know what is the cause of the problem and How can I solve it ??
Exception in thread "main" Traceback (most recent call last):
File "PageRanking.py", line 9, in <module>
from bs4 import BeautifulSoup
File "C:\jython2.7.0\Lib\bs4\__init__.py", line 35, in <module>
from .builder import builder_registry, ParserRejectedMarkup
File "C:\jython2.7.0\Lib\bs4\builder\__init__.py", line 7, in <module>
from bs4.element import (
File "C:\jython2.7.0\Lib\bs4\element.py", line 10, in <module>
from bs4.dammit import EntitySubstitution
File "C:\jython2.7.0\Lib\bs4\dammit.py", line 14, in <module>
from html.entities import codepoint2name
File "C:\jython2.7.0\Lib\html\__init__.py", line 6, in <module>
from html.entities import html5 as _html5
File "C:\jython2.7.0\Lib\html\entities.py", line 2507, in <module>
entitydefs[name] = chr(codepoint)
This is my Python code
from __future__ import with_statement
from bs4 import BeautifulSoup
import requests
def pageRank(link):
url = "https://checkpagerank.net/"
payload = {'name':link}
r = requests.post(url, payload)
with open("requests_results.html", "wb") as f:
f.write(r.content)
with open(r'requests_results.html', "r", encoding='utf-8') as f:
text= f.read()
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('h2')
SResult = results[1]
first= SResult.contents[0]
rankerName = first.find('b').text
second= SResult.contents[2]
rankervalue = second.find('b').text
x = rankervalue[:1]
x = int(x)
x= x*100/10
return x

Keras flask API not giving me output

I am very new to flask. I developed a document classification model using CNN model in Keras in Python3. Below is the code i am using for app.py file in windows machine.
I got the code example from here and improvised it to suit my needs
import os
from flask import jsonify
from flask import request
from flask import Flask
import numpy as np
from keras.models import model_from_json
from keras.models import load_model
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
#star Flask application
app = Flask(__name__)
path = 'C:/Users/user/Model/'
json_file = open(path+'/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
keras_model_loaded = model_from_json(loaded_model_json)
keras_model_loaded.load_weights(path+'/model.h5')
print('Model loaded...')
def preprocess_text(text,num_max = 1000,max_review_length = 100):
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(texts)
cnn_texts_seq = tok.texts_to_sequences(texts)
cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=max_review_length)
return cnn_texts_mat
# URL that we'll use to make predictions using get and post
#app.route('/predict',methods=['GET','POST'])
def predict():
try:
text = request.args.get('text')
x = preprocess_text(text)
y = int(np.round(keras_model_loaded.predict(x)))
#print(y)
return jsonify({'prediction': str(y)})
except:
response = jsonify({'error': 'problem predicting'})
response.status_code = 400
return response
if __name__ == "__main__":
port = int(os.environ.get('PORT', 5000))
# Run locally
app.run(host='0.0.0.0', port=port)
In my windows machine i navigate to the path in the console where i have saved app.py file and execute the command py -3.6 app.py
When i go the url http://localhost:5000/predict and type in browser
http://localhost:5000/predict?text=I've had my Fire HD 8 two weeks now and I love it. This tablet is a great value. We are Prime Members and that is where this tablet SHINES.
it does not give me any class as output, but instead i get this as output {"error":"problem predicting"}.
Any help on how to fix this?
Edit: I removed the try except block in the predict function. Below is how predict function looks like
def predict():
text = request.args.get('text')
x = preprocess_text(text)
y = int(np.round(keras_model_loaded.predict(x)))
return jsonify({'prediction': str(y)})
Now i am getting exception. error message is
[2018-05-28 18:33:59,008] ERROR in app: Exception on /predict [GET]
Traceback (most recent call last):
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\flask\app.py", line 2292, in wsgi_app
response = self.full_dispatch_request()
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\flask\app.py", line 1815, in full_dispatch_request
rv = self.handle_user_exception(e)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\flask\app.py", line 1718, in handle_user_exception
reraise(exc_type, exc_value, tb)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\flask\_compat.py", line 35, in reraise
raise value
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\flask\app.py", line 1813, in full_dispatch_request
rv = self.dispatch_request()
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\flask\app.py", line 1799, in dispatch_request
return self.view_functions[rule.endpoint](**req.view_args)
File "app.py", line 59, in predict
x = preprocess_text(text)
File "app.py", line 37, in preprocess_text
tok.fit_on_texts(texts)
NameError: name 'texts' is not defined
127.0.0.1 - - [28/May/2018 18:33:59] "GET /predict?text=I%27ve%20had%20my%20Fire%20HD%208%20two%20weeks%20now%20and%20I%20love%20it.%20This%20tablet%20is%20a%20great%20value.%20We%20are%20Prime%20Members%20and%20that%20is%20where%20this%20tablet%20SHINES. HTTP/1.1" 500 -
Edit2: I have edited code to
def preprocess_text(texts,num_max = 1000,max_review_length = 100):
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(texts)
cnn_texts_seq = tok.texts_to_sequences(texts)
cnn_texts_mat = pad_sequences(cnn_texts_seq,maxlen=max_review_length)
return cnn_texts_mat
# URL that we'll use to make predictions using get and post
#app.route('/predict',methods=['GET','POST'])
def predict():
text = request.args.get('text')
x = preprocess_text(text)
y = keras_model_loaded.predict(x)
return jsonify({'prediction': str(y)})
and now the error message is
packages\tensorflow\python\framework\ops.py", line 3402, in _as_graph_element_locked
raise ValueError("Tensor %s is not an element of this graph." % obj)
ValueError: Tensor Tensor("output/Sigmoid:0", shape=(?, 1), dtype=float32) is not an element of this graph.
127.0.0.1 - - [28/May/2018 19:39:11] "GET /predict?text=I%27ve%20had%20my%20Fire%20HD%208%20two%20weeks%20now%20and%20I%20love%20it.%20This%20tablet%20is%20a%20great%20value.%20We%20are%20Prime%20Members%20and%20that%20is%20where%20this%20tablet%20SHINES. HTTP/1.1" 500 -
I am unable to understand and debug this error. Not sure what this means. Can anyone help me understand this error and suggest a solution for this?
Also, i am unable to post the entire error message in stackoverflow as most of the chunk in my question appears to be code.
Thanks!!
Now it is what I guessed. There is a problem when using cross-threads with Flask and Tensorflow. Here is a fix for it:
import tensorflow as tf
# ...
graph = tf.get_default_graph()
def predict():
text = request.args.get('text')
x = preprocess_text(text)
with graph.as_default():
y = int(np.round(keras_model_loaded.predict(x)))
return jsonify({'prediction': str(y)})
by wrapping the prediction to forcefully use the default graph.

Write Pandas Dataframe to CSV with a variable name in pathway

I've written a python script that takes in a file and matches some columns in another file. I would like to write this to a csv with the name "[original file name]_matched". E.g. I have a bunch of files (xaa, xab, ...) and after running the script on each file I would also have (xaa_matched, xab_matched, etc...) This is what I've tried based on this solution: Set File_Path for to_csv() in Pandas
import sys
import os
filename = sys.argv[1]
# some code
path = r'/Users/mdong/dataScience/movie_representation/fuzzy_match_dir/'
input_file.to_csv(os.path.join(path,'match_' + filename), index = False)
However, I get back this error
Traceback (most recent call last):
File "movie_matching.py", line 29, in <module>
input_file.to_csv(os.path.join(path,filename), index = False)
File "/Users/mdong/anaconda/lib/python3.6/site-packages/pandas/core/frame.py", line 1413, in to_csv
formatter.save()
File "/Users/mdong/anaconda/lib/python3.6/site-packages/pandas/io/formats/format.py", line 1568, in save
compression=self.compression)
File "/Users/mdong/anaconda/lib/python3.6/site-packages/pandas/io/common.py", line 382, in _get_handle
f = open(path_or_buf, mode, errors='replace')
FileNotFoundError: [Errno 2] No such file or directory: '/Users/mdong/dataScience/movie_representation/fuzzy_match_dir/fuzzy_match_dir/xaa.csv'
I'm not sure what's going wrong in order to troubleshoot, any pointers would be appreciated!
I would use pathlib in this situation.
from pathlib import Path
p = Path('/Path/to/your/folder/')
input_file.to_csv(Path(p, 'match_' + filename + '.csv')), index=False)
I would also check that your filename variable is what you expect it to be. Which you can do with Pathlib as well.
>>> p = Path('/Path/To/Thing.csv')
>>> p.stem
'Thing'
>>> p.name
'Thing.csv'

Pandas HDF5 append time series fails

Going through the documentation of pandas HDF5 usability (http://pandas.pydata.org/pandas-docs/stable/io.html#io-hdf5) the given example raises an error:
import pandas as pd
import numpy as np
store = pd.HDFStore('store.h5')
np.random.seed(1234)
index = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 3), index=index)
store['df'] = df
df1 = df[0:4]
df2 = df[4:]
store.append('df', df1)
store.append('df', df2)
Traceback (most recent call last):
File "C:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2885, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-225-ef7f2e059c6a>", line 1, in <module>
store.append('df', df1)
File "C:\Anaconda3\lib\site-packages\pandas\io\pytables.py", line 919, in append
**kwargs)
File "C:\Anaconda3\lib\site-packages\pandas\io\pytables.py", line 1252, in _write_to_group
raise ValueError('Can only append to Tables')
ValueError: Can only append to Tables
Has something changed here? Or am I doing something wrong?
You need to enable append by default store in the table format by setting the following option at the beginning as your store behaves like a DF currently:
pd.set_option('io.hdf.default_format','table')
Docs