The iris tutorial in tensorflow's website does not work well - tensorflow

The code is showed below,and the wrong message is also showed below:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import urllib.request
import tensorflow as tf
import numpy as np
IRIS_TRAINING = "iris_training.csv"
IRIS_TRAINING_URL = "http://download.tensorflow.org/data/iris_training.csv"
IRIS_TEST = "iris_test.csv"
IRIS_TEST_RRL = "http://download.tensorflow.org/data/iris_test.csv"
if not os.path.exists(IRIS_TRAINING):
raw = urllib.request.urlopen(IRIS_TRAINING_URL).read()
with open(IRIS_TRAINING, 'w') as f:
f.write(raw)
if not os.path.exists(IRIS_TEST):
raw = urllib.request.urlopen(IRIS_TEST_RRL).read()
with open(IRIS_TEST, 'w') as f:
f.write(raw)
# load datasets.
training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
filename=IRIS_TRAINING,
target_dtype=np.int,
features_dtype=np.float32)
test_set = tf.contrib.learn.datasets.base.load_csv_without_header(
filename=IRIS_TEST,
target_dtype=np.int,
features_dtype=np.float32
)
# Specify that all features have real_valued data
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
# Build 3 layers DNN with 10, 20, 10 units respectively.
classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
hidden_units=[10, 20, 30],
n_class=3,
model_dir="/tem/iris_model")
# Define the training imputs
def get_train_inputs():
x = tf.constant(training_set.data)
y = tf.constant(training_set.target)
return x, y
# Fit model
classifier.fit(input_fn=get_train_inputs(), steps=2000)
# Define the test inputs
def get_test_inputs():
x = tf.constant(test_set.data)
y = tf.constant(test_set.target)
return x, y
# Evaluate accuracy
accuracy_score = classifier.evaluate(input_fn=get_test_inputs(), steps=1)["accuracy"]
print("\nTest Accuracy: {0:f}\n".format(accuracy_score))
This prints the following stack-trace:
Traceback (most recent call last):
File "/home/skyfacon/PycharmProjects/LinearFitting/IrisClassification.py", line 35, in <module>
features_dtype=np.float32
File "/home/skyfacon/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py", line 69, in load_csv_without_header
data.append(np.asarray(row, dtype=features_dtype))
File "/home/skyfacon/anaconda3/envs/tensorflow/lib/python3.6/site-packages/numpy/core/numeric.py", line 531, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: 'setosa'
Process finished with exit code 1

I would like to know which page you are using as tutorial for this. Because the first page which comes when searching in google is this:
https://www.tensorflow.org/get_started/tflearn
And the difference between this and what you posted is tf.contrib.learn.datasets.base.load_csv_without_header and tf.contrib.learn.datasets.base.load_csv_with_header.
The actual URL or iris data you have specified contains the header. And you are trying to load it as a file without the header. Hence the strings in the header are not able to get converted to float and the error.
Change your code to:
training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
filename=IRIS_TRAINING,
target_dtype=np.int,
features_dtype=np.float32)
test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
filename=IRIS_TEST,
target_dtype=np.int,
features_dtype=np.float32)

Related

AttributeError: 'DataFrame' object has no attribute '_data' [Not a duplicate]

I was trying to run the main.py but it threw an error about attribution.
My Python version is Python 3.5. I am using the CNTK Docker release 2.6-cpu-python3.5. I cannot update the Python version because of CNTK. It only supports Python 3.5 and will only run in Ubuntu 16.04.
Pandas version: pandas==0.25.3
The Error
Traceback (most recent call last):
File "/workspace/main.py", line 5, in <module>
from model import extract_patches, score_patch, del_cache
File "/workspace/model.py", line 2, in <module>
from regressionModel import extract_features, predict_label
File "/workspace/regressionModel.py", line 26, in <module>
regression_model = read_model['model'][0]
File "/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py", line 2898, in __getitem__
if self.columns.is_unique and key in self.columns:
File "/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py", line 5063, in __getattr__
return object.__getattribute__(self, name)
File "pandas/_libs/properties.pyx", line 65, in pandas._libs.properties.AxisProperty.__get__
File "/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py", line 5063, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute '_data'
main.py
import os
import flask
import numpy as np
from flask import jsonify, request
from model import extract_patches, score_patch, del_cache
app = flask.Flask(__name__)
#app.route('/url/<path:argument>')
def url(argument):
# create a patch folder
patch_path = './patches'
if not os.path.exists(patch_path):
os.mkdir(patch_path)
# get image url from the query string
imageURL = request.url.split('=',1)[1]
# extract patches from imageURL
dimension, face_loc, image_dim = extract_patches(imageURL)
# score each patch
patch_score= score_patch(patch_path)
# delete the downloaded image and the patches from local
del_cache(patch_path)
if os.path.exists('temp.jpg'):
os.remove('temp.jpg')
data = dict()
data['patch_score'] = []
for key in dimension:
tmp = []
tmp[:] = dimension[key]
tmp.append(patch_score[key])
data['patch_score'].append(tmp)
data['image_score'] = round(np.mean(list(patch_score.values())), 2)
data['face_loc'] = face_loc['face_loc']
data['img_dim'] = image_dim
return jsonify(patch_score = str(data['patch_score']), image_score = str(data['image_score']), face_loc = str(data['face_loc']), image_dim = str(data['img_dim']))
if __name__ == '__main__':
app.run(host='0.0.0.0', port = 9580) # port number can be changed in your case
model.py
import getPatches
from regressionModel import extract_features, predict_label
import os
import shutil
def extract_patches(imageURL):
patch_path = './patches'
dimension_dict = dict()
face_dict = dict()
image_dim = []
try:
dim, face, img = getPatches.extract_patches(imageURL, dimension_dict,face_dict, image_dim, patch_path)
print ("extract patches pass")
except:
print ('cannot extract patches from the image')
return dim, face, img
def score_patch(patch_path):
patch_score = dict()
for file in next(os.walk(patch_path))[2]:
file_path = os.path.join(patch_path, file)
score_features = extract_features (file_path)[0].flatten()# extract features from CNTK pretrained model
pred_score_label = predict_label(score_features) # score the extracted features using trained regression model
patch_score[file.split('.')[0]] = float("{0:.2f}".format(pred_score_label[0]))
return patch_score
def infer_label(patch_score, label_mapping):
max_score_name, max_score_value = max(patch_score.items(), key=lambda x:x[1])
pred_label = label_mapping[round(max_score_value)-1]
return pred_label
def del_cache(patch_folder):
shutil.rmtree(patch_folder)
return
regressionModel.py
import numpy as np
import pandas as pd
import cntk as C
from PIL import Image
import pickle
from cntk import load_model, combine
import cntk.io.transforms as xforms
from cntk.logging import graph
from cntk.logging.graph import get_node_outputs
pretrained_model = 'ResNet152_ImageNet_Caffe.model'
pretrained_node_name = 'pool5'
regression_model = 'cntk_regression.dat'
image_width = 224
image_height = 224
# load CNTK pretrained model
#model_file = os.path.join(pretrained_model_path, pretrained_model_name)
loaded_model = load_model(pretrained_model) # a full path is required
node_in_graph = loaded_model.find_by_name(pretrained_node_name)
output_nodes = combine([node_in_graph.owner])
# load the stored regression model
read_model = pd.read_pickle(regression_model)
regression_model = read_model['model'][0]
train_regression = pickle.loads(regression_model)
def extract_features(image_path):
img = Image.open(image_path)
resized = img.resize((image_width, image_height), Image.ANTIALIAS)
bgr_image = np.asarray(resized, dtype=np.float32)[..., [2, 1, 0]]
hwc_format = np.ascontiguousarray(np.rollaxis(bgr_image, 2))
arguments = {loaded_model.arguments[0]: [hwc_format]}
output = output_nodes.eval(arguments)
return output
def predict_label(features):
return train_regression.predict(features.reshape(1,-1))
https://pypi.org/project/cntk/#files has CNTK 2.7 for Python 3.6. Still an obsolete version, but not quite as obsolete.

"im2col_out_cpu" not implemented for 'Byte'

I am trying to generate overlap patches from image size (112,112) but i am unable to do so. I have already tried a lot but it didn't work out.
**Code**
import torch
import numpy as np
import torch.nn as nn
from torch import nn
from PIL import Image
import cv2
import os
import math
import torch.nn.functional as F
import torchvision.transforms as T
from timm import create_model
from typing import List
import matplotlib.pyplot as plt
from torchvision import io, transforms
from utils_torch import Image, ImageDraw
from torchvision.transforms.functional import to_pil_image
IMG_SIZE = 112
# PATCH_SIZE = 64
resize = transforms.Resize((IMG_SIZE, IMG_SIZE))
img = resize(io.read_image("Adam_Brody_233.png"))
img = img.to(torch.float32)
image_size = 112
patch_size = 28
ac_patch_size = 12
pad = 4
img = img.unsqueeze(0)
soft_split = nn.Unfold(kernel_size=(ac_patch_size, ac_patch_size), stride=(patch_size, patch_size), padding=(pad, pad))
patches = soft_split(img).transpose(1, 2)
fig, ax = plt.subplots(16, 16)
for i in range(16):
for j in range(16):
sub_img = patches[:, i, j]
ax[i][j].imshow(to_pil_image(sub_img))
ax[i][j].axis('off')
plt.show()
Traceback
Traceback (most recent call last):
File "/home/cvpr/Documents/OPVT/unfold_ours.py", line 32, in <module>
patches = soft_split(img).transpose(1, 2)
File "/home/cvpr/anaconda3/envs/OPVT/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/cvpr/anaconda3/envs/OPVT/lib/python3.7/site-packages/torch/nn/modules/fold.py", line 295, in forward
self.padding, self.stride)
File "/home/cvpr/anaconda3/envs/OPVT/lib/python3.7/site-packages/torch/nn/functional.py", line 3831, in unfold
_pair(dilation), _pair(padding), _pair(stride))
RuntimeError: "im2col_out_cpu" not implemented for 'Byte'
Yes this is an open issue in PyTorch. A simple fix is just to convert your image tensor from ints to floats you can do it like this:
img = img.to(torch.float32)
This should solve your problem

Error in Keras Lambda layer when wrapped function expects non-float argument

I want to wrap a tensorflow function in a Keras Lambda layer as per the docs. However, my inputs are complex64. Here is a more complete example of the code i am using to replicate this behavior:
import numpy as np
from keras.models import Model
from keras.layers import Input, Lambda
import tensorflow as tf
np.set_printoptions(precision=3, threshold=3, edgeitems=3)
def layer0(inp):
z = inp[0] + inp[1]
num = tf.cast(tf.real(z), tf.complex64)
return z/num
if __name__ == "__main__":
shape = (1,10,5)
z1 = Input(shape=shape[1:], dtype=np.complex64)
z2 = Input(shape=shape[1:], dtype=np.complex64)
#s = Lambda(layer0, output_shape=shape)([z1, z2])
s = Lambda(layer0)([z1, z2])
model = Model(inputs=[z1,z2], outputs=s)
z1_in = np.asarray(np.random.normal(size=shape) + np.random.normal(size=shape)*1j, 'complex64')
z2_in = np.asarray(np.random.normal(size=shape) + np.random.normal(size=shape)*1j, 'complex64')
s_out = model.predict([z1_in, z2_in])
print(s_out)
which gives the following error:
Traceback (most recent call last):
File "complex_lambda.py", line 32, in <module>
s = Lambda(layer0)([z1, z2])
File "complex_lambda.py", line 18, in layer0
return z/num
TypeError: x and y must have the same dtype, got tf.float32 != tf.complex64
However, if I use the commented line instead:
s = Lambda(layer0, output_shape=shape)([z1, z2])
The code runs just fine. It seems that "output_shape=(...)" is necessary to make the division in the lambda function work. While this solution solves the problem for a single output variable, it doesn't work when having multiple outputs.
I cannot replicate your issue. Which version of tensorflow are you using? Are you using the keras package, or the tensorflow.keras submodule ?
At any rate, I think you can fix your issue by specifying the dtype of the Lambda layer : s = Lambda(lambda x: tf.math.real(x[0] + x[1]), dtype='complex64')([z1, s2])

ValueError: Cannot Convert String to Float With Pandas and Amazon Sagemaker

I'm trying to deploy a simple ML model on SageMaker to get the hang of it, and I am not having any luck because I get the following error:
ValueError: could not convert string to float: '6.320000000000000097e-03 1.800000000000000000e+01 2.310000000000000053e+00 0.000000000000000000e+00 5.380000000000000338e-01 6.575000000000000178e+00 6.520000000000000284e+01 4.089999999999999858e+00 1.000000000000000000e+00 2.960000000000000000e+02 1.530000000000000071e+01 3.968999999999999773e+02 4.980000000000000426e+00 2.400000000000000000e+01'
This is the first row of my dataframe.
This is the code in my notebook that I'm using right now:
from sagemaker import get_execution_role, Session
from sagemaker.sklearn.estimator import SKLearn
work_dir = 'data'
session = Session()
role = get_execution_role()
train_input = session.upload_data('data')
script = 'boston_housing_prep.py'
model = SKLearn(
entry_point = script,
train_instance_type = 'ml.c4.xlarge',
role = role,
sagemaker_session = session,
hyperparameters = {'alpha': 10}
)
model.fit({'train': train_input})
My script for boston_housing_prep.py looks like this:
import argparse
import pandas as pd
import os
from sklearn.linear_model import Ridge
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
import numpy as np
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--alpha', type=int, default=1)
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
args = parser.parse_args()
input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ]
if len(input_files) == 0:
raise ValueError(('There are no files in {}.\n' +
'This usually indicates that the channel ({}) was incorrectly specified,\n' +
'the data specification in S3 was incorrectly specified or the role specified\n' +
'does not have permission to access the data.').format(args.train, "train"))
raw_data = [ pd.read_csv(file, header=None, engine="python") for file in input_files ]
df = pd.concat(raw_data)
y_train = df.iloc[:, -1]
X_train = df.iloc[:, :5]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
alpha = args.alpha
clf = Ridge(alpha=alpha)
clf = clf.fit(X_train, y_train)
joblib.dump(clf, os.path.join(args.model_dir, "model.joblib"))
def model_fn(model_dir):
clf = joblib.load(os.path.join(model_dir, "model.joblib"))
return clf
The line that's giving the problem is this one:
X_train = scaler.fit_transform(X_train)
I tried df = df.astype(np.float) after I loaded in the df, but that didn't work either.
This file loads in without a problem when I'm not in SageMaker.

small test_set xgb predict

i would like to ask a question about a problem that i have for the last couple days.
First of all i am a beginner in machine learning and this is my first time using the XGBoost algorithm so excuse me for any mistakes I have done.
I trained my model to predict whether a log file is malicious or not. After i save and reload my model on a different session i use the predict function which seems to be working normally ( with a few deviations in probabilities but that is another topic, I know I, have seen it in another topic )
The problem is this: Sometimes when i try to predict a "small" csv file after load it seems to be broken predicting only the Zero label, even for indexes that are categorized correct previously.
For example, i load a dataset containing 20.000 values , the predict() is working. I keep only the first 5 of these values using pandas drop, again its working. If i save the 5 values on a different csv and reload it its not working. The same error happens if i just remove by hand all indexes (19.995) and save file only with 5 remaining.
I would bet it is a size of file problem but when i drop the indexes on the dataframe through pandas it seems to be working
Also the number 5 ( of indexes ) is for example purpose the same happens if I delete a large portion of the dataset.
I first came up with this problem after trying to verify by hand some completely new logs, which seem to be classified correctly if thrown into the big csv file but not in a new file on their own.
Here is my load and predict code
##IMPORTS
import os
import pandas as pd
from pandas.compat import StringIO
from datetime import datetime
from langid.langid import LanguageIdentifier, model
import langid
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.externals import joblib
from ggplot import ggplot, aes, geom_line
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
import pickle
df = pd.read_csv('big_test.csv')
df3 = pd.read_csv('small_test.csv')
#This one is necessary for the loaded_model
class ColumnSelector(BaseEstimator, TransformerMixin):
def init(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
loaded_model = joblib.load('finalized_model.sav')
result = loaded_model.predict(df)
print(result)
df2=df[:5]
result2 = loaded_model.predict(df2)
print(result2)
result3 = loaded_model.predict(df3)
print(result3)
The results i get are these:
[1 0 1 ... 0 0 0]
[1 0 1 0 1]
[0 0 0 0 0]
I can provide any code even from training or my dataset if necessary.
*EDIT: I use a pipeline for my data. I tried to reproduce the error after using xgb to fit the iris data and i could not. Maybe there is something wrong with my pipeline? the code is below :
df = pd.read_csv('big_test.csv')
# df.info()
# Split Dataset
attributes = ['uri','code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ]
x_train, x_test, y_train, y_test = train_test_split(df[attributes], df['Scan'], test_size=0.2,
stratify=df['Scan'], random_state=0)
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=0.2,
stratify=y_train, random_state=0)
# print('Train:', len(y_train), 'Dev:', len(y_dev), 'Test:', len(y_test))
# set up graph function
def plot_precision_recall_curve(y_true, y_pred_scores):
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_scores)
return ggplot(aes(x='recall', y='precision'),
data=pd.DataFrame({"precision": precision, "recall": recall})) + geom_line()
# XGBClassifier
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column_list):
self.column_list = column_list
def fit(self, x, y=None):
return self
def transform(self, x):
if len(self.column_list) == 1:
return x[self.column_list[0]].values
else:
return x[self.column_list].to_dict(orient='records')
count_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2), min_df=10)
dict_vectorizer = DictVectorizer()
xgb = XGBClassifier(seed=0)
pipeline = Pipeline([
("feature_union", FeatureUnion([
('text_features', Pipeline([
('selector', ColumnSelector(['uri'])),
('count_vectorizer', count_vectorizer)
])),
('categorical_features', Pipeline([
('selector', ColumnSelector(['code','r_size','DT_sec','Method','http_version','PenTool','has_referer', 'Lang','LangProb','GibberFlag' ])),
('dict_vectorizer', dict_vectorizer)
]))
])),
('xgb', xgb)
])
pipeline.fit(x_train, y_train)
filename = 'finalized_model.sav'
joblib.dump(pipeline, filename)
Thats due to different dtypes in big and small file.
When you do:
df = pd.read_csv('big_test.csv')
The dtypes are these:
print(df.dtypes)
# Output
uri object
code object # <== Observe this
r_size object # <== Observe this
Scan int64
...
...
...
Now when you do:
df3 = pd.read_csv('small_test.csv')
the dtypes are changed:
print(df3.dtypes)
# Output
uri object
code int64 # <== Now this has changed
r_size int64 # <== Now this has changed
Scan int64
...
...
You see, pandas will try to determine the dtypes of the columns by itself. When you load the big_test.csv, there are some values in code and r_size column which are of string types, due to this whole column dtype is changed to string, which is not done in small_test.csv.
Now due to this change, the dictVectorizer encodes the data in a different way than before and the features are changed, and hence the results are also changed.
If you do this:
df3[['code', 'r_size']] = df3[['code', 'r_size']].astype(str)
and then call the predict(), the results are same again.