pandas histogram plot error: ValueError: num must be 1 <= num <= 0, not 1 - pandas

I am drawing a histogram of a column from pandas data frame:
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib
df.hist(column='column_A', bins = 100)
but got the following errors:
62 raise ValueError(
63 "num must be 1 <= num <= {maxn}, not {num}".format(
---> 64 maxn=rows*cols, num=num))
65 self._subplotspec = GridSpec(rows, cols)[int(num) - 1]
66 # num - 1 for converting from MATLAB to python indexing
ValueError: num must be 1 <= num <= 0, not 1
Does anyone know what this error mean? Thanks!

Problem
The problem you encounter arises when column_A does not contain numeric data. As you can see in the excerpt from pandas.plotting._core below, the numeric data is essential to make the function hist_frame (which you call by DataFrame.hist()) work correctly.
def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
# skipping part of the code
# ...
if column is not None:
if not isinstance(column, (list, np.ndarray, Index)):
column = [column]
data = data[column]
data = data._get_numeric_data() # there is no numeric data in the column
naxes = len(data.columns) # so the number of axes becomes 0
# naxes is passed to the subplot generating function as 0 and later determines the number of columns as 0
fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
sharex=sharex, sharey=sharey, figsize=figsize,
layout=layout)
# skipping the rest of the code
# ...
Solution
If your problem is to represent numeric data (but not of numeric dtype yet) with a histogram, you need to cast your data to numeric, either with pd.to_numeric or df.astype(a_selected_numeric_dtype), e.g. 'float64', and then proceed with your code.
If your problem is to represent non-numeric data in one column with a histogram, you can call the function hist_series with the following line: df['column_A'].hist(bins=100).
If your problem is to represent non-numeric data in many columns with a histogram, you may resort to a handful options:
Use matplotlib and create subplots and histograms directly
Update pandas at least to version 0.25

usually is 0
mta['penn'] = [mta_bystation[mta_bystation.STATION == "34 ST-PENN STA"], 'Penn Station']
mta['grdcntrl'] = [mta_bystation[mta_bystation.STATION == "GRD CNTRL-42 ST"], 'Grand Central']
mta['heraldsq'] = [mta_bystation[mta_bystation.STATION == "34 ST-HERALD SQ"], 'Herald Sq']
mta['23rd'] = [mta_bystation[mta_bystation.STATION == "23 ST"], '23rd St']
#mta['portauth'] = [mta_bystation[mta_bystation.STATION == "42 ST-PORT AUTH"], 'Port Auth']
#mta['unionsq'] = [mta_bystation[mta_bystation.STATION == "14 ST-UNION SQ"], 'Union Sq']
mta['timessq'] = [mta_bystation[mta_bystation.STATION == "TIMES SQ-42 ST"], 'Ti

Related

passing panda dataframe data to functions and its not outputting the results

In my code, I am trying to extract data from csv file to use in the function, but it doesnt output anything, and gives no error. My code works because I tried it with just numpy array as inputs. not sure why it doesnt work with panda.
import numpy as np
import pandas as pd
import os
# change the current directory to the directory where the running script file is
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# finding best fit line for y=mx+b by iteration
def gradient_descent(x,y):
m_iter = b_iter = 1 #starting point
iteration = 10000
n = len(x)
learning_rate = 0.05
last_mse = 10000
#take baby steps to reach global minima
for i in range(iteration):
y_predicted = m_iter*x + b_iter
#mse = 1/n*sum([value**2 for value in (y-y_predicted)]) # cost function to minimize
mse = 1/n*sum((y-y_predicted)**2) # cost function to minimize
if (last_mse - mse)/mse < 0.001:
break
# recall MSE formula is 1/n*sum((yi-y_predicted)^2), where y_predicted = m*x+b
# using partial deriv of MSE formula, d/dm and d/db
dm = -(2/n)*sum(x*(y-y_predicted))
db = -(2/n)*sum((y-y_predicted))
# use current predicted value to get the next value for prediction
# by using learning rate
m_iter = m_iter - learning_rate*dm
b_iter = b_iter - learning_rate*db
print('m is {}, b is {}, cost is {}, iteration {}'.format(m_iter,b_iter,mse,i))
last_mse = mse
#x = np.array([1,2,3,4,5])
#y = np.array([5,7,8,10,13])
#gradient_descent(x,y)
df = pd.read_csv('Linear_Data.csv')
x = df['Area']
y = df['Price']
gradient_descent(x,y)
My code works because I tried it with just numpy array as inputs. not sure why it doesnt work with panda.
Well no, your code also works with pandas dataframes:
df = pd.DataFrame({'Area': [1,2,3,4,5], 'Price': [5,7,8,10,13]})
x = df['Area']
y = df['Price']
gradient_descent(x,y)
Above will give you the same output as with numpy arrays.
Try to check what's in Linear_Data.csv and/or add some print statements in the gradient_descent function just to check your assumptions. I would suggest to first of all add a print statement before the condition with the break statement:
print(last_mse, mse)
if (last_mse - mse)/mse < 0.001:
break

Pd['Column1] > Pd['Column2'] Key Error: 0

I am trying to write a function that loops through a pandas dataframe and compares if column1 > column2, if so appends 1 to a list, which is then returned.
Importing finance data from yahoo finance, calculating 2 Std and assigning to column upper and lower, and the moving average.
import pandas as pd
import pandas_datareader as web
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
import numpy as np
end = datetime(2021, 1, 16)
start = datetime(2020 , 1, 16)
symbols = ['ETH-USD']
stock_df = web.get_data_yahoo(symbols, start, end)
period = 20
# Simple Moving Average
stock_df['SMA'] = stock_df['Close'].rolling(window=period).mean()
# Standard deviation
stock_df['STD'] = stock_df['Close'].rolling(window=period).std()
# Upper Bollinger Band
stock_df['Upper'] = stock_df['SMA'] + (stock_df['STD'] * 2)
# Lower Bollinger Band
stock_df['Lower'] = stock_df['SMA'] - (stock_df['STD'] * 2)
# List of columns
column_list = ['Close', 'SMA', 'Upper', 'Lower']
stock_df[column_list].plot(figsize=(12.2,6.4)) #Plot the data
plt.title('ETH-USD')
plt.ylabel('USD Price ($)')
plt.show();
#Create a new data frame, Period for calculation, removes NAN's
bolldf = stock_df[period-1:]
#Show the new data frame
bolldf
Function, Loop through column rows and compare, append df['Close'][0] to buy/sell signal if condition is met.
def signal(df):
buy_signal = []
sell_signal = []
for i in range(len(df['Close'])):
if df['Close'][i] > df['Upper'][i]:
buy_signal.append(1)
return buy_signal
buy_signal = signal(bolldf)
buy_signal
Info about the Error:
KeyError: 0
During handling of the above exception, another exception occurred:
---> 12 buy_signal = greaterthan(stock_df)
KeyError: 0
During handling of the above exception, another exception occurred:
11
---> 12 buy_signal = greaterthan(stock_df)
13
----> 8 if bolldf['Close'][i] > bolldf['Open'][i]:
KeyError: 0
When i attempt this function on the columns df['Upper'] > df['Lower], or df['SMA'] < df['Lower'] for example it works as expected, its only when using the columns from the original data that it does not work.
Any help would be amazing. Thank you.
Since it is a multi-index, the columns must also be specified in the multi-index format. You can check the contents of the columns with bolldf.columns, and you can get them by modifying as follows.
bolldf.columns
MultiIndex([('Adj Close', 'ETH-USD'),
( 'Close', 'ETH-USD'),
( 'High', 'ETH-USD'),
( 'Low', 'ETH-USD'),
( 'Open', 'ETH-USD'),
( 'Volume', 'ETH-USD'),
( 'SMA', ''),
( 'STD', ''),
( 'Upper', ''),
( 'Lower', '')],
names=['Attributes', 'Symbols'])
def signal(df):
buy_signal = []
sell_signal = []
for i in range(len(df[('Close','ETH-USD')])):
if df[('Close','ETH-USD')][i] > df[('Upper','')][i]:
buy_signal.append(1)
return buy_signal
buy_signal = signal(bolldf)

Iterating over columns in data frame by skipping first column and drawing multiple plots

I have a data frame as following,
df.head()
ID AS_FP AC_FP RP11_FP RP11_be AC_be AS_be Info
AE02 0.060233 0 0.682884 0.817115 0.591182 0.129252 SAP
AE03 0 0 0 0.889181 0.670113 0.766243 SAP
AE04 0 0 0.033256 0.726193 0.171861 0.103839 others
AE05 0 0 0.034988 0.451329 0.431836 0.219843 others
What I am aiming is to plot each column starting from AS_FP til RP11_beta as lmplot, each x axis is column ending with FP and y axis is its corresponding column ending with be.
And I wanted to save it as separate files so I strated iterating through the columns by skipping first column ID, like this,
for ind, column in enumerate(df.columns):
if column.split('_')[0] == column.split('_')[0]:
But I got lost how to continue, I need to plot
sns.lmplot(x, y, data=df, hue='Info',palette=colors, fit_reg=False,
size=10,scatter_kws={"s": 700},markers=["o", "v"])
and save each image as seperate file
Straightforward solution:
1) Toy data:
import pandas as pd
from collections import OrderedDict
import matplotlib.pyplot as plt
import seaborn as sns
dct = OrderedDict()
dct["ID"] = ["AE02", "AE03", "AE04", "AE05"]
dct["AS_FP"] = [0.060233, 0, 0, 0]
dct["AC_FP"] = [0, 0,0, 0]
dct["RP11_FP"] = [0.682884, 0, 0.033256, 0.034988]
dct["AS_be"] = [0.129252, 0.766243, 0.103839, 0.219843]
dct["AC_be"] = [0.591182, 0.670113, 0.171861, 0.431836]
dct["RP11_be"] = [0.817115, 0.889181, 0.726193, 0.451329]
dct["Info"] = ["SAP", "SAP", "others", "others"]
df = pd.DataFrame(dct)
2) Iterating through pairs, saving each figure with unique filename:
graph_cols = [col for col in df.columns if ("_FP" in col) or ("_be" in col)]
fps = sorted([col for col in graph_cols if "_FP" in col])
bes = sorted([col for col in graph_cols if "_be" in col])
for x, y in zip(fps, bes):
snsplot = sns.lmplot(x, y, data=df, fit_reg=False, hue='Info',
size=10, scatter_kws={"s": 700})
snsplot.savefig(x.split("_")[0] + ".png")
You can add needed params in lmlplot as you need.

TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U1') dtype('<U1') dtype('<U1')

Strange error from numpy via matplotlib when trying to get a histogram of a tiny toy dataset. I'm just not sure how to interpret the error, which makes it hard to see what to do next.
Didn't find much related, though this nltk question and this gdsCAD question are superficially similar.
I intend the debugging info at bottom to be more helpful than the driver code, but if I've missed something, please ask. This is reproducible as part of an existing test suite.
if n > 1:
return diff(a[slice1]-a[slice2], n-1, axis=axis)
else:
> return a[slice1]-a[slice2]
E TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U1') dtype('<U1') dtype('<U1')
../py2.7.11-venv/lib/python2.7/site-packages/numpy/lib/function_base.py:1567: TypeError
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> entering PDB >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
> py2.7.11-venv/lib/python2.7/site-packages/numpy/lib/function_base.py(1567)diff()
-> return a[slice1]-a[slice2]
(Pdb) bt
[...]
py2.7.11-venv/lib/python2.7/site-packages/matplotlib/axes/_axes.py(5678)hist()
-> m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
py2.7.11-venv/lib/python2.7/site-packages/numpy/lib/function_base.py(606)histogram()
-> if (np.diff(bins) < 0).any():
> py2.7.11-venv/lib/python2.7/site-packages/numpy/lib/function_base.py(1567)diff()
-> return a[slice1]-a[slice2]
(Pdb) p numpy.__version__
'1.11.0'
(Pdb) p matplotlib.__version__
'1.4.3'
(Pdb) a
a = [u'A' u'B' u'C' u'D' u'E']
n = 1
axis = -1
(Pdb) p slice1
(slice(1, None, None),)
(Pdb) p slice2
(slice(None, -1, None),)
(Pdb)
I got the same error, but in my case I am subtracting dict.key from dict.value. I have fixed this by subtracting dict.value for corresponding key from other dict.value.
cosine_sim = cosine_similarity(e_b-e_a, w-e_c)
here I got error because e_b, e_a and e_c are embedding vector for word a,b,c respectively. I didn't know that 'w' is string, when I sought out w is string then I fix this by following line:
cosine_sim = cosine_similarity(e_b-e_a, word_to_vec_map[w]-e_c)
Instead of subtracting dict.key, now I have subtracted corresponding value for key
I had a similar issue where an integer in a row of a DataFrame I was iterating over was of type numpy.int64. I got the
TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U1') dtype('<U1') dtype('<U1')
error when trying to subtract a float from it.
The easiest fix for me was to convert the row using pd.to_numeric(row).
Why is it applying diff to an array of strings.
I get an error at the same point, though with a different message
In [23]: a=np.array([u'A' u'B' u'C' u'D' u'E'])
In [24]: np.diff(a)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-24-9d5a62fc3ff0> in <module>()
----> 1 np.diff(a)
C:\Users\paul\AppData\Local\Enthought\Canopy\User\lib\site-packages\numpy\lib\function_base.pyc in diff(a, n, axis)
1112 return diff(a[slice1]-a[slice2], n-1, axis=axis)
1113 else:
-> 1114 return a[slice1]-a[slice2]
1115
1116
TypeError: unsupported operand type(s) for -: 'numpy.ndarray' and 'numpy.ndarray'
Is this a array the bins parameter? What does the docs say bins should be?
I am fairly new to this myself, but I had a similar error and found that it is due to a type casting issue. I was trying to concatenate rather than take the difference but I think the principle is the same here. I provided a similar answer on another question so I hope that is OK.
In essence you need to use a different data type cast, in my case I needed str not float, I suspect yours is the same so my suggested solution is. I am sorry I cannot test it before suggesting but I am unclear from your example what you were doing.
return diff(str(a[slice1])-str(a[slice2]), n-1, axis=axis)
Please see my example code below for the fix to my code, the change occurs on the third to last line. The code is to produce a basic random forest model.
import scipy
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, metrics, cross_validation
Data = pd.read_csv("Free_Energy_exp.csv", sep=",")
Data = Data.fillna(Data.mean()) # replace the NA values with the mean of the descriptor
header = Data.columns.values # Ues the column headers as the descriptor labels
Data.head()
test_name = "Test.csv"
npArray = np.array(Data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
XTrain, XTest, yTrain, yTest = cross_validation.train_test_split(X,y, random_state=0)
# Predictions results initialised
RFpredictions = []
RF = RandomForestRegressor(n_estimators = 10, max_features = 5, max_depth = 5, random_state=0)
RF.fit(XTrain, yTrain) # Train the model
print("Training R2 = %5.2f" % RF.score(XTrain,yTrain))
RFpreds = RF.predict(XTest)
with open(test_name,'a') as fpred :
lenpredictions = len(RFpreds)
lentrue = yTest.shape[0]
if lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lenpredictions) :
fpred.write(RFpreds[i]+",,"+yTest[i]+",\n")
else :
print "ERROR - names, prediction and true value array size mismatch."
This leads to an error of;
Traceback (most recent call last):
File "min_example.py", line 40, in <module>
fpred.write(RFpreds[i]+",,"+yTest[i]+",\n")
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
The solution is to make each variable a str() type on the third to last line then write to file. No other changes to then code have been made from the above.
import scipy
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, metrics, cross_validation
Data = pd.read_csv("Free_Energy_exp.csv", sep=",")
Data = Data.fillna(Data.mean()) # replace the NA values with the mean of the descriptor
header = Data.columns.values # Ues the column headers as the descriptor labels
Data.head()
test_name = "Test.csv"
npArray = np.array(Data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
XTrain, XTest, yTrain, yTest = cross_validation.train_test_split(X,y, random_state=0)
# Predictions results initialised
RFpredictions = []
RF = RandomForestRegressor(n_estimators = 10, max_features = 5, max_depth = 5, random_state=0)
RF.fit(XTrain, yTrain) # Train the model
print("Training R2 = %5.2f" % RF.score(XTrain,yTrain))
RFpreds = RF.predict(XTest)
with open(test_name,'a') as fpred :
lenpredictions = len(RFpreds)
lentrue = yTest.shape[0]
if lenpredictions == lentrue :
fpred.write("Names/Label,, Prediction Random Forest,, True Value,\n")
for i in range(0,lenpredictions) :
fpred.write(str(RFpreds[i])+",,"+str(yTest[i])+",\n")
else :
print "ERROR - names, prediction and true value array size mismatch."
These examples are from a larger code so I hope the examples are clear enough.
I think #James is right. I got stuck by same error while working on Polyval(). And yeah solution is to use the same type of variabes. You can use typecast to cast all variables in the same type.
BELOW IS A EXAMPLE CODE
import numpy
P = numpy.array(input().split(), float)
x = float(input())
print(numpy.polyval(P,x))
here I used float as an output type. so even the user inputs the INT value (whole number). the final answer will be typecasted to float.
I ran into the same issue, but in my case it was just a Python list instead of a Numpy array used. Using two Numpy arrays solved the issue for me.

live plotting using pyserial and matplotlib

I can capture data from serial device via pyserial, at this time I can only export data to text file, the text file has format like below, it's have 3 columns
>21 21 0
>
>41 41 0.5
>
>73 73 1
>
....
>2053 2053 5
>
>2084 2084 5.5
>
>2125 2125 6
Now I want to use matplotlib to generate live graph has 2 figure (x,y) x,y are second and third column, first comlumn,'>', and lines don't have data can be remove
thank folks!
============================
Update : today, after follow these guides from
http://www.blendedtechnologies.com/realtime-plot-of-arduino-serial-data-using-python/231
http://eli.thegreenplace.net/2008/08/01/matplotlib-with-wxpython-guis
pyserial - How to read the last line sent from a serial device
now I can live plot with threading but eliben said that this Guis only plot single value each time, that lead to me the very big limitation, beacause my purpose is plotting 2 or 3 column, here is the code was modified from blendedtechnologies
Here is serial handler :
from threading import Thread
import time
import serial
last_received = ''
def receiving(ser):
global last_received
buffer = ''
while True:
buffer = buffer + ser.read(ser.inWaiting())
if '\n' in buffer:
lines = buffer.split('\n') # Guaranteed to have at least 2 entries
last_received = lines[-2]
#If the Arduino sends lots of empty lines, you'll lose the
#last filled line, so you could make the above statement conditional
#like so: if lines[-2]: last_received = lines[-2]
buffer = lines[-1]
class SerialData(object):
def __init__(self, init=50):
try:
self.ser = ser = serial.Serial(
port='/dev/ttyS0',
baudrate=9600,
bytesize=serial.EIGHTBITS,
parity=serial.PARITY_NONE,
stopbits=serial.STOPBITS_ONE,
timeout=0.1,
xonxoff=0,
rtscts=0,
interCharTimeout=None
)
except serial.serialutil.SerialException:
#no serial connection
self.ser = None
else:
Thread(target=receiving, args=(self.ser,)).start()
def next(self):
if not self.ser:
return 100 #return anything so we can test when Arduino isn't connected
#return a float value or try a few times until we get one
for i in range(40):
raw_line = last_received[1:].split(' ').pop(0)
try:
return float(raw_line.strip())
except ValueError:
print 'bogus data',raw_line
time.sleep(.005)
return 0.
def __del__(self):
if self.ser:
self.ser.close()
if __name__=='__main__':
s = SerialData()
for i in range(500):
time.sleep(.015)
print s.next()
For me I modified this segment so it can grab my 1st column data
for i in range(40):
raw_line = last_received[1:].split(' ').pop(0)
try:
return float(raw_line.strip())
except ValueError:
print 'bogus data',raw_line
time.sleep(.005)
return 0.
and generate graph base on these function on the GUI file
from Arduino_Monitor import SerialData as DataGen
def __init__(self):
wx.Frame.__init__(self, None, -1, self.title)
self.datagen = DataGen()
self.data = [self.datagen.next()]
................................................
def init_plot(self):
self.dpi = 100
self.fig = Figure((3.0, 3.0), dpi=self.dpi)
self.axes = self.fig.add_subplot(111)
self.axes.set_axis_bgcolor('black')
self.axes.set_title('Arduino Serial Data', size=12)
pylab.setp(self.axes.get_xticklabels(), fontsize=8)
pylab.setp(self.axes.get_yticklabels(), fontsize=8)
# plot the data as a line series, and save the reference
# to the plotted line series
#
self.plot_data = self.axes.plot(
self.data,
linewidth=1,
color=(1, 1, 0),
)[0]
So my next question is how to realtime grab at least 2 column and passing 2 columns'data to the GUIs that it can generate graph with 2 axis.
self.plot_data.set_xdata(np.arange(len(self.data))) #my 3rd column data
self.plot_data.set_ydata(np.array(self.data)) #my 2nd column data
Well, this reads your string and converts the numbers to floats. I assume you'll be able to adapt this as needed.
import numpy as np
import pylab as plt
str = '''>21 21 0
>
>41 41 0.5
>
>73 73 1
>
>2053 2053 5
>
>2084 2084 5.5
>
>2125 2125 6'''
nums = np.array([[float(n) for n in sub[1:].split(' ') if len(n)>0] for sub in str.splitlines() if len(sub)>1])
fig = plt.figure(0)
ax = plt.subplot(2,1,1)
ax.plot(nums[:,0], nums[:,1], 'k.')
ax = plt.subplot(2,1,2)
ax.plot(nums[:,0], nums[:,2], 'r+')
plt.show()
Here you have an Eli Bendersky's example of how plotting data arriving from a serial port
some time back I had the same problem. I wasted a lot of writing same ting over and over again. so I wrote a python package for it.
https://github.com/girish946/plot-cat
you just have to write the logic for obtaining the data from serial port.
the example is here: https://github.com/girish946/plot-cat/blob/master/examples/test-ser.py