Failing to find nan values but program is detecting them in data - pandas

import pandas as pd
import numpy as np
from statsmodels.miscmodels.ordinal_model import OrderedModel
import warnings
warnings.simplefilter("ignore")
data = pd.read_csv("filtered aggregated data.csv")
mask = data["Fit"].str.contains("Storefront").astype(bool)
data = data[~mask]
data["6 Mo. Growth Count"] = data["6 Mo. Growth
Count"].fillna("NaN")
sixMoData = data[data["6 Mo. Growth Count"] != "NaN"]
sixMoData["Fit"] = sixMoData["Fit"].astype("category")
sixMoData = pd.get_dummies(sixMoData, columns=
["Category","Country"])
sixMoData.dropna(inplace=True)
sixMoData.reset_index(inplace=True)
X = sixMoData[sixMoData.columns[3:]]
y = sixMoData["Fit"]
for col in X.columns:
containsNa = any(X[col].isna())
containsInf = any(X[col].isin([np.inf, -np.inf]))
if containsNa or containsInf:
print(col)
X[col] = X[col].astype(float)
model = OrderedModel(y, X)
When I run this code, I find that there are no columns that contain an nan value based on the checks that I run here. However, when I try to run the model, I get an error stating: "exog contains inf or nans", where exog refers to the X dataframe. What am I doing incorrectly when checking for nans?

Related

passing panda dataframe data to functions and its not outputting the results

In my code, I am trying to extract data from csv file to use in the function, but it doesnt output anything, and gives no error. My code works because I tried it with just numpy array as inputs. not sure why it doesnt work with panda.
import numpy as np
import pandas as pd
import os
# change the current directory to the directory where the running script file is
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# finding best fit line for y=mx+b by iteration
def gradient_descent(x,y):
m_iter = b_iter = 1 #starting point
iteration = 10000
n = len(x)
learning_rate = 0.05
last_mse = 10000
#take baby steps to reach global minima
for i in range(iteration):
y_predicted = m_iter*x + b_iter
#mse = 1/n*sum([value**2 for value in (y-y_predicted)]) # cost function to minimize
mse = 1/n*sum((y-y_predicted)**2) # cost function to minimize
if (last_mse - mse)/mse < 0.001:
break
# recall MSE formula is 1/n*sum((yi-y_predicted)^2), where y_predicted = m*x+b
# using partial deriv of MSE formula, d/dm and d/db
dm = -(2/n)*sum(x*(y-y_predicted))
db = -(2/n)*sum((y-y_predicted))
# use current predicted value to get the next value for prediction
# by using learning rate
m_iter = m_iter - learning_rate*dm
b_iter = b_iter - learning_rate*db
print('m is {}, b is {}, cost is {}, iteration {}'.format(m_iter,b_iter,mse,i))
last_mse = mse
#x = np.array([1,2,3,4,5])
#y = np.array([5,7,8,10,13])
#gradient_descent(x,y)
df = pd.read_csv('Linear_Data.csv')
x = df['Area']
y = df['Price']
gradient_descent(x,y)
My code works because I tried it with just numpy array as inputs. not sure why it doesnt work with panda.
Well no, your code also works with pandas dataframes:
df = pd.DataFrame({'Area': [1,2,3,4,5], 'Price': [5,7,8,10,13]})
x = df['Area']
y = df['Price']
gradient_descent(x,y)
Above will give you the same output as with numpy arrays.
Try to check what's in Linear_Data.csv and/or add some print statements in the gradient_descent function just to check your assumptions. I would suggest to first of all add a print statement before the condition with the break statement:
print(last_mse, mse)
if (last_mse - mse)/mse < 0.001:
break

TypeError: ufunc add cannot use operands with types dtype('<M8[ns]') and dtype('<M8[ns]')

I am trying to set an ARIMA model to some data, for this, I used 'autocorrelation_plot()' with my time series. It's generates however the error in the title.
I have an attribute table composed, among others, of a Date and time fiels.
I extracted them (after transforming the attribute table into a numpy table), put them in a 'datetime' variable and appended them all in a list:
O,A = [],[]
dt = datetime.strptime(dt1, "%Y/%m/%d %H:%M")
A.append(dt)
I tried then to create time series and printed them to be sure of the results:
data2 = pd.Series(A, O)
print data2
The results were satisfying, until I decided to auto-correlate :
Auto-correlation command :
autocorrelation_plot(data2)
After this command, it returns:
TypeError: ufunc add cannot use operands with types dtype('M8[ns]') and dtype('M8[ns]')
I guess it's due to the conversion of the datetime.strptime to a numpy ?
I tried to follow some suggestions from previous questions
index.to_pydatetime() , dtype, M8[ns] error ..., in vain.
Minimal reproducible example:
from pandas import datetime
from pandas import DataFrame
import pandas as pd
from matplotlib import pyplot as plt
from pandas.tools.plotting import autocorrelation_plot
arr = arcpy.da.TableToNumPyArray(inTable ,("PROVINCE","ZONE_CODE","MEAN", "Datetime","Time"))
arr_length = len(arr)
j = 1
O,A = [],[]
while j<=55: #I have 55 provinces
i = 0
while i<arr_length:
if arr[i][1]== j:
O.append(arr[i][2])
c = str(arr[i][3])
d = str(c[0:4]+"/"+c[5:7]+"/"+c[8:10])
t = str(arr[i][4])
if t=="10":
dt1 = str(d+" 10:00")
else:
dt1 = str(d+" 14:00")
dt = datetime.strptime(dt1, "%Y/%m/%d %H:%M")
A.append(dt)
i = i+1
data2 = pd.Series(A, O)
print data2
autocorrelation_plot(data2)
del A[:]
del O[:]
j += 1
Screenshot of the results:
results
I used this to solve my issue:
import matplotlib.dates as mpl_dates
df.reset_index(inplace=True)
df['Date']=df['Date'].apply(mpl_dates.date2num)
df = df.astype(float)
I found a solution, it can look barbaric, but it works!
I've just "recreated" pd.Series() with the pd.Series I had:
data2 = pd.Series(O, A)
autocorrelation_plot(pd.Series(data2))
plt.show()

Set Multiple Restrictions for Rows Called to Print in Pandas

import pandas as pd
import numpy as np
#load data
#data file and py file must be in same file path
df = pd.read_csv('cbp15st.txt', delimiter = ',', encoding = 'utf-8-
sig')
#define load data DataFrame columns
state = df['FIPSTATE']
industry = df['NAICS']
legal_form_of_organization = df['LFO']
suppression_flag = df['EMPFLAG']
total_establishment = df['EST']
establishment_1_4 = df['N1_4']
establishment_5_9 = df['N5_9']
establishment_10_19 = df['N10_19']
establishment_20_49 = df['N20_49']
establishment_50_99 = df['N50_99']
establishment_100_249 = df['N100_249']
establishment_250_499 = df['N250_499']
establishment_500_999 = df['N500_999']
establishment_1000_more = df['N1000']
#use df.loc to parse dataset for partiuclar value types
print(df.loc[df['EMPFLAG']=='A'], df.loc[df['FIPSTATE']==1],
df.loc[df['NAICS']=='------'])
Currently using df.loc to locate specific values from the df columns, but will read out those columns that contain all of these values, not only these values (like an or vs and statement)
Trying to find a way to place multiple restrictions on this to only get column reads that meet criteria x y and z.
Current Readout from above:
enter image description here
You can use & operator while specifying multiple filtering criteria, something like:
df1 = df.loc[(df['EMPFLAG']=='A']) & (df['FIPSTATE']==1) & (df['NAICS']=='------')]
print(df1)

Scikit-pandas, cross val score number of features

I am getting familiar with scikit and its pandas integration using the Titanic tutorial on Kaggle. I have cleaned my data and would like to make some prediction. I can do it calling a pipeline fit and transform - unfortunately I get an error trying to do the same with cross_val_score.
I am using the sklearn-pandas cross_val_score
The code is as follows:
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline([
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
X = df_train[df_train.columns.drop('Survived')]
y = df_train['Survived']
#model = pipe.fit(X = X, y = y)
#prediction = model.predict(df_train)
score = cross_val_score(pipe, X = X, y = y, scoring = 'accuracy')
df_train is a Pandas dataframe containing all my training set, including outcomes. The two commented lines:
model = pipe.fit(X = X, y = y)
prediction = model.predict(df_train)
Work fine and prediction returns me an array with predicted outcomes. Using the same with cross_val_score, I get the following error:
X has 20 features per sample; expecting 19
Full code below, can be run with the Titanic CSV files on Kaggle (https://www.kaggle.com/c/titanic/data)
#%% Libraries import
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
#%% Read the data
path = 'E:/Kaggle/Titanic/Data/'
file_training = 'train.csv'
file_test = 'test.csv'
#Import the training and test dataset and concatenate them
df_training = pd.read_csv(path + file_training, header = 0, index_col = 'PassengerId')
df_test = pd.read_csv(path + file_test, header = 0, index_col = 'PassengerId')
# Work on the concatenated training and test data for feature engineering and clean-up
df = pd.concat([df_training, df_test], keys = ['train','test'])
#%% Initial data exploration and cleaning
df.describe(include = 'all')
pd.isnull(df).sum() > 0
#%% Preprocesing and Cleanup
#Create new columns with the name (to identify individuals part of a family)
df['LName'] = df['Name'].apply(lambda x:x.split(',')[0].strip())
df['FName'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[1].strip())
#Get the title
df['Title'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
titleDic = {
'Master' : 'kid',
'Mlle' : 'unmarriedWoman',
'Miss' : 'unmarriedWoman',
'Ms' : 'unmarriedWoman',
'Jonkheer' : 'noble',
'Don' : 'noble',
'Dona' : 'noble',
'Sir' : 'noble',
'Lady' : 'noble',
'the Countess' : 'noble',
'Capt' : 'ranked',
'Major' : 'ranked',
'Col' : 'ranked',
'Mr' : 'standard',
'Mme' : 'standard',
'Mrs' : 'standard',
'Dr' : 'academic',
'Rev' : 'academic'
}
df['Group'] = df['Title'].map(titleDic)
#%% Working with the family size
#Get the family size
df['familySize'] = df['Parch'] + df['SibSp'] + 1
#Add a family tag (single, couple, small, large)
df['familyType'] = pd.cut(df['familySize'],
[1,2,3,5,np.inf],
labels = ['single','couple','sFamily','bFamily'],
right = False)
#%% Filling empty values
#Fill empty values with the mean or mode for the column
#Fill the missing values with mean for age per title, class and gender. Store value in AgeFull variable
agePivot = pd.DataFrame(df.groupby(['Group', 'Sex'])['Age'].median())
agePivot.columns = ['AgeFull']
df = pd.merge(df, agePivot, left_on = ['Group', 'Sex'], right_index = True)
df.loc[df['Age'].isnull(),['Age']] = df['AgeFull']
#Embark location missing values
embarkPivot = pd.DataFrame(df.groupby(['Group'])['Embarked'].agg(lambda x:x.value_counts().index[0]))
embarkPivot.columns = ['embarkFull']
df = pd.merge(df, embarkPivot, left_on = ['Group'], right_index = True)
df.loc[df['Embarked'].isnull(),['Embarked']] = df['embarkFull']
#Fill the missing fare value
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mean()
#%% Final clean-up (drop temporary columns)
df = df.drop(['AgeFull', 'embarkFull'], 1)
#%% Preparation for training
df_train = df.loc['train']
df_test = df.loc['test']
#Creation of dummy variables
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline(steps = [
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
#Uncommenting the line below fixes the code - why?
#df_train = df_train.sort_index()
X = df_train[df_train.columns.drop(['Survived'])]
y = df_train.Survived
score = cross_val_score(pipe, X = df_train, y = df_train.Survived, scoring = 'accuracy')
This is very interesting. I have solved the issue just by sorting using the index the DataFrame before passing it to the cross_val_score in the pipeline.
df_train = df_train.sort_index()
Could anyone explain me why this would have an impact on how Scikit is working?

Numpy slogdet computation error

There appears to be a major difference between numpy's slogdet and the exact result when computing the log determinant of Vanermonde matrix.
I compare against the exact log determinant, see eg here for proof.
The minimal code to see this is:
A = np.power.outer(np.linspace(0,1,50),range(50))
print np.linalg.slogdet(A)[1]
s = 0
for v1 in np.linspace(0,1,50):
for v2 in np.linspace(0,1,50):
if v1>v2:
s+= np.log(v1-v2)
print s
Which yeilds:
-1191.88408998
-1706.99560647
I was wondering if there was a more accurate log determinant implementation which I could use in this situation but also in non-Vandermonde matrix situation.
You can use sympy and mpmath like this:
import numpy as np
import sympy as smp
import mpmath as mp
mp.mp.dps = 50
linspace1 = list(map(smp.mpmath.mpf,np.linspace(0,1,50)))
A = np.power.outer(list(map(float,linspace1)),range(50))
first_print = smp.mpmath.mpf(np.linalg.slogdet(A)[1])
print(first_print)
s = 0
linspace2 = list(map(smp.mpmath.mpf,np.linspace(0,1,50)))
linspace3 = list(map(smp.mpmath.mpf,np.linspace(0,1,50)))
for v1 in linspace1:
for v2 in linspace2:
if v1>v2:
s+= mp.log(v1-v2)
print(s)
RESULTS
first_print = -1178.272517342130186079884879291057586669921875
s = -1706.9956064674289001970168329846189154212781094939