PYTHON: Logistic Regression p values - pandas

I am able to print the p-values of my regression but I would like my output to have the X2 value as the key and the p-value next to it.
I want the output to look like this:
attr1_1: 3.73178531e-01
sinc1_1: 4.97942222e-06
the code:
from sklearn.linear_model import LogisticRegression
from scipy import stats
X2 = dating[['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1', 'attr_o','sinc_o','intel_o','fun_o','amb_o','shar_o','age', 'race',]]
y = dating['match']
dating_log_model = LogisticRegression(solver='liblinear')
dating_log_model.fit(X2,y)
dating_log_model.score(X2,y)
# getting the p-values
from sklearn.feature_selection import chi2
scores, pvalues = chi2(X2, y)
print(pvalues)
# current output
[3.73178531e-01 4.97942222e-06 3.49411284e-02 1.14925100e-11
6.40544454e-02 7.46131800e-10 3.52640714e-58 1.31669842e-17
5.15620104e-15 1.42543106e-62 6.60005884e-15 1.52260795e-81
7.41356400e-02 8.19087227e-01]

try this instead of directly print the pvalues
com_dic = {'X2':X2.columns, 'pvalues':pvalues}
result = pd.DataFrame(com_dic)
print(result)

Related

passing panda dataframe data to functions and its not outputting the results

In my code, I am trying to extract data from csv file to use in the function, but it doesnt output anything, and gives no error. My code works because I tried it with just numpy array as inputs. not sure why it doesnt work with panda.
import numpy as np
import pandas as pd
import os
# change the current directory to the directory where the running script file is
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# finding best fit line for y=mx+b by iteration
def gradient_descent(x,y):
m_iter = b_iter = 1 #starting point
iteration = 10000
n = len(x)
learning_rate = 0.05
last_mse = 10000
#take baby steps to reach global minima
for i in range(iteration):
y_predicted = m_iter*x + b_iter
#mse = 1/n*sum([value**2 for value in (y-y_predicted)]) # cost function to minimize
mse = 1/n*sum((y-y_predicted)**2) # cost function to minimize
if (last_mse - mse)/mse < 0.001:
break
# recall MSE formula is 1/n*sum((yi-y_predicted)^2), where y_predicted = m*x+b
# using partial deriv of MSE formula, d/dm and d/db
dm = -(2/n)*sum(x*(y-y_predicted))
db = -(2/n)*sum((y-y_predicted))
# use current predicted value to get the next value for prediction
# by using learning rate
m_iter = m_iter - learning_rate*dm
b_iter = b_iter - learning_rate*db
print('m is {}, b is {}, cost is {}, iteration {}'.format(m_iter,b_iter,mse,i))
last_mse = mse
#x = np.array([1,2,3,4,5])
#y = np.array([5,7,8,10,13])
#gradient_descent(x,y)
df = pd.read_csv('Linear_Data.csv')
x = df['Area']
y = df['Price']
gradient_descent(x,y)
My code works because I tried it with just numpy array as inputs. not sure why it doesnt work with panda.
Well no, your code also works with pandas dataframes:
df = pd.DataFrame({'Area': [1,2,3,4,5], 'Price': [5,7,8,10,13]})
x = df['Area']
y = df['Price']
gradient_descent(x,y)
Above will give you the same output as with numpy arrays.
Try to check what's in Linear_Data.csv and/or add some print statements in the gradient_descent function just to check your assumptions. I would suggest to first of all add a print statement before the condition with the break statement:
print(last_mse, mse)
if (last_mse - mse)/mse < 0.001:
break

Failing to find nan values but program is detecting them in data

import pandas as pd
import numpy as np
from statsmodels.miscmodels.ordinal_model import OrderedModel
import warnings
warnings.simplefilter("ignore")
data = pd.read_csv("filtered aggregated data.csv")
mask = data["Fit"].str.contains("Storefront").astype(bool)
data = data[~mask]
data["6 Mo. Growth Count"] = data["6 Mo. Growth
Count"].fillna("NaN")
sixMoData = data[data["6 Mo. Growth Count"] != "NaN"]
sixMoData["Fit"] = sixMoData["Fit"].astype("category")
sixMoData = pd.get_dummies(sixMoData, columns=
["Category","Country"])
sixMoData.dropna(inplace=True)
sixMoData.reset_index(inplace=True)
X = sixMoData[sixMoData.columns[3:]]
y = sixMoData["Fit"]
for col in X.columns:
containsNa = any(X[col].isna())
containsInf = any(X[col].isin([np.inf, -np.inf]))
if containsNa or containsInf:
print(col)
X[col] = X[col].astype(float)
model = OrderedModel(y, X)
When I run this code, I find that there are no columns that contain an nan value based on the checks that I run here. However, when I try to run the model, I get an error stating: "exog contains inf or nans", where exog refers to the X dataframe. What am I doing incorrectly when checking for nans?

can i use OR-tools for TSP with partial distance matrix (for a huge set of nodes)?

i'm trying to solve tsp with OR-tools for a problem of something like 80,000 nodes, the problem is, I need a huge distance matrix that takes to much memory ,so its infeasible and i don't get a solution.
so:
is there an option to work with partial distance matrix in or-tools?
if not is there a way to improve my code?
is there another external solver that can work for this task in python?
import math
from collections import namedtuple
import random
import time
from collections import namedtuple
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import numba
from scipy.spatial import distance_matrix
from sklearn.metrics.pairwise import euclidean_distances
from math import sqrt
Point = namedtuple("Point", ['x', 'y'])
def solve_it(input_data):
# Modify this code to run your optimization algorithm
global POINTS
# parse the input
lines = input_data.split('\n')
nodeCount = int(lines[0])
points = []
for i in range(1, nodeCount+1):
line = lines[i]
parts = line.split()
points.append(Point(float(parts[0]), float(parts[1])))
#2.routing with or tools
def dist_matrix(nodeCount,points):
data=[]
for k in range(len(points)):
data.append([int(points[k].x),int(points[k].y)])
D=euclidean_distances(data, data)
return D
def create_data_model(D):
"""Stores the data for the problem."""
data = {}
data['distance_matrix'] = D # yapf: disable
data['num_vehicles'] = 1
data['depot'] = 0
return data
def print_solution(manager, routing, solution):
index = routing.Start(0)
plan_output = []#Route for vehicle 0:\n'
route_distance = 0
while not routing.IsEnd(index):
plan_output.append(manager.IndexToNode(index))
index = solution.Value(routing.NextVar(index))
return plan_output
def or_main(nodeCount,points):
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
"""Entry point of the program."""
# Instantiate the data problem.
global sol
D=dist_matrix(nodeCount,points)
data = create_data_model(D)
# Create the routing index manager.
manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),
data['num_vehicles'], data['depot'])
# Create Routing Model.
routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index, to_index):
"""Returns the distance between the two nodes."""
# Convert from routing variable Index to distance matrix NodeIndex.
from_node = manager.IndexToNode(from_index)
to_node = manager.IndexToNode(to_index)
return data['distance_matrix'][from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback)
# Define cost of each arc.
routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
# Setting first solution heuristic.
search_parameters = pywrapcp.DefaultRoutingSearchParameters()
search_parameters.local_search_metaheuristic = (
routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
k = 100
if nodeCount <= 100:
k = 30
elif 100 <= nodeCount <= 1000:
k = 300
elif nodeCount > 1000:
k = 17000
search_parameters.time_limit.seconds =k
search_parameters.log_search = True
# Solve the problem.
solution = routing.SolveWithParameters(search_parameters)
# #print solution on console.
if solution:
sol=print_solution(manager, routing, solution)
return sol
######################################################################
solution=or_main(nodeCount,points)
# calculate the length of the tour
obj = length(points[solution[-1]], points[solution[0]])
for index in range(0, nodeCount-1):
obj += length(points[solution[index]], points[solution[index+1]])
# prepare the solution in the specified output format
output_data = '%.2f' % obj + ' ' + str(0) + '\n'
output_data += ' '.join(map(str, solution))
return output_data
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
file_location = sys.argv[1].strip()
with open(file_location, 'r') as input_data_file:
input_data = input_data_file.read()
#print(solve_it(input_data))
else:
print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/tsp_51_1)')

Numpy slogdet computation error

There appears to be a major difference between numpy's slogdet and the exact result when computing the log determinant of Vanermonde matrix.
I compare against the exact log determinant, see eg here for proof.
The minimal code to see this is:
A = np.power.outer(np.linspace(0,1,50),range(50))
print np.linalg.slogdet(A)[1]
s = 0
for v1 in np.linspace(0,1,50):
for v2 in np.linspace(0,1,50):
if v1>v2:
s+= np.log(v1-v2)
print s
Which yeilds:
-1191.88408998
-1706.99560647
I was wondering if there was a more accurate log determinant implementation which I could use in this situation but also in non-Vandermonde matrix situation.
You can use sympy and mpmath like this:
import numpy as np
import sympy as smp
import mpmath as mp
mp.mp.dps = 50
linspace1 = list(map(smp.mpmath.mpf,np.linspace(0,1,50)))
A = np.power.outer(list(map(float,linspace1)),range(50))
first_print = smp.mpmath.mpf(np.linalg.slogdet(A)[1])
print(first_print)
s = 0
linspace2 = list(map(smp.mpmath.mpf,np.linspace(0,1,50)))
linspace3 = list(map(smp.mpmath.mpf,np.linspace(0,1,50)))
for v1 in linspace1:
for v2 in linspace2:
if v1>v2:
s+= mp.log(v1-v2)
print(s)
RESULTS
first_print = -1178.272517342130186079884879291057586669921875
s = -1706.9956064674289001970168329846189154212781094939

word2vec, sum or average word embeddings?

I'm using word2vec to represent a small phrase (3 to 4 words) as a unique vector, either by adding each individual word embedding or by calculating the average of word embeddings.
From the experiments I've done I always get the same cosine similarity. I suspect it has to do with the word vectors generated by word2vec being normed to unit length (Euclidean norm) after training? or either I have a BUG in the code, or I'm missing something.
Here is the code:
import numpy as np
from nltk import PunktWordTokenizer
from gensim.models import Word2Vec
from numpy.linalg import norm
from scipy.spatial.distance import cosine
def pattern2vector(tokens, word2vec, AVG=False):
pattern_vector = np.zeros(word2vec.layer1_size)
n_words = 0
if len(tokens) > 1:
for t in tokens:
try:
vector = word2vec[t.strip()]
pattern_vector = np.add(pattern_vector,vector)
n_words += 1
except KeyError, e:
continue
if AVG is True:
pattern_vector = np.divide(pattern_vector,n_words)
elif len(tokens) == 1:
try:
pattern_vector = word2vec[tokens[0].strip()]
except KeyError:
pass
return pattern_vector
def main():
print "Loading word2vec model ...\n"
word2vecmodelpath = "/data/word2vec/vectors_200.bin"
word2vec = Word2Vec.load_word2vec_format(word2vecmodelpath, binary=True)
pattern_1 = 'founder and ceo'
pattern_2 = 'co-founder and former chairman'
tokens_1 = PunktWordTokenizer().tokenize(pattern_1)
tokens_2 = PunktWordTokenizer().tokenize(pattern_2)
print "vec1", tokens_1
print "vec2", tokens_2
p1 = pattern2vector(tokens_1, word2vec, False)
p2 = pattern2vector(tokens_2, word2vec, False)
print "\nSUM"
print "dot(vec1,vec2)", np.dot(p1,p2)
print "norm(p1)", norm(p1)
print "norm(p2)", norm(p2)
print "dot((norm)vec1,norm(vec2))", np.dot(norm(p1),norm(p2))
print "cosine(vec1,vec2)", np.divide(np.dot(p1,p2),np.dot(norm(p1),norm(p2)))
print "\n"
print "AVG"
p1 = pattern2vector(tokens_1, word2vec, True)
p2 = pattern2vector(tokens_2, word2vec, True)
print "dot(vec1,vec2)", np.dot(p1,p2)
print "norm(p1)", norm(p1)
print "norm(p2)", norm(p2)
print "dot(norm(vec1),norm(vec2))", np.dot(norm(p1),norm(p2))
print "cosine(vec1,vec2)", np.divide(np.dot(p1,p2),np.dot(norm(p1),norm(p2)))
if __name__ == "__main__":
main()
and here is the output:
Loading word2vec model ...
Dimensions 200
vec1 ['founder', 'and', 'ceo']
vec2 ['co-founder', 'and', 'former', 'chairman']
SUM
dot(vec1,vec2) 5.4008677771
norm(p1) 2.19382594282
norm(p2) 2.87226958166
dot((norm)vec1,norm(vec2)) 6.30125952303
cosine(vec1,vec2) 0.857109242583
AVG
dot(vec1,vec2) 0.450072314758
norm(p1) 0.731275314273
norm(p2) 0.718067395416
dot(norm(vec1),norm(vec2)) 0.525104960252
cosine(vec1,vec2) 0.857109242583
I'm using the cosine similarity as defined here Cosine Similarity (Wikipedia). The values for the norms and dot products are indeed different.
Can anyone explain why the cosine is the same?
Thank you,
David
Cosine measures the angle between two vectors and does not take the length of either vector into account. When you divide by the length of the phrase, you are just shortening the vector, not changing its angular position. So your results look correct to me.