Using pdsit with string value in python scipy - numpy

I have a following code and I want to calculate the hamming strings of the strings:
from pandas import DataFrame
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
df = pd.read_csv("3d_printing.csv", encoding='utf-8', error_bad_lines=False, low_memory=False, names=['file_name', 'phash', 'dhash', 'file_date'])
def hamming_distance(s1, s2):
if len(s1) != len(s2):
raise ValueError("Undefined for sequences of unequal length")
return sum(el1 != el2 for el1, el2 in zip(s1, s2))
df.sort_values(by='file_date', ascending=0)
x = pd.DataFrame(np.triu(squareform(pdist(df[['phash']], hamming_distance))),
columns=df.file_name.str.split('_').str[0],
index=df.file_name.str.split('_').str[0]).replace(0, np.nan)
z = x[x.apply(lambda col: col.index != col.name)].max(1).max(level=0)
z.to_csv("3d_printing_x.csv", mode='a')
When I run the code I get
ValueError: could not convert string to float: '002889898888b8a9'
I know that pdist requires float values, but at this point I don't know what to do

Related

I get TypeError: only size-1 arrays can be converted to Python scalars, how to fix that

import numpy as np from math import * from matplotlib.pyplot import* def f(x,y) : return np.exp(-2*x)((sqrt(3)/6)*sin(2*sqrt(3)*x)+(1/2)*cos(2*sqrt(3)*x)) dx=0.1 a=0 b=6 N= int((b-a)/dx) x=np.linspace(a,b,N+1) y=np.zeros(N+1) y[0] = 6 y[1] = y[0] for i in range (N-1): y[i+2]=(2-4*dx)*y[i+1]+(-1+4*dx-16*dx**2)*y[i] Y2 = f(x,y) Err = abs (Y2-y) plot(x,y,'r',x,Y2,'b') show () plot(x,Err) show()
how to fix that? i kept get TypeError: only size-1 arrays can be converted to Python scalars
First of all don't give code examples like that please.
I cleaned up your code and I think you can use this:
import numpy as np
from math import *
from matplotlib.pyplot import*
def f(x,y) :
return np.exp(-2*x)*((np.sqrt(3)/6)*np.sin(2*np.sqrt(3)*x)+(1/2)*np.cos(2*np.sqrt(3)*x))
dx=0.1
a=0
b=6
N= int((b-a)/dx)
x=np.linspace(a,b,N+1)
y=np.zeros(N+1)
y[0] = 6
y[1] = y[0]
for i in range (N-1):
y[i+2]=(2-4*dx)*y[i+1]+(-1+4*dx-16*dx**2)*y[i]
Y2 = f(x,y)
Err = abs (Y2-y)
plot(x,y,'r',x,Y2,'b')
plot(x,Err)

scipy convert coo string directly to numpy matrix

I already have a string in coo matrix format(row, col, value):
0 0 -1627.761282
0 1 342.811259
0 2 342.811259
0 3 171.372276
0 4 342.744553
0 5 342.744553
Now I want to convert my string directly to numpy matrix. Currently I have to write my string to file, then create a numpy matrix from file:
from scipy.sparse import coo_matrix
import numpy as np
with open("Output.txt", "w") as text_file:
text_file.write(matrix_str)
text = np.loadtxt( 'Output.txt', delimiter=' ' , dtype=str)
rows,cols,data = text.T
matrix = coo_matrix((data.astype(float), (rows.astype(int), cols.astype(int)))).todense()
How can I convert my string directly to numpy matrix without writing to file ? Please help
You could use StriongIO as follows.
import numpy as np
from scipy.sparse import coo_matrix
import io
with io.StringIO(matrix_str) as ss:
rows, cols, data = np.loadtxt(ss).T
matrix = coo_matrix((data.astype(float), (rows.astype(int), cols.astype(int)))).todense()

I am trying to convert the blank values of my csv file to the mean of the columns but it is giving "could not convert string to float: '-' " error

import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
data = pd.read_csv("austin_weather.csv")
data = data.drop(['Events', 'Date'], axis = 1)
X = data.iloc[:, :-1].values
Y = data.iloc[:, 18].values
data = data.replace('T', 0.0)
imputer = Imputer(missing_values="-", strategy="mean", axis = 0)
imputer.fit(X[:])
the imputer function is not able to convert the "-" blank value to the mean of the respective column
For the depreciated class sklearn.preprocessing.Imputer parameter missing_values is either string NaN or a number.
So you can first replace all - values with np.nan: X.replace('-', np.nan) and then call imputer.

Stacked Bar Graph with Errorbars in Pandas / Matplotlib

I want to show my Data in two (or more) stacked Bargraphs inkluding Errorbars. My Code leans on an working Example, but uses df`s at input instead of Arrays.
I tried to set the df-output to an array, but this will not work
from uncertain_panda import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
raw_data = {'': ['Error', 'Value'],'Stars': [3, 18],'Cats': [2,15],'Planets': [1,12],'Dogs': [2,16]}
df = pd.DataFrame(raw_data)
df.set_index('', inplace=True)
print(df)
N = 2
ind = np.arange(N)
width = 0.35
first_Value = df.loc[['Value'],['Cats','Dogs']]
second_Value = df.loc[['Value'],['Stars','Planets']]
first_Error = df.loc[['Error'],['Cats','Dogs']]
second_Error = df.loc[['Error'],['Stars','Planets']]
p1 = plt.bar(ind, first_Value, width, yerr=first_Error)
p2 = plt.bar(ind, second_Value, width, yerr=second_Error, bottom=first_Value)
plt.xticks(ind, ('Pets', 'Universe'))
plt.legend((p1[0], p2[0]), ('Cats', 'Dogs', 'Stars', 'Planets'))
plt.show()
I expect an output like this:
https://matplotlib.org/3.1.0/gallery/lines_bars_and_markers/bar_stacked.html#sphx-glr-gallery-lines-bars-and-markers-bar-stacked-py
Instead i get this error:
TypeError: only size-1 arrays can be converted to Python scalars

How do I enable the REFS_OK flag in nditer in numpy in Python 3.3?

Does anyone know how one goes about enabling the REFS_OK flag in numpy? I cannot seem to find a clear explanation online.
My code is:
import sys
import string
import numpy as np
import pandas as pd
SNP_df = pd.read_csv('SNPs.txt',sep='\t',index_col = None ,header = None,nrows = 101)
output = open('100 SNPs.fa','a')
for i in SNP_df:
data = SNP_df[i]
data = np.array(data)
for j in np.nditer(data):
if j == 0:
output.write(("\n>%s\n")%(str(data(j))))
else:
output.write(data(j))
I keep getting the error message: Iterator operand or requested dtype holds references, but the REFS_OK was not enabled.
I cannot work out how to enable the REFS_OK flag so the program can continue...
I have isolated the problem. There is no need to use np.nditer. The main problem was with me misinterpreting how Python would read iterator variables in a for loop. The corrected code is below.
import sys
import string
import fileinput
import numpy as np
SNP_df = pd.read_csv('datafile.txt',sep='\t',index_col = None ,header = None,nrows = 5000)
output = open('outputFile.fa','a')
for i in range(1,51):
data = SNP_df[i]
data = np.array(data)
for j in range(0,1):
output.write(("\n>%s\n")%(str(data[j])))
for k in range(1,len(data)):
output.write(str(data[k]))
If you really want to enable the flag, I have an working example.
Python 2.7, numpy 1.14.2, pandas 0.22.0
import pandas as pd
import numpy as np
# get all data as panda DataFrame
data = pd.read_csv("./monthdata.csv")
print(data)
# get values as numpy array
data_ar = data.values # numpy.ndarray, every element is a row
for row in data_ar:
print(row)
sum = 0
count = 0
for month in np.nditer(row, flags=["refs_OK"], op_flags=["readwrite"]):
print month