SMOTE adds many rows with 0 values to dataframe - dataframe

Please help me, i cannot understand why X_synthetic_df returns hundreds of rows with 0 values. All of the rows have normal values fine until row 1745. From that row, all the other row values contain nothing but zeros
def nearest_neighbors(nominal_columns, numeric_columns, df, row, k):
def distance(row1, row2):
distance = 0
for col in nominal_columns:
if row1[col] != row2[col]:
distance += 1
for col in numeric_columns:
distance += (row1[col] - row2[col])**2
return distance**0.5
distances = []
for i in range(len(df)):
r = df.iloc[i]
if r.equals(row):
continue
d = distance(row, r)
if(d!=0):
distances.append((d, i))
distances.sort()
nearest = [i for d, i in distances[:k]]
return nearest
def smotenc(X, y, nominal_cols, numeric_cols, k=5, seed=None):
minority_class = y[y==1]
majority_class = y[y==0]
minority_samples = X[y == 1]
minority_target = y[y == 1]
n_synthetic_samples = len(majority_class)-len(minority_class)
synthetic_samples = np.zeros((n_synthetic_samples, X.shape[1]))
if seed is not None:
np.random.seed(seed)
for i in range(len(minority_samples)):
nn = nearest_neighbors(nominal_cols, numeric_cols, minority_samples, minority_samples.iloc[i], k=k)
for j in range(min(k, n_synthetic_samples - i*k)):
nn_idx = int(np.random.choice(a=nn))
diff = minority_samples.iloc[(nn_idx)] - minority_samples.iloc[i]
print(diff)
if (diff == 0).all():
continue
synthetic_sample = minority_samples.iloc[i] + np.random.rand() * diff
synthetic_samples[(i*k)+j, :] = synthetic_sample
X_resampled = pd.concat([X[y == 1], pd.DataFrame(synthetic_samples,columns=X.columns)], axis=0)
y_resampled = np.concatenate((y[y == 1], [1] * n_synthetic_samples))
return X_resampled, y_resampled
minority_features = df_nominal.columns.get_indexer(df_nominal.columns)
synthetic = smotenc(check_x.head(3000),check_y.head(3000),nominal_cols,numeric_cols,seed = None)
X_synthetic_df = synthetic[0]
X_synthetic_df = pd.DataFrame(X_synthetic_df,columns = X.columns)
I was a dataframe with n synthetic samples, where n is the difference between the majority samples and minority class samples

Related

apply my created function to a data frame

I have the following function to calculate the black scholes model,
where I paste the necessary data from S0, K , K , T , r
and market_price in the function manually.
I would like to apply this same function to a pandas data frame
There I have the values ​​needed to perform the calculation
Data frame example
data = {'Name':['BOVAE115', 'BOVAE119', 'BBDCE251', 'BBDCE246'],
'Valor':[110.050003, 110.050003, 19.500000, 19.500000],
'Strike Value':[15.00, 19.00, 24.67, 25.19],
'Temp':[0.119048, 0.119048, 0.119048, 0.119048],
'Taxa':[11.65, 11.65, 11.65, 11.65],
'Market Price':[0.391968, 0.391968, 0.391968, 0.391968],
'Order':['c','c','c','c']
}
# Create DataFrame
df = pd.DataFrame(data)
df
How do I apply the created function to this list of values
Function
See the code
S0 = df['Valor']
K = df['Strike Value']
T = df['Temp']
r = df['Taxa']
market_price = df['Price']
flag = df['Order']
from py_vollib.black_scholes import black_scholes as bs
from py_vollib.black_scholes.greeks.analytical import vega
def implied_vol(S0, K, T, r, market_price, flag='c', tol=0.00001):
#"""Calculating the implied volatility of an European option
# S0: stock price
# K: strike price
# T: time to maturity
# r: risk-free rate
# market_price: option price in market
#"""
max_iter = 500 #max no. of iterations
vol_old = 0.3 #initial guess
for k in range(max_iter):
bs_price = bs(flag, S0, K, T, r, vol_old)
Cprime = vega(flag, S0, K, T, r, vol_old)*100
C = bs_price - market_price
vol_new = vol_old - C/Cprime
new_bs_price = bs(flag, S0, K, T, r, vol_new)
if (abs(vol_old-vol_new) < tol or abs(new_bs_price-market_price) < tol):
break
vol_old = vol_new
implied_vol = vol_new
return implied_vol
S0 = 14.73
K = 15.04
T = 20/252
r = 0.1165
market_price = 0.41
print(implied_vol(S0, K, T, r, market_price)*100)
I wanted to return the implied vol value in a data frame column
How can I apply this function in my dataframe

RGB to HSV in numpy

I'm trying to implement RGB to HSV conversion from opencv in pure numpy using formula from here:
def rgb2hsv_opencv(img_rgb):
img_hsv = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2HSV)
return img_hsv
def rgb2hsv_np(img_rgb):
assert img_rgb.dtype == np.float32
height, width, c = img_rgb.shape
r, g, b = img_rgb[:,:,0], img_rgb[:,:,1], img_rgb[:,:,2]
t = np.min(img_rgb, axis=-1)
v = np.max(img_rgb, axis=-1)
s = (v - t) / (v + 1e-6)
s[v==0] = 0
# v==r
hr = 60 * (g - b) / (v - t + 1e-6)
# v==g
hg = 120 + 60 * (b - r) / (v - t + 1e-6)
# v==b
hb = 240 + 60 * (r - g) / (v - t + 1e-6)
h = np.zeros((height, width), np.float32)
h = h.flatten()
hr = hr.flatten()
hg = hg.flatten()
hb = hb.flatten()
h[(v==r).flatten()] = hr[(v==r).flatten()]
h[(v==g).flatten()] = hg[(v==g).flatten()]
h[(v==b).flatten()] = hb[(v==b).flatten()]
h[h<0] += 360
h = h.reshape((height, width))
img_hsv = np.stack([h, s, v], axis=-1)
return img_hsv
img_bgr = cv2.imread('00000.png')
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
img_rgb = img_rgb / 255.0
img_rgb = img_rgb.astype(np.float32)
img_hsv1 = rgb2hsv_np(img_rgb)
img_hsv2 = rgb2hsv_opencv(img_rgb)
print('max diff:', np.max(np.fabs(img_hsv1 - img_hsv2)))
print('min diff:', np.min(np.fabs(img_hsv1 - img_hsv2)))
print('mean diff:', np.mean(np.fabs(img_hsv1 - img_hsv2)))
But I get big diff:
max diff: 240.0
min diff: 0.0
mean diff: 0.18085355
Do I missing something?
Also maybe it's possible to write numpy code more efficient, for example without flatten?
Also I have hard time finding original C++ code for cvtColor function, as I understand it should be actually function cvCvtColor from C code, but I can't find actual source code with formula.
From the fact that the max difference is exactly 240, I'm pretty sure that what's happening is in the case when both or either of v==r, v==g are simultaneously true alongside v==b, which gets executed last.
If you change the order from:
h[(v==r).flatten()] = hr[(v==r).flatten()]
h[(v==g).flatten()] = hg[(v==g).flatten()]
h[(v==b).flatten()] = hb[(v==b).flatten()]
To:
h[(v==r).flatten()] = hr[(v==r).flatten()]
h[(v==b).flatten()] = hb[(v==b).flatten()]
h[(v==g).flatten()] = hg[(v==g).flatten()]
The max difference may start showing up as 120, because of that added 120 in that equation. So ideally, you would want to execute these three lines in the order b->g->r. The difference should be negligible then (still noticing a max difference of 0.01~, chalking it up to some round off somewhere).
h[(v==b).flatten()] = hb[(v==b).flatten()]
h[(v==g).flatten()] = hg[(v==g).flatten()]
h[(v==r).flatten()] = hr[(v==r).flatten()]

Why is setting the expiry time ineffective in Redis

Now i use redispipeline to set key to redis ,and set key timeout.After running codes and timeout,and the keys are still in redis?It seemed doesn't work.Is there sth wrong with my code.
X[1] is dict,such as dict["a"]=b dict["c"]=d
redis_pool = redis.ConnectionPool(host = redis_ip, port = redis_port, decode_responses = False)
redis_connection = redis.StrictRedis(connection_pool = redis_pool)
redis_pipeline = redis_connection.pipeline(transaction = False)
sync_count = 0
for x in iterator:
key = suffix + x[0]
#value = str(x['tagid'])+"\t"+x['tag_name']
value = x[1]
redis_pipeline.hmset(key, value)
redis_pipeline.expire(key,60)
sync_count += 1
if (sync_count % 100 == 0):
result = redis_pipeline.execute()
print (result)
time.sleep(0.001)
break
if sync_count == 10000:
break
redis_pipeline.execute()

Multiplying a specific cell value based on if statement in a Pandas dataframe

So I am trying to associate a specific value ('Property Damage') with every row in my dataset but I am having some trouble with this. Specifically, I want to multiply the value in the 'MD' column for each row by a number (0.02, 0.15, etc.) if it meets the conditions specified in the for loop (e.g. if i >= 0.8062, print etc.). I have included my code below:
df['RAND'] = np.random.uniform(0, 1, size=df.index.size)
dfRAND = list(df['RAND'])
def sim_1():
for i in dfRAND:
result = []
if i >= 0.8062:
df['Property Damage'] = df['MD'].apply(lambda x: x * 0.02)
print(list(val for x, val in enumerate(df['Count']) if
x == dfRAND.index(i)), 'LF0', i,':', df['Property Damage'])
elif 0.01 <= i < 0.89062:
df['Property Damage'] = list(df['MD'].apply(lambda x: x * 0.15))
print(list(val for x, val in enumerate(df['Count']) if
x == dfRAND.index(i)),'LF1', i, ':', df['Property Damage'])
elif 0.05 <= i < 0.01:
df['Property Damage'] = list(df['MD'].apply(lambda x: x * 0.20))
print(list(val for x, val in enumerate(df['Count']) if
x == dfRAND.index(i)),'LF2', i,':', df['Property Damage'])
elif 0.025 <= i < 0.05:
df['Property Damage'] = list(df['MD'].apply(lambda x: x * 0.50))
print(list(val for x, val in enumerate(df['Count']) if
x == dfRAND.index(i)),'LF3', i,':', df['Property Damage'])
elif 0.0125 <= i < 0.025:
df['Property Damage'] = list(df['MD'].apply(lambda x: x * 1))
print(list(val for x, val in enumerate(df['Count']) if
x == dfRAND.index(i)),'LF4', i,':', df['Property Damage'])
elif 0.0063 <= i < 0.0125:
df['Property Damage'] = list(df['MD'].apply(lambda x: x * 1))
print(list(val for x, val in enumerate(df['Count']) if
x == dfRAND.index(i)),'LF5', i,':', df['Property Damage'])
The problem I am having at the moment is that the code prints all the 'Property Damage' values for each row. I want it to give me the 'Property Damage' value for a specific row based on whichever condition is met in the for loop.
Any help is appreciated. Thanks in advance.
Are you looking for something like this?
my_bins = {pd.Series.max(df['RAND'])-1: 1,
.01: .15,
.0125: 1,
.025: .5,
.05: .2,
pd.Series.max(df['RAND'])+1 : .02}
df['rand_multiplier'] = pd.cut(df['RAND'], bins = sorted(my_bins.keys()), labels = list(range(len(my_bins) - 1))).apply(lambda x: my_bins[sorted(my_bins.keys())[x]])
df.apply(lambda row: row['MD'] * row['rand_multiplier'], axis = 1)
I'm in a bit of a hurry so it's not the prettiest thing. Basically I created bins based on the criteria you had and created a "multiplier" column which associates each entry in df['RAND'] with a multiplying factor. Then we can iterate over df and apply the multiplying factor to your 'MD' row.
Of course, I can't show the produced results without the 'MD' data.

My tensorflow code does not work good -bank analysis

I'm studding tensorflow and I wrote some code but it does not work good.
the data was downloaded from uci:http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
I want to find the client who will subscribe a term deposit,but the result matched is 0.
I use 3 layers neural network and sigmod for output.
My code is like this,please help me.
hidden_layer1 = 200
hidden_layer2 = 200
x = tf.placeholder(tf.float32,[None,16])
y = tf.placeholder(tf.float32,[None,1])
Weights_L1 = tf.Variable(tf.random_normal([16,hidden_layer1]))
biases_L1 = tf.Variable(tf.random_normal([1,hidden_layer1]))
Wx_plus_b_L1 = tf.matmul(x,Weights_L1) + biases_L1
L1=tf.nn.relu(Wx_plus_b_L1)
Weights_L2 = tf.Variable( tf.random_normal([hidden_layer1,1]))
biases_L2 = tf.Variable( tf.random_normal([1,1]))
Wx_plus_b_L2 = tf.matmul(L1,Weights_L2) + biases_L2
pred = tf.nn.sigmoid(Wx_plus_b_L2)
loss = tf.reduce_mean(tf.square(y-pred))
learning_rate=0.05
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
pred_correct = tf.equal(y,pred)
accuracy = tf.reduce_mean(tf.cast(pred_correct,tf.float32))
batch_num = 0
with tf.Session() as ss:
ss.run(tf.global_variables_initializer())
for step in range(500):
ss.run(train_step,feed_dict={x:bank_train_x,y:bank_train_y})
if step%100==0:
batch_num = batch_num +1
acc1 = ss.run(accuracy,feed_dict={x:bank_train_x,y:bank_train_y})
print("train acc"+ str(step) + ", " + str(acc1) +" , batch_num:" + str(batch_num))
#print(ss.run(learning_rate,feed_dict={global_:step}))
p = ss.run(pred,feed_dict={x:bank_train_x,y:bank_train_y})
acc2 = ss.run(accuracy,feed_dict={x:bank_test_x,y:bank_test_y})
print("test acc" + str(acc2))
def calc(pred,y):
l = y.shape[0]
a = 0
b=0
c=0
d=0
for i in range(l):
if (p[i] >0.5 and y[i] == 0):
a = a +1
elif (p[i] >0.5 and y[i] == 1):
b = b+1
elif (p[i] <0.5 and y[i] == 0):
c = c+1
elif (p[i] <0.5 and y[i] == 1):
d = d +1
print(a,b,c,d)
calc(p,bank_train_y)
#the result is 169 0 34959 4629