PuLP, groupby columns to see if sum for group is larger than group's threshold - optimization

import numpy as np
import pandas as pd
d = {'id':['6G','4F','2W','1H','7P','3L'],
'contract': ['contract_1', 'contract_1', 'contract_1', 'contract_2', 'contract_2', 'contract_3'],
'thresholds': [.1,.1,.1,.02, .02, .03],
'performance':[.05,.09,.02,.04,.025,.01],
'cost':[10,3,2,15,4,1]}
df = pd.DataFrame(data=d)
df
Given the above data, I want to minimize the cost and have a constraint where we want to only pick the id's within the same contract that have a sum of their performance equal to or greater than the threshold.
I am trying to get an output that would then pick the following IDs: ['4F', '2W', '7P']
import numpy as np
import pandas as pd
d = {'id':['6G','4F','2W','1H','7P','3L'],
'contract': ['contract_1', 'contract_1', 'contract_1', 'contract_2', 'contract_2', 'contract_3'],
'thresholds': [.1,.1,.1,.02, .02, .03],
'performance':[.05,.09,.02,.04,.025,.01],
'cost':[10,3,2,15,4,1]}
df = pd.DataFrame(data=d)
df
from pulp import *
from collections import defaultdict
#list of unique identifiers
contracts_id = list(df['id'])
contract_list = list(df['contract'])
#dictionaries with each unique identifier and its corresponding value for each column
#thresholds = dict(zip(contracts_id,df['thresholds']))
#performance = dict(zip(contracts_id,df['performance']))
#cost = dict(zip(contracts_id,df['cost']))
#contract = dict(zip(contracts_id,df['contract']))
#nested dictionaries with each unique identifier, contract and its corresponding value for each column
thresholds = defaultdict(dict)
for x,y,z in zip(contracts_id,contract_list,df['thresholds']):
thresholds[x][y]=z
thresholds
performance = defaultdict(dict)
for x,y,z in zip(contracts_id,contract_list,df['performance']):
performance[x][y]=z
performance
cost = defaultdict(dict)
for x,y,z in zip(contracts_id,contract_list,df['cost']):
cost[x][y]=z
cost
#find unique contracts
def unique(list1):
x = np.array(list1)
return(np.unique(x))
unique_contract_list = unique(contract)
unique_contract_list
contracts_id_chosen = LpVariable.dicts('ID',contracts_id,lowBound=0,upBound=1,cat="Integer")
#setup problem to minimize
total_cost = LpProblem('Min_Cost_Performance_Against_Threshold', LpMinimize)
#objective function
for c in cost:
total_cost += lpSum([cost[i]*contracts_id_chosen[i] for i in contracts_id_chosen])
#constraint
#I am trying to group by contract and then find which combination of id's performance gets equal to or greater than threshold
for u in list(unique_contract_list):
for contract_performance in performance.values():
for c in contract_performance:
if c==u:
total_cost+= lpSum([contract_performance.values()[i]*contracts_id_chosen[i] - thresholds[i] for for i in contracts_id_chosen]) >=0
I'm struggling to clearly write the objective function and constraint for this problem. How would you adjust those to group by contract and then find which combination of id's performance gets equal to or greater than threshold when summing the performances.

If I understand the problem correctly you wish to select some ids such that the total cost is minimised, subject to the constraint that the performance for selected ids which belong to each of the contracts at least meet the thresholds for that contract. But that we have some freedom in terms of which contracts we select (I've assumed at least 2 contract have to be selected - as this is case for example solution you gave).
If that is the problem, then this solves it:
import numpy as np
import pandas as pd
import pulp as lp
d = {'id':['6G','4F','2W','1H','7P','3L'],
'contract': ['contr_1', 'contr_1', 'contr_1', 'contr_2', 'contr_2', 'contr_3'],
'thresholds': [.1, .1, .1, .02, .02, .03],
'performance':[.05,.09,.02,.04,.025,.01],
'cost':[10,3,2,15,4,1]}
df = pd.DataFrame(data=d)
df.set_index('id', inplace=True)
print(df)
# Declare Problem
model = lp.LpProblem('contract_sel', lp.LpMinimize)
# Which items are picked? (0=no, 1=yes)
items = list(df.index)
x = lp.LpVariable.dicts('x', items, cat=lp.LpBinary)
# Dictionary of contract thresholds:
contracts = {i: v for i, v in zip(df.contract, df.thresholds)}
# Which contracts are picked? (0=no, 1=yes)
y = lp.LpVariable.dicts('y', contracts.keys(), cat=lp.LpBinary)
# Objective - total cost of selected items
model += lp.lpSum(df.loc[i, 'cost']*x[i] for i in items)
# Constraints - items selected from within contract
# have to meet threshold
for contract in contracts.keys():
contr_items = df[df.contract==contract].index
model += lp.lpSum(df.loc[i, 'performance']*x[i] for i in contr_items) >= \
contracts[contract]*y[contract]
# At least two contracts must be selected?
model += lp.lpSum(y[i] for i in contracts.keys()) >= 2
#print(model)
model.solve()
print('Solution status:',lp.LpStatus[model.status])
for v in model.variables():
print(v.name,': ',v.varValue)
print('Cost is:',lp.value(model.objective))
Which returns (for me at least):
Solution status: Optimal
x_1H : 0.0
x_2W : 1.0
x_3L : 0.0
x_4F : 1.0
x_6G : 0.0
x_7P : 1.0
y_contr_1 : 1.0
y_contr_2 : 1.0
y_contr_3 : 0.0
Cost is: 9.0
Which matches your expected solution: ['4F', '2W', '7P'].

Related

Pandas get max delta in a timeseries for a specified period

Given a dataframe with a non-regular time series as an index, I'd like to find the max delta between the values for a period of 10 secs. Here is some code that does the same thing:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
xs = np.cumsum(np.random.rand(200))
# This function is to create a general situation where the max is not aways at the end or beginning
ys = xs**1.2 + 10 * np.sin(xs)
plt.plot(xs, ys, '+-')
threshold = 10
xs_thresh_ind = np.zeros_like(xs, dtype=int)
deltas = np.zeros_like(ys)
for i, x in enumerate(xs):
# Find indices that lie within the time threshold
period_end_ind = np.argmax(xs > x + threshold)
# Only operate when the window is wide enough (this can be treated differently)
if period_end_ind > 0:
xs_thresh_ind[i] = period_end_ind
# Find extrema in the period
period_min = np.min(ys[i:period_end_ind + 1])
period_max = np.max(ys[i:period_end_ind + 1])
deltas[i] = period_max - period_min
max_ind_low = np.argmax(deltas)
max_ind_high = xs_thresh_ind[max_ind_low]
max_delta = deltas[max_ind_low]
print(
'Max delta {:.2f} is in period x[{}]={:.2f},{:.2f} and x[{}]={:.2f},{:.2f}'
.format(max_delta, max_ind_low, xs[max_ind_low], ys[max_ind_low],
max_ind_high, xs[max_ind_high], ys[max_ind_high]))
df = pd.DataFrame(ys, index=xs)
OUTPUT:
Max delta 48.76 is in period x[167]=86.10,200.32 and x[189]=96.14,249.09
Is there an efficient pandaic way to achieve something similar?
Create a Series from ys values, indexed by xs - but convert xs to be actual timedelta elements, rather than the float equivalent.
ts = pd.Series(ys, index=pd.to_timedelta(xs, unit="s"))
We want to apply a leading, 10 second window in which we calculate the difference between max and min. Because we want it to be leading, we'll sort the Series in descending order and apply a trailing window.
deltas = ts.sort_index(ascending=False).rolling("10s").agg(lambda s: s.max() - s.min())
Find the maximum delta with deltas[deltas == deltas.max()], which gives
0 days 00:01:26.104797298 48.354851
meaning a delta of 48.35 was found in the interval [86.1, 96.1)

Stratified Cross Validation or Sampling for train-test split based on multiple features in python

sklearn's train_test_split , StratifiedShuffleSplit and StratifiedKFold all stratify based on class labels (y-variable or target_column). What if we want to sample based on features columns (x-variables) and not on target column. If it was just one feature it would be easy to stratify based on that single column, but what if there are many feature columns and we want to preserve the population's proportions in the selected sample?
Below I created a df which has skewed population with more people of low income, more females, least people from CA and most people from MA. I want the selected sample to have these characteristics i.e. more people of low income, more females, least people from CA and most people from MA
import random
import string
import pandas as pd
N = 20000 # Total rows in data
names = [''.join(random.choices(string.ascii_uppercase, k = 5)) for _ in range(N)]
incomes = [random.choices(['High','Low'], weights=(30, 70))[0] for _ in range(N)]
genders = [random.choices(['M','F'], weights=(40, 60))[0] for _ in range(N)]
states = [random.choices(['CA','IL','FL','MA'], weights=(10,20,30,40))[0] for _ in range(N)]
targets_y= [random.choice([0,1]) for _ in range(N)]
df = pd.DataFrame(dict(
name = names,
income = incomes,
gender = genders,
state = states,
target_y = targets_y
))
One more complexity arises when for some of the characteristics, we have very few examples and we want to include atleast n examples in selected sample. Consider this example:
single_row = {'name' : 'ABC',
'income' : 'High',
'gender' : 'F',
'state' : 'NY',
'target_y' : 1}
df = df.append(single_row, ignore_index=True)
df
.
I want this single added row to be always included in test-split (n=1 here).
This can be achieved using pandas groupby:
Let us first check the population characteristics:
grps = df.groupby(['state','income','gender'], group_keys=False)
grps.count()
Next let's create a test set with 20% of original data
test_proportion = 0.2
at_least = 1
test = grps.apply(lambda x: x.sample(max(round(len(x)*test_proportion), at_least)))
test
test-set characteristics:
test.groupby(['state','income','gender']).count()
Next we create the train-set as a difference of original df and test-set
print('No. of samples in test =', len(test))
train = set(df.name) - set(test.name)
print('No. of samples in train =', len(train))
No. of samples in test = 4000
No. of samples in train = 16001

How to quickly normalise data in pandas dataframe?

I have a pandas dataframe as follows.
import pandas as pd
df = pd.DataFrame({
'A':[1,2,3],
'B':[100,300,500],
'C':list('abc')
})
print(df)
A B C
0 1 100 a
1 2 300 b
2 3 500 c
I want to normalise the entire dataframe. Since column C is not a numbered column what I do is as follows (i.e. remove C first, normalise data and add the column).
df_new = df.drop('concept', axis=1)
df_concept = df[['concept']]
from sklearn import preprocessing
x = df_new.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_new = pd.DataFrame(x_scaled)
df_new['concept'] = df_concept
However, I am sure that there is more easy way of doing this in pandas (given the column names that I do not need to normalise, then do the normalisation straightforward).
I am happy to provide more details if needed.
Use DataFrame.select_dtypes for DataFrame with numeric columns and then normalize with division by minimal and maximal values and then assign back only normalized columns:
df1 = df.select_dtypes(np.number)
df[df1.columns]=(df1-df1.min())/(df1.max()-df1.min())
print (df)
A B C
0 0.0 0.0 a
1 0.5 0.5 b
2 1.0 1.0 c
In case you want to apply any other functions on the data frame, you can use df[columns] = df[columns].apply(func).

Can we do row operation using pandas in pyspark?

import numpy as np
def charging_funct(rating , duration):
lst = rating.values
for x in np.nditer(lst):
if x > 3.5: # x will contain only value
return duration * rating
charging = pandas_udf(charging_funct , returnType = DoubleType())
df_2.select(charging(col("Rating"),col("Duration"))).show()
The above function is incorrect but have included the code to help explain my question:
Suppose i have to return duration * rating if the rating of movies is greater than 3.5 and just return duration otherwise.
Is this possible with pandas_udf in pyspark? Comparing each row individually will not be efficient as pandas work on batch data.

get less correlated variable names

I have a dataset (50 columns, 100 rows).
Also have 50 variable names, 0,1,2...49 for 50 columns.
I have to find less correlated variables, say correlation < 0.7.
I tried as follows:
import os, glob, time, numpy as np, pandas as pd
data = np.random.randint(1,99,size=(100, 50))
dataframe = pd.DataFrame(data)
print (dataframe.shape)
codes = np.arange(50).astype(str)
dataframe.columns = codes
corr = dataframe.corr()
corr = corr.unstack().sort_values()
print (corr)
corr = corr.values
indices = np.where(corr < 0.7)
print (indices)
res = codes[indices[0]].tolist() + codes[indices[1]].tolist()
print (len(res))
res = list(set(res))
print (len(res))
The result is, 50(all variables!), which is unexpected.
How to solve this problem, guys?
As mentioned in the comments, your question is somewhat ambiguous. First, there is the possibility, that no column pair is correlated. Second, the unstacking doesn't make sense, because you create an index array that you can't directly use on your 2D array. Third, which should be first, but I was blind to this - as #AmiTavory mentioned there is no point in "correlating names".
The correlation procedure per se works, as you can see in the following example:
import numpy as np
import pandas as pd
A = np.arange(100).reshape(25, 4)
#random order in column 2, i.e. a low correlation to the first columns
np.random.shuffle(A[:,2])
#flip column 3 to create a negative correlation with the first columns
A[:,3] = np.flipud(A[:,3])
#column 1 is unchanged, therefore positively correlated to column 0
df = pd.DataFrame(A)
print(df)
#establish a correlation matrix
corr = df.corr()
#retrieve index of pairs below a certain value
#use only the upper triangle with np.triu to filter for symmetric solutions
#use np.abs to take also negative correlation into account
res = np.argwhere(np.triu(np.abs(corr.values) <0.7))
print(res)
Output:
[[0 2]
[1 2]
[2 3]]
As expected, column 2 is the only one that is not correlated to any other, meaning, that all other columns are correlated with each other.