Two Pandas dataframes, how to interpolate row-wise using scipy - pandas

How can I use scipy interpolate on two dataframes, interpolating row-rise?
For example, if I have:
dfx = pd.DataFrame({"a": [0.1, 0.2, 0.5, 0.6], "b": [3.2, 4.1, 1.1, 2.8]})
dfy = pd.DataFrame({"a": [0.8, 0.2, 1.1, 0.1], "b": [0.5, 1.3, 1.3, 2.8]})
display(dfx)
display(dfy)
And say I want to interpolate for y(x=0.5), how can I get the results into an array that I can put in a new dataframe?
Expected result is: [0.761290323 0.284615385 1.1 -0.022727273]
For example, for first row, you can see the expected value is 0.761290323:
x = [0.1, 3.2] # from dfx, row 0
y = [0.8,0.5] # from dfy, row 0
fig, ax = plt.subplots(1,1)
ax.plot(x,y)
f = scipy.interpolate.interp1d(x,y)
out = f(0.5)
print(out)
I tried the following but received ValueError: x and y arrays must be equal in length along interpolation axis.
f = scipy.interpolate.interp1d(dfx, dfy)
out = np.exp(f(0.5))
print(out)

Since you are looking for linear interpolation, you can do:
def interpolate(val, dfx, dfy):
t = (dfx['b'] - val) / (dfx['b'] - dfx['a'])
return dfy['a'] * t + dfy['b'] * (1-t)
interpolate(0.5, dfx, dfy)
Output:
0 0.885714
1 0.284615
2 1.100000
3 -0.022727
dtype: float64

Related

Expand matrix based on vector

I want to turn matrix A into matrix B.
Is there a better/more efficient approach with NumPy than the following?
import numpy as np
a = np.array([[0.02, 0.05, 0.05],
[0.35, 0.10, 0.45],
[0.08, 0.25, 0.15]])
w = np.array([0.75, 0.25])
B = np.insert(a, 9, a[2, :]).reshape(4, 3)
B = np.insert(B.T, 12, B[:, 2]).reshape(4, 4).T
B[2:4, :] = np.multiply(B[2:4, :].T, w).T
.insert isn't a good choice here because numpy needs to allocate memory to create a whole new array every time you do so. Instead, just pre-allocate the size of array you need, and then assign to its slices.
a = np.array([[0.02, 0.05, 0.05],
[0.35, 0.10, 0.45],
[0.08, 0.25, 0.15]])
w = np.array([0.75, 0.25])
b_shape = tuple(s + 1 for s in a.shape) # We need one more row and column than a
b = np.zeros(b_shape) # Create zero array of required shape
b[:a.shape[0], :a.shape[1]] = a # Set a in the top left corner
b[:, -1] = b[:, -2] # Set last column from second-last column
b[-1, :] = b[-2, :] # Set last row from second-last row
b[-w.shape[0]:, :] = b[-w.shape[0]:, :] * w[:, None] # Multiply last two rows with `w`
w[:, None] makes w a column vector (a 2x1 matrix), and numpy broadcasts the shapes to do the correct elementwise multiplication.
This gives us the required b:
array([[0.02 , 0.05 , 0.05 , 0.05 ],
[0.35 , 0.1 , 0.45 , 0.45 ],
[0.06 , 0.1875, 0.1125, 0.1125],
[0.02 , 0.0625, 0.0375, 0.0375]])
Putting this in a function to compare runtimes against your approach:
import numpy as np
import timeit
from matplotlib import pyplot as plt
#% Define functions
def func_insert(a, w):
B = np.insert(a, a.size, a[-1, :]).reshape(a.shape[0]+1, a.shape[1])
B = np.insert(B.T, B.size, B[:, -1]).reshape(a.shape[0]+1, a.shape[1]+1).T
B[-w.shape[0]:, :] = np.multiply(B[-w.shape[0]:, :].T, w).T
return B
def func_prealloc(a, w):
b_shape = tuple(s + 1 for s in a.shape)
b = np.zeros(b_shape)
b[:a.shape[0], :a.shape[1]] = a
b[:, -1] = b[:, -2]
b[-1, :] = b[-2, :]
b[-w.shape[0]:, :] = b[-w.shape[0]:, :] * w[:, None]
return b
#% Time function calls
sizes = [3, 10, 50, 100, 500, 1000, 5000, 10_000]
times = np.zeros((len(sizes), 2))
for i, size in enumerate(sizes):
a = np.random.random((size, size))
w = np.random.random((2,))
times[i, 0] = timeit.timeit("func_insert(a, w)", globals=globals(), number=10) / 10
print(".")
times[i, 1] = timeit.timeit("func_prealloc(a, w)", globals=globals(), number=10) / 10
print("x")
#% Plot results
fig, ax = plt.subplots()
ax.plot(sizes, times[:, 0], label="Insert")
ax.plot(sizes, times[:, 1], label="Prealloc")
ax.set_xscale('log')
ax.set_yscale('log')
ax.legend()
ax.set_xlabel('Array size (NxN)')
ax.set_ylabel('Time per function call (s)')
ax.grid(True)
fig.tight_layout()
]
There's a consistent 3-5x speedup by preallocating.

Invalid RGBA argument: masked_array(data=[1.0, 0.5651961183210134, 0.0, 1.0], mask=False, when using Matplotlib

I am trying to plot a 4d graph using x,y,z labels with the fourth dimension being color. However, when trying to run this code, I run into this:
Invalid RGBA argument: masked_array(data=[1.0, 0.5651961183210134,
0.0, 1.0],
mask=False,
only whenever I try to change the z variable. The only time I don't get an error is when I set the z variable to this: np.random.standard_normal(100)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = rainfall
y = airport_train_adult_pax
z = airport_total_pax
c = exchange_rate
img = ax.scatter(x, y, z, c=c, cmap=plt.hot())
fig.colorbar(img)
plt.show()
Just for some background of my data, the rainfall ranges from 0 to 200 and includes one decimal point, airport train and airport total ranges 2000000-3000000 range with no decimals, and exchange_rate ranges between 0 to 1 with two decimals.
I guess you have NaNs in your data.
Try to insert this code before ax.scatter(...):
df = pd.DataFrame({'x': x, 'y': y, 'z': z, 'c': c}).dropna()
x,y,z,c = [df[c] for c in df]

Value from iterative function in pandas

I have a dataframe and would like to have the values in one column being set through an iterative function as below.
import pandas as pd
import numpy as np
d = {'col1': [0.4444, 25.4615],
'col2': [0.5, 0.7],
'col3': [7, 7]}
df = pd.DataFrame(data=d)
df['col4'] = df['col1'] * df['col3']/4
def func(df):
a = np.exp(-df['col4'])
n = 1
while df['col2'] < a:
a = a + df['col4'] * 4 / n
n += 1
return n
df['col5'] = func(df)
I get an error message "ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()." How can I run the function per row to solve the series/ambiguity problem?
EDIT: Added expected output.
out = {'col1': [0.4444, 25.4615],
'col2': [0.5, 0.7],
'col3': [7, 7],
'col4': [0.7777, 44.557625],
'col5': [0, 49]}
dfout = pd.DataFrame(out)
I am not sure what the values in col4 and col5 will be but according to the calculation I am trying to replicate those will be the values.
EDIT2: I had missed n+=1 in the while loop. added it now.
EDIT3: I am trying to apply
f(0) = e^-col4
f(n) = col4 * f(n-1) / n for n > 0
until f > col2 and then return the value of n per row.
Using the information you provided, this seems to be the solution:
import pandas as pd
import numpy as np
d = {'col1': [0.4444, 25.4615],
'col2': [0.5, 0.7],
'col3': [7, 7]}
df = pd.DataFrame(data=d)
df['col4'] = df['col1'] * df['col3']/4
def func(df):
n = 1
return n
df['col5'] = func(df)
For what it is worth, here is an inefficient solution: after each iteration, keep track of which coefficient starts satisfying the condition.
import pandas as pd
import numpy as np
d = {'col1': [0.4444, 25.4615],
'col2': [0.5, 0.7],
'col3': [7, 7]}
df = pd.DataFrame(data=d)
df['col4'] = df['col1'] * df['col3']/4
def func(df):
a = np.exp(-df['col4'])
n = 1
ns = [None] * len(df['col2'])
status = a > df['col2']
for i in range(len(status)):
if ns[i] is None and status[i]:
ns[i] = n
# stops when all coefficients satisfy the condition
while not status.all():
a = a * df['col4'] * n
status = a > df['col2']
n += 1
for i in range(len(status)):
if ns[i] is None and status[i]:
ns[i] = n
return ns
df['col5'] = func(df)
print(df['col5'])

Vectorization of selective cumulative sum

I have a pandas Series where each element is a list with indices:
series_example = pd.Series([[1, 3, 2], [1, 2]])
In addition, I have an array with values associated to every index:
arr_example = np.array([3., 0.5, 0.25, 0.1])
I want to create a new Series with the cumulative sums of the elements of the array given by the indices in the row of the input Series. In the example, the output Series would have the following contents:
0 [0.5, 0.6, 0.85]
1 [0.5, 0.75]
dtype: object
The non-vectorized way to do it would be the following:
def non_vector_transform(series, array):
series_output = pd.Series(np.zeros(len(series_example)), dtype = object)
for i in range(len(series)):
element_list = series[i]
series_output[i] = []
acum = 0
for element in element_list:
acum += array[element]
series_output[i].append(acum)
return series_output
I would like to do this in a vectorized way. Any vectorization magician to help me in here?
Use Series.apply and np.cumsum:
import numpy as np
import pandas as pd
series_example = pd.Series([[1, 3, 2], [1, 2]])
arr_example = np.array([3., 0.5, 0.25, 0.1])
result = series_example.apply(lambda x: np.cumsum(arr_example[x]))
print(result)
Or if you prefer a for loop:
import numpy as np
import pandas as pd
series_example = pd.Series([[1, 3, 2], [1, 2]])
arr_example = np.array([3., 0.5, 0.25, 0.1])
# Copy only if you do not want to overwrite the original series
result = series_example.copy()
for i, x in result.iteritems():
result[i] = np.cumsum(arr_example[x])
print(result)
Output:
0 [0.5, 0.6, 0.85]
1 [0.5, 0.75]
dtype: object

matplotlib scatter with c=date

How to plot a pandas dataframe like the one below with x on the x-axis, the values on the y-axis (one line per row) and the lines colored by date
values = [[0.2, 3.1, 17.4, 28.9, 57.7, 76.9, 82.8, 87.6, 92.4, 98.9, 100.0],
[0.2, 2.1, 15.5, 26.0, 54.2, 75.6, 82.1, 87.4, 92.4, 98.9, 100.0]]
x = [0.1, 0.2, 0.315, 0.4, 0.63, 1, 1.25, 1.6, 2, 3.15, 4]
dates = pd.date_range(start='2017-07-01', freq='D', periods=2)
data = pd.DataFrame(data=values, columns=x)
data['dates'] = dates
edit: sorry for not being precise.
Is there a way to set the colors of the lines according to a columns of Timestamps using data[x].T.plot(kind='line', legend=False).
If this is not possible, how to set "c" in plt.scatter to an array of Timestamps?
edit: the plot should look like this but should have a colorbar instead of a legend
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# create test data with a structure similar to the real data
x_values = np.linspace(1, 10, 8)
dat = np.random.randn(100, 8)
df = pd.DataFrame(data=np.abs(dat), columns=x_values)
df = df.cumsum(axis=1)
df = df.divide(df.max(axis=1), axis='index')
# create discontinuos date range and add it to data frame
dates = pd.date_range(start=('2016-01-01'), end=('2017-05-01'), freq='D')
dates = dates[(dates < '2016-07-01') | (dates > '2017-03-01')]
df['date'] = sorted(random.sample(dates.date.tolist(), 100))
# create a dataframe with a continous date range (see df) and corresponding colors
drange = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
colors = iter(plt.cm.jet(np.linspace(0, 1, drange.shape[0])))
cdf = pd.DataFrame(data=np.array([drange.date, list(colors)]).T, columns=['date', 'colors'])
# and merge colors to data
data = pd.merge(df, cdf)
# plot all data row by row with color of lines
# matching the date columns
fig, ax = plt.subplots()
for idx in data.index:
ax.plot(x_values, data.loc[idx, x_values],
linestyle='-', alpha=0.75,
color=data.loc[idx, 'colors'],
label=data.loc[idx, 'date'])
# reduce entries of legend
handles, labels = ax.get_legend_handles_labels()
entries = int(data.shape[0]/10)
handles = handles[::entries]
labels = labels[::entries]
ax.legend(handles, labels)