I have a data frame that looks like this that includes price side and volume parameters from multiple exchanges.
df = pd.DataFrame({
'price_ex1' : [9380.59650, 9394.85206, 9397.80000],
'side_ex1' : ['bid', 'bid', 'ask'],
'size_ex1' : [0.416, 0.053, 0.023],
'price_ex2' : [9437.24045, 9487.81185, 9497.81424],
'side_ex2' : ['bid', 'bid', 'ask'],
'size_ex2' : [10.0, 556.0, 23.0]
})
df
price_ex1 side_ex1 size_ex1 price_ex2 side_ex2 size_ex2
0 9380.59650 bid 0.416 9437.24045 bid 10.0
1 9394.85206 bid 0.053 9487.81185 bid 556.0
2 9397.80000 ask 0.023 9497.81424 ask 23.0
For each exchange (I have more than two exchanges), I want the index to be the union of all prices from all exchanges (i.e. union of price_ex1, price_ex2, etc...) ranked from highest to lowest. Then I want to create two size columns for each exchange based on the side parameter of that exchange. The output should look like this where empty columns are NaN.
I am not sure what is the best pandas function to do this, whether it is pivot or melt and how to use that function when I have more than 1 binary column I am flattening.
Thank you for your help!
This is a three step process. After you correct your multiindexed columns, you should stack your dataset, then pivot it.
First, clean up the multiindex columns so that you more easily transform:
df.columns = pd.MultiIndex.from_product([['1', '2'], [col[:-4] for col in df.columns[:3]]], names=['exchange', 'params'])
exchange 1 2
params price side size price side size
0 9380.59650 bid 0.416 9437.24045 bid 10.0
1 9394.85206 bid 0.053 9487.81185 bid 556.0
2 9397.80000 ask 0.023 9497.81424 ask 23.0
Then stack and append the exchange num to the bid and ask values:
df = df.swaplevel(axis=1).stack()
df['side'] = df.apply(lambda row: row.side + '_ex' + row.name[1], axis=1)
params price side size
exchange
0 1 9380.59650 bid_ex1 0.416
2 9437.24045 bid_ex2 10.000
1 1 9394.85206 bid_ex1 0.053
2 9487.81185 bid_ex2 556.000
2 1 9397.80000 ask_ex1 0.023
2 9497.81424 ask_ex2 23.000
Finally, pivot and sort by price:
df.pivot_table(index=['price'], values=['size'], columns=['side']).sort_values('price', ascending=False)
params size
side ask_ex1 ask_ex2 bid_ex1 bid_ex2
price
9497.81424 NaN 23.0 NaN NaN
9487.81185 NaN NaN NaN 556.0
9437.24045 NaN NaN NaN 10.0
9397.80000 0.023 NaN NaN NaN
9394.85206 NaN NaN 0.053 NaN
9380.59650 NaN NaN 0.416 NaN
You can try something like this.
Please make a dataframe with the data that you show us and name it something 'example.csv'
price_ex1 side_ex1 size_ex1 price_ex2 side_ex2 size_ex2
import pandas as pd
import numpy as np
df = pd.read_csv('example.csv')
df1 = df[['price_ex1','side_ex1','size_ex1']]
df2 = df[['price_ex2','side_ex2','size_ex2']]
df3 = df1.append(df2)
df4 = df3[['price_ex1','price_ex2']]
arr = df4.values
df3['price_ex1'] = arr[~np.isnan(arr)].astype(float)
df3.drop(columns=['price_ex2'], inplace=True)
df3.columns = ['price', 'bid_ex1', 'ask_ex1', 'bid_ex2', 'ask_ex2']
def change(bid_ex1, ask_ex1, bid_ex2, ask_ex2, col_name):
if col_name == 'bid_ex1_col':
if (bid_ex1 != np.nan or bid_ex2 != np.nan) and bid_ex1 == 'bid':
return bid_ex2
else:
return bid_ex1
if col_name == 'ask_ex1_col':
if (bid_ex1 != np.nan or bid_ex2 != np.nan) and bid_ex1 == 'ask':
return bid_ex2
else:
return ask_ex1
if col_name == 'ask_ex2_col':
if (ask_ex1 != np.nan or ask_ex2 != np.nan) and ask_ex1 == 'ask':
return ask_ex2
else:
return ask_ex1
if col_name == 'bid_ex2_col':
if (ask_ex1 != np.nan or ask_ex2 != np.nan) and ask_ex1 == 'bid':
return ask_ex2
else:
return ask_ex1
df3['bid_ex1_col'] = df3.apply(lambda row: change(row['bid_ex1'],row['ask_ex1'],row['bid_ex2'],row['ask_ex2'], 'bid_ex1_col'), axis=1)
df3['ask_ex1_col'] = df3.apply(lambda row: change(row['bid_ex1'],row['ask_ex1'],row['bid_ex2'],row['ask_ex2'], 'ask_ex1_col'), axis=1)
df3['ask_ex2_col'] = df3.apply(lambda row: change(row['bid_ex1'],row['ask_ex1'],row['bid_ex2'],row['ask_ex2'], 'ask_ex2_col'), axis=1)
df3['bid_ex2_col'] = df3.apply(lambda row: change(row['bid_ex1'],row['ask_ex1'],row['bid_ex2'],row['ask_ex2'], 'bid_ex2_col'), axis=1)
df3.drop(columns=['bid_ex1', 'ask_ex1', 'bid_ex2', 'ask_ex2'], inplace=True)
df3.replace(to_replace='ask', value=np.nan,inplace=True)
df3.replace(to_replace='bid', value=np.nan,inplace=True)
One option is to flip to long form with pivot_longer before flipping back to wide form with pivot_wider from pyjanitor:
# pip install pyjanitor
import pandas as pd
import janitor
(df
.pivot_longer(names_to = ('ex1', 'ex2', 'ex'),
values_to=('price','side','size'),
names_pattern=['price', 'side', 'size'])
.loc[:, ['price', 'side','ex','size']]
.assign(ex = lambda df: df.ex.str.split('_').str[-1])
.pivot_wider('price', ('side', 'ex'), 'size')
.sort_values('price', ascending = False)
)
price bid_ex1 ask_ex1 bid_ex2 ask_ex2
5 9497.81424 NaN NaN NaN 23.0
4 9487.81185 NaN NaN 556.0 NaN
3 9437.24045 NaN NaN 10.0 NaN
2 9397.80000 NaN 0.023 NaN NaN
1 9394.85206 0.053 NaN NaN NaN
0 9380.59650 0.416 NaN NaN NaN
Related
I would like to take a dataframe and concatenate consecutive rows for comparison.
e.g.
Take
xyt = pd.DataFrame(np.concatenate((np.random.randn(3,2), np.arange(3).reshape((3, 1))), axis=1), columns=['x','y','t'])
Which looks something like:
x y t
0 1.237007 -1.035837 0.0
1 -1.782458 1.042942 1.0
2 0.063130 0.355014 2.0
And make:
a b
x y t x y t
0 1.237007 -1.035837 0.0 -1.782458 1.042942 1.0
1 -1.782458 1.042942 1.0 0.063130 0.355014 2.0
The best I could come up with was:
pd.DataFrame(
[np.append(x,y) for (x, y) in zip(xyt.values, xyt[1:].values)],
columns=pd.MultiIndex.from_product([('a', 'b'), xyt.columns]))
Is there a better way?
Let's try concat on axis=1 with the shifted frame:
import pandas as pd
xyt = pd.DataFrame({'x': {0: 1.237007, 1: -1.782458, 2: 0.06313},
'y': {0: -1.035837, 1: 1.042942, 2: 0.355014},
't': {0: 0.0, 1: 1.0, 2: 2.0}})
merged = pd.concat((xyt, xyt.shift(-1)), axis=1, keys=('a', 'b')).iloc[:-1]
print(merged)
merged:
a b
x y t x y t
0 1.237007 -1.035837 0.0 -1.782458 1.042942 1.0
1 -1.782458 1.042942 1.0 0.063130 0.355014 2.0
You can use pd.concat:
# Generate random data
n = 10
x, y = np.random.randn(2, n)
t = np.arange(n)
xyt = pd.DataFrame({
'x': x, 'y': y, 't': t
})
# The call
pd.concat([xyt, xyt.shift(-1)], axis=1, keys=['a','b'])
# Result
a b
x y t x y t
0 1.180544 1.707380 0 -0.227370 0.734225 1.0
1 -0.227370 0.734225 1 0.271997 -1.039424 2.0
2 0.271997 -1.039424 2 -0.729960 -1.081224 3.0
3 -0.729960 -1.081224 3 0.185301 0.530126 4.0
4 0.185301 0.530126 4 -0.175333 -0.126157 5.0
5 -0.175333 -0.126157 5 -0.634870 0.068683 6.0
6 -0.634870 0.068683 6 0.350867 0.361564 7.0
7 0.350867 0.361564 7 0.090678 -0.269504 8.0
8 0.090678 -0.269504 8 0.177076 -0.976640 9.0
9 0.177076 -0.976640 9 NaN NaN NaN
I'm trying to assign a list value to a column with the following code
In [105] df[df['review_meta_id'] == 5349]['tags'].head()
Out [105] 4 NaN
2035 NaN
2630 NaN
3085 NaN
6833 NaN
Name: tags, dtype: object
In [106] tags
Out [106] ['자연공원', '도심상점']
In [107] df.loc[df['review_meta_id'] == 5349,'tags'] = pd.Series(tags)
In [108] df[df['review_meta_id'] == 5349]['tags'].head()
Out [108] 4 NaN
2035 NaN
2630 NaN
3085 NaN
6833 NaN
Name: tags, dtype: object
In [109]
So why is value not being assigned?
*Edit
So it seems, I can do something like
df.loc[df['review_meta_id'] == 5349,'tags'] = pd.Series([tags] * len(df))
why not ?
df.loc[df['review_meta_id'] == 5349,'tags'] = pd.Series([tags] * len(df[df['review_meta_id'] == 5349]))
The reason here is pandas will check the index as well when you create the new series, (The index is range index from 0 to n-1)then index is mismatched with assign position, so pandas will return NaN
df.loc[df['review_meta_id'] == 5349,'tags'] = pd.Series([tags] * len(df[df['review_meta_id'] == 5349]), index = df.index[df['review_meta_id'] == 5349] )
I want to divide a dataframe by a number:
df = df/10
Is there a way to do this in a method chain?
# idea:
df = df.filter(['a','b']).query("a>100").assign(**divide by 10)
We can use DataFrame.div here:
df = df[['a','b']].query("a>100").div(10)
a b
0 40.0 0.7
1 50.0 0.8
5 70.0 0.3
Use DataFrame.pipe with lambda function for use some function for all data of DataFrame:
df = pd.DataFrame({
'a':[400,500,40,50,5,700],
'b':[7,8,9,4,2,3],
'c':[1,3,5,7,1,0],
'd':[5,3,6,9,2,4]
})
df = df.filter(['a','b']).query("a>100").pipe(lambda x: x / 10)
print (df)
a b
0 40.0 0.7
1 50.0 0.8
5 70.0 0.3
Here if use apply all columns are divided separately:
df = df.filter(['a','b']).query("a>100").apply(lambda x: x / 10)
You can see difference with print:
df1 = df.filter(['a','b']).query("a>100").pipe(lambda x: print (x))
a b
0 400 7
1 500 8
5 700 3
df2 = df.filter(['a','b']).query("a>100").apply(lambda x: print (x))
0 400
1 500
5 700
Name: a, dtype: int64
0 7
1 8
5 3
Name: b, dtype: int64
Suppose I have a dataframe as shown below:
import pandas as pd
import numpy as np
np.random.seed(42)
df = pd.DataFrame({'A':np.random.randn(5), 'B': np.zeros(5), 'C': np.zeros(5)})
df
>>>
A B C
0 0.496714 0.0 0.0
1 -0.138264 0.0 0.0
2 0.647689 0.0 0.0
3 1.523030 0.0 0.0
4 -0.234153 0.0 0.0
And I have the list of columns which I want to populate with the value of 1, when A is negative.
idx = df.A < 0
cols = ['B', 'C']
So in this case, I want the indices [1, 'B'] and [4, 'C'] set to 1.
What I tried:
However, doing df.loc[idx, cols] = 1 sets the entire row to be 1, and not just the individual column. I also tried doing df.loc[idx, cols] = pd.get_dummies(cols) which gave the result:
A B C
0 0.496714 0.0 0.0
1 -0.138264 0.0 1.0
2 0.647689 0.0 0.0
3 1.523030 0.0 0.0
4 -0.234153 NaN NaN
I'm assuming this is because the index of get_dummies and the dataframe don't line up.
Expected Output:
A B C
0 0.496714 0.0 0.0
1 -0.138264 1.0 0.0
2 0.647689 0.0 0.0
3 1.523030 0.0 0.0
4 -0.234153 0.0 1.0
So what's the best (read fastest) way to do this. In my case, there are 1000's of rows and 5 columns.
Timing of results:
TLDR: editing values directly is faster.
%%timeit
df.values[idx, df.columns.get_indexer(cols)] = 1
123 µs ± 2.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%%timeit
df.iloc[idx.array,df.columns.get_indexer(cols)]=1
266 µs ± 7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Use numpy indexing for improve performance:
idx = df.A < 0
res = ['B', 'C']
arr = df.values
arr[idx, df.columns.get_indexer(res)] = 1
print (arr)
[[ 0.49671415 0. 0. ]
[-0.1382643 1. 0. ]
[ 0.64768854 0. 0. ]
[ 1.52302986 0. 0. ]
[-0.23415337 0. 1. ]]
df = pd.DataFrame(arr, columns=df.columns, index=df.index)
print (df)
A B C
0 0.496714 0.0 0.0
1 -0.138264 1.0 0.0
2 0.647689 0.0 0.0
3 1.523030 0.0 0.0
4 -0.234153 0.0 1.0
Alternative:
idx = df.A < 0
res = ['B', 'C']
df.values[idx, df.columns.get_indexer(res)] = 1
print (df)
A B C
0 0.496714 0.0 0.0
1 -0.138264 1.0 0.0
2 0.647689 0.0 0.0
3 1.523030 0.0 0.0
4 -0.234153 0.0 1.0
ind = df.index[idx]
for idx,col in zip(ind,res):
...: df.at[idx,col] = 1
In [7]: df
Out[7]:
A B C
0 0.496714 0.0 0.0
1 -0.138264 1.0 0.0
2 0.647689 0.0 0.0
3 1.523030 0.0 0.0
4 -0.234153 0.0 1.0
What do the following assignments behave differently?
df.loc[rows, [col]] = ...
df.loc[rows, col] = ...
For example:
r = pd.DataFrame({"response": [1,1,1],},index = [1,2,3] )
df = pd.DataFrame({"x": [999,99,9],}, index = [3,4,5] )
df = pd.merge(df, r, how="left", left_index=True, right_index=True)
df.loc[df["response"].isnull(), "response"] = 0
print df
x response
3 999 0.0
4 99 0.0
5 9 0.0
but
df.loc[df["response"].isnull(), ["response"]] = 0
print df
x response
3 999 1.0
4 99 0.0
5 9 0.0
why should I expect the first to behave differently to the second?
df.loc[df["response"].isnull(), ["response"]]
returns a DataFrame, so if you want to assign something to it it must be aligned by both index and columns
Demo:
In [79]: df.loc[df["response"].isnull(), ["response"]] = \
pd.DataFrame([11,12], columns=['response'], index=[4,5])
In [80]: df
Out[80]:
x response
3 999 1.0
4 99 11.0
5 9 12.0
alternatively you can assign an array/matrix of the same shape:
In [83]: df.loc[df["response"].isnull(), ["response"]] = [11, 12]
In [84]: df
Out[84]:
x response
3 999 1.0
4 99 11.0
5 9 12.0
I'd also consider using fillna() method:
In [88]: df.response = df.response.fillna(0)
In [89]: df
Out[89]:
x response
3 999 1.0
4 99 0.0
5 9 0.0