Related
I desire a DataFrame like so:
timestamp | bids | asks | ticker
-------------------------------------------------------------------
1598215600 | [[10, 20], [15, 30]] | [[20, 10], [25, 20]] | "AAPL"
1598222400 | [[11, 25], [16, 35]] | [[22, 15], [28, 25]] | "MSFT"
1598229200 | [[12, 30], [18, 40]] | [[24, 20], [30, 30]] | "GOOG"
The bids Series has a Vec<Vec> structure, which in plain words is a vector that holds a pair (another vector) of the price and amount (two values).
What is the required rust code to create this? If possible answer in rust, but python works too I guess I can recreate it.
I'm new to rust so it's possible this is not optimal.
From looking around it seems like ChunkedArray may be the way to go?
use polars::prelude::*;
fn build_column(rows: &Vec<[[i64; 2]; 2]>) -> Series {
ListChunked::from_iter(rows.into_iter().map(|row| {
ListChunked::from_iter(
row.into_iter()
.map(|values| Int64Chunked::from_slice("", values).into_series()),
)
.into_series()
}))
.into_series()
}
fn main() -> PolarsResult<()> {
let asks = vec![
[[20, 10], [25, 20]],
[[22, 15], [28, 25]],
[[24, 20], [30, 30]],
];
let bids = vec![
[[10, 20], [15, 30]],
[[11, 25], [16, 35]],
[[12, 30], [18, 40]],
];
let df = df!(
"timestamp" => [1598215600, 1598222400, 1598229200],
"asks" => build_column(&asks),
"bids" => build_column(&bids),
"ticker" => ["AAPL", "MSFT", "GOOG"]
);
println!("{:?}", df);
Ok(())
}
Ok(shape: (3, 4)
┌────────────┬──────────────────────┬──────────────────────┬────────┐
│ timestamp ┆ asks ┆ bids ┆ ticker │
│ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ list[list[i64]] ┆ list[list[i64]] ┆ str │
╞════════════╪══════════════════════╪══════════════════════╪════════╡
│ 1598215600 ┆ [[20, 10], [25, 20]] ┆ [[10, 20], [15, 30]] ┆ AAPL │
│ 1598222400 ┆ [[22, 15], [28, 25]] ┆ [[11, 25], [16, 35]] ┆ MSFT │
│ 1598229200 ┆ [[24, 20], [30, 30]] ┆ [[12, 30], [18, 40]] ┆ GOOG │
└────────────┴──────────────────────┴──────────────────────┴────────┘)
I have a dataframe with 2 columns, where first column contains lists, and second column integer indexes. How to get elements from first column by index specified in second column? Or even better, put that element in 3rd column. So for example, how from this
a = pl.DataFrame([{'lst': [1, 2, 3], 'ind': 1}, {'lst': [4, 5, 6], 'ind': 2}])
┌───────────┬─────┐
│ lst ┆ ind │
│ --- ┆ --- │
│ list[i64] ┆ i64 │
╞═══════════╪═════╡
│ [1, 2, 3] ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ [4, 5, 6] ┆ 2 │
└───────────┴─────┘
you can get this
b = pl.DataFrame([{'lst': [1, 2, 3], 'ind': 1, 'list[ind]': 2}, {'lst': [4, 5, 6], 'ind': 2, 'list[ind]': 6}])
┌───────────┬─────┬───────────┐
│ lst ┆ ind ┆ list[ind] │
│ --- ┆ --- ┆ --- │
│ list[i64] ┆ i64 ┆ i64 │
╞═══════════╪═════╪═══════════╡
│ [1, 2, 3] ┆ 1 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ [4, 5, 6] ┆ 2 ┆ 6 │
└───────────┴─────┴───────────┘
Thanks.
Edit
As of python polars 0.14.24 this can be done more easily by
df.with_column(pl.col("lst").arr.get(pl.col("ind")).alias("list[ind]"))
Original answer
You can use with_row_count() to add a row count column for grouping, then explode() the list so each list element is on each row. Then call take() over the row count column using over() to select the element from the subgroup.
df = pl.DataFrame({"lst": [[1, 2, 3], [4, 5, 6]], "ind": [1, 2]})
df = (
df.with_row_count()
.with_column(
pl.col("lst").explode().take(pl.col("ind")).over(pl.col("row_nr")).alias("list[ind]")
)
.drop("row_nr")
)
shape: (2, 3)
┌───────────┬─────┬───────────┐
│ lst ┆ ind ┆ list[ind] │
│ --- ┆ --- ┆ --- │
│ list[i64] ┆ i64 ┆ i64 │
╞═══════════╪═════╪═══════════╡
│ [1, 2, 3] ┆ 1 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ [4, 5, 6] ┆ 2 ┆ 6 │
└───────────┴─────┴───────────┘
Here is my approach:
Create a custom function to get the values as per the required index.
def get_elem(d):
sel_idx = d[0]
return d[1][sel_idx]
here is a test data.
df = pl.DataFrame({'lista':[[1,2,3],[4,5,6]],'idx':[1,2]})
Now lets create a struct on these two columns(it will create a dict) and apply an above function
df.with_columns([
pl.struct(['idx','lista']).apply(lambda x: get_elem(list(x.values()))).alias('req_elem')])
shape: (2, 3)
┌───────────┬─────┬──────────┐
│ lista ┆ idx ┆ req_elem │
│ --- ┆ --- ┆ --- │
│ list[i64] ┆ i64 ┆ i64 │
╞═══════════╪═════╪══════════╡
│ [1, 2, 3] ┆ 1 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ [4, 5, 6] ┆ 2 ┆ 6 │
└───────────┴─────┴──────────┘
If your number of unique idx elements isn't absolutely massive, you can build a when/then expression to select based on the value of idx using list.get(idx):
import polars as pl
df = pl.DataFrame([{"lst": [1, 2, 3], "ind": 1}, {"lst": [4, 5, 6], "ind": 2}])
# create when/then expression for each unique index
idxs = df["ind"].unique()
ind, lst = pl.col("ind"), pl.col("lst") # makes expression generator look cleaner
expr = pl.when(ind == idxs[0]).then(lst.arr.get(idxs[0]))
for idx in idxs[1:]:
expr = expr.when(ind == idx).then(lst.arr.get(idx))
expr = expr.otherwise(None)
df.select(expr)
shape: (2, 1)
┌─────┐
│ lst │
│ --- │
│ i64 │
╞═════╡
│ 2 │
├╌╌╌╌╌┤
│ 6 │
└─────┘
Python version 3.7 - 3.9
pandas 1.4.4
>>> import pandas as pd
>>> import numpy as np
>>> df1 = df1 = pd.DataFrame({'name':[np.nan, np.nan, 'John', 'Mary', 'David'],
... 'age' :[np.nan, np.nan, 20, 30, 40],
... 'state': ['NY', 'CA', 'CA', 'NY', 'IL']})
>>> df1
name age state
0 NaN NaN NY
1 NaN NaN CA
2 John 20.0 CA
3 Mary 30.0 NY
4 David 40.0 IL
>>> df2 = pd.DataFrame({'name':['Lee', 'David', 'Mary', np.nan],
... 'age':[np.nan, 40, 30, np.nan],
... 'city':['Boston', 'Chicago', 'New York', 'Seattle']})
>>> df2
name age city
0 Lee NaN Boston
1 David 40.0 Chicago
2 Mary 30.0 New York
3 NaN NaN Seattle
>>> df1_new = df1.set_index(['name', 'age'])
>>> df2_new = df2.set_index(['name', 'age'])
>>> df1_new.index
MultiIndex([( nan, nan),
( nan, nan),
( 'John', 20.0),
( 'Mary', 30.0),
('David', 40.0)],
names=['name', 'age'])
>>> df2_new.index
MultiIndex([( 'Lee', nan),
('David', 40.0),
( 'Mary', 30.0),
( nan, nan)],
names=['name', 'age'])
>>> df1_new.index.union(df2_new.index)
MultiIndex([('David', 40.0),
( 'John', 20.0),
( 'Mary', 30.0),
( nan, nan),
( nan, nan)],
names=['name', 'age'])
>>> df2_new.index.union(df1_new.index)
MultiIndex([('David', 40.0),
( 'John', 20.0),
( 'Lee', nan),
( 'Mary', 30.0),
( nan, nan),
( nan, nan)],
names=['name', 'age'])
So the output of df1_new.index.union(df2_new.index) has a missing key ( 'Lee', nan) that is in df2_new.index.union(df1_new.index). It is an unexpected behavior as mathematically a union operation shall be not dependent on the sequence of operands.
I notice this issue when I am trying to figure out how to ensure `pandas.merge(df1, df2, left_index=True, right_index=True, how='outer') will have all indices included.
First, we create a large dataset with MultiIndex whose first record contains missing values np.NaN
In [200]: data = []
...: val = 0
...: for ind_1 in range(3000):
...: if ind_1 == 0:
...: data.append({'ind_1': 0, 'ind_2': np.NaN, 'val': np.NaN})
...: else:
...: for ind_2 in range(3000):
...: data.append({'ind_1': ind_1, 'ind_2': ind_2, 'val': val})
...: val += 1
...: df = pd.DataFrame(data).set_index(['ind_1', 'ind_2'])
In [201]: df
Out[201]:
val
ind_1 ind_2
0 NaN NaN
1 0.0 0.0
1.0 1.0
2.0 2.0
3.0 3.0
... ...
2999 2995.0 8996995.0
2996.0 8996996.0
2997.0 8996997.0
2998.0 8996998.0
2999.0 8996999.0
[8997001 rows x 1 columns]
I want to select all rows where ind_1 < 3 and ind_2 < 3
First I create an MultiIndex i1 where ind_1 < 3
In [202]: i1 = df.loc[df.index.get_level_values('ind_1') < 3].index
In [203]: i1
Out[203]:
MultiIndex([(0, nan),
(1, 0.0),
(1, 1.0),
(1, 2.0),
(1, 3.0),
(1, 4.0),
(1, 5.0),
(1, 6.0),
(1, 7.0),
(1, 8.0),
...
(2, 2990.0),
(2, 2991.0),
(2, 2992.0),
(2, 2993.0),
(2, 2994.0),
(2, 2995.0),
(2, 2996.0),
(2, 2997.0),
(2, 2998.0),
(2, 2999.0)],
names=['ind_1', 'ind_2'], length=6001)
Then I create an MultiIndex i2 where ind_2 < 3
In [204]: i2 = df.loc[~(df.index.get_level_values('ind_2') > 2)].index
In [205]: i2
Out[205]:
MultiIndex([( 0, nan),
( 1, 0.0),
( 1, 1.0),
( 1, 2.0),
( 2, 0.0),
( 2, 1.0),
( 2, 2.0),
( 3, 0.0),
( 3, 1.0),
( 3, 2.0),
...
(2996, 2.0),
(2997, 0.0),
(2997, 1.0),
(2997, 2.0),
(2998, 0.0),
(2998, 1.0),
(2998, 2.0),
(2999, 0.0),
(2999, 1.0),
(2999, 2.0)],
names=['ind_1', 'ind_2'], length=8998)
Logically, the solution should be the intersection of these two sets
In [206]: df.loc[i1 & i2]
Out[206]:
val
ind_1 ind_2
1 0.0 0.0
1.0 1.0
2.0 2.0
2 0.0 3000.0
1.0 3001.0
2.0 3002.0
Why is the first record (0, nan) filtered out?
Use boolean arrays i1, i2 instead of indexes
In [27]: i1 = df.index.get_level_values('ind_1') < 3
In [28]: i2 = ~(df.index.get_level_values('ind_2') > 2)
In [29]: i1
Out[29]: array([ True, True, True, ..., False, False, False])
In [30]: i2
Out[30]: array([ True, True, True, ..., False, False, False])
In [31]: df.loc[i1 & i2]
Out[31]:
val
ind_1 ind_2
0 NaN NaN
1 0.0 0.0
1.0 1.0
2.0 2.0
2 0.0 3000.0
1.0 3001.0
2.0 3002.0
I have a series of Events which are produced by different users over time.
How can I aggregate this series by events that are close each other. Two events are close (in the same window) if:
b.user = a.user
and b.time >= a.time
and b.time - a.time <= interval '1 month'
This is a recursive condition. For example, the following dataset:
CREATE TABLE pg_temp.Data
("event" int, "user" int, "date" date, "value" int)
;
INSERT INTO pg_temp.Data
("event", "user", "date", "value")
VALUES
(1, 1, '2017-01-01', 5),
(2, 1, '2017-01-07', 3),
(3, 1, '2017-02-09', 2),
(4, 1, '2017-03-12', 4),
(5, 1, '2017-04-03', 7),
(6, 1, '2017-05-01', 6),
(7, 2, '2017-01-05', 9),
(8, 2, '2017-01-12', 1),
(9, 2, '2017-03-24', 6)
;
select * from pg_temp.Data
should be reduced to something like:
[
{
"init": "2017-01-01",
"latest": "2017-01-07",
"events": [
1,
2
],
"user": 1,
"value": 8
},
{
"init": "2017-02-09",
"latest": "2017-02-09",
"events": [
3
],
"user": 1,
"value": 2
},
{
"init": "2017-03-12",
"latest": "2017-05-01",
"events": [
4,
5,
6
],
"user": 1,
"value": 17
},
{
"init": "2017-01-05",
"latest": "2017-01-12",
"events": [
7,
8
],
"user": 2,
"value": 10
},
{
"init": "2017-03-24",
"latest": "2017-03-24",
"events": [
9
],
"user": 2,
"value": 6
}
]
Where init and latest are the window's time range and value is the sum of values in the window.
Note that events 6 and 4 are more than one month apart, but they've been aggregated into the same group because event 5 is in between them.
Use window functions:
SELECT min(date) AS init,
max(date) AS latest,
array_agg(event) AS events,
"user",
sum(value) AS value
FROM (SELECT event,
"user",
date,
value,
count(grp_start)
OVER (PARTITION BY "user" ORDER BY date) session_id
FROM (SELECT event,
"user",
date,
value,
CASE
WHEN date
> lag(date, 1, timestamp '-infinity')
OVER (PARTITION BY "user" ORDER BY date)
+ INTERVAL '1 month'
THEN 1
END grp_start
FROM data
) tagged
) numbered
GROUP BY "user", session_id
ORDER BY "user", init;
This will result in:
┌─────────────────────┬─────────────────────┬─────────┬──────┬───────┐
│ init │ latest │ events │ user │ value │
├─────────────────────┼─────────────────────┼─────────┼──────┼───────┤
│ 2017-01-01 00:00:00 │ 2017-01-07 00:00:00 │ {1,2} │ 1 │ 8 │
│ 2017-02-09 00:00:00 │ 2017-02-09 00:00:00 │ {3} │ 1 │ 2 │
│ 2017-03-12 00:00:00 │ 2017-05-01 00:00:00 │ {4,5,6} │ 1 │ 17 │
│ 2017-01-05 00:00:00 │ 2017-01-12 00:00:00 │ {7,8} │ 2 │ 10 │
│ 2017-03-24 00:00:00 │ 2017-03-24 00:00:00 │ {9} │ 2 │ 6 │
└─────────────────────┴─────────────────────┴─────────┴──────┴───────┘
(5 rows)
A word af advice: it is a good idea not to use column names like user that are reserved words. If you forget to use them inside double quotes, surprising things will happen (try it out).