how compute discounted future cumulative sum with spark pyspark window functions or sql - sql

Can I compute a discounted future cumulative sum using spark sql? Below is an example that computes the undiscounted cum future sum using window functions, and I hard coded in what I mean by the discounted cum sum:
from pyspark.sql.window import Window
def undiscountedCummulativeFutureReward(df):
windowSpec = Window \
.partitionBy('user') \
.orderBy('time') \
.rangeBetween(0, Window.unboundedFollowing)
tot_reward = F.sum('reward').over(windowSpec)
df_tot_reward = df.withColumn('undiscounted', tot_reward)
return df_tot_reward
def makeData(spark, gamma=0.5):
data = [{'user': 'bob', 'time': 3, 'reward': 10, 'discounted_cum': 10 + (gamma * 9) + ((gamma ** 2) * 11)},
{'user': 'bob', 'time': 4, 'reward': 9, 'discounted_cum': 9 + gamma * 11},
{'user': 'bob', 'time': 5, 'reward': 11, 'discounted_cum': 11.0},
{'user': 'jo', 'time': 4, 'reward': 6, 'discounted_cum': 6 + gamma * 7},
{'user': 'jo', 'time': 5, 'reward': 7, 'discounted_cum': 7.0},
]
schema = T.StructType([T.StructField('user', T.StringType(), False),
T.StructField('time', T.IntegerType(), False),
T.StructField('reward', T.IntegerType(), False),
T.StructField('discounted_cum', T.FloatType(), False)])
return spark.createDataFrame(data=data, schema=schema)
def main(spark):
df = makeData(spark)
df = undiscountedCummulativeFutureReward(df)
df.orderBy('user', 'time').show()
return df
When you run it you get:
+----+----+------+--------------+------------+
|user|time|reward|discounted_cum|undiscounted|
+----+----+------+--------------+------------+
| bob| 3| 10| 17.25| 30|
| bob| 4| 9| 14.5| 20|
| bob| 5| 11| 11.0| 11|
| jo| 4| 6| 9.5| 13|
| jo| 5| 7| 7.0| 7|
+----+----+------+--------------+------------+
That is discounted is sum \gamma^k r_k for k=0 to \infinity
I'm wondering if I can compute the discounted column with Window functions, like introduce a column with the rank, a literal with gamma, multiply things together - but still not quite clear - I suppose I can do it with some kind of UDF, but I think I'd have to first collect_as_list all the users, return a new list with the cum discounted sum, and then explode the list.

Suppose you were starting with the following DataFrame:
df.show()
#+----+----+------+
#|user|time|reward|
#+----+----+------+
#| bob| 3| 10|
#| bob| 4| 9|
#| bob| 5| 11|
#| jo| 4| 6|
#| jo| 5| 7|
#+----+----+------+
You can join this DataFrame to itself on the user column, and keep only those rows where the time column of the right table is greater than or equal to the time column of the left table. We make this easier by aliasing the DataFrames l and r.
After the join, you can group by user, time and reward from the left table and aggregate the reward column from the right table. However it seems that a groupBy followed by an orderBy is not guaranteed to maintain that order, so you should use a Window to be explicit.
from pyspark.sql import Window, functions as f
w = Window.partitionBy("user", "l.time", "l.reward").orderBy("r.time")
df = df.alias("l").join(df.alias("r"), on="user")\
.where("r.time>=l.time")\
.select(
"user",
f.col("l.time").alias("time"),
f.col("l.reward").alias("reward"),
f.collect_list("r.reward").over(w).alias("rewards")
)
df.show()
#+----+----+------+-----------+
#|user|time|reward| rewards|
#+----+----+------+-----------+
#| jo| 4| 6| [6]|
#| jo| 4| 6| [6, 7]|
#| jo| 5| 7| [7]|
#| bob| 3| 10| [10]|
#| bob| 3| 10| [10, 9]|
#| bob| 3| 10|[10, 9, 11]|
#| bob| 4| 9| [9]|
#| bob| 4| 9| [9, 11]|
#| bob| 5| 11| [11]|
#+----+----+------+-----------+
Now you have all of the elements required to compute your discounted_cum column.
Spark 2.1 and above:
You can use pyspark.sql.functions.posexplode to explode the rewards array along with the index in the list. This will make a new row for each value in the rewards array. Use distinct to drop duplicates that were introduced by using the Window function (instead of groupBy).
We'll call the index k and the reward rk. Now you can apply your function using pyspark.sql.functions.pow
gamma = 0.5
df.select("user", "time", "reward", f.posexplode("rewards").alias("k", "rk"))\
.distinct()\
.withColumn("discounted", f.pow(f.lit(gamma), f.col("k"))*f.col("rk"))\
.groupBy("user", "time")\
.agg(f.first("reward").alias("reward"), f.sum("discounted").alias("discounted_cum"))\
.show()
#+----+----+------+--------------+
#|user|time|reward|discounted_cum|
#+----+----+------+--------------+
#| bob| 3| 10| 17.25|
#| bob| 4| 9| 14.5|
#| bob| 5| 11| 11.0|
#| jo| 4| 6| 9.5|
#| jo| 5| 7| 7.0|
#+----+----+------+--------------+
Older Versions of Spark
For older versions of spark, you'll have to use row_number()-1 to get the values for k after using explode:
df.select("user", "time", "reward", f.explode("rewards").alias("rk"))\
.distinct()\
.withColumn(
"k",
f.row_number().over(Window.partitionBy("user", "time").orderBy("time"))-1
)\
.withColumn("discounted", f.pow(f.lit(gamma), f.col("k"))*f.col("rk"))\
.groupBy("user", "time")\
.agg(f.first("reward").alias("reward"), f.sum("discounted").alias("discounted_cum"))\
.show()
#+----+----+------+--------------+
#|user|time|reward|discounted_cum|
#+----+----+------+--------------+
#| jo| 4| 6| 9.5|
#| jo| 5| 7| 7.0|
#| bob| 3| 10| 17.25|
#| bob| 4| 9| 14.5|
#| bob| 5| 11| 11.0|
#+----+----+------+--------------+

Related

Counting number of changes in categorical column in PySpark Dataframe

I have a PySpark dataframe that looks like this:
data = [(2010, 3, 12, 0, 'p1', 'state1'),
(2010, 3, 12, 0, 'p2', 'state2'),
(2010, 3, 12, 0, 'p3', 'state1'),
(2010, 3, 12, 0, 'p4', 'state2'),
(2010, 3, 12, 2, 'p1', 'state3'),
(2010, 3, 12, 2, 'p2', 'state1'),
(2010, 3, 12, 2, 'p3', 'state3'),
(2010, 3, 12, 4, 'p1', 'state1'),
(2010, 3, 12, 6, 'p1', 'state1')]
columns = ['year', 'month', 'day', 'hour', 'process_id','state']
df = spark.createDataFrame(data=data, schema=columns)
df.show()
+----+-----+---+----+----------+------+
|year|month|day|hour|process_id| state|
+----+-----+---+----+----------+------+
|2010| 3| 12| 0| p1|state1|
|2010| 3| 12| 0| p2|state2|
|2010| 3| 12| 0| p3|state1|
|2010| 3| 12| 0| p4|state2|
|2010| 3| 12| 2| p1|state3|
|2010| 3| 12| 2| p2|state1|
|2010| 3| 12| 2| p3|state3|
|2010| 3| 12| 4| p1|state1|
|2010| 3| 12| 6| p1|state1|
+----+-----+---+----+----------+------+
The dataframe is already sorted in an increasing order by the four columns: year, month, day and hour as above. The increment is in 2-hour interval.
I would like to find out, for each process_id, how many times its state changes within each day. For that, I intend to use groupby, something like this:
chg_count_df = df.groupby('process_id', 'year', 'month', 'day').
agg(.....)
For this example, the expected output is:
+----+-----+---+----------+----------+
|year|month|day|process_id| chg_count|
+----+-----+---+----------+----------+
|2010| 3| 12| p1| 2|
|2010| 3| 12| p2| 1|
|2010| 3| 12| p3| 1|
|2010| 3| 12| p4| 0|
+----+-----+---+----------+----------+
What should go into the agg(...) function? Or is there a better to way to do this?
You could employ lag window function to check if a state was changed. Then groupBy using sum.
from pyspark.sql import functions as F, Window as W
w = W.partitionBy('year', 'month', 'day', 'process_id').orderBy(F.desc('hour'))
df = df.withColumn('change', F.coalesce((F.lag('state').over(w) != F.col('state')).cast('int'), F.lit(0)))
df = df.groupBy('year', 'month', 'day', 'process_id').agg(F.sum('change').alias('chg_count'))
df.show()
# +----+-----+---+----------+---------+
# |year|month|day|process_id|chg_count|
# +----+-----+---+----------+---------+
# |2010| 3| 12| p1| 2|
# |2010| 3| 12| p2| 1|
# |2010| 3| 12| p3| 1|
# |2010| 3| 12| p4| 0|
# +----+-----+---+----------+---------+
chg_count_df = df.groupby('process_id', 'year', 'month', 'day').count()

pyspark get latest non-null element of every column in one row

Let me explain my question using an example:
I have a dataframe:
pd_1 = pd.DataFrame({'day':[1,2,3,2,1,3],
'code': [10, 10, 20,20,30,30],
'A': [44, 55, 66,77,88,99],
'B':['a',None,'c',None,'d', None],
'C':[None,None,'12',None,None, None]
})
df_1 = sc.createDataFrame(pd_1)
df_1.show()
Output:
+---+----+---+----+----+
|day|code| A| B| C|
+---+----+---+----+----+
| 1| 10| 44| a|null|
| 2| 10| 55|null|null|
| 3| 20| 66| c| 12|
| 2| 20| 77|null|null|
| 1| 30| 88| d|null|
| 3| 30| 99|null|null|
+---+----+---+----+----+
What I want to achieve is a new dataframe, each row corresponds to a code, and for each column I want to have the most recent non-null value (with highest day).
In pandas, I can simply do
pd_2 = pd_1.sort_values('day', ascending=True).groupby('code').last()
pd_2.reset_index()
to get
code day A B C
0 10 2 55 a None
1 20 3 66 c 12
2 30 3 99 d None
My question is, how can I do it in pyspark (preferably version < 3)?
What I have tried so far is:
from pyspark.sql import Window
import pyspark.sql.functions as F
w = Window.partitionBy('code').orderBy(F.desc('day')).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
## Update: after applying #Steven's idea to remove for loop:
df_1 = df_1 .select([F.collect_list(x).over(w).getItem(0).alias(x) for x in df_.columns])
##for x in df_1.columns:
## df_1 = df_1.withColumn(x, F.collect_list(x).over(w).getItem(0))
df_1 = df_1.distinct()
df_1.show()
Output
+---+----+---+---+----+
|day|code| A| B| C|
+---+----+---+---+----+
| 2| 10| 55| a|null|
| 3| 30| 99| d|null|
| 3| 20| 66| c| 12|
+---+----+---+---+----+
Which I'm not very happy with, especially due to the for loop.
I think your current solution is quite nice. If you want another solution, you can try using first/last window functions :
from pyspark.sql import functions as F, Window
w = Window.partitionBy("code").orderBy(F.col("day").desc())
df2 = (
df.select(
"day",
"code",
F.row_number().over(w).alias("rwnb"),
*(
F.first(F.col(col), ignorenulls=True)
.over(w.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))
.alias(col)
for col in ("A", "B", "C")
),
)
.where("rwnb = 1")
.drop("rwnb")
)
and the result :
df2.show()
+---+----+---+---+----+
|day|code| A| B| C|
+---+----+---+---+----+
| 2| 10| 55| a|null|
| 3| 30| 99| d|null|
| 3| 20| 66| c| 12|
+---+----+---+---+----+
Here's another way of doing by using array functions and struct ordering instead of Window:
from pyspark.sql import functions as F
other_cols = ["day", "A", "B", "C"]
df_1 = df_1.groupBy("code").agg(
F.collect_list(F.struct(*other_cols)).alias("values")
).selectExpr(
"code",
*[f"array_max(filter(values, x-> x.{c} is not null))['{c}'] as {c}" for c in other_cols]
)
df_1.show()
#+----+---+---+---+----+
#|code|day| A| B| C|
#+----+---+---+---+----+
#| 10| 2| 55| a|null|
#| 30| 3| 99| d|null|
#| 20| 3| 66| c| 12|
#+----+---+---+---+----+

How to transpose a long dataframe to wide dataframe

I have a dataframe looks like:
group, rate
A,0.1
A,0.2
B,0.3
B,0.1
C,0.1
C,0.2
How can I transpose this to a wide data frame. This is what I expect to get:
group, rate_1, rate_2
A,0.1,0.2
B,0.3,0.1
C,0.1,0.2
The number of records in each group is the same and also how to create a consistent column name with prefix or suffix while transposing?
Do you know which function I can use?
Thanks,
Try with groupBy, collect_list then dynamically split the array column as new columns.
Example:
df.show()
#+-----+----+
#|group|rate|
#+-----+----+
#| A| 0.1|
#| A| 0.2|
#| B| 0.3|
#| B| 0.1|
#+-----+----+
arr_size = 2
exprs=['group']+[expr('lst[' + str(x) + ']').alias('rate_'+str(x+1)) for x in range(0, arr_size)]
df1=df.groupBy("group").agg(collect_list(col("rate")).alias("lst"))
df1.select(*exprs).show()
#+-----+------+------+
#|group|rate_1|rate_2|
#+-----+------+------+
#| B| 0.3| 0.1|
#| A| 0.1| 0.2|
#+-----+------+------+
For Preserver Order in collect_list():
df=spark.createDataFrame([('A',0.1),('A',0.2),('B',0.3),('B',0.1)],['group','rate']).withColumn("mid",monotonically_increasing_id()).repartition(100)
from pyspark.sql.functions import *
from pyspark.sql import *
w=Window.partitionBy("group").orderBy("mid")
w1=Window.partitionBy("group").orderBy(desc("mid"))
df1=df.withColumn("lst",collect_list(col("rate")).over(w)).\
withColumn("snr",row_number().over(w1)).\
filter(col("snr") == 1).\
drop(*['mid','snr','rate'])
df1.show()
#+-----+----------+
#|group| lst|
#+-----+----------+
#| B|[0.3, 0.1]|
#| A|[0.1, 0.2]|
#+-----+----------+
arr_size = 2
exprs=['group']+[expr('lst[' + str(x) + ']').alias('rate_'+str(x+1)) for x in range(0, arr_size)]
df1.select(*exprs).show()
+-----+------+------+
|group|rate_1|rate_2|
+-----+------+------+
| B| 0.3| 0.1|
| A| 0.1| 0.2|
+-----+------+------+
I would create a column to rank your "rate" column and then pivot:
First create a "rank" column and concatenate the string "rate_" to the row_number:
from pyspark.sql.functions import concat, first, lit, row_number
from pyspark.sql import Window
df = df.withColumn(
"rank",
concat(
lit("rate_"),
row_number().over(Window.partitionBy("group")\
.orderBy("rate")).cast("string")
)
)
df.show()
#+-----+----+------+
#|group|rate| rank|
#+-----+----+------+
#| B| 0.1|rate_1|
#| B| 0.3|rate_2|
#| C| 0.1|rate_1|
#| C| 0.2|rate_2|
#| A| 0.1|rate_1|
#| A| 0.2|rate_2|
#+-----+----+------+
Now group by the "group" column and pivot on the "rank" column. Since you need an aggregation, use first.
df.groupBy("group").pivot("rank").agg(first("rate")).show()
#+-----+------+------+
#|group|rate_1|rate_2|
#+-----+------+------+
#| B| 0.1| 0.3|
#| C| 0.1| 0.2|
#| A| 0.1| 0.2|
#+-----+------+------+
The above does not depend on knowing the number of records in each group ahead of time.
However if (like you said) you know the number of records in each group you can make the pivot more efficient by passing in the values
num_records = 2
values = ["rate_" + str(i+1) for i in range(num_records)]
df.groupBy("group").pivot("rank", values=values).agg(first("rate")).show()
#+-----+------+------+
#|group|rate_1|rate_2|
#+-----+------+------+
#| B| 0.1| 0.3|
#| C| 0.1| 0.2|
#| A| 0.1| 0.2|
#+-----+------+------+

Pyspark : how to code complicated dataframe calculation lead sum

I have given dataframe that looks like this.
THIS dataframe is sorted by date, and col1 is just some random value.
TEST_schema = StructType([StructField("date", StringType(), True),\
StructField("col1", IntegerType(), True),\
])
TEST_data = [('2020-08-01',3),('2020-08-02',1),('2020-08-03',-1),('2020-08-04',-1),('2020-08-05',3),\
('2020-08-06',-1),('2020-08-07',6),('2020-08-08',4),('2020-08-09',5)]
rdd3 = sc.parallelize(TEST_data)
TEST_df = sqlContext.createDataFrame(TEST_data, TEST_schema)
TEST_df.show()
+----------+----+
| date|col1|
+----------+----+
|2020-08-01| 3|
|2020-08-02| 1|
|2020-08-03| -1|
|2020-08-04| -1|
|2020-08-05| 3|
|2020-08-06| -1|
|2020-08-07| 6|
|2020-08-08| 4|
|2020-08-09| 5|
+----------+----+
LOGIC : lead(col1) +1, if col1 ==-1, then from the previous value lead(col1) +2...
the resulted dataframe will look like this (want column is what i want as output)
+----------+----+----+
| date|col1|WANT|
+----------+----+----+
|2020-08-01| 3| 2|
|2020-08-02| 1| 6|
|2020-08-03| -1| 5|
|2020-08-04| -1| 4|
|2020-08-05| 3| 8|
|2020-08-06| -1| 7|
|2020-08-07| 6| 5|
|2020-08-08| 4| 6|
|2020-08-09| 5| -1|
+----------+----+----+
Let's look at last row, where col1==5, that 5 is leaded +1 which is in want==6 (2020-08-08)
If we have col==-1, then we add +1 more ,, if we have col==-1 repeated twice, then we add +2 more..
this is hard to explain in words,lastly since it created last column instead of null, replace with -1. I have a diagram
You can check if the following code and logic works for you:
create a sub-group label g which take running sum of int(col1!=-1), and we only concern about Rows with col1 == -1, and nullify all other Rows.
the residual is 1 and if col1 == -1, plus the running count on Window w2
take the prev_col1 over w1 which is not -1 (using nullif), (the naming of prev_col1 might be confusion since it takes only if col1 = -1 using typical pyspark's way to do ffill, otherwise keep the original).
set val = prev_col1 + residual, take the lag and set null to -1
Code below:
from pyspark.sql.functions import when, col, expr, count, desc, lag, coalesce
from pyspark.sql import Window
w1 = Window.orderBy(desc('date'))
w2 = Window.partitionBy('g').orderBy(desc('date'))
TEST_df.withColumn('g', when(col('col1') == -1, expr("sum(int(col1!=-1))").over(w1))) \
.withColumn('residual', when(col('col1') == -1, count('*').over(w2) + 1).otherwise(1)) \
.withColumn('prev_col1',expr("last(nullif(col1,-1),True)").over(w1)) \
.withColumn('want', coalesce(lag(expr("prev_col1 + residual")).over(w1),lit(-1))) \
.orderBy('date').show()
+----------+----+----+--------+---------+----+
| date|col1| g|residual|prev_col1|want|
+----------+----+----+--------+---------+----+
|2020-08-01| 3|null| 1| 3| 2|
|2020-08-02| 1|null| 1| 1| 6|
|2020-08-03| -1| 4| 3| 3| 5|
|2020-08-04| -1| 4| 2| 3| 4|
|2020-08-05| 3|null| 1| 3| 8|
|2020-08-06| -1| 3| 2| 6| 7|
|2020-08-07| 6|null| 1| 6| 5|
|2020-08-08| 4|null| 1| 4| 6|
|2020-08-09| 5|null| 1| 5| -1|
+----------+----+----+--------+---------+----+

how to create & sort by an ordered categorical variable in pyspark

I'm migrating some code from pandas to pyspark. My source dataframe looks like this:
a b c
0 1 insert 1
1 2 update 1
2 3 seed 1
3 4 insert 2
4 5 update 2
5 6 delete 2
6 7 snapshot 1
and the operation (in python / pandas) that I'm applying is:
df.b = pd.Categorical(df.b, ordered=True, categories=['insert', 'seed', 'update', 'snapshot', 'delete'])
df.sort_values(['c', 'b'])
resulting in the output dataframe:
a b c
0 1 insert 1
2 3 seed 1
1 2 update 1
6 7 snapshot 1
3 4 insert 2
4 5 update 2
5 6 delete 2
I'm unsure how best to set up ordered categoricals using pyspark, and my initial approach creates a new column using case-when and attempts to use that subsequently:
df = df.withColumn(
"_precedence",
when(col("b") == "insert", 1)
.when(col("b") == "seed", 2)
.when(col("b") == "update", 3)
.when(col("b") == "snapshot", 4)
.when(col("b") == "delete", 5)
)
You can use a map:
from pyspark.sql.functions import create_map, lit, col
categories=['insert', 'seed', 'update', 'snapshot', 'delete']
# per #HaleemurAli, adjusted the below list comprehension to create map
map1 = create_map([val for (i, c) in enumerate(categories) for val in (c, lit(i))])
#Column<b'map(insert, 0, seed, 1, update, 2, snapshot, 3, delete, 4)'>
df.orderBy('c', map1[col('b')]).show()
+---+---+--------+---+
| id| a| b| c|
+---+---+--------+---+
| 0| 1| insert| 1|
| 2| 3| seed| 1|
| 1| 2| update| 1|
| 6| 7|snapshot| 1|
| 3| 4| insert| 2|
| 4| 5| update| 2|
| 5| 6| delete| 2|
+---+---+--------+---+
to reverse the order on column-b: df.orderBy('c', map1[col('b')].desc()).show()
You could also do this using coalesce with ur when statements.
from pyspark.sql import functions as F
categories=['insert', 'seed', 'update', 'snapshot', 'delete']
cols=[(F.when(F.col("b")==x,F.lit(y))) for x,y in zip(categories,[x for x in (range(1, len(categories)+1))])]
df.orderBy("c",F.coalesce(*cols)).show()
#+---+--------+---+
#| a| b| c|
#+---+--------+---+
#| 1| insert| 1|
#| 3| seed| 1|
#| 2| update| 1|
#| 7|snapshot| 1|
#| 4| insert| 2|
#| 5| update| 2|
#| 6| delete| 2|
#+---+--------+---+