Pyspark : how to code complicated dataframe calculation lead sum - apache-spark-sql

I have given dataframe that looks like this.
THIS dataframe is sorted by date, and col1 is just some random value.
TEST_schema = StructType([StructField("date", StringType(), True),\
StructField("col1", IntegerType(), True),\
])
TEST_data = [('2020-08-01',3),('2020-08-02',1),('2020-08-03',-1),('2020-08-04',-1),('2020-08-05',3),\
('2020-08-06',-1),('2020-08-07',6),('2020-08-08',4),('2020-08-09',5)]
rdd3 = sc.parallelize(TEST_data)
TEST_df = sqlContext.createDataFrame(TEST_data, TEST_schema)
TEST_df.show()
+----------+----+
| date|col1|
+----------+----+
|2020-08-01| 3|
|2020-08-02| 1|
|2020-08-03| -1|
|2020-08-04| -1|
|2020-08-05| 3|
|2020-08-06| -1|
|2020-08-07| 6|
|2020-08-08| 4|
|2020-08-09| 5|
+----------+----+
LOGIC : lead(col1) +1, if col1 ==-1, then from the previous value lead(col1) +2...
the resulted dataframe will look like this (want column is what i want as output)
+----------+----+----+
| date|col1|WANT|
+----------+----+----+
|2020-08-01| 3| 2|
|2020-08-02| 1| 6|
|2020-08-03| -1| 5|
|2020-08-04| -1| 4|
|2020-08-05| 3| 8|
|2020-08-06| -1| 7|
|2020-08-07| 6| 5|
|2020-08-08| 4| 6|
|2020-08-09| 5| -1|
+----------+----+----+
Let's look at last row, where col1==5, that 5 is leaded +1 which is in want==6 (2020-08-08)
If we have col==-1, then we add +1 more ,, if we have col==-1 repeated twice, then we add +2 more..
this is hard to explain in words,lastly since it created last column instead of null, replace with -1. I have a diagram

You can check if the following code and logic works for you:
create a sub-group label g which take running sum of int(col1!=-1), and we only concern about Rows with col1 == -1, and nullify all other Rows.
the residual is 1 and if col1 == -1, plus the running count on Window w2
take the prev_col1 over w1 which is not -1 (using nullif), (the naming of prev_col1 might be confusion since it takes only if col1 = -1 using typical pyspark's way to do ffill, otherwise keep the original).
set val = prev_col1 + residual, take the lag and set null to -1
Code below:
from pyspark.sql.functions import when, col, expr, count, desc, lag, coalesce
from pyspark.sql import Window
w1 = Window.orderBy(desc('date'))
w2 = Window.partitionBy('g').orderBy(desc('date'))
TEST_df.withColumn('g', when(col('col1') == -1, expr("sum(int(col1!=-1))").over(w1))) \
.withColumn('residual', when(col('col1') == -1, count('*').over(w2) + 1).otherwise(1)) \
.withColumn('prev_col1',expr("last(nullif(col1,-1),True)").over(w1)) \
.withColumn('want', coalesce(lag(expr("prev_col1 + residual")).over(w1),lit(-1))) \
.orderBy('date').show()
+----------+----+----+--------+---------+----+
| date|col1| g|residual|prev_col1|want|
+----------+----+----+--------+---------+----+
|2020-08-01| 3|null| 1| 3| 2|
|2020-08-02| 1|null| 1| 1| 6|
|2020-08-03| -1| 4| 3| 3| 5|
|2020-08-04| -1| 4| 2| 3| 4|
|2020-08-05| 3|null| 1| 3| 8|
|2020-08-06| -1| 3| 2| 6| 7|
|2020-08-07| 6|null| 1| 6| 5|
|2020-08-08| 4|null| 1| 4| 6|
|2020-08-09| 5|null| 1| 5| -1|
+----------+----+----+--------+---------+----+

Related

pyspark get latest non-null element of every column in one row

Let me explain my question using an example:
I have a dataframe:
pd_1 = pd.DataFrame({'day':[1,2,3,2,1,3],
'code': [10, 10, 20,20,30,30],
'A': [44, 55, 66,77,88,99],
'B':['a',None,'c',None,'d', None],
'C':[None,None,'12',None,None, None]
})
df_1 = sc.createDataFrame(pd_1)
df_1.show()
Output:
+---+----+---+----+----+
|day|code| A| B| C|
+---+----+---+----+----+
| 1| 10| 44| a|null|
| 2| 10| 55|null|null|
| 3| 20| 66| c| 12|
| 2| 20| 77|null|null|
| 1| 30| 88| d|null|
| 3| 30| 99|null|null|
+---+----+---+----+----+
What I want to achieve is a new dataframe, each row corresponds to a code, and for each column I want to have the most recent non-null value (with highest day).
In pandas, I can simply do
pd_2 = pd_1.sort_values('day', ascending=True).groupby('code').last()
pd_2.reset_index()
to get
code day A B C
0 10 2 55 a None
1 20 3 66 c 12
2 30 3 99 d None
My question is, how can I do it in pyspark (preferably version < 3)?
What I have tried so far is:
from pyspark.sql import Window
import pyspark.sql.functions as F
w = Window.partitionBy('code').orderBy(F.desc('day')).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
## Update: after applying #Steven's idea to remove for loop:
df_1 = df_1 .select([F.collect_list(x).over(w).getItem(0).alias(x) for x in df_.columns])
##for x in df_1.columns:
## df_1 = df_1.withColumn(x, F.collect_list(x).over(w).getItem(0))
df_1 = df_1.distinct()
df_1.show()
Output
+---+----+---+---+----+
|day|code| A| B| C|
+---+----+---+---+----+
| 2| 10| 55| a|null|
| 3| 30| 99| d|null|
| 3| 20| 66| c| 12|
+---+----+---+---+----+
Which I'm not very happy with, especially due to the for loop.
I think your current solution is quite nice. If you want another solution, you can try using first/last window functions :
from pyspark.sql import functions as F, Window
w = Window.partitionBy("code").orderBy(F.col("day").desc())
df2 = (
df.select(
"day",
"code",
F.row_number().over(w).alias("rwnb"),
*(
F.first(F.col(col), ignorenulls=True)
.over(w.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing))
.alias(col)
for col in ("A", "B", "C")
),
)
.where("rwnb = 1")
.drop("rwnb")
)
and the result :
df2.show()
+---+----+---+---+----+
|day|code| A| B| C|
+---+----+---+---+----+
| 2| 10| 55| a|null|
| 3| 30| 99| d|null|
| 3| 20| 66| c| 12|
+---+----+---+---+----+
Here's another way of doing by using array functions and struct ordering instead of Window:
from pyspark.sql import functions as F
other_cols = ["day", "A", "B", "C"]
df_1 = df_1.groupBy("code").agg(
F.collect_list(F.struct(*other_cols)).alias("values")
).selectExpr(
"code",
*[f"array_max(filter(values, x-> x.{c} is not null))['{c}'] as {c}" for c in other_cols]
)
df_1.show()
#+----+---+---+---+----+
#|code|day| A| B| C|
#+----+---+---+---+----+
#| 10| 2| 55| a|null|
#| 30| 3| 99| d|null|
#| 20| 3| 66| c| 12|
#+----+---+---+---+----+

SQL query to find an output table

I have three dimension tables and a fact table and i need to write a query in such way that i join all the dimension columns with fact table to find out top 10 ATMs where most transactions are in the ’inactive’ state.I try the query with cartesian join but i dont know if this is the right way to join the tables.
select a.atm_number,a.atm_manufacturer,b.location,count(c.trans_id) as total_transaction_count,count(c.atm_status) as inactive_count
from dimen_atm a,dimen_location b,fact_atm_trans c
where a.atm_id = c.atm_id and b.location = c.location
order by inactive_count desc limit 10;
dimen_card_type
+------------+---------+
|card_type_id|card_type|
+------------+---------+
| 1| CIRRUS|
| 2| Dankort|
dimen_atm
+------+----------+----------------+---------------+
|atm_id|atm_number|atm_manufacturer|atm_location_id|
+------+----------+----------------+---------------+
| 1| 1| NCR| 16|
| 2| 2| NCR| 64|
+------+----------+----------------+---------------+
dimen_location
+-----------+--------------------+----------------+-------------+-------+------+------+
|location_id| location| streetname|street_number|zipcode| lat| lon|
+-----------+--------------------+----------------+-------------+-------+------+------+
| 1|Intern København|Rådhuspladsen| 75| 1550|55.676|12.571|
| 2| København| Regnbuepladsen| 5| 1550|55.676|12.571|
+-----------+--------------------+----------------+-------------+-------+------+------+
fact_atm_trans
+--------+------+--------------+-------+------------+----------+--------+----------+------------------+------------+------------+-------+----------+----------+------------+-------------------+
|trans_id|atm_id|weather_loc_id|date_id|card_type_id|atm_status|currency| service|transaction_amount|message_code|message_text|rain_3h|clouds_all|weather_id|weather_main|weather_description|
+--------+------+--------------+-------+------------+----------+--------+----------+------------------+------------+------------+-------+----------+----------+------------+-------------------+
| 1| 1| 16| 5229| 3| Active| DKK|Withdrawal| 5980| null| null| 0.0| 80| 803| Clouds| broken cloudsr|
| 2| 1| 16| 4090| 10| Active| DKK|Withdrawal| 3992| null| null| 0.0| 32| 802| Clouds| scattered cloudsr|
+--------+------+--------------+-------+------------+----------+--------+----------+------------------+------------+-----------

Spark SQL: Is there a way to distinguish columns with same name?

I have a csv with a header with columns with same name.
I want to process them with spark using only SQL and be able to refer these columns unambiguously.
Ex.:
id name age height name
1 Alex 23 1.70
2 Joseph 24 1.89
I want to get only first name column using only Spark SQL
As mentioned in the comments, I think that the less error prone method would be to have the schema of the input data changed.
Yet, in case you are looking for a quick workaround, you can simply index the duplicated names of the columns.
For instance, let's create a dataframe with three id columns.
val df = spark.range(3)
.select('id * 2 as "id", 'id * 3 as "x", 'id, 'id * 4 as "y", 'id)
df.show
+---+---+---+---+---+
| id| x| id| y| id|
+---+---+---+---+---+
| 0| 0| 0| 0| 0|
| 2| 3| 1| 4| 1|
| 4| 6| 2| 8| 2|
+---+---+---+---+---+
Then I can use toDF to set new column names. Let's consider that I know that only id is duplicated. If we don't, adding the extra logic to figure out which columns are duplicated would not be very difficult.
var i = -1
val names = df.columns.map( n =>
if(n == "id") {
i+=1
s"id_$i"
} else n )
val new_df = df.toDF(names : _*)
new_df.show
+----+---+----+---+----+
|id_0| x|id_1| y|id_2|
+----+---+----+---+----+
| 0| 0| 0| 0| 0|
| 2| 3| 1| 4| 1|
| 4| 6| 2| 8| 2|
+----+---+----+---+----+

Add aggregated columns to pivot without join

Considering the table:
df=sc.parallelize([(1,1,1),(5,0,2),(27,1,1),(1,0,3),(5,1,1),(1,0,2)]).toDF(['id', 'error', 'timestamp'])
df.show()
+---+-----+---------+
| id|error|timestamp|
+---+-----+---------+
| 1| 1| 1|
| 5| 0| 2|
| 27| 1| 1|
| 1| 0| 3|
| 5| 1| 1|
| 1| 0| 2|
+---+-----+---------+
I would like to make a pivot on timestamp column keeping some other aggregated information from the original table. The result I am interested in can be achieved by
df1=df.groupBy('id').agg(sf.sum('error').alias('Ne'),sf.count('*').alias('cnt'))
df2=df.groupBy('id').pivot('timestamp').agg(sf.count('*')).fillna(0)
df1.join(df2, on='id').filter(sf.col('cnt')>1).show()
with the resulting table:
+---+---+---+---+---+---+
| id| Ne|cnt| 1| 2| 3|
+---+---+---+---+---+---+
| 5| 1| 2| 1| 1| 0|
| 1| 1| 3| 1| 1| 1|
+---+---+---+---+---+---+
However, there are at least two issues with the mentioned solution:
I am filtering by cnt at the end of the script. If I would be able to do this at the beginning, I can avoid almost all processing, because a large portion of data is removed using this filtration. Is there any way how to do this excepting collect and isin methods?
I am doing groupBy on id two-times. First, to aggregate some columns I need in results and the second time to get the pivot columns. Finally, I need join to merge these columns. I feel that I am surely missing some solution because it should be possible to do this with just one groubBy and without join, but I cannot figure out, how to do this.
I think you can not get around the join, because the pivot will need the timestamp values and the first grouping should not consider them. So if you have to create the NE and cnt values you have to group the dataframe only by id which results in the loss of timestamp if you want to preserve the values in columns you have to do the pivot as you did separately and join it back.
The only improvement that can be done is to move the filter to the df1 creation. So as you said this could already improve the performance since df1 should be much smaller after the filtering for your real data.
from pyspark.sql.functions import *
df=sc.parallelize([(1,1,1),(5,0,2),(27,1,1),(1,0,3),(5,1,1),(1,0,2)]).toDF(['id', 'error', 'timestamp'])
df1=df.groupBy('id').agg(sum('error').alias('Ne'),count('*').alias('cnt')).filter(col('cnt')>1)
df2=df.groupBy('id').pivot('timestamp').agg(count('*')).fillna(0)
df1.join(df2, on='id').show()
Output:
+---+---+---+---+---+---+
| id| Ne|cnt| 1| 2| 3|
+---+---+---+---+---+---+
| 5| 1| 2| 1| 1| 0|
| 1| 1| 3| 1| 1| 1|
+---+---+---+---+---+---+
Actually it is indeed possible to avoid join using Window as
w1 = Window.partitionBy('id')
w2 = Window.partitionBy('id', 'timestamp')
df.select('id', 'timestamp',
sf.sum('error').over(w1).alias('Ne'),
sf.count('*').over(w1).alias('cnt'),
sf.count('*').over(w2).alias('cnt_2')
).filter(sf.col('cnt')>1) \
.groupBy('id', 'Ne', 'cnt').pivot('timestamp').agg(sf.first('cnt_2')).fillna(0).show()

how compute discounted future cumulative sum with spark pyspark window functions or sql

Can I compute a discounted future cumulative sum using spark sql? Below is an example that computes the undiscounted cum future sum using window functions, and I hard coded in what I mean by the discounted cum sum:
from pyspark.sql.window import Window
def undiscountedCummulativeFutureReward(df):
windowSpec = Window \
.partitionBy('user') \
.orderBy('time') \
.rangeBetween(0, Window.unboundedFollowing)
tot_reward = F.sum('reward').over(windowSpec)
df_tot_reward = df.withColumn('undiscounted', tot_reward)
return df_tot_reward
def makeData(spark, gamma=0.5):
data = [{'user': 'bob', 'time': 3, 'reward': 10, 'discounted_cum': 10 + (gamma * 9) + ((gamma ** 2) * 11)},
{'user': 'bob', 'time': 4, 'reward': 9, 'discounted_cum': 9 + gamma * 11},
{'user': 'bob', 'time': 5, 'reward': 11, 'discounted_cum': 11.0},
{'user': 'jo', 'time': 4, 'reward': 6, 'discounted_cum': 6 + gamma * 7},
{'user': 'jo', 'time': 5, 'reward': 7, 'discounted_cum': 7.0},
]
schema = T.StructType([T.StructField('user', T.StringType(), False),
T.StructField('time', T.IntegerType(), False),
T.StructField('reward', T.IntegerType(), False),
T.StructField('discounted_cum', T.FloatType(), False)])
return spark.createDataFrame(data=data, schema=schema)
def main(spark):
df = makeData(spark)
df = undiscountedCummulativeFutureReward(df)
df.orderBy('user', 'time').show()
return df
When you run it you get:
+----+----+------+--------------+------------+
|user|time|reward|discounted_cum|undiscounted|
+----+----+------+--------------+------------+
| bob| 3| 10| 17.25| 30|
| bob| 4| 9| 14.5| 20|
| bob| 5| 11| 11.0| 11|
| jo| 4| 6| 9.5| 13|
| jo| 5| 7| 7.0| 7|
+----+----+------+--------------+------------+
That is discounted is sum \gamma^k r_k for k=0 to \infinity
I'm wondering if I can compute the discounted column with Window functions, like introduce a column with the rank, a literal with gamma, multiply things together - but still not quite clear - I suppose I can do it with some kind of UDF, but I think I'd have to first collect_as_list all the users, return a new list with the cum discounted sum, and then explode the list.
Suppose you were starting with the following DataFrame:
df.show()
#+----+----+------+
#|user|time|reward|
#+----+----+------+
#| bob| 3| 10|
#| bob| 4| 9|
#| bob| 5| 11|
#| jo| 4| 6|
#| jo| 5| 7|
#+----+----+------+
You can join this DataFrame to itself on the user column, and keep only those rows where the time column of the right table is greater than or equal to the time column of the left table. We make this easier by aliasing the DataFrames l and r.
After the join, you can group by user, time and reward from the left table and aggregate the reward column from the right table. However it seems that a groupBy followed by an orderBy is not guaranteed to maintain that order, so you should use a Window to be explicit.
from pyspark.sql import Window, functions as f
w = Window.partitionBy("user", "l.time", "l.reward").orderBy("r.time")
df = df.alias("l").join(df.alias("r"), on="user")\
.where("r.time>=l.time")\
.select(
"user",
f.col("l.time").alias("time"),
f.col("l.reward").alias("reward"),
f.collect_list("r.reward").over(w).alias("rewards")
)
df.show()
#+----+----+------+-----------+
#|user|time|reward| rewards|
#+----+----+------+-----------+
#| jo| 4| 6| [6]|
#| jo| 4| 6| [6, 7]|
#| jo| 5| 7| [7]|
#| bob| 3| 10| [10]|
#| bob| 3| 10| [10, 9]|
#| bob| 3| 10|[10, 9, 11]|
#| bob| 4| 9| [9]|
#| bob| 4| 9| [9, 11]|
#| bob| 5| 11| [11]|
#+----+----+------+-----------+
Now you have all of the elements required to compute your discounted_cum column.
Spark 2.1 and above:
You can use pyspark.sql.functions.posexplode to explode the rewards array along with the index in the list. This will make a new row for each value in the rewards array. Use distinct to drop duplicates that were introduced by using the Window function (instead of groupBy).
We'll call the index k and the reward rk. Now you can apply your function using pyspark.sql.functions.pow
gamma = 0.5
df.select("user", "time", "reward", f.posexplode("rewards").alias("k", "rk"))\
.distinct()\
.withColumn("discounted", f.pow(f.lit(gamma), f.col("k"))*f.col("rk"))\
.groupBy("user", "time")\
.agg(f.first("reward").alias("reward"), f.sum("discounted").alias("discounted_cum"))\
.show()
#+----+----+------+--------------+
#|user|time|reward|discounted_cum|
#+----+----+------+--------------+
#| bob| 3| 10| 17.25|
#| bob| 4| 9| 14.5|
#| bob| 5| 11| 11.0|
#| jo| 4| 6| 9.5|
#| jo| 5| 7| 7.0|
#+----+----+------+--------------+
Older Versions of Spark
For older versions of spark, you'll have to use row_number()-1 to get the values for k after using explode:
df.select("user", "time", "reward", f.explode("rewards").alias("rk"))\
.distinct()\
.withColumn(
"k",
f.row_number().over(Window.partitionBy("user", "time").orderBy("time"))-1
)\
.withColumn("discounted", f.pow(f.lit(gamma), f.col("k"))*f.col("rk"))\
.groupBy("user", "time")\
.agg(f.first("reward").alias("reward"), f.sum("discounted").alias("discounted_cum"))\
.show()
#+----+----+------+--------------+
#|user|time|reward|discounted_cum|
#+----+----+------+--------------+
#| jo| 4| 6| 9.5|
#| jo| 5| 7| 7.0|
#| bob| 3| 10| 17.25|
#| bob| 4| 9| 14.5|
#| bob| 5| 11| 11.0|
#+----+----+------+--------------+