How to convert this sql query to pyspark? - apache-spark-sql

SELECT A.* FROM df1 A
WHERE ID NOT IN (SELECT ID FROM df2)
This is the code I'm trying to convert

from Spark-2.2+ supports subqueries, as you can register temp view and execute the same query.
In DataFrameApi for NOT IN use left_anti join to mimic the same query.
Example:
df.show()
#+----+---+
#|name| id|
#+----+---+
#| a| 1|
#| c| 3|
#+----+---+
df1.show()
#+----+---+
#|name| id|
#+----+---+
#| a| 1|
#| b| 2|
#+----+---+
df.join(df1,'id','left_anti').show()
#+---+----+
#| id|name|
#+---+----+
#| 3| c|
#+---+----+
#only if you have few id values in df2 then
ids=df1.select("id").rdd.map(lambda x:x.id).collect()
#isin accepts only values not columns from another dataframe
df.filter(~col("id").isin(ids)).show()
#+----+---+
#|name| id|
#+----+---+
#| c| 3|
#+----+---+
If two dataframes having same number of columns then you can use exceptAll function
df.exceptAll(df1).show()
#+----+---+
#|name| id|
#+----+---+
#| c| 3|
#+----+---+
For in subquery use left_semi join.
df.join(df1,'id','left_semi').show()
#+---+----+
#| id|name|
#+---+----+
#| 1| a|
#+---+----+

Related

Pyspark sum of columns after union of dataframe

How can I sum all columns after unioning two dataframe ?
I have this first df with one row per user:
df = sqlContext.createDataFrame([("2022-01-10", 3, 2,"a"),("2022-01-10",3,4,"b"),("2022-01-10", 1,3,"c")], ["date", "value1", "value2", "userid"])
df.show()
+----------+------+------+------+
| date|value1|value2|userid|
+----------+------+------+------+
|2022-01-10| 3| 2| a|
|2022-01-10| 3| 4| b|
|2022-01-10| 1| 3| c|
+----------+------+------+------+
date value will always be the today's date.
and I have another df, with multiple row per userid this time, so one value for each day:
df2 = sqlContext.createDataFrame([("2022-01-01", 13, 12,"a"),("2022-01-02",13,14,"b"),("2022-01-03", 11,13,"c"),
("2022-01-04", 3, 2,"a"),("2022-01-05",3,4,"b"),("2022-01-06", 1,3,"c"),
("2022-01-10", 31, 21,"a"),("2022-01-07",31,41,"b"),("2022-01-09", 11,31,"c")], ["date", "value3", "value4", "userid"])
df2.show()
+----------+------+------+------+
| date|value3|value4|userid|
+----------+------+------+------+
|2022-01-01| 13| 12| a|
|2022-01-02| 13| 14| b|
|2022-01-03| 11| 13| c|
|2022-01-04| 3| 2| a|
|2022-01-05| 3| 4| b|
|2022-01-06| 1| 3| c|
|2022-01-10| 31| 21| a|
|2022-01-07| 31| 41| b|
|2022-01-09| 11| 31| c|
+----------+------+------+------+
After unioning the two of them with this function, here what I have:
def union_different_tables(df1, df2):
columns_df1 = df1.columns
columns_df2 = df2.columns
data_types_df1 = [i.dataType for i in df1.schema.fields]
data_types_df2 = [i.dataType for i in df2.schema.fields]
for col, _type in zip(columns_df1, data_types_df1):
if col not in df2.columns:
df2 = df2.withColumn(col, f.lit(None).cast(_type))
for col, _type in zip(columns_df2, data_types_df2):
if col not in df1.columns:
df1 = df1.withColumn(col, f.lit(None).cast(_type))
union = df1.unionByName(df2)
return union
+----------+------+------+------+------+------+
| date|value1|value2|userid|value3|value4|
+----------+------+------+------+------+------+
|2022-01-10| 3| 2| a| null| null|
|2022-01-10| 3| 4| b| null| null|
|2022-01-10| 1| 3| c| null| null|
|2022-01-01| null| null| a| 13| 12|
|2022-01-02| null| null| b| 13| 14|
|2022-01-03| null| null| c| 11| 13|
|2022-01-04| null| null| a| 3| 2|
|2022-01-05| null| null| b| 3| 4|
|2022-01-06| null| null| c| 1| 3|
|2022-01-10| null| null| a| 31| 21|
|2022-01-07| null| null| b| 31| 41|
|2022-01-09| null| null| c| 11| 31|
+----------+------+------+------+------+------+
What I want to get is the sum of all columns in df2 (I have 10 of them in the real case) till the date of the day for each userid, so one row per user:
+----------+------+------+------+------+------+
| date|value1|value2|userid|value3|value4|
+----------+------+------+------+------+------+
|2022-01-10| 3| 2| a| 47 | 35 |
|2022-01-10| 3| 4| b| 47 | 59 |
|2022-01-10| 1| 3| c| 23 | 47 |
+----------+------+------+------+------+------+
Since I have to do this operation for multiple tables, here what I tried:
user_window = Window.partitionBy(['userid']).orderBy('date')
list_tables = [df2]
list_col_original = df.columns
for table in list_tables:
df = union_different_tables(df, table)
list_column = list(set(table.columns) - set(list_col_original))
list_col_original.extend(list_column)
df = df.select('userid',
*[f.sum(f.col(col_name)).over(user_window).alias(col_name) for col_name in list_column])
df.show()
+------+------+------+
|userid|value4|value3|
+------+------+------+
| c| 13| 11|
| c| 16| 12|
| c| 47| 23|
| c| 47| 23|
| b| 14| 13|
| b| 18| 16|
| b| 59| 47|
| b| 59| 47|
| a| 12| 13|
| a| 14| 16|
| a| 35| 47|
| a| 35| 47|
+------+------+------+
But that give me a sort of cumulative sum, plus I didn't find a way to add all the columns in the resulting df.
The only thing is that I can't do any join ! My df are very very large and any join is taking too long to compute.
Do you know how I can fix my code to have the result I want ?
After union of df1 and df2, you can group by userid and sum all columns except date for which you get the max.
Note that for the union part, you can actually use DataFrame.unionByName if you have the same data types but only number of columns can differ:
df = df1.unionByName(df2, allowMissingColumns=True)
Then group by and agg:
import pyspark.sql.functions as F
result = df.groupBy("userid").agg(
F.max("date").alias("date"),
*[F.sum(c).alias(c) for c in df.columns if c not in ("date", "userid")]
)
result.show()
#+------+----------+------+------+------+------+
#|userid| date|value1|value2|value3|value4|
#+------+----------+------+------+------+------+
#| a|2022-01-10| 3| 2| 47| 35|
#| b|2022-01-10| 3| 4| 47| 59|
#| c|2022-01-10| 1| 3| 23| 47|
#+------+----------+------+------+------+------+
This supposes the second dataframe contains only dates prior to the today date in the first one. Otherwise, you'll need to filter df2 before union.

How to join two dataframes together

I have two dataframes.
One is coming from groupBy and the other is the total summary:
a = data.groupBy("bucket").agg(sum(a.total))
b = data.agg(sum(a.total))
I want to put the total from b to a dataframe so that I can calculate the % on each bucket.
Do you know what kind of join I shall use?
Use .crossJoin then you will get the total from b added to all rows of df a, then you can calculate the percentage.
Example:
a.crossJoin(b).show()
#+------+----------+----------+
#|bucket|sum(total)|sum(total)|
#+------+----------+----------+
#| c| 4| 10|
#| b| 3| 10|
#| a| 3| 10|
#+------+----------+----------+
Instead of CrossJoin you can try using window functions as mentioned below.
df.show()
#+-----+------+
#|total|bucket|
#+-----+------+
#| 1| a|
#| 2| a|
#| 3| b|
#| 4| c|
#+-----+------+
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.window import *
import sys
w=Window.partitionBy(col("bucket"))
w1=Window.orderBy(lit("1")).rowsBetween(-sys.maxsize,sys.maxsize)
df.withColumn("sum_b",sum(col("total")).over(w)).withColumn("sum_c",sum(col("total")).over(w1)).show()
#+-----+------+-----+-----+
#|total|bucket|sum_b|sum_c|
#+-----+------+-----+-----+
#| 4| c| 4| 10|
#| 3| b| 3| 10|
#| 1| a| 3| 10|
#| 2| a| 3| 10|
#+-----+------+-----+-----+
You can use also collect() as you will return to the driver just a simple result
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()
df = spark.sql("select 'A' as bucket, 5 as value union all select 'B' as bucket, 8 as value")
df_total = spark.sql("select 9 as total")
df=df.withColumn('total',lit(df_total.collect()[0]['total']))
+------+-----+-----+
|bucket|value|total|
+------+-----+-----+
| A| 5| 9|
| B| 8| 9|
+------+-----+-----+
df= df.withColumn('pourcentage', col('total') / col('value'))
+------+-----+-----+-----------+
|bucket|value|total|pourcentage|
+------+-----+-----+-----------+
| A| 5| 9| 1.8|
| B| 8| 9| 1.125|
+------+-----+-----+-----------+

Pyspark Dataframe Compare

I have 2 Spark dataframes with same number of columns.
DF1:
ID KEY
1 A
1 A
2 B
3 C
3 C
DF2:
ID KEY
1 A
1 A
1 A
2 B
3 C
3 C
4 D
5 E
5 E
I want to compare these 2 dataframes and write those records that are there in DF2 but not in DF1.
Expected output:
ID KEY
1 A
4 D
5 E
5 E
use .exceptAll function.
`Example:
df1.show()
#+---+---+
#| ID|KEY|
#+---+---+
#| 1| A|
#| 1| A|
#| 2| B|
#| 3| c|
#| 3| c|
#+---+---+
df2.show()
#+---+---+
#| ID|KEY|
#+---+---+
#| 1| A|
#| 1| A|
#| 1| A|
#| 2| B|
#| 3| c|
#| 3| c|
#| 4| D|
#| 5| E|
#| 5| E|
#+---+---+
df2.exceptAll(df1).orderBy("ID").show()
#+---+---+
#| ID|KEY|
#+---+---+
#| 1| A|
#| 4| D|
#| 5| E|
#| 5| E|
#+---+---+

How to split data into groups in pyspark

I need to find groups in time series data.
Data sample
I need to output column group based on value and day.
I've tried using lag, lead and row_number but it ended up to nothing.
It seems like you want to increment the group everytime the value changes. If so, this is a kind of gaps-and-islands problem.
Here is one approach that uses lag() and a cumulative sum():
select
value,
day,
sum(case when value = lag_value then 0 else 1 end) over(order by day) grp
from (
select t.*, lag(value) over(order by day) lag_value
from mytable t
) t
PySpark way to do this. Find endpoints of groups using lag, do an incremental sum on this lag to get groups, add 1 to groups to get your desired groups.
from pypsark.sql.window import Window
from pyspark.sql import functions as F
w1=Window().orderBy("day")
df.withColumn("lag", F.when(F.lag("value").over(w1)!=F.col("value"), F.lit(1)).otherwise(F.lit(0)))\
.withColumn("group", F.sum("lag").over(w1) + 1).drop("lag").show()
#+-----+---+-----+
#|value|day|group|
#+-----+---+-----+
#| 1| 1| 1|
#| 1| 2| 1|
#| 1| 3| 1|
#| 1| 4| 1|
#| 1| 5| 1|
#| 2| 6| 2|
#| 2| 7| 2|
#| 1| 8| 3|
#| 1| 9| 3|
#| 1| 10| 3|
#| 1| 11| 3|
#| 1| 12| 3|
#| 1| 13| 3|
#+-----+---+-----+

SQL or Pyspark - Get the last time a column had a different value for each ID

I am using pyspark so I have tried both pyspark code and SQL.
I am trying to get the time that the ADDRESS column was a different value, grouped by USER_ID. The rows are ordered by TIME. Take the below table:
+---+-------+-------+----+
| ID|USER_ID|ADDRESS|TIME|
+---+-------+-------+----+
| 1| 1| A| 10|
| 2| 1| B| 15|
| 3| 1| A| 20|
| 4| 1| A| 40|
| 5| 1| A| 45|
+---+-------+-------+----+
The correct new column I would like is as below:
+---+-------+-------+----+---------+
| ID|USER_ID|ADDRESS|TIME|LAST_DIFF|
+---+-------+-------+----+---------+
| 1| 1| A| 10| null|
| 2| 1| B| 15| 10|
| 3| 1| A| 20| 15|
| 4| 1| A| 40| 15|
| 5| 1| A| 45| 15|
+---+-------+-------+----+---------+
I have tried using different windows but none ever seem to get exactly what I want. Any ideas?
A simplified version of #jxc's answer.
from pyspark.sql.functions import *
from pyspark.sql import Window
#Window definition
w = Window.partitionBy(col('user_id')).orderBy(col('id'))
#Getting the previous time and classifying rows into groups
grp_df = df.withColumn('grp',sum(when(lag(col('address')).over(w) == col('address'),0).otherwise(1)).over(w)) \
.withColumn('prev_time',lag(col('time')).over(w))
#Window definition with groups
w_grp = Window.partitionBy(col('user_id'),col('grp')).orderBy(col('id'))
grp_df.withColumn('last_addr_change_time',min(col('prev_time')).over(w_grp)).show()
Use lag with running sum to assign groups when there is a change in the column value (based on the defined window). Get the time from the previous row, which will be used in the next step.
Once you get the groups, use the running minimum to get the last timestamp of the column value change. (Suggest you look at the intermediate results to understand the transformations better)
One way using two Window specs:
from pyspark.sql.functions import when, col, lag, sum as fsum
from pyspark.sql import Window
w1 = Window.partitionBy('USER_ID').orderBy('ID')
w2 = Window.partitionBy('USER_ID').orderBy('g')
# create a new sub-group label based on the values of ADDRESS and Previous ADDRESS
df1 = df.withColumn('g', fsum(when(col('ADDRESS') == lag('ADDRESS').over(w1), 0).otherwise(1)).over(w1))
# group by USER_ID and the above sub-group label and calculate the sum of time in the group as diff
# calculate the last_diff and then join the data back to the df1
df2 = df1.groupby('USER_ID', 'g').agg(fsum('Time').alias('diff')).withColumn('last_diff', lag('diff').over(w2))
df1.join(df2, on=['USER_ID', 'g']).show()
+-------+---+---+-------+----+----+---------+
|USER_ID| g| ID|ADDRESS|TIME|diff|last_diff|
+-------+---+---+-------+----+----+---------+
| 1| 1| 1| A| 10| 10| null|
| 1| 2| 2| B| 15| 15| 10|
| 1| 3| 3| A| 20| 105| 15|
| 1| 3| 4| A| 40| 105| 15|
| 1| 3| 5| A| 45| 105| 15|
+-------+---+---+-------+----+----+---------+
df_new = df1.join(df2, on=['USER_ID', 'g']).drop('g', 'diff')