N conditions in dynamically 'when' clause - dataframe

I have the following code
from pyspark.sql.functions import col, count, when
from functools import reduce
df = spark.createDataFrame([ (1,""), (2,None),(3,"c"),(4,"d") ], ['id','name'])
filter1 = col("name").isNull()
filter2 = col("name") == ""
dfresult = df.filter(filter1 | filter2).select(col("id"), when(filter1, "name is null").when(filter2, "name is empty").alias("new_col"))
dfresult.show()
+---+-------------+
| id| new_col|
+---+-------------+
| 1|name is empty|
| 2| name is null|
+---+-------------+
In the scenario with N filters. I think about
filters = []
filters.append({ "item": filter1, "msg":"name is null"})
filters.append({ "item": filter2, "msg":"name is empty"})
dynamic_filter = reduce(
lambda x,y: x | y,
[s['item'] for s in filters]
)
df2 = df.filter(dynamic_filter).select(col("id"), when(filter1, "name is null").when(filter2, "name is empty").alias("new_col"))
df2.show()
How can I make something better for new_col column with dynamic when?

Simply use functools.reduce as your already did for the filter expression:
from functools import reduce
from pyspark.sql import functions as F
new_col = reduce(
lambda acc, x: acc.when(x["item"], F.lit(x["msg"])),
filters,
F
)
df2 = df.filter(dynamic_filter).select(col("id"), new_col.alias("new_col"))
df2.show()
#+---+-------------+
#| id| new_col|
#+---+-------------+
#| 1|name is empty|
#| 2| name is null|
#+---+-------------+

Related

How to retain the preceding updated row values in PySpark and use it in the next row calculation?

The below condition needs to be applied on RANK and RANKA columns
Input table:
Condition for RANK column:
IF RANK == 0 : then RANK= previous RANK value + 1 ;
else : RANK=RANK
Condition for RANKA column:
IF RANKA == 0 : then RANKA= previous RANKA value + current row Salary
value;
else : RANKA=RANKA
Below is a piece of code that I tried.
I have created dummy columns named RANK_new and RANKA_new for storing the desired outputs of RANK and RANKA columns after applying conditions.
And then once I get the correct values I can replace the RANK and RANKA column with those dummy columns.
# importing necessary libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import when, col
# function to create new SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.functions import lag,lead
def create_session():
spk = SparkSession.builder \
.master("local") \
.appName("employee_profile.com") \
.getOrCreate()
return spk
def create_df(spark, data, schema):
df1 = spark.createDataFrame(data, schema)
return df1
if __name__ == "__main__":
# calling function to create SparkSession
spark = create_session()
input_data = [(1, "Shivansh", "Data Scientist", 2,1,1,2),
(0, "Rishabh", "Software Developer", 5,2,0,3),
(0, "Swati", "Data Analyst", 10,3,10,4),
(1, "Amar", "Data Analyst", 2,4,9,0),
(0, "Arpit", "Android Developer", 3,5,0,0),
(0, "Ranjeet", "Python Developer", 4,6,0,0),
(0, "Priyanka", "Full Stack Developer",5,7,0,0)]
schema = ["Id", "Name", "Job Profile", "Salary",'hi','RANK','RANKA']
# calling function to create dataframe
dff = create_df(spark, input_data, schema)
# Below 3 lines for RANK
df1=dff.repartition(1)
df2 = df1.withColumn('RANK_new', when(col('RANK') == 0,lag(col('RANK')+lit(1)).over(Window.orderBy(col('hi')))).otherwise(col('RANK')))
df2 = df2.withColumn('RANK_new', when((col('RANK') == 0) & (lag(col('RANK')).over(Window.orderBy(col('hi'))) == 0) ,lag(col('RANK_new')+lit(1)).over(Window.orderBy(col('hi')))).otherwise(col('RANK_new')))
#Below line for RANKA
df2=df2.withColumn('RANKA_new', when(col('RANKA') == 0, lag(col("RANKA")).over(Window.orderBy("hi"))+col("Salary")).otherwise(col('RANKA')))
df2.show()
The issue with this code is that the lag function is not taking the updated values of the previous rows.
This can be done with a for loop but since my data is so huge, I need a solution without for loop.
Below is the desired output:
Below is a summarized picture to show the Output I got and the desired output.
RANK_new, RANKA_new --> These are the output I got for RANK and RANKA columns after I applied the above code
RANK_desired, RANKA-desired ---> This is what is expected to be produced.
You can first create groups for partitioning for both, RANK and RANKA. Then using sum inside partitions should work.
Input
from pyspark.sql import functions as F, Window as W
input_data = [(1, "Shivansh", "Data Scientist", 2,1,1,2),
(0, "Rishabh", "Software Developer", 5,2,0,3),
(0, "Swati", "Data Analyst", 10,3,10,4),
(1, "Amar", "Data Analyst", 2,4,9,0),
(0, "Arpit", "Android Developer", 3,5,0,0),
(0, "Ranjeet", "Python Developer", 4,6,0,0),
(0, "Priyanka", "Full Stack Developer",5,7,0,0)]
schema = ["Id", "Name", "Job Profile", "Salary",'hi','RANK','RANKA']
dff = spark.createDataFrame(input_data, schema)
Script:
w0 = W.orderBy('hi')
rank_grp = F.when(F.col('RANK') != 0, 1).otherwise(0)
dff = dff.withColumn('RANK_grp', F.sum(rank_grp).over(w0))
w1 = W.partitionBy('RANK_grp').orderBy('hi')
ranka_grp = F.when(F.col('RANKA') != 0, 1).otherwise(0)
dff = dff.withColumn('RANKA_grp', F.sum(ranka_grp).over(w0))
w2 = W.partitionBy('RANKA_grp').orderBy('hi')
dff = (dff
.withColumn('RANK_new', F.sum(F.when(F.col('RANK') == 0, 1).otherwise(F.col('RANK'))).over(w1))
.withColumn('RANKA_new', F.sum(F.when(F.col('RANKA') == 0, F.col('Salary')).otherwise(F.col('RANKA'))).over(w2))
.drop('RANK_grp', 'RANKA_grp')
)
dff.show()
# +---+--------+--------------------+------+---+----+-----+--------+---------+
# | Id| Name| Job Profile|Salary| hi|RANK|RANKA|RANK_new|RANKA_new|
# +---+--------+--------------------+------+---+----+-----+--------+---------+
# | 1|Shivansh| Data Scientist| 2| 1| 1| 2| 1| 2|
# | 0| Rishabh| Software Developer| 5| 2| 0| 3| 2| 3|
# | 0| Swati| Data Analyst| 10| 3| 10| 4| 10| 4|
# | 1| Amar| Data Analyst| 2| 4| 9| 0| 9| 6|
# | 0| Arpit| Android Developer| 3| 5| 0| 0| 10| 9|
# | 0| Ranjeet| Python Developer| 4| 6| 0| 0| 11| 13|
# | 0|Priyanka|Full Stack Developer| 5| 7| 0| 0| 12| 18|
# +---+--------+--------------------+------+---+----+-----+--------+---------+

Replacing column values in pyspark by iterating through list

I have a pyspark data frame as
| ID|colA|colB |colC|
+---+----+-----+----+
|ID1| 3|5.85 | LB|
|ID2| 4|12.67| RF|
|ID3| 2|20.78| LCM|
|ID4| 1| 2 | LWB|
|ID5| 6| 3 | LF|
|ID6| 7| 4 | LM|
|ID7| 8| 5 | RS|
+---+----+----+----+
My goal is to replace the values in ColC as for the values of LB,LWB,LF with x and so on as shown below.
x = [LB,LWB,LF]
y = [RF,LCM]
z = [LM,RS]
Currently I'm able to achieve this by replacing each of the values manually as in below code :
# Replacing the values LB,LWF,LF with x
df_new = df.withColumn('ColC',f.when((f.col('ColC') == 'LB')|(f.col('ColC') == 'LWB')|(f.col('ColC') == 'LF'),'x').otherwise(df.ColC))
My question here is that how can we replace the values of a column (ColC in my example) by iterating through a list (x,y,z) dynamically at once using pyspark? What is the time complexity involved? Also, how can we truncate the decimal values in ColB to 1 decmial place?
You can coalesce the when statements if you have many conditions to match. You can also use a dictionary to hold the columns to be converted, and construct the when statements dynamically using a dict comprehension. As for rounding to 1 decimal place, you can use round.
import pyspark.sql.functions as F
xyz_dict = {'x': ['LB','LWB','LF'],
'y': ['RF','LCM'],
'z': ['LM','RS']}
df2 = df.withColumn(
'colC',
F.coalesce(*[F.when(F.col('colC').isin(v), k) for (k, v) in xyz_dict.items()])
).withColumn(
'colB',
F.round('colB', 1)
)
df2.show()
+---+----+----+----+
| ID|colA|colB|colC|
+---+----+----+----+
|ID1| 3| 5.9| x|
|ID2| 4|12.7| y|
|ID3| 2|20.8| y|
|ID4| 1| 2.0| x|
|ID5| 6| 3.0| x|
|ID6| 7| 4.0| z|
|ID7| 8| 5.0| z|
+---+----+----+----+
You can use replace on dataframe to replace the values in colC by passing a dict object for the mappings. And round function to limit the number of decimals in colB:
from pyspark.sql import functions as F
replacement = {
"LB": "x", "LWB": "x", "LF": "x",
"RF": "y", "LCM": "y",
"LM": "z", "RS": "z"
}
df1 = df.replace(replacement, ["colC"]).withColumn("colB", F.round("colB", 1))
df1.show()
#+---+----+----+----+
#| ID|colA|colB|colC|
#+---+----+----+----+
#|ID1| 3| 5.9| x|
#|ID2| 4|12.7| y|
#|ID3| 2|20.8| y|
#|ID4| 1| 2.0| x|
#|ID5| 6| 3.0| x|
#|ID6| 7| 4.0| z|
#|ID7| 8| 5.0| z|
#+---+----+----+----+
Also you can use isin function:
from pyspark.sql.functions import col, when
x = ['LB','LWB','LF']
y = ['LCM','RF']
z = ['LM','RS']
df = df.withColumn('ColC', when(col('colC').isin(x), "x")\
.otherwise(when(col('colC').isin(y), "y")\
.otherwise(when(col('colC').isin(z), "z")\
.otherwise(df.ColC))))
If you have a few lists with too many values in this way your complexity is less than blackbishop answer but in this problem his answer is easier.
You can try also with a regular expression using regexp_replace:
import pyspark.sql.functions as f
replacements = [
("(LB)|(LWB)|(LF)", "x"),
("(LCM)|(RF)", "y"),
("(LM)|(RS)", "z")
]
for x, y in replacements:
df = df.withColumn("colC", f.regexp_replace("colC", x, y))

Is there any method to find number of columns having data in pyspark data frame

I have a pyspark data frame that has 7 columns, I have to add a new column named "sum" and calculate a number of columns that have data (Not null) in the sum column.Example a data frame in which yellow highlighted part is required answer
This sum can be calculated like this:
df = spark.createDataFrame([
(1, "a", "xxx", None, "abc", "xyz","fgh"),
(2, "b", None, 3, "abc", "xyz","fgh"),
(3, "c", "a23", None, None, "xyz","fgh")
], ("ID","flag", "col1", "col2", "col3", "col4", "col5"))
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
df2 = df.withColumn("sum",sum([(~F.isnull(df[col])).cast(IntegerType()) for col in df.columns]))
df2.show()
+---+----+----+----+----+----+----+---+
| ID|flag|col1|col2|col3|col4|col5|sum|
+---+----+----+----+----+----+----+---+
| 1| a| xxx|null| abc| xyz| fgh| 6|
| 2| b|null| 3| abc| xyz| fgh| 6|
| 3| c| a23|null|null| xyz| fgh| 5|
+---+----+----+----+----+----+----+---+
Hope this helps!

using sparksql and spark dataframe How can we find the COLUMN NAME based on the minimum value in a row

i have a dataframe df . its having 4 columns
+-------+-------+-------+-------+
| dist1 | dist2 | dist3 | dist4 |
+-------+-------+-------+-------+
| 42 | 53 | 24 | 17 |
+-------+-------+-------+-------+
output i want is
dist4
seems easy but i did not find any proper solution using dataframe or sparksql query
You may use least function as
select least(dist1,dist2,dist3,dist4) as min_dist
from yourTable;
For the opposite cases greatest may be used.
EDIT :
To detect column names the following maybe used to get rows
select inline(array(struct(42, 'dist1'), struct(53, 'dist2'),
struct(24, 'dist3'), struct(17, 'dist4') ))
42 dist1
53 dist2
24 dist3
17 dist4
and then min function may be applied to get dist4
Try this,
df.show
+---+---+---+---+
| A| B| C| D|
+---+---+---+---+
| 1| 2| 3| 4|
| 5| 4| 3| 1|
+---+---+---+---+
val temp_df = df.columns.foldLeft(df) { (acc: DataFrame, colName: String) => acc.withColumn(colName, concat(col(colName), lit(","+colName)))}
val minval = udf((ar: Seq[String]) => ar.min.split(",")(1))
val result = temp_df.withColumn("least", split(concat_ws(":",x.columns.map(col(_)):_*),":")).withColumn("least_col", minval(col("least")))
result.show
+---+---+---+---+--------------------+---------+
| A| B| C| D| least|least_col|
+---+---+---+---+--------------------+---------+
|1,A|2,B|3,C|4,D|[1,A, 2,B, 3,C, 4,D]| A|
|5,A|4,B|3,C|1,D|[5,A, 4,B, 3,C, 1,D]| D|
+---+---+---+---+--------------------+---------+
RDD way and without udf()s.
scala> val df = Seq((1,2,3,4),(5,4,3,1)).toDF("A","B","C","D")
df: org.apache.spark.sql.DataFrame = [A: int, B: int ... 2 more fields]
scala> val df2 = df.withColumn("arr", array(df.columns.map(col(_)):_*))
df2: org.apache.spark.sql.DataFrame = [A: int, B: int ... 3 more fields]
scala> val rowarr = df.columns
rowarr: Array[String] = Array(A, B, C, D)
scala> val rdd1 = df2.rdd.map( x=> {val p = x.getAs[WrappedArray[Int]]("arr").toArray; val q=rowarr(p.indexWhere(_==p.min));Row.merge(x,Row(q)) })
rdd1: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[83] at map at <console>:47
scala> spark.createDataFrame(rdd1,df2.schema.add(StructField("mincol",StringType))).show
+---+---+---+---+------------+------+
| A| B| C| D| arr|mincol|
+---+---+---+---+------------+------+
| 1| 2| 3| 4|[1, 2, 3, 4]| A|
| 5| 4| 3| 1|[5, 4, 3, 1]| D|
+---+---+---+---+------------+------+
scala>
you can do something like,
import org.apache.spark.sql.functions._
val cols = df.columns
val u1 = udf((s: Seq[Int]) => cols(s.zipWithIndex.min._2))
df.withColumn("res", u1(array("*")))
You could access the rows schema, retrieve a list of names out of there and access the rows value by name and then figure it out that way.
See: https://spark.apache.org/docs/2.3.2/api/scala/index.html#org.apache.spark.sql.Row
It would look roughly like this
dataframe.map(
row => {
val schema = row.schema
val fieldNames:List[String] = ??? //extract names from schema
fieldNames.foldLeft(("", 0))(???) // retrieve field value using it's name and retain maximum
}
)
This would yield a Dataset[String]

Including null values in an Apache Spark Join

I would like to include null values in an Apache Spark join. Spark doesn't include rows with null by default.
Here is the default Spark behavior.
val numbersDf = Seq(
("123"),
("456"),
(null),
("")
).toDF("numbers")
val lettersDf = Seq(
("123", "abc"),
("456", "def"),
(null, "zzz"),
("", "hhh")
).toDF("numbers", "letters")
val joinedDf = numbersDf.join(lettersDf, Seq("numbers"))
Here is the output of joinedDf.show():
+-------+-------+
|numbers|letters|
+-------+-------+
| 123| abc|
| 456| def|
| | hhh|
+-------+-------+
This is the output I would like:
+-------+-------+
|numbers|letters|
+-------+-------+
| 123| abc|
| 456| def|
| | hhh|
| null| zzz|
+-------+-------+
Spark provides a special NULL safe equality operator:
numbersDf
.join(lettersDf, numbersDf("numbers") <=> lettersDf("numbers"))
.drop(lettersDf("numbers"))
+-------+-------+
|numbers|letters|
+-------+-------+
| 123| abc|
| 456| def|
| null| zzz|
| | hhh|
+-------+-------+
Be careful not to use it with Spark 1.5 or earlier. Prior to Spark 1.6 it required a Cartesian product (SPARK-11111 - Fast null-safe join).
In Spark 2.3.0 or later you can use Column.eqNullSafe in PySpark:
numbers_df = sc.parallelize([
("123", ), ("456", ), (None, ), ("", )
]).toDF(["numbers"])
letters_df = sc.parallelize([
("123", "abc"), ("456", "def"), (None, "zzz"), ("", "hhh")
]).toDF(["numbers", "letters"])
numbers_df.join(letters_df, numbers_df.numbers.eqNullSafe(letters_df.numbers))
+-------+-------+-------+
|numbers|numbers|letters|
+-------+-------+-------+
| 456| 456| def|
| null| null| zzz|
| | | hhh|
| 123| 123| abc|
+-------+-------+-------+
and %<=>% in SparkR:
numbers_df <- createDataFrame(data.frame(numbers = c("123", "456", NA, "")))
letters_df <- createDataFrame(data.frame(
numbers = c("123", "456", NA, ""),
letters = c("abc", "def", "zzz", "hhh")
))
head(join(numbers_df, letters_df, numbers_df$numbers %<=>% letters_df$numbers))
numbers numbers letters
1 456 456 def
2 <NA> <NA> zzz
3 hhh
4 123 123 abc
With SQL (Spark 2.2.0+) you can use IS NOT DISTINCT FROM:
SELECT * FROM numbers JOIN letters
ON numbers.numbers IS NOT DISTINCT FROM letters.numbers
This is can be used with DataFrame API as well:
numbersDf.alias("numbers")
.join(lettersDf.alias("letters"))
.where("numbers.numbers IS NOT DISTINCT FROM letters.numbers")
val numbers2 = numbersDf.withColumnRenamed("numbers","num1") //rename columns so that we can disambiguate them in the join
val letters2 = lettersDf.withColumnRenamed("numbers","num2")
val joinedDf = numbers2.join(letters2, $"num1" === $"num2" || ($"num1".isNull && $"num2".isNull) ,"outer")
joinedDf.select("num1","letters").withColumnRenamed("num1","numbers").show //rename the columns back to the original names
Based on K L's idea, you could use foldLeft to generate join column expression:
def nullSafeJoin(rightDF: DataFrame, columns: Seq[String], joinType: String)(leftDF: DataFrame): DataFrame =
{
val colExpr: Column = leftDF(columns.head) <=> rightDF(columns.head)
val fullExpr = columns.tail.foldLeft(colExpr) {
(colExpr, p) => colExpr && leftDF(p) <=> rightDF(p)
}
leftDF.join(rightDF, fullExpr, joinType)
}
then, you could call this function just like:
aDF.transform(nullSafejoin(bDF, columns, joinType))
Complementing the other answers, for PYSPARK < 2.3.0 you would not have Column.eqNullSafe neither IS NOT DISTINCT FROM.
You still can build the <=> operator with an sql expression to include it in the join, as long as you define alias for the join queries:
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
numbers_df = spark.createDataFrame (["123","456",None,""], StringType()).toDF("numbers")
letters_df = spark.createDataFrame ([("123", "abc"),("456", "def"),(None, "zzz"),("", "hhh") ]).\
toDF("numbers", "letters")
joined_df = numbers_df.alias("numbers").join(letters_df.alias("letters"),
F.expr('numbers.numbers <=> letters.numbers')).\
select('letters.*')
joined_df.show()
+-------+-------+
|numbers|letters|
+-------+-------+
| 456| def|
| null| zzz|
| | hhh|
| 123| abc|
+-------+-------+
Based on timothyzhang's idea one can further improve it by removing duplicate columns:
def dropDuplicateColumns(df: DataFrame, rightDf: DataFrame, cols: Seq[String]): DataFrame
= cols.foldLeft(df)((df, c) => df.drop(rightDf(c)))
def joinTablesWithSafeNulls(rightDF: DataFrame, leftDF: DataFrame, columns: Seq[String], joinType: String): DataFrame =
{
val colExpr: Column = leftDF(columns.head) <=> rightDF(columns.head)
val fullExpr = columns.tail.foldLeft(colExpr) {
(colExpr, p) => colExpr && leftDF(p) <=> rightDF(p)
}
val finalDF = leftDF.join(rightDF, fullExpr, joinType)
val filteredDF = dropDuplicateColumns(finalDF, rightDF, columns)
filteredDF
}
Try the following method to include the null rows to the result of JOIN operator:
def nullSafeJoin(leftDF: DataFrame, rightDF: DataFrame, columns: Seq[String], joinType: String): DataFrame = {
var columnsExpr: Column = leftDF(columns.head) <=> rightDF(columns.head)
columns.drop(1).foreach(column => {
columnsExpr = columnsExpr && (leftDF(column) <=> rightDF(column))
})
var joinedDF: DataFrame = leftDF.join(rightDF, columnsExpr, joinType)
columns.foreach(column => {
joinedDF = joinedDF.drop(leftDF(column))
})
joinedDF
}