Is there a way in pyspark to count unique values - dataframe

I have a spark dataframe (12m x 132) and I am trying to calculate the number of unique values by column, and remove columns that have only 1 unique value.
So far, I have used the pandas nunique function as such:
import pandas as pd
df = sql_dw.read_table(<table>)
df_p = df.toPandas()
nun = df_p.nunique(axis=0)
nundf = pd.DataFrame({'atr':nun.index, 'countU':nun.values})
dropped = []
for i, j in nundf.values:
if j == 1:
dropped.append(i)
df = df.drop(i)
print(dropped)
Is there a way to do this that is more native to spark - i.e. not using pandas?

Please have a look at the commented example below. The solution requires more python as pyspark specific knowledge.
import pyspark.sql.functions as F
#creating a dataframe
columns = ['asin' ,'ctx' ,'fo' ]
l = [('ASIN1','CTX1','FO1')
,('ASIN1','CTX1','FO1')
,('ASIN1','CTX1','FO2')
,('ASIN1','CTX2','FO1')
,('ASIN1','CTX2','FO2')
,('ASIN1','CTX2','FO2')
,('ASIN1','CTX2','FO3')
,('ASIN1','CTX3','FO1')
,('ASIN1','CTX3','FO3')]
df=spark.createDataFrame(l, columns)
df.show()
#we create a list of functions we want to apply
#in this case countDistinct for each column
expr = [F.countDistinct(c).alias(c) for c in df.columns]
#we apply those functions
countdf = df.select(*expr)
#this df has just one row
countdf.show()
#we extract the columns which have just one value
cols2drop = [k for k,v in countdf.collect()[0].asDict().items() if v == 1]
df.drop(*cols2drop).show()
Output:
+-----+----+---+
| asin| ctx| fo|
+-----+----+---+
|ASIN1|CTX1|FO1|
|ASIN1|CTX1|FO1|
|ASIN1|CTX1|FO2|
|ASIN1|CTX2|FO1|
|ASIN1|CTX2|FO2|
|ASIN1|CTX2|FO2|
|ASIN1|CTX2|FO3|
|ASIN1|CTX3|FO1|
|ASIN1|CTX3|FO3|
+-----+----+---+
+----+---+---+
|asin|ctx| fo|
+----+---+---+
| 1| 3| 3|
+----+---+---+
+----+---+
| ctx| fo|
+----+---+
|CTX1|FO1|
|CTX1|FO1|
|CTX1|FO2|
|CTX2|FO1|
|CTX2|FO2|
|CTX2|FO2|
|CTX2|FO3|
|CTX3|FO1|
|CTX3|FO3|
+----+---+

My apologies as I don't have the solution in pyspark but in pure spark, which may be transferable or used in case you can't find a pyspark way.
You can create a blank list and then using a foreach, check which columns have a distinct count of 1, then append them to the blank list.
From there you can use the list as a filter and drop those columns from your dataframe.
var list_of_columns: List[String] = ()
df_p.columns.foreach{c =>
if (df_p.select(c).distinct.count == 1)
list_of_columns ++= List(c)
df_p_new = df_p.drop(list_of_columns:_*)

you can group your df by that column and count distinct value of this column:
df = df.groupBy("column_name").agg(countDistinct("column_name").alias("distinct_count"))
And then filter your df by row which has more than 1 distinct_count:
df = df.filter(df.distinct_count > 1)

Related

How to add multiple column dynamically based on filter condition

I am trying to create multiple columns dynamically based on filter condition after comparing two data frame with below code
source_df
+---+-----+-----+----+
|key|val11|val12|date|
+---+-----+-----+-----+
|abc| 1.1| john|2-3-21
|def| 3.0| dani|2-2-21
+---+-----+-----+------
dest_df
+---+-----+-----+------+
|key|val11|val12|date |
+---+-----+-----+------
|abc| 2.1| jack|2-3-21|
|def| 3.0| dani|2-2-21|
-----------------------
columns= source_df.columns[1:]
joined_df=source_df\
.join(dest_df, 'key', 'full')
for column in columns:
column_name="difference_in_"+str(column)
report = joined_df\
.filter((source_df[column] != dest_df[column]))\
.withColumn(column_name, F.concat(F.lit('[src:'), source_df[column], F.lit(',dst:'),dest_df[column],F.lit(']')))
The output I expect is
#Expected
+---+-----------------+------------------+
|key| difference_in_val11| difference_in_val12 |
+---+-----------------+------------------+
|abc|[src:1.1,dst:2.1]|[src:john,dst:jack]|
+---+-----------------+-------------------+
I get only one column result
#Actual
+---+-----------------+-
|key| difference_in_val12 |
+---+-----------------+-|
|abc|[src:john,dst:jack]|
+---+-----------------+-
How to generate multiple columns based on filter condition dynamically?
Dataframes are immutable objects. Having said that, you need to create another dataframe using the one that got generated in the 1st iteration. Something like below -
from pyspark.sql import functions as F
columns= source_df.columns[1:]
joined_df=source_df\
.join(dest_df, 'key', 'full')
for column in columns:
if column != columns[-1]:
column_name="difference_in_"+str(column)
report = joined_df\
.filter((source_df[column] != dest_df[column]))\
.withColumn(column_name, F.concat(F.lit('[src:'), source_df[column], F.lit(',dst:'),dest_df[column],F.lit(']')))
else:
column_name="difference_in_"+str(column)
report1 = report.filter((source_df[column] != dest_df[column]))\
.withColumn(column_name, F.concat(F.lit('[src:'), source_df[column], F.lit(',dst:'),dest_df[column],F.lit(']')))
report1.show()
#report.show()
Output -
+---+-----+-----+-----+-----+-------------------+-------------------+
|key|val11|val12|val11|val12|difference_in_val11|difference_in_val12|
+---+-----+-----+-----+-----+-------------------+-------------------+
|abc| 1.1| john| 2.1| jack| [src:1.1,dst:2.1]|[src:john,dst:jack]|
+---+-----+-----+-----+-----+-------------------+-------------------+
You could also do this with a union of both dataframes and then collect list only if collect_set size is greater than 1 , this can avoid joining the dataframes:
from pyspark.sql import functions as F
cols = source_df.drop("key").columns
output = (source_df.withColumn("ref",F.lit("src:"))
.unionByName(dest_df.withColumn("ref",F.lit("dst:"))).groupBy("key")
.agg(*[F.when(F.size(F.collect_set(i))>1,F.collect_list(F.concat("ref",i))).alias(i)
for i in cols]).dropna(subset = cols, how='all')
)
output.show()
+---+------------------+--------------------+
|key| val11| val12|
+---+------------------+--------------------+
|abc|[src:1.1, dst:2.1]|[src:john, dst:jack]|
+---+------------------+--------------------+

PySpark - Get top 5 most frequent values for every column (without UDF)

Currently I'm gathering the top 5 most frequent values with a UDF.
The goal is to achieve the same result without using UDF and have the most efficient solution (avoid groupBy in loops).
Here's the code I'm using to have the result :
from pyspark.sql import functions as F
df = df.select('A', 'B', ...)
#F.udf
def get_top_5_udf(x)
from collections import Counter
return [elem[0] for elem in Counter(x).most_common(5)]
agg_expr = [get_top_5_udf(F.collect_list(col)).alias(col) for col in df.columns]
df_top5 = df.agg(*agg_expr)
The result looks like the following :
# result
#+-----------------+--------------+---------------+
#| A | B | ... |
#+-----------------+--------------+---------------+
#| [1, 2, 3, 4, 5] | [...] | ... |
#+-----------------+--------------+---------------+
You can try using count over window partitioned by each column before aggregating:
from pyspark.sql import functions as F, Window
result = df.select(*[
F.struct(
F.count(c).over(Window.partitionBy(c)).alias("cnt"),
F.col(c).alias("val")
).alias(c) for c in df.columns
]).agg(*[
F.slice(
F.expr(f"transform(sort_array(collect_set({c}), false), x -> x.val)"),
1, 5
).alias(c) for c in df.columns
])
result.show()

Pyspark -Sql loop optimization

I have a scenario where i need to filter date column on date condition ,like wise i need to do it for entire month . Problem is while looping for each date it is taking time . I wanted to do entire month in one go. Following is the code.
target_date = [1,2,3...30]
for i in target_date:
df = spark.sql(f'select * from table where x_date <={i} and y_date >={i}')
df = df.withColumn('load_date',f.lit(i))
df.write.partition('load_date').mode('append').parquet(output_path)
Any approaches to make this faster
Maybe you can move the write to outside the loop. Something like
target_date = [1,2,3...30]
df_final = []
for i in target_date:
df = spark.sql(f'select * from table where x_date <={i} and y_date >={i}')
df = df.withColumn('load_date',f.lit(i))
df_final = df_final.union(df)
df_final.write.partition('load_date').parquet(output_path)
I believe you could solve it with a kind of cross-join like this:
load_dates = spark.createDataFrame([[i,] for i in range(1,31)], ['load_date'])
load_dates.show()
+---------+
|load_date|
+---------+
| 1|
| 2|
| 3|
| ...|
| 30|
+---------+
df = spark.sql(f'select * from table')
df.join(
load_dates,
on=(F.col('x_date') <= F.col('load_date')) & (F.col('y_date') >= F.col('load_date')),
how='inner',
)
df.write.partitionBy('load_date').parquet(output_path)
You should be able to do it by
Creating an array of load_dates in each row
Exploding the array so that you have a unique load_date per original row
Filtering to get just the load_dates you want
For example
target_dates = [1,2,3...30]
df = spark.sql(f'select * from table')
# create an array of all load_dates in each row
df = df.withColumn("load_date", F.array([F.lit(i) for i in target_dates]))
# Explode the load_dates so that you get a new row for each load_date
df = df.withColumn("load_date", F.explode("load_date"))
# Filter only the load_dates you want to keep
df = df.filter("x_date <= load_date and y_date >=load_date")
df.write.partition('load_date').mode('append').parquet(output_path)

how to split one spark dataframe column into two columns by conditional when

I would like to replace a column of pyspark dataframe.
the dataframe:
price
90.16|USD
I need:
dollar_price currency
9016 USD
Pyspark code:
new_col = F.when(F.col("price").isNull() == False, F.substring(F.col('price'), 1, F.instr(F.col('retail_value'), '|')-1)).otherwise(null)
new_df = df.withColumn('dollar_price', new_col)
new_col = F.when(F.col("price").isNull() == False, F.substring(F.col('price'), F.instr(F.col('retail_value'), '|')+1, 3)).otherwise(null)
new_df_1 = new_df.withColumn('currency', new_col)
I got error:
TypeError: Column is not iterable
Could you please tell me what I missed ?
I have tried
Split a dataframe column's list into two dataframe columns
but it does not work.
thanks
Try with expr as you are computing value from instr function.
Example:
df.show()
#+---------+
#| price|
#+---------+
#|90.16|USD|
#+---------+
from pyspark.sql.functions import *
from pyspark.sql.types import *
df.withColumn("dollar_price",when(col("price").isNull()==False,expr("substring(price,1,instr(price,'|')-1)")).otherwise(None)).\
withColumn("currency",when(col("price").isNull()==False,expr("substring(price,instr(price,'|')+1,3)")).otherwise(None)).\
show()
#+---------+------------+--------+
#| price|dollar_price|currency|
#+---------+------------+--------+
#|90.16|USD| 90.16| USD|
#+---------+------------+--------+

Count the number of missing values in a dataframe Spark

I have a dataset with missing values , I would like to get the number of missing values for each columns. Following is what I did , I got the number of non missing values. How can I use it to get the number of missing values?
df.describe().filter($"summary" === "count").show
+-------+---+---+---+
|summary| x| y| z|
+-------+---+---+---+
| count| 1| 2| 3|
+-------+---+---+---+
Any help please to get a dataframe in which we'll find columns and number of missing values for each one.
You could count the missing values by summing the boolean output of the isNull() method, after converting it to type integer:
In Scala:
import org.apache.spark.sql.functions.{sum, col}
df.select(df.columns.map(c => sum(col(c).isNull.cast("int")).alias(c)): _*).show
In Python:
from pyspark.sql.functions import col,sum
df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()
Alternatively, you could also use the output of df.describe().filter($"summary" === "count"), and subtract the number in each cell by the number of rows in the data:
In Scala:
import org.apache.spark.sql.functions.lit,
val rows = df.count()
val summary = df.describe().filter($"summary" === "count")
summary.select(df.columns.map(c =>(lit(rows) - col(c)).alias(c)): _*).show
In Python:
from pyspark.sql.functions import lit
rows = df.count()
summary = df.describe().filter(col("summary") == "count")
summary.select(*((lit(rows)-col(c)).alias(c) for c in df.columns)).show()
from pyspark.sql.functions import isnull, when, count, col
nacounts = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).toPandas()
nacounts
for i in df.columns:
print(i,df.count()-(df.na.drop(subset=i).count()))