Add single quotes to the dataFrame column values - dataframe

DataFrame is holding a column QUALIFY with values like below.
QUALIFY
=================
ColA|ColB|ColC
ColA
ColZ|ColP
The values in this column are split by "|". I want values in this column to be like 'ColA','ColB','ColC' ...
With the below code I am able to replace | with ,',. How can I add a single quote at the start and end of value?
newDf = df_qualify.withColumn('QUALIFY2', regexp_replace('QUALIFY', "\\|", "\\','"))

Your solution is almost there - you just need to add a single quote to the start and end. You can achieve this using pyspark.sql.functions.concat:
from pyspark.sql.functions import col, concat, lit, regexp_replace
df.withColumn(
"QUALIFY2",
concat(lit("'"), regexp_replace(col('QUALIFY'), r"\|", r"','"), lit("'"))
).show()
#+--------------+--------------------+
#| QUALIFY| QUALIFY2|
#+--------------+--------------------+
#|ColA|ColB|ColC|'ColA','ColB','ColC'|
#| ColA| 'ColA'|
#| ColZ|ColP| 'ColZ','ColP'|
#+--------------+--------------------+
Alternatively, you can avoid regular expressions and achieve the same using split and concat_ws:
from pyspark.sql.functions import split, concat_ws
df.withColumn(
"QUALIFY2",
concat(lit("'"), concat_ws("','", split("QUALIFY", "\|")), lit("'"))
).show()
#+--------------+--------------------+
#| QUALIFY| QUALIFY2|
#+--------------+--------------------+
#|ColA|ColB|ColC|'ColA','ColB','ColC'|
#| ColA| 'ColA'|
#| ColZ|ColP| 'ColZ','ColP'|
#+--------------+--------------------+

Split the column on | and then join the resulting array back to a string :
import pyspark.sql.functions as F
import pyspark.sql.types as T
def str_list(x):
return str(x).replace("[", "").replace("]", "")
str_udf = F.udf(str_list, T.StringType())
df = df.withColumn("arr_split", F.split(F.col("QUALIFY"), "\|")) # escape character
df = df.withColumn("QUALIFY2", str_udf(F.col("arr_split")))
My sample output frame:
df.drop("arr_split").show() # Please ignore a and b columns
+---+---+--------------+--------------------+
| a| b| abc| QUALIFY2|
+---+---+--------------+--------------------+
| 1| 1|col1|col2|col3|'col1', 'col2', '...|
| 2| 2|col1|col2|col3|'col1', 'col2', '...|
| 3| 3|col1|col2|col3|'col1', 'col2', '...|
| 4| 4|col1|col2|col3|'col1', 'col2', '...|
| 5| 5|col1|col2|col3|'col1', 'col2', '...|
+---+---+--------------+--------------------+

Below code worked for me, added the square brackets back to make it an array
import pyspark.sql.functions as F
import pyspark.sql.types as T
def str_list(x):
return str(x).replace("[", "").replace("]", "")
str_udf = F.udf(str_list, T.StringType())
df = df.withColumn(column_name,str_udf(F.col(column_name)))
df = df.withColumn(column_name, F.expr("concat('[', " + column_name +", ']')"))

Related

Splitting a column on a dot separator

How to split PySpark dataframe column with separator as dot (.). To me it doesn't seem to work when I use split used on a dot.
E.g. column with value abcd.efgh, should be split into two columns with values abcd and efgh.
This is the df based on your example.
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([('abcd.efgh',)], ['c1'])
df.show()
#+---------+
#| c1|
#+---------+
#|abcd.efgh|
#+---------+
For splitting one can use split like this:
splitCol = F.split('c1', '[.]', 2)
df = df.select(
splitCol[0].alias('c1_0'),
splitCol[1].alias('c1_1'),
)
df.show()
#+----+----+
#|c1_0|c1_1|
#+----+----+
#|abcd|efgh|
#+----+----+

How to remove double quotes from column name while saving dataframe in csv in spark?

I am saving spark dataframe into csv file. All the records is saving in double quotes that is fine but column name also coming in double quotes. Could you please help me how to remove them?
Example:
"Source_System"|"Date"|"Market_Volume"|"Volume_Units"|"Market_Value"|"Value_Currency"|"Sales_Channel"|"Competitor_Name"
"IMS"|"20080628"|"183.0"|"16470.0"|"165653.256349"|"AUD"|"AUSTRALIA HOSPITAL"|"PFIZER"
Desirable Output:
Source_System|Date|Market_Volume|Volume_Units|Market_Value|Value_Currency|Sales_Channel|Competitor_Name
"IMS"|"20080628"|"183.0"|"16470.0"|"165653.256349"|"AUD"|"AUSTRALIA HOSPITAL"|"PFIZER"
I am using below code:
df4.repartition(1).write.csv(Output_Path_ASPAC, quote='"', header=True, quoteAll=True, sep='|', mode='overwrite')
I think only workaround would be concat quotes to the column values in dataframe before writing to csv.
Example:
df.show()
#+---+----+------+
#| id|name|salary|
#+---+----+------+
#| 1| a| 100|
#+---+----+------+
from pyspark.sql.functions import col, concat, lit
cols = [concat(lit('"'), col(i), lit('"')).alias(i) for i in df.columns]
df1=df.select(*cols)
df1.show()
#+---+----+------+
#| id|name|salary|
#+---+----+------+
#|"1"| "a"| "100"|
#+---+----+------+
df1.\
write.\
csv("<path>", header=True, sep='|',escape='', quote='',mode='overwrite')
#output
#cat tmp4/part*
#id|name|salary
#"1"|"a"|"100"

How to transform two arrays of each column into a pair for a Spark DataFrame?

I have a DataFrame which has two columns of array values like below
var ds = Seq((Array("a","b"),Array("1","2")),(Array("p","q"),Array("3","4")))
var df = ds.toDF("col1", "col2")
+------+------+
| col1| col2|
+------+------+
|[a, b]|[1, 2]|
|[p, q]|[3, 4]|
+------+------+
I want to transform this into an array of pairs like below
+------+------+---------------+
| col1| col2| col3|
+------+------+---------------+
|[a, b]|[1, 2]|[[a, 1],[b, 2]]|
|[p, q]|[3, 4]|[[p, 3],[q, 4]]|
+------+------+---------------+
I guess I can use struct and then some udf. But I wanted to know if there is any built-in higher order method to do this efficiently.
From Spark-2.4 use arrays_zip function.
Example:
df.show()
#+------+------+
#| col1| col2|
#+------+------+
#|[a, b]|[1, 2]|
#|[p, q]|[3, 4]|
#+------+------+
from pyspark.sql.functions import *
df.withColumn("col3",arrays_zip(col("col1"),col("col2"))).show()
#+------+------+----------------+
#| col1| col2| col3|
#+------+------+----------------+
#|[a, b]|[1, 2]|[[a, 1], [b, 2]]|
#|[p, q]|[3, 4]|[[p, 3], [q, 4]]|
#+------+------+----------------+
For Spark-2.3 or below, I found the iterator zip method really handy for this use case (which I was unaware of while posting the question). I can define a small UDF
val zip = udf((xs: Seq[String], ys: Seq[String]) => xs.zip(ys))
and use as
var out = df.withColumn("col3", zip(df("col1"), df("col2")))
This gives me desired result.

Merging two dataframes using Pyspark

I have 2 DF to merge:
DF1 --> contains Stocks
Plant Art_nr Tot
A X 5
B Y 4
DF2 --Z contains open delivery
Plant Art_nr Tot
A X 1
C Z 3
I would like to obtain a DF3 where for each combination of Plant and Art_nr:
- if there is a match between DF1.Plant&Art_nr and DF2.Plant&Art_nr I get the difference between DF1 and DF2
- if there is no match between DF1.Plant&Art_nr and DF2.Plant&Art_nr I keep the original values from DF1 and DF2
DF3 -->
Plant Art_nr Total
A X 4
B Y 4
C Z 3
I created a "Concat" field in DF1 and DF2 to concatenate Plant and Art_nr and I tried with a full join + when + otherwise but I can't find the correct syntax
DF1.join(DF2, ["Concat"],"full").withColumn("Total",when(DF1.Concat.isin(DF2.Concat)), DF1.Tot - DF2.Tot).otherwise(when(not(DF1.Concat.isin(DF2.Concat)), DF1.Tot)).show()
Any suggestions about alternative functions I could use, or how to correctly use those?
You have to join both dataframes and then perform case (If-Else) expression or coalesce function.
This could be done in multiple ways, here are few examples.
Option1: Use coalesce function as alternative of CASE-WHEN-NULL
from pyspark.sql.functions import coalesce, lit,abs
cond = [df1.Plant == df2.Plant, df1.Art_nr == df2.Art_nr]
df1.join(df2,cond,'full') \
.select(coalesce(df1.Plant,df2.Plant).alias('Plant')
,coalesce(df1.Art_nr,df2.Art_nr).alias('Art_nr')
,abs(coalesce(df1.Tot,lit(0)) - coalesce(df2.Tot,lit(0))).alias('Tot')
).show()
Option2: Use case expression within selectExpr()
cond = [df1.Plant == df2.Plant, df1.Art_nr == df2.Art_nr]
df1.alias('a').join(df2.alias('b'),cond,'full') \
.selectExpr("CASE WHEN a.Plant IS NULL THEN b.Plant ELSE a.Plant END AS Plant",
"CASE WHEN a.Art_nr IS NULL THEN b.Art_nr ELSE a.Art_nr END AS Art_nr",
"abs(coalesce(a.Tot,0) - coalesce(b.Tot,0)) AS Tot") \
.show()
#+-----+------+---+
#|Plant|Art_nr|Tot|
#+-----+------+---+
#| A| X| 4|
#| B| Y| 4|
#| C| Z| 3|
#+-----+------+---+
Option3: Use when().otherwise()
from pyspark.sql.functions import when,coalesce, lit,abs
cond = [df1.Plant == df2.Plant, df1.Art_nr == df2.Art_nr]
df1.join(df2,cond,'full') \
.select(when(df1.Plant.isNull(),df2.Plant).otherwise(df1.Plant).alias('Plant')
,when(df1.Art_nr.isNull(),df2.Art_nr).otherwise(df1.Art_nr).alias('Art_nr')
,abs(coalesce(df1.Tot,lit(0)) - coalesce(df2.Tot,lit(0))).alias('Tot')
).show()
Output:
#+-----+------+---+
#|Plant|Art_nr|Tot|
#+-----+------+---+
#| A| X| 4|
#| B| Y| 4|
#| C| Z| 3|
#+-----+------+---+
Use Udf, seems verbose but gives more clarity
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf, array
def score(arr):
if arr[0] is None:
return int(arr[1])
elif arr[1] is None:
return int(arr[0])
return (int(arr[0])-int(arr[1]))
udf_final = udf(lambda arr: score(arr), IntegerType())
DF1.join(DF2, cond, "full").withColumn("final_score",udf_final(array("Tot","Total")))
I would probably do a union with a groupBy and some reformatting to avoid using UDFs and without large blocks of code.
from pyspark.sql.functions import *
DF3 = DF1.union(DF2.withColumn("Tot", col("Tot") * (-1)))
DF3 = DF3.groupBy("Plant", "Art_nr").agg(sum("Tot").alias("Tot"))
DF3 = DF3.withColumn("Tot", abs(col("Tot")))
I'm not 100% sure if there are no side effects I wasn't considering and if it fits your needs.

Statistics of Columns computed parallely

Best way to get the max value in a Spark dataframe column
This post shows how to run an aggregation (distinct, min, max) on a table something like:
for colName in df.columns:
dt = cd[[colName]].distinct().count()
mx = cd.agg({colName: "max"}).collect()[0][0]
mn = cd.agg({colName: "min"}).collect()[0][0]
print(colName, dt, mx, mn)
This can be easily done by compute statistics. The stats from Hive and spark are different:
Hive gives - distinct, max, min, nulls, length, version
Spark Gives - count, mean, stddev, min, max
Looks like there are quite a few statistics that are calculated. How get all of them for all columns using one command?
However, I have 1000s of columns and doing this serially is very slow. Suppose I want to compute some other function say Standard Deviation on each of the columns - how can that be done parallely?
You can use pyspark.sql.DataFrame.describe() to get aggregate statistics like count, mean, min, max, and standard deviation for all columns where such statistics are applicable. (If you don't pass in any arguments, stats for all columns are returned by default)
df = spark.createDataFrame(
[(1, "a"),(2, "b"), (3, "a"), (4, None), (None, "c")],["id", "name"]
)
df.describe().show()
#+-------+------------------+----+
#|summary| id|name|
#+-------+------------------+----+
#| count| 4| 4|
#| mean| 2.5|null|
#| stddev|1.2909944487358056|null|
#| min| 1| a|
#| max| 4| c|
#+-------+------------------+----+
As you can see, these statistics ignore any null values.
If you're using spark version 2.3, there is also pyspark.sql.DataFrame.summary() which supports the following aggregates:
count - mean - stddev - min - max - arbitrary approximate percentiles specified as a percentage (eg, 75%)
df.summary("count", "min", "max").show()
#+-------+------------------+----+
#|summary| id|name|
#+-------+------------------+----+
#| count| 4| 4|
#| min| 1| a|
#| max| 4| c|
#+-------+------------------+----+
If you wanted some other aggregate statistic for all columns, you could also use a list comprehension with pyspark.sql.DataFrame.agg(). For example, if you wanted to replicate what you say Hive gives (distinct, max, min and nulls - I'm not sure what length and version mean):
import pyspark.sql.functions as f
from itertools import chain
agg_distinct = [f.countDistinct(c).alias("distinct_"+c) for c in df.columns]
agg_max = [f.max(c).alias("max_"+c) for c in df.columns]
agg_min = [f.min(c).alias("min_"+c) for c in df.columns]
agg_nulls = [f.count(f.when(f.isnull(c), c)).alias("nulls_"+c) for c in df.columns]
df.agg(
*(chain.from_iterable([agg_distinct, agg_max, agg_min, agg_nulls]))
).show()
#+-----------+-------------+------+--------+------+--------+--------+----------+
#|distinct_id|distinct_name|max_id|max_name|min_id|min_name|nulls_id|nulls_name|
#+-----------+-------------+------+--------+------+--------+--------+----------+
#| 4| 3| 4| c| 1| a| 1| 1|
#+-----------+-------------+------+--------+------+--------+--------+----------+
Though this method will return one row, rather than one row per statistic as describe() and summary() do.
You can put as many expressions into an agg as you want, when you collect they all get computed at once. The result is a single row with all the values. Here's an example:
from pyspark.sql.functions import min, max, countDistinct
r = df.agg(
min(df.col1).alias("minCol1"),
max(df.col1).alias("maxCol1"),
(max(df.col1) - min(df.col1)).alias("diffMinMax"),
countDistinct(df.col2).alias("distinctItemsInCol2"))
r.printSchema()
# root
# |-- minCol1: long (nullable = true)
# |-- maxCol1: long (nullable = true)
# |-- diffMinMax: long (nullable = true)
# |-- distinctItemsInCol2: long (nullable = false)
row = r.collect()[0]
print(row.distinctItemsInCol2, row.diffMinMax)
# (10, 9)
You can also use the dictionary syntax here, but it's harder to manage for more complex things.