Speed up pyspark parsing large nested json file - dataframe

Hello I have nested json files with size of 400 megabytes with 200k records.I created a solution using pyspark to parse the file and store in a customized dataframe , but it takes about 5-7 minutes to do this operation which is very slow.
Here is an example of a json file (small one but with same structure as the large ones) :
{"status":"success",
"data":{"resultType":"matrix","result":
[{"metric":{"data0":"T" ,"data1":"O"},"values":[[90,"0"],[80, "0"]]},
{"metric":{"data0":"K" ,"data1":"S"},"values":[[70,"0"],[60, "0"]]},
{"metric":{"data2":"J" ,"data3":"O"},"values":[[50,"0"],[40, "0"]]}]}}
Here is the structure of the output dataframe I want :
time | value |data0 | data1 | data2 | data3
90 | "0" | "T"| "O"| nan | nan
80 | "0" | "T"| "O"| nan | nan
70 | "0" | "K"| "S"| nan | nan
60 | "0" | "K"| "S"| nan | nan
50 | "0" | nan| nan| "J" | "O"
40 | "0" | nan| nan| "J" | "O"
and this is the pyspark code I used to on the large file to produce the structure of the dataframe listed above:
from datetime import datetime
import json
import rapidjson
import pyspark.sql.functions as F
from pyspark.sql.types import StructType
from util import schema ,meta_date
new_schema = StructType.fromJson(json.loads(schema))
with open("largefile.json", "r") as json_file:
result_count = len(rapidjson.load(json_file)["data"]["result"])
spark = SparkSession.builder.master("spark://IP").getOrCreate()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '5g'),
('spark.executor.cores', '4'),
('spark.driver.memory', '4g'),
])
spark.sparkContext.stop()
spark = SparkSession.builder.config(conf=conf).getOrCreate()
df = spark.read.json("largefile.json")
for data_name in meta_date:
df = df.withColumn(
data_name, F.expr(f"transform(data.result, x -> x.metric.{data_name})")
)
df = (
df.withColumn("values", F.expr("transform(data.result, x -> x.values)"))
.withColumn("items", F.array(*[F.lit(x) for x in range(0, result_count)]))
.withColumn("items", F.explode(F.col("items")))
)
for data_name in meta_date:
df = df.withColumn(data_name, F.col(data_name).getItem(F.col("items")))
df = (df.withColumn("values", F.col("values").getItem(F.col("items")))
.withColumn("values", F.explode("values"))
.withColumn("time", F.col("values").getItem(0))
.withColumn("value", F.col("values").getItem(1))
.drop("data", "status", "items", "values")).show()
My machine has 4 cores (8 logical cores) and memory 16 gb . I'm using the standalone mode with cluster of master and 2 worker nodes.
Any help on how to speed up this process either by editing the cluster configurations or refactoring the transformations in the code?

What about this? Read json, select columns with explode and it looks like match with your desired result.
df.select(f.explode('data.result').alias('result')) \
.select('result.metric.*', f.explode('result.values').alias('values')) \
.withColumn('time', f.col('values')[0]) \
.withColumn('value', f.col('values')[1]) \
.drop('values') \
.show(truncate=False)
+-----+-----+-----+-----+----+-----+
|data0|data1|data2|data3|time|value|
+-----+-----+-----+-----+----+-----+
|T |O |null |null |90 |0 |
|T |O |null |null |80 |0 |
|K |S |null |null |70 |0 |
|K |S |null |null |60 |0 |
|null |null |J |O |50 |0 |
|null |null |J |O |40 |0 |
+-----+-----+-----+-----+----+-----+

Related

Creating MAPTYPE field from multiple columns - Spark SQL

I have a use case wherein multiple keys are distributed across the dataset in a JSON format, which needs to be aggregated into a consolidated resultset for further processing.
I have been able to develop a code structure that achieves it using both Python API (PySpark) & Spark SQL, but the latter involves a more composite & tardy of doing it and involves intermediate conversations which can in the future lead to errors.
Using the below snippets, is there a better way to achieve this using Spark SQL, by creating a MAP<STRING,ARRAY<STRING> using key and value?
Data Preparation
from pyspark.sql.types import *
import pandas as pd
from io import StringIO
s = StringIO("""
id|json_struct
1|{"a":["tyeqb","",""],"e":["qwrqc","",""]}
1|{"t":["sartq","",""],"r":["fsafsq","",""]}
1|{"b":["puhqiqh","",""],"e":["hjfsaj","",""]}
2|{"b":["basajhjwa","",""],"e":["asfafas","",""]}
2|{"n":["gaswq","",""],"r":["sar","",""],"l":["sar","",""],"s":["rqqrq","",""],"m":["wrqwrq","",""]}
2|{"s":["tqqwjh","",""],"t":["afs","",""],"l":["fsaafs","",""]}
""")
df = pd.read_csv(s,delimiter='|')
sparkDF = spark.createDataFrame(df)
sparkDF.registerTempTable("INPUT")
sparkDF = sparkDF.withColumn('json_struct', F.from_json(F.col('json_struct')
,schema=MapType(StringType(),ArrayType(StringType()),True)
))
sparkDF.show(truncate=False)
+---+---------------------------------------------------------------------------------------+
|id |json_struct |
+---+---------------------------------------------------------------------------------------+
|1 |{a -> [tyeqb, , ], e -> [qwrqc, , ]} |
|1 |{t -> [sartq, , ], r -> [fsafsq, , ]} |
|1 |{b -> [puhqiqh, , ], e -> [hjfsaj, , ]} |
|2 |{b -> [basajhjwa, , ], e -> [asfafas, , ]} |
|2 |{n -> [gaswq, , ], r -> [sar, , ], l -> [sar, , ], s -> [rqqrq, , ], m -> [wrqwrq, , ]}|
|2 |{s -> [tqqwjh, , ], t -> [afs, , ], l -> [fsaafs, , ]} |
+---+---------------------------------------------------------------------------------------+
Python API (PySpark) - Implementation
As you can see, the resultant key from explode is natively a STRING type and since PySpark has create_map, which is not available within Spark SQL, it can be readily used to generate the final json_struct column ensuring a single key with a varying length ARRAYTYPE<STRING> value
sparkDF.select(
F.col('id')
,F.explode(F.col('json_struct'))
).withColumn('value',F.filter(F.col('value'), lambda x: x != '')\
).withColumn('value',F.concat_ws(',', F.col('value'))\
).groupBy('id', 'key'
).agg(F.collect_set(F.col('value')).alias('value')\
).withColumn('json_struct',F.to_json(F.create_map("key","value"))
).orderBy('id'
).show(truncate=False)
+---+---+---------------+------------------------+
|id |key|value |json_struct |
+---+---+---------------+------------------------+
|1 |a |[tyeqb] |{"a":["tyeqb"]} |
|1 |e |[hjfsaj, qwrqc]|{"e":["hjfsaj","qwrqc"]}|
|1 |r |[fsafsq] |{"r":["fsafsq"]} |
|1 |b |[puhqiqh] |{"b":["puhqiqh"]} |
|1 |t |[sartq] |{"t":["sartq"]} |
|2 |b |[basajhjwa] |{"b":["basajhjwa"]} |
|2 |n |[gaswq] |{"n":["gaswq"]} |
|2 |t |[afs] |{"t":["afs"]} |
|2 |s |[tqqwjh, rqqrq]|{"s":["tqqwjh","rqqrq"]}|
|2 |e |[asfafas] |{"e":["asfafas"]} |
|2 |l |[sar, fsaafs] |{"l":["sar","fsaafs"]} |
|2 |r |[sar] |{"r":["sar"]} |
|2 |m |[wrqwrq] |{"m":["wrqwrq"]} |
+---+---+---------------+------------------------+
Spark SQL - Implementation
Within this implementation, I have to take additional steps to ensure both key and value columns are of ARRAYTYPE and consistent lengths as map_from_arrays takes in arrays as inputs.
Is there a way to bypass these and create a similar schema as depicted using Python API?
sql.sql("""
SELECT
id,
KEY,
VALUE,
TO_JSON(MAP_FROM_ARRAYS(KEY,VALUE)) as json_struct
FROM (
SELECT
id,
key,
ARRAY(COLLECT_SET( value )) as value -- <------- ### Ensuring Value is NESTED ARRAY
FROM (
SELECT
id,
SPLIT(k,'|',1) as key, -- <------- ### Ensuring Key is Array
CONCAT_WS(',',FILTER(v,x -> x != '')) as value
FROM (
SELECT
id,
EXPLODE(FROM_JSON(json_struct,'MAP<STRING,ARRAY<STRING>>')) as (k,v)
FROM INPUT
)
)
GROUP BY 1,2
)
ORDER BY 1
""").show(truncate=False)
+---+---+-----------------+------------------------+
|id |KEY|VALUE |json_struct |
+---+---+-----------------+------------------------+
|1 |[a]|[[tyeqb]] |{"a":["tyeqb"]} |
|1 |[e]|[[hjfsaj, qwrqc]]|{"e":["hjfsaj","qwrqc"]}|
|1 |[b]|[[puhqiqh]] |{"b":["puhqiqh"]} |
|1 |[r]|[[fsafsq]] |{"r":["fsafsq"]} |
|1 |[t]|[[sartq]] |{"t":["sartq"]} |
|2 |[n]|[[gaswq]] |{"n":["gaswq"]} |
|2 |[b]|[[basajhjwa]] |{"b":["basajhjwa"]} |
|2 |[t]|[[afs]] |{"t":["afs"]} |
|2 |[s]|[[tqqwjh, rqqrq]]|{"s":["tqqwjh","rqqrq"]}|
|2 |[e]|[[asfafas]] |{"e":["asfafas"]} |
|2 |[l]|[[sar, fsaafs]] |{"l":["sar","fsaafs"]} |
|2 |[r]|[[sar]] |{"r":["sar"]} |
|2 |[m]|[[wrqwrq]] |{"m":["wrqwrq"]} |
+---+---+-----------------+------------------------+
Spark SQL instead of create_map has map. Your PySpark code could be translated into this:
df = spark.sql("""
WITH
TBL2 (SELECT id, EXPLODE(FROM_JSON(json_struct,'MAP<STRING,ARRAY<STRING>>')) from INPUT),
TBL3 (SELECT id, key, FLATTEN(COLLECT_SET(FILTER(value, x -> x != ''))) value
FROM TBL2
GROUP BY id, key)
SELECT *, TO_JSON(MAP(key, value)) json_struct
FROM TBL3
""")
df.show(truncate=0)
# +---+---+---------------+------------------------+
# |id |key|value |json_struct |
# +---+---+---------------+------------------------+
# |1 |a |[tyeqb] |{"a":["tyeqb"]} |
# |1 |e |[qwrqc, hjfsaj]|{"e":["qwrqc","hjfsaj"]}|
# |1 |b |[puhqiqh] |{"b":["puhqiqh"]} |
# |1 |r |[fsafsq] |{"r":["fsafsq"]} |
# |1 |t |[sartq] |{"t":["sartq"]} |
# |2 |b |[basajhjwa] |{"b":["basajhjwa"]} |
# |2 |n |[gaswq] |{"n":["gaswq"]} |
# |2 |s |[rqqrq, tqqwjh]|{"s":["rqqrq","tqqwjh"]}|
# |2 |t |[afs] |{"t":["afs"]} |
# |2 |e |[asfafas] |{"e":["asfafas"]} |
# |2 |l |[fsaafs, sar] |{"l":["fsaafs","sar"]} |
# |2 |r |[sar] |{"r":["sar"]} |
# |2 |m |[wrqwrq] |{"m":["wrqwrq"]} |
# +---+---+---------------+------------------------+

How to append data to a column value in dataframe

In spark, I have a dataframe having a column named goals which holds numeric value. Here, I just want to append "goal or goals" string to the actual value
I want to print it as
if,
value = 1 then 1 goal
value = 2 then 2 goals and so on..
My data looks like this
val goalsDF = Seq(("meg", 2), ("meg", 4), ("min", 3),
("min2", 1), ("ss", 1)).toDF("name", "goals")
goalsDF.show()
+-----+-----+
|name |goals|
+-----+-----+
|meg |2 |
|meg |4 |
|min |3 |
|min2 |1 |
|ss |1 |
+-----+-----+
Expected Output:
+-----+---------+
|name |goals |
+-----+---------+
|meg |2 goals |
|meg |4 goals |
|min |3 goals |
|min2 |1 goal |
|ss |1 goal |
+-----+---------+
I tried below code but it doesn't work and prints the data as null
goalsDF.withColumn("goals", col("goals") + lit("goals")).show()
+----+-----+
|name|goals|
+----+-----+
| meg| null|
| meg| null|
| min| null|
|min2| null|
| ss| null|
+----+-----+
Please suggest if we can do this inside .withColumn() without any addition user defined method
You should use case when. It's pyspark example but you should be able to reference it and use scala.
DF.
withColumn('goals', F.When(F.col('goals') == 1, '1 goal').otherwise(F.concat_ws(" ", F.col("goals"), "goals"))
)
For scala example see here: https://stackoverflow.com/a/37108127/5899997

How to get the maximum row_number in a window in a Spark dataframe

I have a dataframe that looks as below, and I'm using the below mentioned code the get it
+---+----------+--------+----------+
|EK |Value |date |row_number|
+---+----------+--------+----------+
|5 |100 |1/1/2020|1 |
|5 |150 |1/3/2020|2 |
|5 |175 |1/5/2020|3 |
|62 |200 |2/9/1999|1 |
|62 |21 |9/2/2000|2 |
+---+----------+--------+----------+
window = Window.partitionBy("EK").orderBy("date")
df.withColumn("row_number",row_number().over(window))
The expected result is to get the maximum row number in every window as shown below,
+---+----------+--------+----------+
|EK |Value |date |row_number|
+---+----------+--------+----------+
|5 |175 |1/5/2020|3 |
|62 |21 |9/2/2000|2 |
+---+----------+--------+----------+
Thank you.
You can use one more window to get the last value.
from pyspark.sql import functions as f
from pyspark.sql import Window
w1 = Window.partitionBy('EK').orderBy('date')
w2 = Window.partitionBy('EK')
df.withColumn('row_number', f.row_number().over(w1)) \
.withColumn('last', f.last('row_number').over(w2)) \
.filter('row_number = last') \
.show(truncate=False)
+---+-----+--------+----------+----+
|EK |Value|date |row_number|last|
+---+-----+--------+----------+----+
|5 |175 |1/5/2020|3 |3 |
|62 |21 |9/2/2000|2 |2 |
+---+-----+--------+----------+----+
If you don't care about the row_number,
from pyspark.sql import functions as f
from pyspark.sql import Window
w = Window.partitionBy('EK')
df.withColumn('last', f.last('date').over(w)) \
.filter('date = last').show(truncate=False)
+---+-----+--------+--------+
|EK |Value|date |last |
+---+-----+--------+--------+
|5 |175 |1/5/2020|1/5/2020|
|62 |21 |9/2/2000|9/2/2000|
+---+-----+--------+--------+
and drop last.
You can create a temporary column with maximum row number by partition, then filter and drop it:
from pyspark.sql.functions import col, max, row_number
window = Window.partitionBy("EK").orderBy("date")
df = df.withColumn("row_number", row_number().over(window))
df = (df
.withColumn('max_row_number', max('row_number').over(Window.partitionBy("EK")))
.where(col('row_number') == col('max_row_number'))
.drop('max_row_number'))
df.show(truncate=False)

How to add more rows in pyspark df by column value

I'm stuck with this problem quite a while and probably making it bigger than really it is. I will try to simplify it.
I'm using pyspark and data frame functions along my code.
I already have a df as:
+--+-----+---------+
|id|col1 |col2 |
+--+-----+---------+
|1 |Hello|Repeat |
|2 |Word |Repeat |
|3 |Aux |No repeat|
|4 |Test |Repeat |
+--+-----+---------+
What I want to achieve is to repeat the df's rows when col2 is 'Repeat' increasing col1's values in value+1.
+--+-----+---------+------+
|id|col1 |col2 |col3 |
+--+-----+---------+------+
|1 |Hello|Repeat |Hello1|
|1 |Hello|Repeat |Hello2|
|1 |Hello|Repeat |Hello3|
|2 |Word |Repeat |Word1 |
|2 |Word |Repeat |Word2 |
|2 |Word |Repeat |Word3 |
|3 |Aux |No repeat|Aux |
|4 |Test |Repeat |Test1 |
|4 |Test |Repeat |Test2 |
|4 |Test |Repeat |Test3 |
+--+-----+---------+------+
My first approach was to use withColumn operator to create a new column with udf's help:
my_func = udf(lambda words: (words + str(i + 1 for i in range(3))), StringType())
df = df\
.withColumn('col3', when(col('col2') == 'No Repeat', col('col1'))
.otherwise(my_func(col('col1'))))
But when I evaluate this in a df.show(10,False) it's throw me an error. My guessing is because I just can't create more rows with withColumn function in that way.
So I decide to go for another approach with no success also. Using a rdd.flatMap:
test = df.rdd.flatMap(lambda row: (row if (row.col2== 'No Repeat') else (row.col1 + str(i+1) for i in range(3))))
print(test.collect())
But here I'm losing the df schema and I can not throw out the full row on the else condition, it only throw me the col1 words plus it's iterator.
Do you know any proper way to solve this?
At the end my problem is that I do not get a properly way to create more rows based on column values because I'm quite new in this world. Also answers that I found seems not to fit this problem.
All help will be appreciate.
One way is use a condition and assign an array , then explode,
import pyspark.sql.functions as F
(df.withColumn("test",F.when(df['col2']=='Repeat',
F.array([F.lit(str(i)) for i in range(1,4)])).otherwise(F.array(F.lit(''))))
.withColumn("col3",F.explode(F.col("test"))).drop("test")
.withColumn("col3",F.concat(F.col("col1"),F.col("col3")))).show()
A neater version of the same as suggested by #MohammadMurtazaHashmi would look like:
(df.withColumn("test",F.when(df['col2']=='Repeat',
F.array([F.concat(F.col("col1"),F.lit(str(i))) for i in range(1,4)]))
.otherwise(F.array(F.col("col1"))))
.select("id","col1","col2", F.explode("test"))).show()
+---+-----+---------+------+
| id| col1| col2| col3|
+---+-----+---------+------+
| 1|Hello| Repeat|Hello1|
| 1|Hello| Repeat|Hello2|
| 1|Hello| Repeat|Hello3|
| 2| Word| Repeat| Word1|
| 2| Word| Repeat| Word2|
| 2| Word| Repeat| Word3|
| 3| Aux|No repeat| Aux|
| 4| Test| Repeat| Test1|
| 4| Test| Repeat| Test2|
| 4| Test| Repeat| Test3|
+---+-----+---------+------+

Merge two columns but with different structure in hive

I have loaded a parquet file and created a Data frame as shown below
----------------------------------------------------------------------
time | data1 | data2
-----------------------------------------------------------------------
1-40 | [ lion-> 34, bear -> 2 ] | [ monkey -> [9,23], goose -> [4,5] ]
So, the data type of data1 column is string->integer map, where data type of data2 column is string->array map.
I want to explode the above data frame into below structure
------------------------
time | key | val
------------------------
1-40 | lion | 34
1-40 | bear | 2
1-40 | monkey_0 | 9
1-40 | monkey_1 | 23
1-40 | goose_0 | 4
1-40 | goose_1 | 5
I tried to convert both data1 and data2 into same datatype as string->array by using udfs in pyspark and then exploded the column as show below
def to_map(col1, col2):
for i in col1.keys():
col2[i] = [col1[i]]
return col2
caster= udf(to_map,MapType(StringType(),ArrayType(IntegerType())))
pm_df = pm_df.withColumn("animals", caster('data1', 'data2'))
pm_df.select('time',explode(col('animals')))
I also tried using hive sql by assuming hive sql has more performance than using pyspark UDFs.
rdd = spark.sparkContext.parallelize([[datetime.datetime.now(), {'lion': 34, 'bear': 2}, {'monkey': [9, 23], 'goose':[4,5]} ]])
df = rdd.toDF(fields)
df.createOrReplaceTempView("df")
df = spark.sql("select time, explode(data1), data2 from df")
df.createOrReplaceTempView("df")
df = spark.sql("select time,key as animal,value,posexplode(data2) from df").show(truncate=False)
But I am stuck with below result and don't know how to merge the splitted columns as per my requirement.Output of above hive sql is:
+--------------------------+------+-----+---+------+-------+
|time |animal|value|pos|key |value |
+--------------------------+------+-----+---+------+-------+
|2019-06-12 19:23:00.169739|bear |2 |0 |goose |[4, 5] |
|2019-06-12 19:23:00.169739|bear |2 |1 |monkey|[9, 23]|
|2019-06-12 19:23:00.169739|lion |34 |0 |goose |[4, 5] |
|2019-06-12 19:23:00.169739|lion |34 |1 |monkey|[9, 23]|
+--------------------------+------+-----+---+------+-------+
I know that while using python udfs there is lot of overhead that goes for communication between a python processor and JVMs. Is there any way to achieve the above expected result using inbuilt functions or hive sql.
I would process data1 and data2 separately and then union the resultset:
from pyspark.sql import functions as F
df1 = df.select('time', F.explode('data1').alias('key', 'value'))
>>> df1.show()
#+--------------------+----+-----+
#| time| key|value|
#+--------------------+----+-----+
#|2019-06-12 20:19:...|bear| 2|
#|2019-06-12 20:19:...|lion| 34|
#+--------------------+----+-----+
df2 = df.select('time', F.explode('data2').alias('key', 'values')) \
.select('time', 'key', F.posexplode('values').alias('pos','value')) \
.select('time', F.concat('key', F.lit('_'), 'pos').alias('key'), 'value')
>>> df2.show()
#+--------------------+--------+-----+
#| time| key|value|
#+--------------------+--------+-----+
#|2019-06-12 20:19:...| goose_0| 4|
#|2019-06-12 20:19:...| goose_1| 5|
#|2019-06-12 20:19:...|monkey_0| 9|
#|2019-06-12 20:19:...|monkey_1| 23|
#+--------------------+--------+-----+
df_new = df1.union(df2)