combine and count a json column in spark dataframe - sql

I would like to aggregate a column values (json) in spark dataframe and hive table.
e.g.
year, month, val (json)
2010 01 [{"a_id":"caes"},{"a_id":"rgvtsa"},{"a_id":"btbsdv"}]
2010 01 [{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]
2008 10 [{"a_id":"rfve"},{"a_id":"yjndf"},{"a_id":"onbds"}]
2008 10 [{"a_id":"fvds"},{"a_id":"yjndf"},{"a_id":"yesva"}]
I need:
year, month, val (json), num (int)
2010 01 [{"a_id":"caes"},{"a_id":"rgvtsa"},{"a_id":"btbsdv},{"a_id":"uktf"}, {"a_id":"ohcwa"}] 5
2008 10 [{"a_id":"rfve"},{"a_id":"yjndf"},{"a_id":"onbds"},{"a_id":"yesva"}] 4
I need to remove the duplicates and also find the size of the json string (num of "a_id") in it.
The data is saved as a hive table so it could be better to work on it by pyspark sql ?
I also would like to know how to work on it if it is saved as a spark dataframe.
I have tried:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType(
[
StructField('a_id', StringType(), True)
]
)
df.withColumn("val", from_json("val", schema))\
.select(col('year'), col('month'), col('val.*'))\
.show()
But, all values in "val1" are null.
thanks
UPDTAE
my hive version:
%sh
ls /databricks/hive | grep "hive"
spark--maven-trees--spark_1.4_hive_0.13
My DDL:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import *
def concate_elements(val):
return reduce (lambda x, y:x+y, val)
flatten_array = F.udf(concate_elements, T.ArrayType(T.StringType()))
remove_duplicates = udf(lambda row: list(set(row)),
ArrayType(StringType()))
#final results
df.select("year","month", flatten_array("val").alias("flattenvalues")).withColumn("uniquevalues", remove_duplicates("flattenvalues")).withColumn("size",F.size("uniquevalues")).show()

considered input data input Json file json-input.json
{"year":"2010","month":"01","value":[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]}
{"year":"2011","month":"01","value":[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"uktf"},{"a_id":"sathya"}]}
Approach 1. Read data from hive
1. insert data into hive
ADD JAR /home/sathya/Downloads/json-serde-1.3.7-jar-with-dependencies.jar
CREATE EXTERNAL TABLE json_table (
year string,
month string,
value array<struct<a_id:string>>)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe';
load data local inpath '/home/sathya/json-input.json' into table json_table;
select * from json_table;
OK
2010 01 [{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]
2011 01 [{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"uktf"},{"a_id":"sathya"}]
2. Read data from spark:
pyspark --jars /home/sathya/Downloads/json-serde-1.3.7-jar-with-dependencies.jar --driver-class-path /home/sathya/Downloads/json-serde-1.3.7-jar-with-dependencies.jar
df=spark.sql("select * from default.json_table")
df.show(truncate=False)
'''
+----+-----+----------------------------------+
|year|month|value |
+----+-----+----------------------------------+
|2010|01 |[[caes], [uktf], [ohcwa]] |
|2011|01 |[[caes], [uktf], [uktf], [sathya]]|
+----+-----+----------------------------------+
'''
#UDFs for concatenating the array elements & removing duplicates in an array
def concate_elements(val):
return reduce (lambda x, y:x+y, val)
flatten_array = F.udf(concate_elements, T.ArrayType(T.StringType()))
remove_duplicates = udf(lambda row: list(set(row)), ArrayType(StringType()))
#final results
df.select("year","month",flattenUdf("value").alias("flattenvalues")).withColumn("uniquevalues", remove_duplicates("flattenvalues")).withColumn("size",size("uniquevalues")).show()
'''
+----+-----+--------------------------+--------------------+----+
|year|month|flattenvalues |uniquevalues |size|
+----+-----+--------------------------+--------------------+----+
|2010|01 |[caes, uktf, ohcwa] |[caes, uktf, ohcwa] |3 |
|2011|01 |[caes, uktf, uktf, sathya]|[caes, sathya, uktf]|3 |
+----+-----+--------------------------+--------------------+----+
'''
Approach 2 - direct read from input Json file json-input.json
{"year":"2010","month":"01","value":[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]}
{"year":"2011","month":"01","value":[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"uktf"},{"a_id":"sathya"}]}
code for your scenario is:
import os
import logging
from pyspark.sql import SQLContext,SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql import functions as F
import pyspark.sql.types as T
df=spark.read.json("file:///home/sathya/json-input.json")
df.show(truncate=False)
'''
+-----+----------------------------------+----+
|month|value |year|
+-----+----------------------------------+----+
|01 |[[caes], [uktf], [ohcwa]] |2010|
|01 |[[caes], [uktf], [uktf], [sathya]]|2011|
+-----+----------------------------------+----+
'''
#UDFs for concatenating the array elements & removing duplicates in an array
def concate_elements(val):
return reduce (lambda x, y:x+y, val)
flatten_array = F.udf(concate_elements, T.ArrayType(T.StringType()))
remove_duplicates = udf(lambda row: list(set(row)), ArrayType(StringType()))
#final results
df.select("year","month",flattenUdf("value").alias("flattenvalues")).withColumn("uniquevalues", remove_duplicates("flattenvalues")).withColumn("size",size("uniquevalues")).show()
'''
+----+-----+--------------------------+--------------------+----+
|year|month|flattenvalues |uniquevalues |size|
+----+-----+--------------------------+--------------------+----+
|2010|01 |[caes, uktf, ohcwa] |[caes, uktf, ohcwa] |3 |
|2011|01 |[caes, uktf, uktf, sathya]|[caes, sathya, uktf]|3 |
+----+-----+--------------------------+--------------------+----+
'''

Here is a solution that'll work in Databricks:
#Import libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
#Define schema
schema1=StructType([
StructField('year',IntegerType(),True),
StructField('month',StringType(),True),
StructField('val',ArrayType(StructType([
StructField('a_id',StringType(),True)
])))
])
#Test data
rowsArr=[
[2010,'01',[{"a_id":"caes"},{"a_id":"rgvtsa"},{"a_id":"btbsdv"}]],
[2010,'01',[{"a_id":"caes"},{"a_id":"uktf"},{"a_id":"ohcwa"}]],
[2008,'10',[{"a_id":"rfve"},{"a_id":"yjndf"},{"a_id":"onbds"}]],
[2008,'10',[{"a_id":"fvds"},{"a_id":"yjndf"},{"a_id":"yesva"}]]
]
#Create dataframe
df1=(spark
.createDataFrame(rowsArr,schema=schema1)
)
#Create database
spark.sql('CREATE DATABASE IF NOT EXISTS testdb')
#Dump it into hive table
(df1
.write
.mode('overwrite')
.options(schema=schema1)
.saveAsTable('testdb.testtable')
)
#read from hive table
df_ht=(spark
.sql('select * from testdb.testtable')
)
#Perform transformation
df2=(df_ht
.groupBy('year','month')
.agg(array_distinct(flatten(collect_list('val'))).alias('val'))
.withColumn('num',size('val'))
)
Input DF:
Output DF:

Related

Add column with the first IP address of the subnet

I have PySpark dataframe with column named "subnet". I want to add a column which is the first IP of that subnet. I've tried many solutions including
def get_first_ip(prefix):
n = ipaddress.IPv4Network(prefix)
first, last = n[0], n[-1]
return first
df.withColumn("first_ip", get_first_ip(F.col("subnet")))
But getting error:
-> 1161 raise AddressValueError("Expected 4 octets in %r" % ip_str)
1162
1163 try:
AddressValueError: Expected 4 octets in "Column<'subnet'>"
I do understand that is the Column value and can no use it as a simple string here, but how to solve my problem with PySpark?
I could do the same in pandas and then convert to PySpark, but I'm wondering if there's any other more elegant way?
It's hard to tell what's the issue when we don't know how the input dataframe looks like. But something is wrong with the column values as #samkart suggested.
Here's an example that I tested:
import ipaddress
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
def get_first_ip(x):
n = ipaddress.IPv4Network(x)
return str(n[0])
def get_last_ip(x):
n = ipaddress.IPv4Network(x)
return str(n[-1])
first_ip_udf = F.udf(lambda x: get_first_ip(x), StringType())
last_ip_udf = F.udf(lambda x: get_last_ip(x), StringType())
spark = SparkSession.builder.getOrCreate()
data = [
{"IP": "10.10.128.123"},
{"IP": "10.10.128.0/17"},
]
df = spark.createDataFrame(data=data)
df = df.withColumn("first_ip", first_ip_udf(F.col("IP")))
df = df.withColumn("last_ip", last_ip_udf(F.col("IP")))
Outputs:
+--------------+-------------+-------------+
|IP |first_ip |last_ip |
+--------------+-------------+-------------+
|10.10.128.123 |10.10.128.123|10.10.128.123|
|10.10.128.0/17|10.10.128.0 |10.10.255.255|
+--------------+-------------+-------------+
You cannot directly apply python native function to a Spark dataframe column. As demonstrated in this answer, you could create a udf from your function.
Since udf is slow for big dataframes, you could use pandas_udf which is a lot faster.
Input:
import ipaddress
import pandas as pd
from pyspark.sql import functions as F
df = spark.createDataFrame([("10.10.128.123",), ("10.10.128.0/17",)], ["subnet"])
Script:
#F.pandas_udf('string')
def get_first_ip(prefix: pd.Series) -> pd.Series:
return prefix.apply(lambda s: str(ipaddress.IPv4Network(s)[0]))
df = df.withColumn("first_ip", get_first_ip("subnet"))
df.show()
# +--------------+-------------+
# | subnet| first_ip|
# +--------------+-------------+
# | 10.10.128.123|10.10.128.123|
# |10.10.128.0/17| 10.10.128.0|
# +--------------+-------------+

Creating PySpark UDFs from python method with numpy array input, to calculate and return a single float value

As input I have a csv file with int values in it.
spark_df = spark.read.option("header", "false").csv("../int_values.csv")
df = spark_df.selectExpr("_c0 as something")
_df = df.withColumn("values", df.something.cast(FloatType())).select("values")
I also have some python functions designed for numpy array inputs, that I need to apply on the Spark DataFrame.
The example one:
def calc_sum(float_array):
return np.sum(float_array)
Real function:
def calc_rms(float_array):
return np.sqrt(np.mean(np.diff(float_array)**2))
For the 1. example you can use SQL sum like:
_df.groupBy().sum().collect()
But, what I need is a standard solution to transform these functions into Spark UDFs
I tried many ways, like:
udf_sum = udf(lambda x : calc_sum(x), FloatType())
_df.rdd.flatMap(udf_sum).collect()
but it always failed with:
TypeError: Invalid argument, not a string or column:
Row(values=1114.0) of type <class 'pyspark.sql.types.Row'>. For column
literals, use 'lit', 'array', 'struct' or 'create_map' function.
Is it possible to transform the data in a way that works with these functions?
DataFrame sample:
In [6]: spark_df.show()
+----+
| _c0|
+----+
|1114|
|1113|
|1066|
|1119|
|1062|
|1089|
|1093|
| 975|
|1099|
|1062|
|1062|
|1162|
|1057|
|1123|
|1141|
|1089|
|1172|
|1096|
|1164|
|1146|
+----+
only showing top 20 rows
Expected output:
A Float value returned from the UDF.
For the Sum function it should be clear.
What you want is groupby and use collect_list to get all integer values into an array column then apply your UDF on that column. Also, you need to explicitly return float from calc_rms:
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
def calc_rms(float_array):
return float(np.sqrt(np.mean(np.diff(float_array) ** 2)))
calc_rms_udf = F.udf(calc_rms, FloatType())
df.groupby().agg(F.collect_list("_c0").alias("_c0")) \
.select(calc_rms_udf(F.col("_c0")).alias("rms")) \
.show()
#+--------+
#| rms|
#+--------+
#|67.16202|
#+--------+

how to split one spark dataframe column into two columns by conditional when

I would like to replace a column of pyspark dataframe.
the dataframe:
price
90.16|USD
I need:
dollar_price currency
9016 USD
Pyspark code:
new_col = F.when(F.col("price").isNull() == False, F.substring(F.col('price'), 1, F.instr(F.col('retail_value'), '|')-1)).otherwise(null)
new_df = df.withColumn('dollar_price', new_col)
new_col = F.when(F.col("price").isNull() == False, F.substring(F.col('price'), F.instr(F.col('retail_value'), '|')+1, 3)).otherwise(null)
new_df_1 = new_df.withColumn('currency', new_col)
I got error:
TypeError: Column is not iterable
Could you please tell me what I missed ?
I have tried
Split a dataframe column's list into two dataframe columns
but it does not work.
thanks
Try with expr as you are computing value from instr function.
Example:
df.show()
#+---------+
#| price|
#+---------+
#|90.16|USD|
#+---------+
from pyspark.sql.functions import *
from pyspark.sql.types import *
df.withColumn("dollar_price",when(col("price").isNull()==False,expr("substring(price,1,instr(price,'|')-1)")).otherwise(None)).\
withColumn("currency",when(col("price").isNull()==False,expr("substring(price,instr(price,'|')+1,3)")).otherwise(None)).\
show()
#+---------+------------+--------+
#| price|dollar_price|currency|
#+---------+------------+--------+
#|90.16|USD| 90.16| USD|
#+---------+------------+--------+

Strange convertion of pandas dataframe to spark dataframe with defined schema

I'm facing the following problem and cound't get an answer yet: when converting a pandas dataframe with integers to a pyspark dataframe with a schema that supposes data comes as a string, the values change to "strange" strings, just like the example below. I've saved a lot of important data like that, and I wonder why that happened and if it is possible to "decode" these symbols back to integer forms. Thanks in advance!
import pandas as pd
from pyspark.sql.types import StructType, StructField,StringType
df = pd.DataFrame(data = {"a": [111,222, 333]})
schema = StructType([
StructField("a", StringType(), True)
])
sparkdf = spark.createDataFrame(df, schema)
sparkdf.show()
Output:
--+
+---+
| a|
+---+
| o|
| Þ|
| ō|
+---+
I cannot reproduce the problem on any recent version but the most likely reason is that you incorrectly defined the schema (in combination with enabled Arrow support).
Either cast the input:
df["a"] = df.a.astype("str")
or define the correct schema:
from pyspark.sql.types import LongType
schema = StructType([
StructField("a", LongType(), True)
])

How to replace the Timedelta Pandas function with a pure PySpark function?

I am developing a small script in PySpark that generates a date sequence (36 months before today's date) and (while applying a truncate to be the first day of the month). Overall I succeeded this task however
But with the help of the Pandas package Timedelta to calculate the time delta .
Is there a way to replace this Timedelta from Pandas with a pure PySpark function ?
import pandas as pd
from datetime import date, timedelta, datetime
from pyspark.sql.functions import col, date_trunc
today = datetime.today()
data = [((date(today.year, today.month, 1) - pd.Timedelta(36,'M')),date(today.year, today.month, 1))] # I want to replace this Pandas function
df = spark.createDataFrame(data, ["minDate", "maxDate"])
+----------+----------+
| minDate| maxDate|
+----------+----------+
|2016-10-01|2019-10-01|
+----------+----------+
import pyspark.sql.functions as f
df = df.withColumn("monthsDiff", f.months_between("maxDate", "minDate"))\
.withColumn("repeat", f.expr("split(repeat(',', monthsDiff), ',')"))\
.select("*", f.posexplode("repeat").alias("date", "val"))\ #
.withColumn("date", f.expr("add_months(minDate, date)"))\
.select('date')\
.show(n=50)
+----------+
| date|
+----------+
|2016-10-01|
|2016-11-01|
|2016-12-01|
|2017-01-01|
|2017-02-01|
|2017-03-01|
etc...
+----------+
You can use Pyspark inbuilt trunc function.
pyspark.sql.functions.trunc(date, format)
Returns date truncated to the unit specified by the format.
Parameters:
format – ‘year’, ‘YYYY’, ‘yy’ or ‘month’, ‘mon’, ‘mm’
Imagine I have a below dataframe.
list = [(1,),]
df=spark.createDataFrame(list, ['id'])
import pyspark.sql.functions as f
df=df.withColumn("start_date" ,f.add_months(f.trunc(f.current_date(),"month") ,-36))
df=df.withColumn("max_date" ,f.trunc(f.current_date(),"month"))
>>> df.show()
+---+----------+----------+
| id|start_date| max_date|
+---+----------+----------+
| 1|2016-10-01|2019-10-01|
+---+----------+----------+
Here's a link with more details on Spark date functions.
Pyspark date Functions