Build hierarchy based on group by column - pyspark or pandas

Build hierarchy based on group by column - pyspark or pandas - pandas

I have dataframe
|empid|mgrid|deptid|
|-----|-----|------|
|1 |2 |1 |
|2 |3 |1 |
|5 |6 |1 |
|2 |3 |2 |
|3 |4 |2 |
|-----|-----|------|
Expected Output
|deptid|empid|hierarchy|
|------|-----|---------|
|1 |1 |[2,3] |
|1 |2 |[3] |
|1 |5 |[6] |
|2 |2 |[3,4] |
|2 |3 |4 |
|------|-----|---------|
My Problem - I want to build the hierarchy based on deptid.
I am using the below code to build, but its not based on any column. It takes all.
import pandas as pd
def walk(df, id, f, r, prev=pd.Series(dtype="int64")):
mgr = df.loc[df[f]==id,][r]
if not mgr.isna().all():
prev = walk(df, mgr.tolist()[0], f, r, prev)
return pd.concat([mgr, prev])
Trying something like
df_pandas = df_pandas[["deptid","empid","mgrid"]]
df_pandas1 = (df_pandas.groupby("deptid"))
df_pandas1.assign(parent_lineage=lambda x: x["empid"].apply(lambda e: (walk(x, e, "empid", "mgrid")
.dropna().astype("string").tolist()))

As this is a graph problem, I would use networkx to solve it:
import networkx as nx
def get_descendents(g):
G = nx.from_pandas_edgelist(g, source='empid', target='mgrid',
create_using=nx.DiGraph)
return g['empid'].map(lambda n: list(nx.descendants(G, n)))
df['hierarchy'] = df.groupby('deptid', group_keys=False).apply(get_descendents)
Output:
empid mgrid deptid hierarchy
0 1 2 1 [2, 3]
1 2 3 1 [3]
2 5 6 1 [6]
3 2 3 2 [3, 4]
4 3 4 2 [4]
Your subgraphs:
deptid 1:
deptid 2:

Related

Creating MAPTYPE field from multiple columns - Spark SQL

I have a use case wherein multiple keys are distributed across the dataset in a JSON format, which needs to be aggregated into a consolidated resultset for further processing.
I have been able to develop a code structure that achieves it using both Python API (PySpark) & Spark SQL, but the latter involves a more composite & tardy of doing it and involves intermediate conversations which can in the future lead to errors.
Using the below snippets, is there a better way to achieve this using Spark SQL, by creating a MAP<STRING,ARRAY<STRING> using key and value?
Data Preparation
from pyspark.sql.types import *
import pandas as pd
from io import StringIO
s = StringIO("""
id|json_struct
1|{"a":["tyeqb","",""],"e":["qwrqc","",""]}
1|{"t":["sartq","",""],"r":["fsafsq","",""]}
1|{"b":["puhqiqh","",""],"e":["hjfsaj","",""]}
2|{"b":["basajhjwa","",""],"e":["asfafas","",""]}
2|{"n":["gaswq","",""],"r":["sar","",""],"l":["sar","",""],"s":["rqqrq","",""],"m":["wrqwrq","",""]}
2|{"s":["tqqwjh","",""],"t":["afs","",""],"l":["fsaafs","",""]}
""")
df = pd.read_csv(s,delimiter='|')
sparkDF = spark.createDataFrame(df)
sparkDF.registerTempTable("INPUT")
sparkDF = sparkDF.withColumn('json_struct', F.from_json(F.col('json_struct')
,schema=MapType(StringType(),ArrayType(StringType()),True)
))
sparkDF.show(truncate=False)
+---+---------------------------------------------------------------------------------------+
|id |json_struct |
+---+---------------------------------------------------------------------------------------+
|1 |{a -> [tyeqb, , ], e -> [qwrqc, , ]} |
|1 |{t -> [sartq, , ], r -> [fsafsq, , ]} |
|1 |{b -> [puhqiqh, , ], e -> [hjfsaj, , ]} |
|2 |{b -> [basajhjwa, , ], e -> [asfafas, , ]} |
|2 |{n -> [gaswq, , ], r -> [sar, , ], l -> [sar, , ], s -> [rqqrq, , ], m -> [wrqwrq, , ]}|
|2 |{s -> [tqqwjh, , ], t -> [afs, , ], l -> [fsaafs, , ]} |
+---+---------------------------------------------------------------------------------------+
Python API (PySpark) - Implementation
As you can see, the resultant key from explode is natively a STRING type and since PySpark has create_map, which is not available within Spark SQL, it can be readily used to generate the final json_struct column ensuring a single key with a varying length ARRAYTYPE<STRING> value
sparkDF.select(
F.col('id')
,F.explode(F.col('json_struct'))
).withColumn('value',F.filter(F.col('value'), lambda x: x != '')\
).withColumn('value',F.concat_ws(',', F.col('value'))\
).groupBy('id', 'key'
).agg(F.collect_set(F.col('value')).alias('value')\
).withColumn('json_struct',F.to_json(F.create_map("key","value"))
).orderBy('id'
).show(truncate=False)
+---+---+---------------+------------------------+
|id |key|value |json_struct |
+---+---+---------------+------------------------+
|1 |a |[tyeqb] |{"a":["tyeqb"]} |
|1 |e |[hjfsaj, qwrqc]|{"e":["hjfsaj","qwrqc"]}|
|1 |r |[fsafsq] |{"r":["fsafsq"]} |
|1 |b |[puhqiqh] |{"b":["puhqiqh"]} |
|1 |t |[sartq] |{"t":["sartq"]} |
|2 |b |[basajhjwa] |{"b":["basajhjwa"]} |
|2 |n |[gaswq] |{"n":["gaswq"]} |
|2 |t |[afs] |{"t":["afs"]} |
|2 |s |[tqqwjh, rqqrq]|{"s":["tqqwjh","rqqrq"]}|
|2 |e |[asfafas] |{"e":["asfafas"]} |
|2 |l |[sar, fsaafs] |{"l":["sar","fsaafs"]} |
|2 |r |[sar] |{"r":["sar"]} |
|2 |m |[wrqwrq] |{"m":["wrqwrq"]} |
+---+---+---------------+------------------------+
Spark SQL - Implementation
Within this implementation, I have to take additional steps to ensure both key and value columns are of ARRAYTYPE and consistent lengths as map_from_arrays takes in arrays as inputs.
Is there a way to bypass these and create a similar schema as depicted using Python API?
sql.sql("""
SELECT
id,
KEY,
VALUE,
TO_JSON(MAP_FROM_ARRAYS(KEY,VALUE)) as json_struct
FROM (
SELECT
id,
key,
ARRAY(COLLECT_SET( value )) as value -- <------- ### Ensuring Value is NESTED ARRAY
FROM (
SELECT
id,
SPLIT(k,'|',1) as key, -- <------- ### Ensuring Key is Array
CONCAT_WS(',',FILTER(v,x -> x != '')) as value
FROM (
SELECT
id,
EXPLODE(FROM_JSON(json_struct,'MAP<STRING,ARRAY<STRING>>')) as (k,v)
FROM INPUT
)
)
GROUP BY 1,2
)
ORDER BY 1
""").show(truncate=False)
+---+---+-----------------+------------------------+
|id |KEY|VALUE |json_struct |
+---+---+-----------------+------------------------+
|1 |[a]|[[tyeqb]] |{"a":["tyeqb"]} |
|1 |[e]|[[hjfsaj, qwrqc]]|{"e":["hjfsaj","qwrqc"]}|
|1 |[b]|[[puhqiqh]] |{"b":["puhqiqh"]} |
|1 |[r]|[[fsafsq]] |{"r":["fsafsq"]} |
|1 |[t]|[[sartq]] |{"t":["sartq"]} |
|2 |[n]|[[gaswq]] |{"n":["gaswq"]} |
|2 |[b]|[[basajhjwa]] |{"b":["basajhjwa"]} |
|2 |[t]|[[afs]] |{"t":["afs"]} |
|2 |[s]|[[tqqwjh, rqqrq]]|{"s":["tqqwjh","rqqrq"]}|
|2 |[e]|[[asfafas]] |{"e":["asfafas"]} |
|2 |[l]|[[sar, fsaafs]] |{"l":["sar","fsaafs"]} |
|2 |[r]|[[sar]] |{"r":["sar"]} |
|2 |[m]|[[wrqwrq]] |{"m":["wrqwrq"]} |
+---+---+-----------------+------------------------+

Spark SQL instead of create_map has map. Your PySpark code could be translated into this:
df = spark.sql("""
WITH
TBL2 (SELECT id, EXPLODE(FROM_JSON(json_struct,'MAP<STRING,ARRAY<STRING>>')) from INPUT),
TBL3 (SELECT id, key, FLATTEN(COLLECT_SET(FILTER(value, x -> x != ''))) value
FROM TBL2
GROUP BY id, key)
SELECT *, TO_JSON(MAP(key, value)) json_struct
FROM TBL3
""")
df.show(truncate=0)
# +---+---+---------------+------------------------+
# |id |key|value |json_struct |
# +---+---+---------------+------------------------+
# |1 |a |[tyeqb] |{"a":["tyeqb"]} |
# |1 |e |[qwrqc, hjfsaj]|{"e":["qwrqc","hjfsaj"]}|
# |1 |b |[puhqiqh] |{"b":["puhqiqh"]} |
# |1 |r |[fsafsq] |{"r":["fsafsq"]} |
# |1 |t |[sartq] |{"t":["sartq"]} |
# |2 |b |[basajhjwa] |{"b":["basajhjwa"]} |
# |2 |n |[gaswq] |{"n":["gaswq"]} |
# |2 |s |[rqqrq, tqqwjh]|{"s":["rqqrq","tqqwjh"]}|
# |2 |t |[afs] |{"t":["afs"]} |
# |2 |e |[asfafas] |{"e":["asfafas"]} |
# |2 |l |[fsaafs, sar] |{"l":["fsaafs","sar"]} |
# |2 |r |[sar] |{"r":["sar"]} |
# |2 |m |[wrqwrq] |{"m":["wrqwrq"]} |
# +---+---+---------------+------------------------+

Spark Scala: How to Pass column value from one table as column condition another Dataframe Creation

I have a use case like this - I have a look-up table which contains formula and original table contains columns values and the final table need to create with formula and columns value from original table. For each client , formula will get changed .
lkp1:
|clnt_id | total_amount | total_avg
==============================================
|1 | col+col2 | col2-col1
|2 | col+col2+5 | 1
|3 | 2 | 14/col3
orig_1
clnt_id |name |col1 |col2 |col3
1 |name1 |1 |2 |4
2 |name2 |1 |4 |5
3 |name4 |3 |5 |7
final_1
clnt_id |name |Amount |avg
1 |name1 |3 |-2
2 |name2 |10 |1
3 |name4 |2 |2
I have achieved the same by using :
var final_1:DataFrame=_
var final_intermediate:DataFrame=_
var cnt=0
val lookup_1_df=spark.sql("select * from lookup_1")
var lookup_1_Df=lookup_1.
select(column("toatl_amount"),
column("total_avg")
.collect
val lookup_1_Df_length=lookup_1_Df.length
for (row <- 0 to lookup_1_Df_length-1)
{
var toatl_amount= lookup_1_Df(row)(0).toString
var total_avg= lookup_1_Df(row)(1).toString
var fina_df_frame= "select clnd_id,name,$toatl_amount,$total_avg from orig_table_1 a left join lookup_1 b on a.clin_id=b.clnt_id where a.clin_id='$clin_id"
var fina_df_frame_replaced=fina_df_frame.replace("$clin_id", clin_id).replace("$toatl_amount", toatl_amount).replace("$total_avg", total_avg)
final_intermediate=spark.sql(sqlText=fina_df_frame_replaced)
if (cnt == 0)
{
final_1=final_intermediate
}
else
{
final_1=final_intermediate.union(final_1)
}
cnt=cnt+1
}
final_1.createOrReplaceTempView("final_1_table")
Here I have shown sample data set only, my original table contains millions of records and I have 1000+ clients. Hence the looping is not an optimal solution as for each client, the above code snippet has to run which I am aware of. Can we do it in a more efficient way? Any suggestion?

spark join with column multiple values in list

I have
Dataset A: uuid, listOfLocationsIds, name
Dataset B: locationId, latitude, longitude
A.listOfLocationIds can have multiple locationIds
How can I do a join on A and B with each value in listOfLocationsIds?
So if there are two values in listOfLocationIds, I would want the join to consider each locationId in the listOfLocationIds
A.join(B, A.listOfLocationsIds[0] == B.locationId, "left")
A.join(B, A.listOfLocationsIds[1] == B.locationId, "left")

Assume dataset A is called df with this content:
+----+-----------------+-----+
|uuid|listOfLocationsId|name |
+----+-----------------+-----+
|1 |[1, 2, 3] |name1|
|2 |[1, 3] |name1|
+----+-----------------+-----+
and dataset B is called df2 with this content:
+----------+--------+---------+
|locationId|latitude|longitude|
+----------+--------+---------+
|2 |5 |7 |
+----------+--------+---------+
And we do an array_contains join:
df = df.join(df2,
array_contains(col("listOfLocationsId"), col("locationId")), "left"
)
The final result:
+----+-----------------+-----+----------+--------+---------+
|uuid|listOfLocationsId|name |locationId|latitude|longitude|
+----+-----------------+-----+----------+--------+---------+
|1 |[1, 2, 3] |name1|2 |5 |7 |
|2 |[1, 3] |name1|null |null |null |
+----+-----------------+-----+----------+--------+---------+
Good luck!

pyspark dataframe column creation

I'm beginner in pyspark. I have this problem where I have a vector column / list of values.
col = ["True", "False", "True"]
I want to create a column in dataframe (with 3 rows) with this vector / list of values. E.g., in pandas we can do df['col_name'] = col.

Unfortunately Spark don't have such function that work well in Pandas, you can can still achieve it by using joining. Assuming you have a sorted dataframe and a list of new value that want to be the new column:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
df = spark.createDataFrame([('a', ), ('b', ), ('c', )], ['column'])
df.show(3, False)
+------+----------+
|column|row_number|
+------+----------+
|a |1 |
|b |2 |
|c |3 |
+------+----------+
You can add a row number and do the joining:
new_add_column = spark.createDataFrame([(True, ), (False, ), (True, )], ['new_create_column'])\
.withColumn('row_number', func.row_number().over(Window.orderBy(func.lit(''))))
new_add_column.show(3, False)
+------+----------+
|column|row_number|
+------+----------+
|a |1 |
|b |2 |
|c |3 |
+------+----------+
new_add_column = spark.createDataFrame([(True, ), (False, ), (True, )], ['new_create_column'])\
.withColumn('row_number', func.row_number().over(Window.orderBy(func.lit(''))))
new_add_column.show(3, False)
+-----------------+----------+
|new_create_column|row_number|
+-----------------+----------+
|true |1 |
|false |2 |
|true |3 |
+-----------------+----------+
output = df.join(new_add_column, on='row_number', how='inner')
output.show(3, False)
+----------+------+-----------------+
|row_number|column|new_create_column|
+----------+------+-----------------+
|1 |a |true |
|2 |b |false |
|3 |c |true |
+----------+------+-----------------+

Assign Rank to Row based on Alphabetical Order Using Window Functions in PySpark

I'm trying to assign a rank to the rows of a dataframe using a window function over a string column (user_id), based on alphabetical order. So, for example:
user_id | rank_num
-------------------
A |1
A |1
A |1
B |2
A |1
B |2
C |3
B |2
B |2
C |3
I tried using the following lines of code:
user_window = Window().partitionBy('user_id').orderBy('user_id')
data = (data
.withColumn('profile_row_num', dense_rank().over(user_window))
)
But I'm getting something like:
user_id | rank_num
-------------------
A |1
A |1
A |1
B |1
A |1
B |1
C |1
B |1
B |1
C |1

Partition by user_id is unnecessary. This will cause all user_id to fall into their own partition and get a rank of 1. The code below should do what you wanted:
user_window = Window.orderBy('user_id')
data = data.withColumn('profile_row_num', dense_rank().over(user_window))

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Build hierarchy based on group by column - pyspark or pandas - pandas

Related

Creating MAPTYPE field from multiple columns - Spark SQL

Spark Scala: How to Pass column value from one table as column condition another Dataframe Creation

spark join with column multiple values in list

pyspark dataframe column creation

Assign Rank to Row based on Alphabetical Order Using Window Functions in PySpark

Categories

Resources