Transposing spark dataframe based on a subcolumn

I have a spark dataframe which looks like this:
|-- 0000154d-7585-5eb283ff985c: struct (nullable = true)
| |-- collaborative_rank: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- content_rank: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- curated_rank: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- discovery_score: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- original_rank: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- recipe_id: array (nullable = true)
| | |-- element: long (containsNull = true)
|-- 00005426-2675-68085cd359c7: struct (nullable = true)
| |-- collaborative_rank: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- content_rank: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- curated_rank: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- discovery_score: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- original_rank: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- recipe_id: array (nullable = true)
| | |-- element: long (containsNull = true)
Each column is a user id, e.g. 0000154d-7585-5eb283ff985c, and each row is made up of 15 000 users (they come from json files that each contain 15 000 users).
I want to transpose it such that each user id is a row, and each sub-column collaborative_rank, content_rank, curated_rank, discovery_score, original_rank and recipe_id is a column with the array being the value. I'm new to spark is there any painless way to do this?
For reference, an input .json file I'm reading from looks like this:
{"0000154d-7585-4096-a71a-5eb283ff985c": {"recipe_id": [1, 2, 3], "collaborative_rank": [1, 2, 3], "curated_rank": [1, 2, 3], "discovery_score": [1]}, "00005426-2675-4940-8394-e8085cd359c7": {"recipe_id": [] ... }

If you don't want to convert it to rdd and perform UDF, you can consider stacking the dataframe.
df ='C:\stackoverflow\samples\inp.json')
stack_characteristics = str(len(df.columns))+','+','.join([f"'{v}',`{v}`" for v in df.columns])'''stack({stack_characteristics})''').alias('userId','vals')).\
select('userId', 'vals.*').show()
| userId|collaborative_rank|curated_rank|discovery_score|recipe_id|
|0000154d-7585-409...| [1, 2, 3]| [1, 2, 3]| [1]|[1, 2, 3]|
|00005426-2675-494...| [1, 2, 3]| [1, 2, 3]| [1]|[1, 2, 3]|

AFAIK, this below code may solve your problem.
input json considered,
{"0000154d-7585-4096-a71a-5eb283ff985c": {"recipe_id": [1, 2, 3], "collaborative_rank": [1, 2, 3], "curated_rank": [1, 2, 3], "discovery_score": [1] }}
from pyspark.sql import Row
#read an input data"/home/sathya/Desktop/stackoverflo/input.json")
#method to extract keys to columns
def extract_json(row):
out_array = []
data_dict = row.asDict()
for k in data_dict.keys():
out_array.append(Row(k, data_dict[k][0], data_dict[k][1],data_dict[k][2],data_dict[k][3]))
return Row(*out_array)
#flatmap columns and extracting the data
rdd = df.rdd.flatMap(extract_json)
#df creation
df1.selectExpr("_1 as user_id","_2 as recipe_id", "_3 as collaborative_rank", "_4 as curated_rank", "_5 as discovery_score").show(truncate=False)
|user_id |recipe_id|collaborative_rank|curated_rank|discovery_score|
|0000154d-7585-4096-a71a-5eb283ff985c|[1, 2, 3]|[1, 2, 3] |[1] |[1, 2, 3] |


I have a text classification problem.
I'm particularly interested in this embedding model in sparknlp because I have a dataset from Wikipedia in 'sq' language. I need to convert sentences of my dataset into embeddings.
I do so by WordEmbeddingsModel, however, after the embeddings are generated I don't know how to prepare them to make ready as an input for an RNN model using keras and tensorflow.
My dataset has two columns 'text' and 'label', until now I was able to do the following steps:
# start spark session
spark = sparknlp.start(gpu=True)
# convert train df into spark df
| text|label|
|Joy Adowaa Buolam...| 0|
|Ajo themeloi "Alg...| 1|
|Buolamwini lindi ...| 1|
|Kur ishte 9 vjeç,...| 0|
|Si një studente u...| 1|
# define sparknlp pipeline
document = DocumentAssembler()\
tokenizer = Tokenizer() \
.setInputCols(\["document"\]) \
embeddings = WordEmbeddingsModel\
.setInputCols(\["document", "token"\])\
pipeline = Pipeline(stages=\[document, tokenizer, embeddings\])
# fit the pipeline to the training data
model =
# apply the pipeline to the training data
result = model.transform(spark_train_df)
| text|label| document| token| embeddings|
|Joy Adowaa Buolam...| 0|[{document, 0, 13...|[{token, 0, 2, Jo...|[{word_embeddings...|
|Ajo themeloi "Alg...| 1|[{document, 0, 13...|[{token, 0, 2, Aj...|[{word_embeddings...|
|Buolamwini lindi ...| 1|[{document, 0, 94...|[{token, 0, 9, Bu...|[{word_embeddings...|
|Kur ishte 9 vjeç,...| 0|[{document, 0, 12...|[{token, 0, 2, Ku...|[{word_embeddings...|
|Si një studente u...| 1|[{document, 0, 15...|[{token, 0, 1, Si...|[{word_embeddings...|
|Buolamwini diplom...| 1|[{document, 0, 11...|[{token, 0, 9, Bu...|[{word_embeddings...|
The schema of result is:
|-- text: string (nullable = true)
|-- label: long (nullable = true)
|-- document: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
|-- token: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
|-- embeddings: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
The output I receive from:
result.schema["embeddings"].dataType is:
ArrayType(StructType([StructField('annotatorType', StringType(), True), StructField('begin', IntegerType(), False), StructField('end', IntegerType(), False), StructField('result', StringType(), True), StructField('metadata', MapType(StringType(), StringType(), True), True), StructField('embeddings', ArrayType(FloatType(), False), True)]), True)

Spark - Merge two columns of array struct type

I have a dataframe of schema -
|-- A: string (nullable = true)
|-- B: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- key: string (nullable = true)
| | |-- x: double (nullable = true)
| | |-- y: double (nullable = true)
| | |-- z: double (nullable = true)
|-- C: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- key: string (nullable = true)
| | |-- x: double (nullable = true)
| | |-- y: double (nullable = true)
I want to merge column B & C (array_union). But array_union is not working because of different data types of these columns. Structs of B & C have pretty much same columns except z. I don't care about z - whether it is present or not - in their merged output.
What would be a good way to achieve this?
Sure, drop Z in B and then array_join()
new = (df1.withColumn('B',expr("transform(B,s->struct(s.key as key,s.x as x, s.y as y))"))#drop Z
.withColumn('D', array_union(col('B'),col('C')))#array_join
.drop('B','C')#Drop B and C if not needed
|-- A: string (nullable = false)
|-- D: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- key: string (nullable = true)
| | |-- x: double (nullable = true)
| | |-- y: double (nullable = true)
Transform the column 'C' like this and use the array_union after:
import pyspark.sql.functions as f
df = (df
.withColumn('z', f.expr("transform(C, element -> cast(1 AS double))"))
.withColumn('C', f.expr("transform(C, (element, idx) -> struct(element_at(C.x, idx + 1) AS x, element_at(C.y, idx + 1) AS y, element_at(z, idx + 1) AS z))"))

Flatten dataframe with nested struct ArrayType using pyspark

I have a dataframe with this schema
|-- AUTHOR_ID: integer (nullable = false)
|-- NAME: string (nullable = true)
|-- Books: array (nullable = false)
| |-- element: struct (containsNull = false)
| | |-- BOOK_ID: integer (nullable = false)
| | |-- Chapters: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- NAME: string (nullable = true)
| | | | |-- NUMBER_PAGES: integer (nullable = true)
How to flat all columns into one level with Pyspark ?
Using inline function:
df2 = (df.selectExpr("AUTHOR_ID", "NAME", "inline(Books)")
.selectExpr("*", "inline(Chapters)")
Or explode:
from pyspark.sql import functions as F
df2 = (df.withColumn("Books", F.explode("Books"))
.select("*", "Books.*")
.withColumn("Chapters", F.explode("Chapters"))
.select("*", "Chapters.*")

Why does dropping a nested column in pyspark doesn't work?

trying to drop a nested column from a dataframe in pyspark doesn't work.
This is the sinppet from my code:
`from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType , BooleanType
from pyspark.sql.functions import udf
#simple filter function
def my_filter(x):
if (x != df_new.a.b) :
return True
else :
return False
#df.filter(my_filter('id', 'category')).show()
def drop_col(df, struct_nm, delete_struct_child_col_nm):
#fields_to_keep = filter(lambda x: x != delete_struct_child_col_nm,"{}.*".format(struct_nm)).columns)
fields_to_keep = filter(lambda x: my_filter(x) ,"{}.*".format(struct_nm)).columns)
fields_to_keep = list(map(lambda x: "{}.{}".format(struct_nm, x), fields_to_keep))
return df.withColumn(struct_nm, struct(fields_to_keep))
drop_col(df_new, "a", df_new.a.b)`
I used UDF because
trying the following line didn't work. As this not != symbol doesn't work nor does using tilde
#fields_to_keep = filter(lambda x: x != delete_struct_child_col_nm,"{}.*".format
Someone asked for schema in the comment so i am providing it
|-- a: struct (nullable = false)
| |-- rawEntity: string (nullable = true)
| |-- entityType: string (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- StoreId: string (nullable = true)
| | | |-- AppId: string (nullable = true)
| |-- Timestamps: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- User: string (nullable = true)
| |-- b: array (nullable = true) //trying to drop this
| | |-- element: string (containsNull = true)
| |-- KeywordsFull: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- Keywords: string (nullable = true)
| | |-- element: string (containsNull = true)

Dropping nested column of Dataframe with PySpark

I'm trying to drop some nested columns from structs in a Spark dataframe using PySpark.
I found this for Scala that seems to be doing exactly what I want to, but I'm not familiar with Scala and don't know how to write it in Python.
Example for pyspark:
def drop_col(df, struct_nm, delete_struct_child_col_nm):
fields_to_keep = filter(lambda x: x != delete_struct_child_col_nm,"{}.*".format(struct_nm)).columns)
fields_to_keep = list(map(lambda x: "{}.{}".format(struct_nm, x), fields_to_keep))
return df.withColumn(struct_nm, struct(fields_to_keep))
A method that I found using pyspark is by first converting the nested column into json and then parse the converted json with a new nested schema with the unwanted columns filtered out.
Suppose I have the following schema and I want to drop d, e and j (a.b.d, a.e, a.h.j) from the dataframe:
|-- a: struct (nullable = true)
| |-- b: struct (nullable = true)
| | |-- c: long (nullable = true)
| | |-- d: string (nullable = true)
| |-- e: struct (nullable = true)
| | |-- f: long (nullable = true)
| | |-- g: string (nullable = true)
| |-- h: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- i: string (nullable = true)
| | | |-- j: string (nullable = true)
|-- k: string (nullable = true)
I used the following approach:
Create new schema for a by excluding d, e and j. A quick way to do this is by manually select the fields that you want from"a").schema and create a new schema from the selected fields using StructType. Or, you can do this programmatically by traversing the schema tree and exclude the unwanted fields, something like:
def exclude_nested_field(schema, unwanted_fields, parent=""):
new_schema = []
for field in schema:
full_field_name =
if parent:
full_field_name = parent + "." + full_field_name
if full_field_name not in unwanted_fields:
if isinstance(field.dataType, StructType):
inner_schema = exclude_nested_field(field.dataType, unwanted_fields, full_field_name)
new_schema.append(StructField(, inner_schema))
elif isinstance(field.dataType, ArrayType):
new_schema.append(StructField(, ArrayType(field.dataType.elementType)))
new_schema.append(StructField(, field.dataType))
return StructType(new_schema)
new_schema = exclude_nested_field(df.schema["a"].dataType, ["b.d", "e", "h.j"])
Convert a column to json: .withColumn("json", F.to_json("a")).drop("a")
Parse the json-converted a column from step 2 with the new schema found in step 1: .withColumn("a", F.from_json("json", new_schema)).drop("json")
We can now do it natively with Spark version >= 3.1
Althoug I've no solution for PySpark, maybe it's easier to translate this into python. Consider a dataframe df with schema:
|-- employee: struct (nullable = false)
| |-- name: string (nullable = false)
| |-- age: integer (nullable = false)
Then if you want e.g. to drop name,
you can do:
val fieldsToKeep =$"employee.*").columns
.filter(_!="name") // the nested column you want to drop
.map(n => "employee."+n)
// overwite column with subset of fields
Having the below dataframe, the aim is to drop d, e and j.
from pyspark.sql import functions as F
df = spark.createDataFrame([], "a struct<b:struct<c:bigint,d:string>,e:struct<f:bigint,g:string>,h:array<struct<i:string,j:string>>>, k string")
# root
# |-- a: struct (nullable = true)
# | |-- b: struct (nullable = true)
# | | |-- c: long (nullable = true)
# | | |-- d: string (nullable = true) # <<--- to be dropped
# | |-- e: struct (nullable = true) # <<--- to be dropped
# | | |-- f: long (nullable = true)
# | | |-- g: string (nullable = true)
# | |-- h: array (nullable = true)
# | | |-- element: struct (containsNull = true)
# | | | |-- i: string (nullable = true)
# | | | |-- j: string (nullable = true) # <<--- to be dropped
# |-- k: string (nullable = true)
e is the easiest:
df = df.withColumn("a", F.col("a").dropFields("e"))
# root
# |-- a: struct (nullable = true)
# | |-- b: struct (nullable = true)
# | | |-- c: long (nullable = true)
# | | |-- d: string (nullable = true)
# | |-- h: array (nullable = true)
# | | |-- element: struct (containsNull = true)
# | | | |-- i: string (nullable = true)
# | | | |-- j: string (nullable = true)
# |-- k: string (nullable = true)
In order to drop d, we must go inside b:
df = df.withColumn("a", F.col("a").withField("b", F.col("a.b").dropFields("d")))
# root
# |-- a: struct (nullable = true)
# | |-- b: struct (nullable = true)
# | | |-- c: long (nullable = true)
# | |-- h: array (nullable = true)
# | | |-- element: struct (containsNull = true)
# | | | |-- i: string (nullable = true)
# | | | |-- j: string (nullable = true)
# |-- k: string (nullable = true)
j is inside array, so transform must also be used. It "loops" through every array's elements (in this case, the element is a struct) and transforms it (removes a field).
df = df.withColumn("a", F.col("a").withField(
lambda x: x.dropFields("j")
# root
# |-- a: struct (nullable = true)
# | |-- b: struct (nullable = true)
# | | |-- c: long (nullable = true)
# | |-- h: array (nullable = true)
# | | |-- element: struct (containsNull = true)
# | | | |-- i: string (nullable = true)
# |-- k: string (nullable = true)
Pyspark version of Raphaels Scala answer.
This runs at a certain depth, discards everything above that depth and filters on the row below it.
def remove_columns(df,root):
from pyspark.sql.functions import col
cols =
fields_filter = filter(lambda x: x[0]!= "$", cols) # use your own lambda here.
fieldsToKeep = list(map(lambda x: root[:-1] + x, fields_filter))
df = remove_columns(raw_df, root="level1.level2.*")