Dropping nested column of Dataframe with PySpark - dataframe
I'm trying to drop some nested columns from structs in a Spark dataframe using PySpark.
I found this for Scala that seems to be doing exactly what I want to, but I'm not familiar with Scala and don't know how to write it in Python.
https://stackoverflow.com/a/39943812/5706548
Example for pyspark:
def drop_col(df, struct_nm, delete_struct_child_col_nm):
fields_to_keep = filter(lambda x: x != delete_struct_child_col_nm, df.select("{}.*".format(struct_nm)).columns)
fields_to_keep = list(map(lambda x: "{}.{}".format(struct_nm, x), fields_to_keep))
return df.withColumn(struct_nm, struct(fields_to_keep))
A method that I found using pyspark is by first converting the nested column into json and then parse the converted json with a new nested schema with the unwanted columns filtered out.
Suppose I have the following schema and I want to drop d, e and j (a.b.d, a.e, a.h.j) from the dataframe:
root
|-- a: struct (nullable = true)
| |-- b: struct (nullable = true)
| | |-- c: long (nullable = true)
| | |-- d: string (nullable = true)
| |-- e: struct (nullable = true)
| | |-- f: long (nullable = true)
| | |-- g: string (nullable = true)
| |-- h: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- i: string (nullable = true)
| | | |-- j: string (nullable = true)
|-- k: string (nullable = true)
I used the following approach:
Create new schema for a by excluding d, e and j. A quick way to do this is by manually select the fields that you want from df.select("a").schema and create a new schema from the selected fields using StructType. Or, you can do this programmatically by traversing the schema tree and exclude the unwanted fields, something like:
def exclude_nested_field(schema, unwanted_fields, parent=""):
new_schema = []
for field in schema:
full_field_name = field.name
if parent:
full_field_name = parent + "." + full_field_name
if full_field_name not in unwanted_fields:
if isinstance(field.dataType, StructType):
inner_schema = exclude_nested_field(field.dataType, unwanted_fields, full_field_name)
new_schema.append(StructField(field.name, inner_schema))
elif isinstance(field.dataType, ArrayType):
new_schema.append(StructField(field.name, ArrayType(field.dataType.elementType)))
else:
new_schema.append(StructField(field.name, field.dataType))
return StructType(new_schema)
new_schema = exclude_nested_field(df.schema["a"].dataType, ["b.d", "e", "h.j"])
Convert a column to json: .withColumn("json", F.to_json("a")).drop("a")
Parse the json-converted a column from step 2 with the new schema found in step 1: .withColumn("a", F.from_json("json", new_schema)).drop("json")
We can now do it natively with Spark version >= 3.1
https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.Column.dropFields.html
Althoug I've no solution for PySpark, maybe it's easier to translate this into python. Consider a dataframe df with schema:
root
|-- employee: struct (nullable = false)
| |-- name: string (nullable = false)
| |-- age: integer (nullable = false)
Then if you want e.g. to drop name,
you can do:
val fieldsToKeep = df.select($"employee.*").columns
.filter(_!="name") // the nested column you want to drop
.map(n => "employee."+n)
// overwite column with subset of fields
df
.withColumn("employee",struct(fieldsToKeep.head,fieldsToKeep.tail:_*))
Having the below dataframe, the aim is to drop d, e and j.
from pyspark.sql import functions as F
df = spark.createDataFrame([], "a struct<b:struct<c:bigint,d:string>,e:struct<f:bigint,g:string>,h:array<struct<i:string,j:string>>>, k string")
df.printSchema()
# root
# |-- a: struct (nullable = true)
# | |-- b: struct (nullable = true)
# | | |-- c: long (nullable = true)
# | | |-- d: string (nullable = true) # <<--- to be dropped
# | |-- e: struct (nullable = true) # <<--- to be dropped
# | | |-- f: long (nullable = true)
# | | |-- g: string (nullable = true)
# | |-- h: array (nullable = true)
# | | |-- element: struct (containsNull = true)
# | | | |-- i: string (nullable = true)
# | | | |-- j: string (nullable = true) # <<--- to be dropped
# |-- k: string (nullable = true)
e is the easiest:
df = df.withColumn("a", F.col("a").dropFields("e"))
df.printSchema()
# root
# |-- a: struct (nullable = true)
# | |-- b: struct (nullable = true)
# | | |-- c: long (nullable = true)
# | | |-- d: string (nullable = true)
# | |-- h: array (nullable = true)
# | | |-- element: struct (containsNull = true)
# | | | |-- i: string (nullable = true)
# | | | |-- j: string (nullable = true)
# |-- k: string (nullable = true)
In order to drop d, we must go inside b:
df = df.withColumn("a", F.col("a").withField("b", F.col("a.b").dropFields("d")))
df.printSchema()
# root
# |-- a: struct (nullable = true)
# | |-- b: struct (nullable = true)
# | | |-- c: long (nullable = true)
# | |-- h: array (nullable = true)
# | | |-- element: struct (containsNull = true)
# | | | |-- i: string (nullable = true)
# | | | |-- j: string (nullable = true)
# |-- k: string (nullable = true)
j is inside array, so transform must also be used. It "loops" through every array's elements (in this case, the element is a struct) and transforms it (removes a field).
df = df.withColumn("a", F.col("a").withField(
"h",
F.transform(
F.col("a.h"),
lambda x: x.dropFields("j")
)
))
df.printSchema()
# root
# |-- a: struct (nullable = true)
# | |-- b: struct (nullable = true)
# | | |-- c: long (nullable = true)
# | |-- h: array (nullable = true)
# | | |-- element: struct (containsNull = true)
# | | | |-- i: string (nullable = true)
# |-- k: string (nullable = true)
Pyspark version of Raphaels Scala answer.
This runs at a certain depth, discards everything above that depth and filters on the row below it.
def remove_columns(df,root):
from pyspark.sql.functions import col
cols = df.select(root).columns
fields_filter = filter(lambda x: x[0]!= "$", cols) # use your own lambda here.
fieldsToKeep = list(map(lambda x: root[:-1] + x, fields_filter))
return df.select(fieldsToKeep)
df = remove_columns(raw_df, root="level1.level2.*")
Related
Spark - Merge two columns of array struct type
I have a dataframe of schema - |-- A: string (nullable = true) |-- B: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- key: string (nullable = true) | | |-- x: double (nullable = true) | | |-- y: double (nullable = true) | | |-- z: double (nullable = true) |-- C: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- key: string (nullable = true) | | |-- x: double (nullable = true) | | |-- y: double (nullable = true) I want to merge column B & C (array_union). But array_union is not working because of different data types of these columns. Structs of B & C have pretty much same columns except z. I don't care about z - whether it is present or not - in their merged output. What would be a good way to achieve this?
Sure, drop Z in B and then array_join() new = (df1.withColumn('B',expr("transform(B,s->struct(s.key as key,s.x as x, s.y as y))"))#drop Z .withColumn('D', array_union(col('B'),col('C')))#array_join .drop('B','C')#Drop B and C if not needed ).printSchema() root |-- A: string (nullable = false) |-- D: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- key: string (nullable = true) | | |-- x: double (nullable = true) | | |-- y: double (nullable = true)
Transform the column 'C' like this and use the array_union after: import pyspark.sql.functions as f df = (df .withColumn('z', f.expr("transform(C, element -> cast(1 AS double))")) .withColumn('C', f.expr("transform(C, (element, idx) -> struct(element_at(C.x, idx + 1) AS x, element_at(C.y, idx + 1) AS y, element_at(z, idx + 1) AS z))")) .drop('z') )
Flatten dataframe with nested struct ArrayType using pyspark
I have a dataframe with this schema root |-- AUTHOR_ID: integer (nullable = false) |-- NAME: string (nullable = true) |-- Books: array (nullable = false) | |-- element: struct (containsNull = false) | | |-- BOOK_ID: integer (nullable = false) | | |-- Chapters: array (nullable = true) | | | |-- element: struct (containsNull = true) | | | | |-- NAME: string (nullable = true) | | | | |-- NUMBER_PAGES: integer (nullable = true) How to flat all columns into one level with Pyspark ?
Using inline function: df2 = (df.selectExpr("AUTHOR_ID", "NAME", "inline(Books)") .selectExpr("*", "inline(Chapters)") .drop("Chapters") ) Or explode: from pyspark.sql import functions as F df2 = (df.withColumn("Books", F.explode("Books")) .select("*", "Books.*") .withColumn("Chapters", F.explode("Chapters")) .select("*", "Chapters.*") )
pyspark: rearrange nested array of struct sequence
I've a dataframe in this format and I would like to rearrange the fields inside item column. root |-- order: string (nullable = true) |-- dt: struct (nullable = true) |-- item: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- a: long (nullable = true) | | |-- b: string (nullable = true) | | |-- c: long (nullable = true) So this is the desired format I'm looking for. root |-- order: string (nullable = true) |-- dt: struct (nullable = true) |-- item: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- a: long (nullable = true) | | |-- c: string (nullable = true) | | |-- b: long (nullable = true)
You can use transform function: from pyspark.sql import functions as F result = df.withColumn( "item", F.expr("transform(item, x -> struct(x.a as a, x.c as c, x.b as b))") )
Why does dropping a nested column in pyspark doesn't work?
trying to drop a nested column from a dataframe in pyspark doesn't work. This is the sinppet from my code: `from pyspark.sql import functions as F from pyspark.sql.types import IntegerType , BooleanType from pyspark.sql.functions import udf #simple filter function #F.udf(returnType=BooleanType()) def my_filter(x): if (x != df_new.a.b) : return True else : return False #df.filter(my_filter('id', 'category')).show() def drop_col(df, struct_nm, delete_struct_child_col_nm): #fields_to_keep = filter(lambda x: x != delete_struct_child_col_nm, df.select("{}.*".format(struct_nm)).columns) fields_to_keep = filter(lambda x: my_filter(x) , df.select("{}.*".format(struct_nm)).columns) fields_to_keep = list(map(lambda x: "{}.{}".format(struct_nm, x), fields_to_keep)) return df.withColumn(struct_nm, struct(fields_to_keep)) drop_col(df_new, "a", df_new.a.b)` I used UDF because trying the following line didn't work. As this not != symbol doesn't work nor does using tilde #fields_to_keep = filter(lambda x: x != delete_struct_child_col_nm, df.select("{}.*".format EDIT: Someone asked for schema in the comment so i am providing it root |-- a: struct (nullable = false) | |-- rawEntity: string (nullable = true) | |-- entityType: string (nullable = true) | | |-- element: struct (containsNull = true) | | | |-- StoreId: string (nullable = true) | | | |-- AppId: string (nullable = true) | |-- Timestamps: array (nullable = true) | | |-- element: long (containsNull = true) | |-- User: string (nullable = true) | |-- b: array (nullable = true) //trying to drop this | | |-- element: string (containsNull = true) | |-- KeywordsFull: array (nullable = true) | | |-- element: struct (containsNull = true) | | | |-- Keywords: string (nullable = true) | | |-- element: string (containsNull = true)
How do I add a column to a nested struct in a PySpark dataframe?
I have a dataframe with a schema like root |-- state: struct (nullable = true) | |-- fld: integer (nullable = true) I'd like to add columns within the state struct, that is, create a dataframe with a schema like root |-- state: struct (nullable = true) | |-- fld: integer (nullable = true) | |-- a: integer (nullable = true) I tried df.withColumn('state.a', val).printSchema() # root # |-- state: struct (nullable = true) # | |-- fld: integer (nullable = true) # |-- state.a: integer (nullable = true)
Here is a way to do it without using a udf: # create example dataframe import pyspark.sql.functions as f data = [ ({'fld': 0},) ] schema = StructType( [ StructField('state', StructType( [StructField('fld', IntegerType())] ) ) ] ) df = sqlCtx.createDataFrame(data, schema) df.printSchema() #root # |-- state: struct (nullable = true) # | |-- fld: integer (nullable = true) Now use withColumn() and add the new field using lit() and alias(). val = 1 df_new = df.withColumn( 'state', f.struct(*[f.col('state')['fld'].alias('fld'), f.lit(val).alias('a')]) ) df_new.printSchema() #root # |-- state: struct (nullable = false) # | |-- fld: integer (nullable = true) # | |-- a: integer (nullable = false) If you have a lot of fields in the nested struct you can use a list comprehension, using df.schema["state"].dataType.names to get the field names. For example: val = 1 s_fields = df.schema["state"].dataType.names # ['fld'] df_new = df.withColumn( 'state', f.struct(*([f.col('state')[c].alias(c) for c in s_fields] + [f.lit(val).alias('a')])) ) df_new.printSchema() #root # |-- state: struct (nullable = false) # | |-- fld: integer (nullable = true) # | |-- a: integer (nullable = false) References I found a way to get the field names from the Struct without naming them manually from this answer.
Use a transformation such as the following: import pyspark.sql.functions as f df = df.withColumn( "state", f.struct( f.col("state.*"), f.lit(123).alias("a") ) )
Although this is a too late answer, for pyspark version 2.x.x following is supported. Assuming dfOld already contains state and fld as asked in question. dfOld.withColumn("a","value") dfNew = dfOld.select("level1Field1", "level1Field2", struct(col("state.fld").alias("fld"), col("a")).alias("state")) Reference: https://medium.com/#mrpowers/adding-structtype-columns-to-spark-dataframes-b44125409803
Here's a way to do it without a udf. Initialize example dataframe: nested_df1 = (spark.read.json(sc.parallelize(["""[ { "state": {"fld": 1} }, { "state": {"fld": 2}} ]"""]))) nested_df1.printSchema() root |-- state: struct (nullable = true) | |-- fld: long (nullable = true) Spark .read.json imports all integers as long by default. If state.fld has to be an int, you will need to cast it. from pyspark.sql import functions as F nested_df1 = (nested_df1 .select( F.struct(F.col("state.fld").alias("fld").cast('int')).alias("state") )) nested_df1.printSchema() root |-- state: struct (nullable = false) | |-- col1: integer (nullable = true) nested_df1.show() +-----+ |state| +-----+ | [1]| | [2]| +-----+ Finally Use .select to get the nested columns you want from the existing struct with the "parent.child" notation, create the new column, then re-wrap the old columns together with the new columns in a struct. val_a = 3 nested_df2 = (nested_df .select( F.struct( F.col("state.fld"), F.lit(val_a).alias("a") ).alias("state") ) ) nested_df2.printSchema() root |-- state: struct (nullable = false) | |-- fld: integer (nullable = true) | |-- a: integer (nullable = false) nested_df2.show() +------+ | state| +------+ |[1, 3]| |[2, 3]| +------+ Flatten if needed with "parent.*". nested_df2.select("state.*").printSchema() root |-- fld: integer (nullable = true) |-- a: integer (nullable = false) nested_df2.select("state.*").show() +---+---+ |fld| a| +---+---+ | 1| 3| | 2| 3| +---+---+
Spark 3.1+ F.col('state').withField('a', F.lit(1)) Example: from pyspark.sql import functions as F df = spark.createDataFrame([((1,),)], 'state:struct<fld:int>') df.printSchema() # root # |-- state: struct (nullable = true) # | |-- fld: integer (nullable = true) df = df.withColumn('state', F.col('state').withField('a', F.lit(1))) df.printSchema() # root # |-- state: struct (nullable = true) # | |-- fld: integer (nullable = true) # | |-- a: integer (nullable = false)
You can use the struct function import pyspark.sql.functions as f df = df.withColumn( "state", f.struct( f.col("state.fld").alias("fld"), f.lit(1).alias("a") ) )
from pyspark.sql.functions import * from pyspark.sql.types import * def add_field_in_dataframe(nfield, df, dt): fields = nfield.split(".") print fields n = len(fields) addField = fields[0] if n == 1: return df.withColumn(addField, lit(None).cast(dt)) nestedField = ".".join(fields[:-1]) sfields = df.select(nestedField).schema[fields[-2]].dataType.names print sfields ac = col(nestedField) if n == 2: nc = struct(*( [ac[c].alias(c) for c in sfields] + [lit(None).cast(dt).alias(fields[-1])])) else: nc = struct(*( [ac[c].alias(c) for c in sfields] + [lit(None).cast(dt).alias(fields[-1])])).alias(fields[-2]) print nc n = n - 1 while n > 1: print "n: ",n fields = fields[:-1] print "fields: ", fields nestedField = ".".join(fields[:-1]) print "nestedField: ", nestedField sfields = df.select(nestedField).schema[fields[-2]].dataType.names print fields[-1] print "sfields: ", sfields sfields = [s for s in sfields if s != fields[-1]] print "sfields: ", sfields ac = col(".".join(fields[:-1])) if n > 2: print fields[-2] nc = struct(*( [ac[c].alias(c) for c in sfields] + [nc])).alias(fields[-2]) else: nc = struct(*( [ac[c].alias(c) for c in sfields] + [nc])) n = n - 1 return df.withColumn(addField, nc)