I have this RDD in PySpark and i want to make the schema.
Example of 1 row of RDD collected:
(('16/12/2006', '17:24:00', 4.216, 0.418, 234.84, 18.4, 0.0, 1.0, 17.0), 0)
customSchema = StructType([
StructField("Date", StringType(), True),
StructField("Hour", StringType(), True),
StructField("ActivePower", FloatType(), True),
StructField("ReactivePower", FloatType(), True),
StructField("Voltage", FloatType(), True),
StructField("Instensity", FloatType(), True),
StructField("Sub1", FloatType(), True),
StructField("Sub2", FloatType(), True),
StructField("Sub3", FloatType(), True),
StructField("ID", IntegerType(), True)])
The problem is that the Index (last zero) is out of the tuple of data and I don't know how to make the schema correctly.
Thank you in advance.
You're almost there. you just need another StructField:
data = [
(('16/12/2006', '17:24:00', 4.216, 0.418, 234.84, 18.4, 0.0, 1.0, 17.0), 0)
]
schema = StructType([
StructField("values", StructType([
StructField("Date", StringType(), True),
StructField("Hour", StringType(), True),
StructField("ActivePower", FloatType(), True),
StructField("ReactivePower", FloatType(), True),
StructField("Voltage", FloatType(), True),
StructField("Instensity", FloatType(), True),
StructField("Sub1", FloatType(), True),
StructField("Sub2", FloatType(), True),
StructField("Sub3", FloatType(), True),
])),
StructField("ID", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)
df.printSchema()
root
|-- values: struct (nullable = true)
| |-- Date: string (nullable = true)
| |-- Hour: string (nullable = true)
| |-- ActivePower: float (nullable = true)
| |-- ReactivePower: float (nullable = true)
| |-- Voltage: float (nullable = true)
| |-- Instensity: float (nullable = true)
| |-- Sub1: float (nullable = true)
| |-- Sub2: float (nullable = true)
| |-- Sub3: float (nullable = true)
|-- ID: integer (nullable = true)
df.show(1, False)
+----------------------------------------------------------+---+
|values |ID |
+----------------------------------------------------------+---+
|[16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0]|0 |
+----------------------------------------------------------+---+
Related
I can not take any inputI have a schema.
schm = StructType([
StructField("ID", IntegerType(), True),
StructField("fixed_credit_limit", IntegerType(), True),
StructField("credit_limit", IntegerType(), True),
StructField("due_amount", IntegerType(), True),
StructField("created_date", StringType(), True),
StructField("updated_date", StringType(), True),
StructField("agent_name_id",IntegerType(), True),
StructField("rfid_id", StringType(), True),
])
input=[(13158,100,100,0,'05/29/2021 11:01:31','05/29/2021 11:01:31',5,'862b4497-577f-47f9-8725-dd6c397ce408')]
df1 = spark.createDataFrame(input, schema)
I want to take the user input of agent_name_id but it gives the error ['list' object is not callable]
how can I take the user input of agent_name_id.
I have this schema:
schm = StructType([
StructField("User ID", IntegerType(), True),
StructField("Tag", StringType(), True),
StructField("Activated", StringType(), True),
StructField("Created Date", StringType(), True),
StructField("Updated Date", StringType(), True),
StructField("Valid Until", StringType(), True),
StructField("last used", StringType(), True),
StructField("reference", StringType(), True),
StructField("employee code", IntegerType(), True),
StructField("Unique user ID", IntegerType(), True),
])
schm
I want to build a dataframe for 998 values.But the problem is i can't understand how to get input data for 998 rows(same value for 998 rows):
df = create_df(spark, input_data, schm)
I am new to Apache Spark, so forgive me if this is a noob question. I am trying to define a particular schema before reading in the dataset in order to speed up processing. There are a few data types that I am not sure how to define (ArrayType and StructType).
Here is a screenshot of the schema I am working with:
Here is what I have so far:
jsonSchema = StructType([StructField("attribution", ArrayType(), True),
StructField("averagingPeriod", StructType(), True),
StructField("city", StringType(), True),
StructField("coordinates", StructType(), True),
StructField("country", StringType(), True),
StructField("date", StructType(), True),
StructField("location", StringType(), True),
StructField("mobile", BooleanType(), True),
StructField("parameter", StringType(), True),
StructField("sourceName", StringType(), True),
StructField("sourceType", StringType(), True),
StructField("unit", StringType(), True),
StructField("value", DoubleType(), True)
])
My question is: How do I account for the name and url under the attribution column, the unit and value under the averagingPeriod column, etc?
For reference, here is the dataset I am using: https://registry.opendata.aws/openaq/.
Here's an example of array type and struct type. I think it should be straightforward to do this for all other columns.
from pyspark.sql.types import *
jsonSchema = StructType([
StructField("attribution", ArrayType(StructType([StructField("name", StringType()), StructField("url", StringType())])), True),
StructField("averagingPeriod", StructType([StructField("unit", StringType()), StructField("value", DoubleType())]), True),
# ... etc.
])
I am trying to create a pyspark dataframe from pandas dataframe.
import pandas as pd
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
a_dict = {0: [(0, 9.821), (1, 82.185)]}
a_pd = pd.DataFrame.from_dict(a_dict.items())
a_pd.columns = ["row_num", "val"]
a_str = StructType([StructField("id", IntegerType(), True), StructField("prob", DoubleType(), True)])
my_schema = StructType([ StructField("row_num", LongType(), True),StructField("val", list(a_str), True)]) # error
a_df = spark.createDataFrame(a_pd, schema=my_schema)
error:
AssertionError: dataType [StructField(id,IntegerType,true), StructField(prob,DoubleType,true)] should be an instance of <class 'pyspark.sql.types.DataType'>
How to define a valid schema of
list of tuple of (int, DoubleType)
so that it can be understood by pyspark?
thanks
For a list of values, you have to use ArrayType. Below is your code reproduced with examples.
import pandas as pd
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
a_dict = {0: [(0, 9.821), (1, 82.185)],
1: [(0, 9.821), (1, 8.10), (3, 2.385)],
2: [(0, 9.821), (1, 1.4485), (4, 5.15), (5, 6.104)]}
a_pd = pd.DataFrame.from_dict(a_dict.items())
a_pd.columns = ["row_num", "val"]
print(a_pd.head())
a_str = StructType([StructField("id", IntegerType(), True), StructField("prob", DoubleType(), True)])
my_schema = StructType([StructField("row_num", LongType(), True), StructField("val", ArrayType(a_str), True)]) # error
a_df = sqlContext.createDataFrame(a_pd, schema=my_schema)
print(a_df.show(truncate=False))
print(a_df.printSchema())
Output:
+-------+------------------------------------------------+
|row_num|val |
+-------+------------------------------------------------+
|0 |[[0, 9.821], [1, 82.185]] |
|1 |[[0, 9.821], [1, 8.1], [3, 2.385]] |
|2 |[[0, 9.821], [1, 1.4485], [4, 5.15], [5, 6.104]]|
+-------+------------------------------------------------+
root
|-- row_num: long (nullable = true)
|-- val: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: integer (nullable = true)
| | |-- prob: double (nullable = true)
I'm looking for method to change pyspark dataframe column type
from
df.printSchema()
To
Thank you, for your help, in advance.
You have to replace the column with new schema. ArrayType take two parameters elementType and containsNull.
from pyspark.sql.types import *
from pyspark.sql.functions import udf
x = [("a",["b","c","d","e"]),("g",["h","h","d","e"])]
schema = StructType([StructField("key",StringType(), nullable=True),
StructField("values", ArrayType(StringType(), containsNull=False))])
df = spark.createDataFrame(x,schema = schema)
df.printSchema()
new_schema = ArrayType(StringType(), containsNull=True)
udf_foo = udf(lambda x:x, new_schema)
df.withColumn("values",udf_foo("values")).printSchema()
root
|-- key: string (nullable = true)
|-- values: array (nullable = true)
| |-- element: string (containsNull = false)
root
|-- key: string (nullable = true)
|-- values: array (nullable = true)
| |-- element: string (containsNull = true)
Here is a useful example where you can change the schema for every column
assuming you want the same type
from pyspark.sql.types import Row
from pyspark.sql.functions import *
df = sc.parallelize([
Row(isbn=1, count=1, average=10.6666666),
Row(isbn=2, count=1, average=11.1111111)
]).toDF()
df.printSchema()
df=df.select(*[col(x).cast('float') for x in df.columns]).printSchema()
outputs:
root
|-- average: double (nullable = true)
|-- count: long (nullable = true)
|-- isbn: long (nullable = true)
root
|-- average: float (nullable = true)
|-- count: float (nullable = true)
|-- isbn: float (nullable = true)