I am trying to use Facebook prophet in spark in an Zeppelin environment and I have tried to follow the exact steps from https://github.com/facebook/prophet/issues/517, However, i get errors like below. I am simply not sure what am I to correct here or how to debug this.
My data contains a datetime features called ds, volume that I want to predict y and the segment and I am trying to build a model for each segment.
File"/hadoop14/yarn/nm/usercache/khasbab/appcache/application_1588090646020_2412/container_e168_1588090646020_2412_01_000001/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o3737.showString.
%livycd.pyspark
from pyspark.sql.types import StructType,StructField,StringType,TimestampType,ArrayType,DoubleType
from pyspark.sql.functions import current_date
from pyspark.sql.functions import pandas_udf, PandasUDFType
from fbprophet import Prophet
from datetime import datetime
import pandas as pd
result_schema = StructType([
StructField('segment', StringType(), True),
StructField('ds', TimestampType(), True),
StructField('trend', ArrayType(DoubleType()), True),
StructField('trend_upper', ArrayType(DoubleType()), True),
StructField('trend_lower', ArrayType(DoubleType()), True),
StructField('yearly', ArrayType(DoubleType()), True),
StructField('yearly_upper', ArrayType(DoubleType()), True),
StructField('yearly_lower', ArrayType(DoubleType()), True),
StructField('yhat', ArrayType(DoubleType()), True),
StructField('yhat_upper', ArrayType(DoubleType()), True),
StructField('yhat_lower', ArrayType(DoubleType()), True),
StructField('multiplicative_terms', ArrayType(DoubleType()), True),
StructField('multiplicative_terms_upper', ArrayType(DoubleType()), True),
StructField('multiplicative_terms_lower', ArrayType(DoubleType()), True),
StructField('additive_terms', ArrayType(DoubleType()), True),
StructField('additive_terms_upper', ArrayType(DoubleType()), True),
StructField('additive_terms_lower', ArrayType(DoubleType()), True),
])
#pandas_udf(result_schema, PandasUDFType.GROUPED_MAP)
def forecast_loans(history_pd):
# instantiate the model, configure the parameters
model = Prophet(
interval_width=0.95,
growth='linear',
daily_seasonality=False,
weekly_seasonality=False,
yearly_seasonality=True,
seasonality_mode='multiplicative'
)
#history_pd['ds'] = pd.to_datetime(history_pd['ds'], errors = 'coerce', format = '%Y-%m-%d')
#.apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
# fit the model
model.fit(history_pd.loc[:,['ds','y']])
# configure predictions
future_pd = model.make_future_dataframe(
periods=20,
freq='W')
# make predictions
results_pd = model.predict(future_pd)
# return predictions
return pd.DataFrame({
'segment':history_pd['segment'].values[0],
'ds': [results_pd.loc[:,'ds'].values.tolist()],
'trend': [results_pd.loc[:,'ds'].values.tolist()],
'trend_upper': [results_pd.loc[:,'trend_upper'].values.tolist()],
'trend_lower': [results_pd.loc[:,'trend_lower'].values.tolist()],
'yearly': [results_pd.loc[:,'yearly'].values.tolist()],
'yearly_upper': [results_pd.loc[:,'yearly_upper'].values.tolist()],
'yearly_lower': [results_pd.loc[:,'yearly_lower'].values.tolist()],
'yhat': [results_pd.loc[:,'yhat'].values.tolist()],
'yhat_upper': [results_pd.loc[:,'yhat_upper'].values.tolist()],
'yhat_lower': [results_pd.loc[:,'yhat_lower'].values.tolist()],
'multiplicative_terms': [results_pd.loc[:,'multiplicative_terms'].values.tolist()],
'multiplicative_terms_upper': [results_pd.loc[:,'multiplicative_terms_upper'].values.tolist()],
'multiplicative_terms_lower': [results_pd.loc[:,'multiplicative_terms_lower'].values.tolist()],
'additive_terms': [results_pd.loc[:,'additive_terms'].values.tolist()],
'additive_terms_upper': [results_pd.loc[:,'additive_terms_upper'].values.tolist()],
'additive_terms_lower': [results_pd.loc[:,'additive_terms_lower'].values.tolist()]
})
#return pd.concat([pd.DataFrame(results_pd),pd.DataFrame(history_pd[['segment']].values[0])], axis = 1)
results =df3.groupBy('segment').apply(forecast_loans)
results.show()
I have tweaked my code to the following and downgraded to pyarrow 0.14 as suggested here Pandas scalar UDF failing, IllegalArgumentException and it all worked! I believe downgrading pyarrow to 0.14 was the key for spark 2.x versions as commented on stackoverflow.
The comment says the following "The issue is not with pyarrow's new release, it is spark which has to upgrade and become compatible with pyarrow.(i am afraid we have to wait for spark 3.0 to use the latest pyarrow)"
%livycd.pyspark
from pyspark.sql.types import StructType,StructField,StringType,TimestampType,ArrayType,DoubleType
from pyspark.sql.functions import current_date
from pyspark.sql.functions import pandas_udf, PandasUDFType
from fbprophet import Prophet
from datetime import datetime
import pandas as pd
result_schema = StructType([
StructField('segment', StringType(), True),
StructField('ds', TimestampType(), True),
StructField('trend', DoubleType(), True),
StructField('trend_upper', DoubleType(), True),
StructField('trend_lower', DoubleType(), True),
StructField('yearly', DoubleType(), True),
StructField('yearly_upper', DoubleType(), True),
StructField('yearly_lower', DoubleType(), True),
StructField('yhat', DoubleType(), True),
StructField('yhat_upper', DoubleType(), True),
StructField('yhat_lower', DoubleType(), True),
StructField('multiplicative_terms', DoubleType(), True),
StructField('multiplicative_terms_upper', DoubleType(), True),
StructField('multiplicative_terms_lower', DoubleType(), True),
StructField('additive_terms', DoubleType(), True),
StructField('additive_terms_upper', DoubleType(), True),
StructField('additive_terms_lower', DoubleType(), True),
])
#pandas_udf(result_schema, PandasUDFType.GROUPED_MAP)
def forecast_loans(df):
def prophet_model(df,test_start_date):
df['ds'] = pd.to_datetime(df['ds'])
# train
ts_train = (df
.query('ds < #test_start_date')
.sort_values('ds')
)
# test
ts_test = (df
.query('ds >= #test_start_date')
.sort_values('ds')
.drop('y', axis=1)
)
print(ts_test.columns)
# instantiate the model, configure the parameters
model = Prophet(
interval_width=0.95,
growth='linear',
daily_seasonality=False,
weekly_seasonality=False,
yearly_seasonality=True,
seasonality_mode='multiplicative'
)
# fit the model
model.fit(ts_train.loc[:,['ds','y']])
# configure predictions
future_pd = model.make_future_dataframe(
periods=len(ts_test),
freq='W')
# make predictions
results_pd = model.predict(future_pd)
results_pd = pd.concat([results_pd,df['segment']],axis = 1)
return pd.DataFrame(results_pd, columns = result_schema.fieldNames())
# return predictions
return prophet_model(df, test_start_date= '2019-03-31')
results =df3.groupBy('segment').apply(forecast_loans)
Assuming you are using Spark 2.3.x or 2.4.x and PyArrow >= 0.15.0, there is a known compatibility issue between PyArrow and Spark.
The simplest solution is to set the environment variable ARROW_PRE_0_15_IPC_FORMAT=1. The Spark documentation recommends setting it in conf/spark-env.sh, but you can set it in your Linux shell, and it is also possible to set it before creating your spark_session in your Python script or shell.
import os
os.ENVIRON["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
spark_session = ...
Alternatively, you can downgrade PyArrow if that is an option for you, as noted in the other answer.
Related
I am new to Python and spark.
We are using Azure Databrick and with help of PySpark code shown below for it.
data=spark.sql("SELECT 'Name' as name, 'Number' as number FROM Student")
print(data)
This solution will work for you.
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
data2 = [("Finance",10),
("Marketing",20),
("Sales",30),
("IT",40)
]
schema = StructType([ \
StructField("Name",StringType(),True), \
StructField("number", IntegerType(), True) \
])
df = spark.createDataFrame(data=data2,schema=schema)
df1 = df.withColumn("Student",lit("Student")).select("Student",to_json(struct("Name","number")).alias("Data"))
display(df1)
I'm trying to upload a sample pyspark dataframe to Azure blob, after converting it to excel format. Getting the below error. Also, below is the snippet of my sample code.
If there is a other way to do the same, pls let me know.
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
import pandas as ps
#%pip install xlwt
#%pip install openpyxl
#%pip install fsspec
my_data = [
("A","1","M",3000),
("B","2","F",4000),
("C","3","M",4000)
]
schema = StructType([ \
StructField("firstname",StringType(),True), \
StructField("id", StringType(), True), \
StructField("gender", StringType(), True), \
StructField("salary", IntegerType(), True) \
])
df = spark.createDataFrame(data=my_data,schema=schema)
pandasDF = df.toPandas()
pandasDF.to_excel("wasbs://blob.paybledvclient1#civblbstr.blob.core.windows.net/output_file.xlsx")
ValueError: Protocol not known: wasbs
You are directly using python library pandas to write the data. This isn't work this way. You need to first mount the Azure Blob storage container and then write the data.
To mount, use following command:
dbutils.fs.mount(
source = "wasbs://<container-name>#<storage-account-name>.blob.core.windows.net",
mount_point = "/mnt/<mount-name>",
extra_configs = {"<conf-key>":dbutils.secrets.get(scope = "<scope-name>", key = "<key-name>")})
To write, use below commands:
df.write
.mode("overwrite")
.option("header", "true")
.csv("dbfs:/mnt/azurestorage/filename.csv"))
I can not take any inputI have a schema.
schm = StructType([
StructField("ID", IntegerType(), True),
StructField("fixed_credit_limit", IntegerType(), True),
StructField("credit_limit", IntegerType(), True),
StructField("due_amount", IntegerType(), True),
StructField("created_date", StringType(), True),
StructField("updated_date", StringType(), True),
StructField("agent_name_id",IntegerType(), True),
StructField("rfid_id", StringType(), True),
])
input=[(13158,100,100,0,'05/29/2021 11:01:31','05/29/2021 11:01:31',5,'862b4497-577f-47f9-8725-dd6c397ce408')]
df1 = spark.createDataFrame(input, schema)
I want to take the user input of agent_name_id but it gives the error ['list' object is not callable]
how can I take the user input of agent_name_id.
I have this schema:
schm = StructType([
StructField("User ID", IntegerType(), True),
StructField("Tag", StringType(), True),
StructField("Activated", StringType(), True),
StructField("Created Date", StringType(), True),
StructField("Updated Date", StringType(), True),
StructField("Valid Until", StringType(), True),
StructField("last used", StringType(), True),
StructField("reference", StringType(), True),
StructField("employee code", IntegerType(), True),
StructField("Unique user ID", IntegerType(), True),
])
schm
I want to build a dataframe for 998 values.But the problem is i can't understand how to get input data for 998 rows(same value for 998 rows):
df = create_df(spark, input_data, schm)
I am new to Apache Spark, so forgive me if this is a noob question. I am trying to define a particular schema before reading in the dataset in order to speed up processing. There are a few data types that I am not sure how to define (ArrayType and StructType).
Here is a screenshot of the schema I am working with:
Here is what I have so far:
jsonSchema = StructType([StructField("attribution", ArrayType(), True),
StructField("averagingPeriod", StructType(), True),
StructField("city", StringType(), True),
StructField("coordinates", StructType(), True),
StructField("country", StringType(), True),
StructField("date", StructType(), True),
StructField("location", StringType(), True),
StructField("mobile", BooleanType(), True),
StructField("parameter", StringType(), True),
StructField("sourceName", StringType(), True),
StructField("sourceType", StringType(), True),
StructField("unit", StringType(), True),
StructField("value", DoubleType(), True)
])
My question is: How do I account for the name and url under the attribution column, the unit and value under the averagingPeriod column, etc?
For reference, here is the dataset I am using: https://registry.opendata.aws/openaq/.
Here's an example of array type and struct type. I think it should be straightforward to do this for all other columns.
from pyspark.sql.types import *
jsonSchema = StructType([
StructField("attribution", ArrayType(StructType([StructField("name", StringType()), StructField("url", StringType())])), True),
StructField("averagingPeriod", StructType([StructField("unit", StringType()), StructField("value", DoubleType())]), True),
# ... etc.
])