This is the code that I have tried to write the CSV file.
Spark data frame would be written into a CSV file using pandas
from pyspark.sql.functions import pandas_udf,PandasUDFType
import os
import csv
df3 = spark.createDataFrame(
[("a", 1, 0), ("a", -1, 42), ("b", 3, -1), ("b", 10, -2)],
("key", "value1", "value2")
)
from pyspark.sql.types import *
schema = StructType([
StructField("key", StringType()),
StructField("avg_value1", DoubleType()),
StructField("avg_value2", DoubleType()),
StructField("sum_avg", DoubleType()),
StructField("sub_avg", DoubleType()),
StructField("result", StringType())
])
#pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def g(df):
gr = df['key'].iloc[0]
x = df.value1.mean()
y = df.value2.mean()
w = df.value1.mean() + df.value2.mean()
z = df.value1.mean() - df.value2.mean()
fileName = '/mnt/test' + gr + '.csv'
df.to_csv(fileName, sep='\t')
a = "Saved"
return pd.DataFrame([[gr]+[x]+[y]+[w]+[z]+[a]])
df3.groupby("key").apply(g).show()
Output:
+---+----------+----------+-------+-------+------+
|key|avg_value1|avg_value2|sum_avg|sub_avg|result|
+---+----------+----------+-------+-------+------+
| a| 0.0| 21.0| 21.0| -21.0| Saved|
| b| 6.5| -1.5| 5.0| 8.0| Saved|
+---+----------+----------+-------+-------+------+
But the CSV files are not getting created.
Any suggestions would be appreciated.
Related
I have the data below and need to create a line chart of x = Date and y = count.
The code I used to create the dataframe below was from another dataframe.
df7=df7.select("*",
concat(col("Month"),lit("/"),col("Year")).alias("Date"))
df7.show()
I've imported matplotlib.pyplot as plt and am still getting errors.
The code to plot I used in different variations as below:
df.plot(x = 'Date', y = 'Count')
df.plot(kind = 'line')
I keep getting this error though:
AttributeError: 'DataFrame' object has no attribute 'plt'/'plot'
Please note that using df_pd= df.toPandas() is sometimes expensive, and if you deal with a high number of records like a scale of M, you might face OOM error in Databricks medium or your session could be crashed due to a lack of RAM memory of the drive. Long story short, by using toPandas(), in fact, you are not using spark-based or distributed computation resources anymore! So alternatively, you can follow below approach:
So let's start with a simple example:
import time
import datetime as dt
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.functions import dayofmonth, dayofweek
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, TimestampType, DateType
dict2 = [("2021-08-11 04:05:06", 10),
("2021-08-12 04:15:06", 17),
("2021-08-13 09:15:26", 25),
("2021-08-14 11:04:06", 68),
("2021-08-15 14:55:16", 50),
("2021-08-16 04:12:11", 2),
]
schema = StructType([
StructField("timestamp", StringType(), True), \
StructField("count", IntegerType(), True), \
])
#create a Spark dataframe
sqlCtx = SQLContext(sc)
sdf = sqlCtx.createDataFrame(data=dict2,schema=schema)
sdf.printSchema()
sdf.show(truncate=False)
#Generate date and timestamp
new_df = sdf.withColumn('timestamp', F.to_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss").cast(TimestampType())) \
.withColumn('date', F.to_date("timestamp", "yyyy-MM-dd").cast(DateType())) \
.select('timestamp', 'date', 'count')
new_df.show(truncate = False)
#root
# |-- timestamp: string (nullable = true)
# |-- count: integer (nullable = true)
#+-------------------+-----+
#|timestamp |count|
#+-------------------+-----+
#|2021-08-11 04:05:06|10 |
#|2021-08-12 04:15:06|17 |
#|2021-08-13 09:15:26|25 |
#|2021-08-14 11:04:06|68 |
#|2021-08-15 14:55:16|50 |
#|2021-08-16 04:12:11|2 |
#+-------------------+-----+
#+-------------------+----------+-----+
#|timestamp |date |count|
#+-------------------+----------+-----+
#|2021-08-11 04:05:06|2021-08-11|10 |
#|2021-08-12 04:15:06|2021-08-12|17 |
#|2021-08-13 09:15:26|2021-08-13|25 |
#|2021-08-14 11:04:06|2021-08-14|68 |
#|2021-08-15 14:55:16|2021-08-15|50 |
#|2021-08-16 04:12:11|2021-08-16|2 |
#+-------------------+----------+-----+
Now you need to collect() the values of the columns you want to reflect your plot in the absence of Pandas; of course, this is expensive and takes (a long) time in big data records, but it works. Now you can apply one of the following ways:
#for big\high # of records
xlabels = new_df.select("timestamp").rdd.flatMap(list).collect()
ylabels = new_df.select("count").rdd.flatMap(list).collect()
#for limited # of records
xlabels = [val.timestamp for val in new_df.select('timestamp').collect()]
ylabels = [val.count for val in new_df.select('count').collect()]
To plot:
import matplotlib.pyplot as plt
import matplotlib.dates as md
fig, ax = plt.subplots(figsize=(10,6))
plt.plot(xlabels, ylabels, color='blue', label="event's count") #, marker="o"
plt.scatter(xlabels, ylabels, color='cyan', marker='d', s=70)
plt.xticks(rotation=45)
plt.ylabel('Event counts \n# of records', fontsize=15)
plt.xlabel('timestamp', fontsize=15)
plt.title('Events over time', fontsize=15, color='darkred', weight='bold')
plt.legend(['# of records'], loc='upper right')
plt.show()
Based on comments, I assumed due to having lots of records that are printed under x-axis timestamps are not readable like the below pic:
To resolve this, you need to use the following approach to arrange x-axis ticks properly so that they would not plot on top of each other or ultimately side-by-side:
import pandas as pd
import matplotlib.pyplot as plt
x=xlabels
y=ylabels
#Note 1: if you use Pandas dataFrame after .toPandas()
#x=df['timestamp']
#y=df['count']
##Note 2: if you use Pandas dataFrame after .toPandas()
# convert the datetime column to a datetime type and assign it back to the column
#df.timestamp = pd.to_datetime(df.timestamp)
#verify timestamp column type is datetime64[ns] using print(df.info())
fig, ax = plt.subplots( figsize=(12,8))
plt.plot(x, y)
ax.legend(['# of records'])
ax.set_xlabel('Timestamp')
ax.set_ylabel('Event counts \n# of records')
# beautify the x-labels
import matplotlib.dates as md
plt.gcf().autofmt_xdate()
myFmt = md.DateFormatter('%Y-%m-%d %H:%M:%S.%f')
plt.gca().xaxis.set_major_formatter(myFmt)
plt.show()
plt.close()
I am trying to deploy a simple if-else function specifically using pandas_udf.
Here is the code:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
#pandas_udf("string", PandasUDFType.SCALAR )
def seq_sum1(col1,col2):
if col1 + col2 <= 6:
v = "low"
elif ((col1 + col2 > 6) & (col1 + col2 <=10)) :
v = "medium"
else:
v = "High"
return (v)
# Deploy
df.select("*",seq_sum1('c1','c2').alias('new_col')).show(10)
this results in an error:
PythonException: An exception was thrown from a UDF: 'ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().', from <command-1220380192863042>, line 13. Full traceback below:
if I deploy the same code but using #udf instead of #pandas_udf, it produces the results as expected.
However, pandas_udf doesn't seem to work.
I know that this kind of functionally can be achieved through other means in spark (case when etc), so the point here is that I want to understand how pandas_udf works when dealing with such logics.
Thanks
The UDF should take a pandas series and return a pandas series, not taking and returning strings.
import pandas as pd
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T
#F.pandas_udf("string", F.PandasUDFType.SCALAR)
def seq_sum1(col1, col2):
return pd.Series(
np.where(
col1 + col2 <= 6, "low",
np.where(
(col1 + col2 > 6) & (col1 + col2 <= 10), "medium",
"high"
)
)
)
df.select("*", seq_sum1('c1','c2').alias('new_col')).show()
+---+---+-------+
| c1| c2|new_col|
+---+---+-------+
| 1| 2| low|
| 3| 4| medium|
| 5| 6| high|
+---+---+-------+
#mck provided the insight, and I end up using the map function to solve it.
#pandas_udf("string", PandasUDFType.SCALAR)
def seq_sum(col1):
# actual function/calculation goes here
def main(x):
if x < 6:
v = "low"
else:
v = "high"
return(v)
# now apply map function, returning a panda series
result = pd.Series(map(main,col1))
return (result)
df.select("*",seq_sum('column_name').alias('new_col')).show(10)
so I have a large dataset (about 1 TB+) where I have to do many operations for which I have thought of using pyspark for faster processing. Here are my imports:
import numpy as np
import pandas as pd
try:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
except ImportError as e:
raise ImportError('PySpark is not Configured')
print(f"PySpark Version : {pyspark.__version__}")
# Creating a Spark-Context
sc = SparkContext.getOrCreate(SparkConf().setMaster('local[*]'))
# Spark Builder
spark = SparkSession.builder \
.appName('MBLSRProcessor') \
.config('spark.executor.memory', '10GB') \
.getOrCreate()
# SQL Context - for SQL Query Executions
sqlContext = SQLContext(sc)
>> PySpark Version : 2.4.7
Now, I want to apply log10 function on two columns - For demonstrations, please consider this data:
data = spark.createDataFrame(pd.DataFrame({
"A" : [1, 2, 3, 4, 5],
"B" : [4, 3, 6, 1, 8]
}))
data.head(5)
>> [Row(A=1, B=4), Row(A=2, B=3), Row(A=3, B=6), Row(A=4, B=1), Row(A=5, B=8)]
This is what I require: log10(A + B) i.e. log10(6 + 4) = 1 for which I have made a function like this:
def add(a, b):
# this is for demonstration
return np.sum([a, b])
data = data.withColumn("ADD", add(data.A, data.B))
data.head(5)
>> [Row(A=1, B=4, ADD=5), Row(A=2, B=3, ADD=5), Row(A=3, B=6, ADD=9), Row(A=4, B=1, ADD=5), Row(A=5, B=8, ADD=13)]
But, I can't do the same for np.log10:
def np_log(a, b):
# actual function
return np.log10(np.sum([a, b]))
data = data.withColumn("LOG", np_log(data.A, data.B))
data.head(5)
TypeError Traceback (most recent call last)
<ipython-input-13-a5726b6c7dc2> in <module>
----> 1 data = data.withColumn("LOG", np_log(data.A, data.B))
2 data.head(5)
<ipython-input-12-0e020707faae> in np_log(a, b)
1 def np_log(a, b):
----> 2 return np.log10(np.sum([a, b]))
TypeError: loop of ufunc does not support argument 0 of type Column which has no callable log10 method
The best way to do this is to use native Spark functions:
import pyspark.sql.functions as F
import pandas as pd
data = spark.createDataFrame(pd.DataFrame({
"A" : [1, 2, 3, 4, 5],
"B" : [4, 3, 6, 1, 8]
}))
data = data.withColumn("LOG", F.log10(F.col('A') + F.col('B')))
But if you want, you can also use a UDF:
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType
import numpy as np
import pandas as pd
data = spark.createDataFrame(pd.DataFrame({
"A" : [1, 2, 3, 4, 5],
"B" : [4, 3, 6, 1, 8]
}))
def udf_np_log(a, b):
# actual function
return float(np.log10(np.sum([a, b])))
np_log = F.udf(udf_np_log, FloatType())
data = data.withColumn("LOG", np_log(data.A, data.B))
+---+---+---------+
| A| B| LOG|
+---+---+---------+
| 1| 4| 0.69897|
| 2| 3| 0.69897|
| 3| 6|0.9542425|
| 4| 1| 0.69897|
| 5| 8|1.1139433|
+---+---+---------+
Interestingly it works for np.sum without UDF because I guess np.sum is just calling the + operator, which is valid for spark dataframe columns.
I am trying to create a pyspark dataframe from pandas dataframe.
import pandas as pd
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
a_dict = {0: [(0, 9.821), (1, 82.185)]}
a_pd = pd.DataFrame.from_dict(a_dict.items())
a_pd.columns = ["row_num", "val"]
a_str = StructType([StructField("id", IntegerType(), True), StructField("prob", DoubleType(), True)])
my_schema = StructType([ StructField("row_num", LongType(), True),StructField("val", list(a_str), True)]) # error
a_df = spark.createDataFrame(a_pd, schema=my_schema)
error:
AssertionError: dataType [StructField(id,IntegerType,true), StructField(prob,DoubleType,true)] should be an instance of <class 'pyspark.sql.types.DataType'>
How to define a valid schema of
list of tuple of (int, DoubleType)
so that it can be understood by pyspark?
thanks
For a list of values, you have to use ArrayType. Below is your code reproduced with examples.
import pandas as pd
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
a_dict = {0: [(0, 9.821), (1, 82.185)],
1: [(0, 9.821), (1, 8.10), (3, 2.385)],
2: [(0, 9.821), (1, 1.4485), (4, 5.15), (5, 6.104)]}
a_pd = pd.DataFrame.from_dict(a_dict.items())
a_pd.columns = ["row_num", "val"]
print(a_pd.head())
a_str = StructType([StructField("id", IntegerType(), True), StructField("prob", DoubleType(), True)])
my_schema = StructType([StructField("row_num", LongType(), True), StructField("val", ArrayType(a_str), True)]) # error
a_df = sqlContext.createDataFrame(a_pd, schema=my_schema)
print(a_df.show(truncate=False))
print(a_df.printSchema())
Output:
+-------+------------------------------------------------+
|row_num|val |
+-------+------------------------------------------------+
|0 |[[0, 9.821], [1, 82.185]] |
|1 |[[0, 9.821], [1, 8.1], [3, 2.385]] |
|2 |[[0, 9.821], [1, 1.4485], [4, 5.15], [5, 6.104]]|
+-------+------------------------------------------------+
root
|-- row_num: long (nullable = true)
|-- val: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: integer (nullable = true)
| | |-- prob: double (nullable = true)
How can a function be applied on a pandas groupby that requires parameters from multiple columns of the groupby dataframe and returns two scaler values.
Below is the repeatable example. The last line gets the f_value
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
import plotly.express as px
n=100
df = pd.DataFrame({
'c': np.random.choice(['CATS', 'DOGS'], n),
'x': np.random.choice(list('ABCDE'), n),
'y': np.random.normal(5, 1, n)
})
signal = np.where(df['c'].eq('CATS') & df['x'].eq('A'), 1.1, 0)
df['y'] = df['y'] + signal
def get_ols_fp(df, x, y):
formula = y + '~' + x
model = ols(formula, df).fit()
f_value = model.fvalue
p_value = model.f_pvalue
return (f_value, p_value)
# getting f_value and p_value works with a single series.
get_ols_fp(df[df['c'].eq('CATS')], 'x', 'y')
This above code works and fetches the f_value and the p_value. However, the following does not work.
# how could we run the get_ols with a groupby().agg()
df.groupby('c').agg(get_ols_fp('x', 'y'))
The desired output would be a dataframe one row per level of the 'c' variable ('CATTS' and 'DOGS') in this case and one column for the p_value, and another for the f_value.
This is working :
def get_ols_fp(df, x=None, y=None):
formula = y + '~' + x
model = ols(formula, df).fit()
f_value = model.fvalue
p_value = model.f_pvalue
return pd.Series([f_value, p_value], index=['f_value', 'p_value'])
df.groupby('c').apply(get_ols_fp, x='x', y = 'y')
I'd do it a little different.
I don't know if it's the easiest way, but it works.
Example:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
n=100
df = pd.DataFrame({
'c': np.random.choice(['CATS', 'DOGS'], n),
'x': np.random.choice(list('ABCDE'), n),
'y': np.random.normal(5, 1, n)
})
signal = np.where(df['c'].eq('CATS') & df['x'].eq('A'), 1.1, 0)
df['y'] = df['y'] + signal
def get_ols_fp(df, x, y):
formula = y + '~' + x
model = ols(formula, df).fit()
f_value = model.fvalue
p_value = model.f_pvalue
return (f_value, p_value)
# getting f_value and p_value works with a single series.
# get_ols_fp(df[df['c'].eq('CATS')], 'x', 'y')
df_result = pd.DataFrame([], columns = ["c", "f_value", "p_value"])
for c, dd in df.groupby(['c']):
v = get_ols_fp(dd, 'x', 'y')
df_result.loc[len(df_result)] = [c, *v]
df_result