using numpy busday_count function in pyspark - numpy

Given I have two columns in pyspark that are not null:
df2 = sqlContext.sql("SELECT AssignedDate,primaryid from parts_so where AssignedDate is not null ")
df2 = df2.withColumn('sdate', df2.AssignedDate )
df2 = df2.withColumn('edate',current_timestamp() )
When I run df2 = df2.withColumn('days_outstanding', lit(get_hours2(df2.sdate,df2.edate ) )
def get_hours2(sdate,edate):
biz_days = np.busday_count(sdate,edate)
return biz_days
I get the following error:
object of type 'Column' has no len()

First of all let's create some random data
import datetime
import random
import pandas as pd
import numpy as np
sdate = [datetime.datetime.now() + datetime.timedelta(i) for i in range(5)]
edate = [date + datetime.timedelta(random.random()+3) for date in sdate]
data = {
'sdate': sdate,
'edate': edate
}
pdf = pd.DataFrame(data)
df = spark.createDataFrame(pdf)
df.show()
+--------------------+--------------------+
| edate| sdate|
+--------------------+--------------------+
|2019-12-06 22:55:...|2019-12-03 08:14:...|
|2019-12-07 19:42:...|2019-12-04 08:14:...|
|2019-12-08 21:26:...|2019-12-05 08:14:...|
|2019-12-09 18:57:...|2019-12-06 08:14:...|
|2019-12-11 04:08:...|2019-12-07 08:14:...|
+--------------------+--------------------+
You cannot use bare function to create another column in pyspark. We have to create UDF in order to do that.
NOTE: Please remember that you have to cast the result of the computation to int, because you might get a problem with pickling numpy type.
import pyspark.sql.types as T
import pyspark.sql.functions as F
#F.udf(returnType=T.IntegerType())
def get_hours2(sdate,edate):
biz_days = np.busday_count(sdate,edate)
return int(biz_days)
Finally we can use UDF on created DataFrame.
df = df.withColumn('days_outstanding', F.lit(get_hours2('sdate', 'edate')))
df.show()
+--------------------+--------------------+----------------+
| edate| sdate|days_outstanding|
+--------------------+--------------------+----------------+
|2019-12-06 22:55:...|2019-12-03 08:14:...| 3|
|2019-12-07 19:42:...|2019-12-04 08:14:...| 3|
|2019-12-08 21:26:...|2019-12-05 08:14:...| 2|
|2019-12-09 18:57:...|2019-12-06 08:14:...| 1|
|2019-12-11 04:08:...|2019-12-07 08:14:...| 2|
+--------------------+--------------------+----------------+
I hope this helps you.

Related

Perform groupBy with array of strings - Scala

I have an array of strings defined in a variable which contains name of the column. I would like to perform group by and get count.
I am trying below code but throws error.
val keys = Array("Col1", "Col2")
val grouppedByDf = myDf.groupBy(keys.mkString(",").count
Can you please guide me what I am doing wrong here ?
import spark.implicits._
val df = Seq(("βήτα", "άλφα", 20), ("άλφα", "βήτα", 10), ("άλφα", "βήτα", 20), ("βήτα", "άλφα", 10)).toDF("α", "β", "ω")
val keys = Array("α", "β")
df
.groupBy(keys.map(col(_)): _*)
.count()
.show()
+----+----+-----+
| α| β|count|
+----+----+-----+
|βήτα|άλφα| 2|
|άλφα|βήτα| 2|
+----+----+-----+

How to count hypothenuses with pandas udf, pyspark

I want to write a panda udf which will take two arguments cathetus1, and cathetus2 from other dataframe and return hypot.
# this data is list where cathetuses are.
data = [(3.0, 4.0), (6.0, 8.0), (3.3, 5.6)]
schema = StructType([StructField("cathetus1",DoubleType(),True),StructField("cathetus2",DoubleType(),True)])
df = spark.createDataFrame(data=data,schema=schema)
df.show()
#and this is creating dataframe where only cathetuses are showing.
this is function i have written so far.
def pandaUdf(cat1, cat2):
leg1 = []
leg2 = []
for i in data:
x = 0
leg1.append(i[x])
leg2.append(i[x+1])
hypoData.append(np.hypot(leg1,leg2))
return np.hypot(leg1,leg2)
#example_series = pd.Series(data)
and im trying to create a new column in df, which values will be hypos.
df.withColumn(col('Hypo'), pandaUdf(example_df.cathetus1,example_df.cathetus2)).show()
but this gives me an error --> col should be Column.
I dont understand how I can fix this error or why its even there.
You can apply np.hypot on the 2 cathetus directly without extracting individual values.
from pyspark.sql import functions as F
from pyspark.sql.types import *
data = [(3.0, 4.0), (6.0, 8.0), (3.3, 5.6)]
schema = StructType([StructField("cathetus1",DoubleType(),True),StructField("cathetus2",DoubleType(),True)])
df = spark.createDataFrame(data=data,schema=schema)
df.show()
"""
+---------+---------+
|cathetus1|cathetus2|
+---------+---------+
| 3.0| 4.0|
| 6.0| 8.0|
| 3.3| 5.6|
+---------+---------+
"""
def hypot(cat1: pd.Series, cat2: pd.Series) -> pd.Series:
return np.hypot(cat1,cat2)
hypot_pandas_df = F.pandas_udf(hypot, returnType=FloatType())
df.withColumn("Hypo", hypot_pandas_df("cathetus1", "cathetus2")).show()
"""
+---------+---------+----+
|cathetus1|cathetus2|Hypo|
+---------+---------+----+
| 3.0| 4.0| 5.0|
| 6.0| 8.0|10.0|
| 3.3| 5.6| 6.5|
+---------+---------+----+
"""

cast a date to integer pyspark

Is is possible to convert a date column to an integer column in a pyspark dataframe? I tried 2 different ways but every attempt returns a column with nulls. What am I missing?
from pyspark.sql.types import *
# DUMMY DATA
simpleData = [("James",34,"2006-01-01","true","M",3000.60),
("Michael",33,"1980-01-10","true","F",3300.80),
("Robert",37,"1992-07-01","false","M",5000.50)
]
columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df=df.withColumn("jobStartDate", df['jobStartDate'].cast(DateType()))
# ATTEMPT 1 with cast()
df=df.withColumn("jobStartDateAsInteger1", df['jobStartDate'].cast(IntegerType()))
# ATTEMPT 2 with selectExpr()
df=df.selectExpr("*","CAST(jobStartDate as int) as jobStartDateAsInteger2")
df.show()
You can try casting it to a UNIX timestamp using F.unix_timestamp():
from pyspark.sql.types import *
import pyspark.sql.functions as F
# DUMMY DATA
simpleData = [("James",34,"2006-01-01","true","M",3000.60),
("Michael",33,"1980-01-10","true","F",3300.80),
("Robert",37,"1992-07-01","false","M",5000.50)
]
columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df=df.withColumn("jobStartDate", df['jobStartDate'].cast(DateType()))
df=df.withColumn("jobStartDateAsInteger1", F.unix_timestamp(df['jobStartDate']))
df.show()
+---------+---+------------+-----------+------+------+----------------------+
|firstname|age|jobStartDate|isGraduated|gender|salary|jobStartDateAsInteger1|
+---------+---+------------+-----------+------+------+----------------------+
| James| 34| 2006-01-01| true| M|3000.6| 1136073600|
| Michael| 33| 1980-01-10| true| F|3300.8| 316310400|
| Robert| 37| 1992-07-01| false| M|5000.5| 709948800|
+---------+---+------------+-----------+------+------+----------------------+

pandas_udf error (applyInPandas) stating tuple format

I have the following spark DataFrame :
+-------------------+-------+---------+-------+---------+---------------+
| Time| Close| Volume| Open|Num_Ticks| Dollar Volume|
+-------------------+-------+---------+-------+---------+---------------+
|2015-06-01 00:00:00|2109.25|1337694.0| 2109.5| 1.0| 2.8215310695E9|
|2015-06-02 00:00:00|2106.75|1442673.0| 2106.5| 1.0|3.03935134275E9|
|2015-06-03 00:00:00| 2116.0|1310989.0|2116.25| 1.0| 2.774052724E9|
|2015-06-04 00:00:00| 2099.0|1716475.0| 2099.0| 1.0| 3.602881025E9|
|2015-06-05 00:00:00|2092.25|1459933.0| 2092.0| 1.0|3.05454481925E9|
|2015-06-08 00:00:00|2078.25|1290580.0| 2079.0| 1.0| 2.682147885E9|
|2015-06-09 00:00:00| 2080.0|1446234.0| 2080.5| 1.0| 3.00816672E9|
|2015-06-10 00:00:00| 2107.0|1664080.0| 2106.0| 1.0| 3.50621656E9|
|2015-06-11 00:00:00|2109.25|1480391.0|2109.25| 1.0|3.12251471675E9|
|2015-06-12 00:00:00| 2093.0|1130566.0| 2094.0| 1.0| 2.366274638E9|
|2015-06-15 00:00:00| 2084.0|1077154.0|2083.75| 1.0| 2.244788936E9|
|2015-06-16 00:00:00| 2097.5| 790233.0|2097.25| 1.0| 1.6575137175E9|
|2015-06-17 00:00:00|2089.25|1577521.0|2088.75| 1.0|3.29583574925E9|
|2015-06-18 00:00:00|2114.75|1899198.0| 2114.0| 1.0| 4.0163289705E9|
|2015-06-19 00:00:00|2097.75|1236103.0|2097.75| 1.0|2.59303506825E9|
|2015-06-22 00:00:00|2112.75|1095590.0|2113.25| 1.0| 2.3147077725E9|
|2015-06-23 00:00:00| 2116.5| 835219.0| 2117.0| 1.0| 1.7677410135E9|
|2015-06-24 00:00:00| 2099.5|1153248.0| 2099.5| 1.0| 2.421244176E9|
|2015-06-25 00:00:00| 2094.0|1213961.0| 2094.0| 1.0| 2.542034334E9|
|2015-06-26 00:00:00|2095.75|1318744.0|2095.75| 1.0| 2.763757738E9|
+-------------------+-------+---------+-------+---------+---------------+
root
|-- Time: timestamp (nullable = true)
|-- Close: double (nullable = true)
|-- Volume: double (nullable = true)
|-- Open: double (nullable = true)
|-- Num_Ticks: double (nullable = true)
|-- Dollar Volume: double (nullable = true)
and I am applying the following function:
def getDailyVol(pdf, span0=100):
pdf = pdf.set_index('Time') # Line #1
close = pdf.Close
df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1)) # Line #1
df0 = df0[df0>0]
df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
df0 = close.loc[df0.index] / close.loc[df0.values].values - 1 # daily returns
df0 = df0.ewm(span=span0).std()
return df0
daily_vol = es_dbars.groupBy().applyInPandas(getDailyVol,
schema='Time timestamp, Close double'
)
However when I am trying to show the results it throws the following error 'tuple' object has no attribute 'set_index' and if I comment line #1 it throws an error in line #2 stating something that searchsorted cannot be applied even if I have already loaded all the following libraries :
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import databricks.koalas as ks
import os, shutil
import numpy as np
import matplotlib.pyplot as pl
spark = SparkSession.builder \
.master('local[3]') \
.appName('chapter3') \
.config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config('spark.executor.memory','6gb') \
.getOrCreate()
sc = spark.sparkContext
spark.sql("set spark.sql.shuffle.partitions = 3")
If someone can help me noticing what am I doing incorrect or if I need to do something else I will appreciate it.
edit: I am providing the dataset in a reproducible way as follows:
,Time,Close,Volume,Open,Num_Ticks,Dollar Volume
0,2015-06-01,2109.25,1337694.0,2109.5,1.0,2821531069.5
1,2015-06-02,2106.75,1442673.0,2106.5,1.0,3039351342.75
2,2015-06-03,2116.0,1310989.0,2116.25,1.0,2774052724.0
3,2015-06-04,2099.0,1716475.0,2099.0,1.0,3602881025.0
4,2015-06-05,2092.25,1459933.0,2092.0,1.0,3054544819.25
5,2015-06-08,2078.25,1290580.0,2079.0,1.0,2682147885.0
6,2015-06-09,2080.0,1446234.0,2080.5,1.0,3008166720.0
7,2015-06-10,2107.0,1664080.0,2106.0,1.0,3506216560.0
8,2015-06-11,2109.25,1480391.0,2109.25,1.0,3122514716.75
9,2015-06-12,2093.0,1130566.0,2094.0,1.0,2366274638.0
If you provide two arguments to your apply function (getDailyVol in your case) your function won't work since PySpark interprets the first argument as keys (tuple) and the second as pdf (pd.DataFrame). You can read this in their documentation.
To fix this, you would have to define your function as follows:
def getDailyVol(pdf):
pdf = pdf.set_index('Time') # Line #1
close = pdf.Close
df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1)) # Line #1
df0 = df0[df0>0]
df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
df0 = close.loc[df0.index] / close.loc[df0.values].values - 1 # daily returns
df0 = df0.ewm(span=100).std()
return df0
Alternatively, you could rewrite it to:
def getDailyVol(df, span0):
def apply_function(pdf):
pdf = pdf.set_index('Time') # Line #1
close = pdf.Close
df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1)) # Line #1
df0 = df0[df0>0]
df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
df0 = close.loc[df0.index] / close.loc[df0.values].values - 1 # daily returns
df0 = df0.ewm(span=span0).std()
return df0
return df.groupBy().applyInPandas(getDailyVol, schema='Time timestamp, Close double')
daily_vol = getDailyVol(df=es_dbars, span0=100)
Which would then work for an arbitrary span0. I can't verify if this works 100% since your example is quite hard to get into my local machine. But hope this helps.

RDD to pandas dataframe

I am trying to convert spark rdd in below format to pandas dataframe.
['f1\tf2\tf3\tf4\tf5','4.0\tNULL\t183.0\t190.0\tMARRIED']
When I execute the below code, line 3 gives the error: "Error while instantiating 'org.apache.spark.sql.hive.HiveSessionState':"
sparkDF = data.map(lambda x: str(x))
sparkDF2 = sparkDF.map(lambda w: w.split('\t'))
sparkDF3 = sparkDF2.toDF()
Any suggestions would be appreciated !!
In pandas
x=['f1\tf2\tf3\tf4\tf5','4.0\tNULL\t183.0\t190.0\tMARRIED']
rdd=sc.parallelize(x)
#list=[l.split('\t') for l in ','.join(rdd.collect()).split(',')]
#or
list=rdd.map(lambda x:x.split("\t")).collect()
import pandas as pnd
pnd.DataFrame(list)
In Spark
x=['f1\tf2\tf3\tf4\tf5','4.0\tNULL\t183.0\t190.0\tMARRIED']
rdd=sc.parallelize(x)
df=rdd.map(lambda x:x.split("\t")).toDF()
df.show()
+---+----+-----+-----+-------+
| _1| _2| _3| _4| _5|
+---+----+-----+-----+-------+
| f1| f2| f3| f4| f5|
|4.0|NULL|183.0|190.0|MARRIED|
+---+----+-----+-----+-------+