cast a date to integer pyspark - dataframe

Is is possible to convert a date column to an integer column in a pyspark dataframe? I tried 2 different ways but every attempt returns a column with nulls. What am I missing?
from pyspark.sql.types import *
# DUMMY DATA
simpleData = [("James",34,"2006-01-01","true","M",3000.60),
("Michael",33,"1980-01-10","true","F",3300.80),
("Robert",37,"1992-07-01","false","M",5000.50)
]
columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df=df.withColumn("jobStartDate", df['jobStartDate'].cast(DateType()))
# ATTEMPT 1 with cast()
df=df.withColumn("jobStartDateAsInteger1", df['jobStartDate'].cast(IntegerType()))
# ATTEMPT 2 with selectExpr()
df=df.selectExpr("*","CAST(jobStartDate as int) as jobStartDateAsInteger2")
df.show()

You can try casting it to a UNIX timestamp using F.unix_timestamp():
from pyspark.sql.types import *
import pyspark.sql.functions as F
# DUMMY DATA
simpleData = [("James",34,"2006-01-01","true","M",3000.60),
("Michael",33,"1980-01-10","true","F",3300.80),
("Robert",37,"1992-07-01","false","M",5000.50)
]
columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df=df.withColumn("jobStartDate", df['jobStartDate'].cast(DateType()))
df=df.withColumn("jobStartDateAsInteger1", F.unix_timestamp(df['jobStartDate']))
df.show()
+---------+---+------------+-----------+------+------+----------------------+
|firstname|age|jobStartDate|isGraduated|gender|salary|jobStartDateAsInteger1|
+---------+---+------------+-----------+------+------+----------------------+
| James| 34| 2006-01-01| true| M|3000.6| 1136073600|
| Michael| 33| 1980-01-10| true| F|3300.8| 316310400|
| Robert| 37| 1992-07-01| false| M|5000.5| 709948800|
+---------+---+------------+-----------+------+------+----------------------+

Related

Read text file using information in separate dataframe

I have fixed width file as below
00120181120xyz12341
00220180203abc56792
00320181203pqr25483
And a corresponding dataframe that specifies the schema (says column name (_Name) and the column width (_Length):
How can I use PySpark to get the text file dataframe as follows:
#+---+----+--+
#|C1| C2 |C3|
#+--+-----+--+
#| 0|02018|11|
#| 0|02018|02|
#| 0|02018|12|
#+--+-----+--+
You could:
collect your column names and lengths;
use it to create a list of substrings indexes to be used in extracting string parts that you need;
use the list of substring indexes to extract string parts for every row.
Input:
rdd_data = spark.sparkContext.textFile(r'C:\Temp\sample.txt')
df_lengths = spark.createDataFrame([("1", "C1"), ("5", "C2"), ("2", "C3")], ["_Length", "_Name"])
Script:
from pyspark.sql import Row
lengths = df_lengths.collect()
ranges = [[0, 0]]
for x in lengths:
ranges.append([ranges[-1][-1], ranges[-1][-1] + int(x["_Length"])])
Cols = Row(*[r["_Name"] for r in lengths])
df = rdd_data.map(lambda x: Cols(*[x[r[0]:r[1]] for r in ranges[1:]])).toDF()
df.show()
# +---+-----+---+
# | C1| C2| C3|
# +---+-----+---+
# | 0|01201| 81|
# | 0|02201| 80|
# | 0|03201| 81|
# +---+-----+---+
Something like this is possible using only DataFrame API, if you have a column which you could use inside orderBy for the window function.
from pyspark.sql import functions as F, Window as W
df_data = spark.read.csv(r"C:\Temp\sample.txt")
df_lengths = spark.createDataFrame([("1", "C1"), ("5", "C2"), ("2", "C3")], ["_Length", "_Name"])
sum_col = F.sum("_Length").over(W.orderBy("_Name")) + 1
df_lengths = (df_lengths
.withColumn("_Len", F.array((sum_col - F.col("_Length")).cast('int'), "_Length"))
.groupBy().pivot("_Name").agg(F.first("_Len"))
)
df_data = df_data.select(
[F.substring("_c0", int(c[0]), int(c[1])) for c in df_lengths.head()]
).toDF(*df_lengths.columns)
df_data.show()
# +---+-----+---+
# | C1| C2| C3|
# +---+-----+---+
# | 0|01201| 81|
# | 0|02201| 80|
# | 0|03201| 81|
# +---+-----+---+

How to count hypothenuses with pandas udf, pyspark

I want to write a panda udf which will take two arguments cathetus1, and cathetus2 from other dataframe and return hypot.
# this data is list where cathetuses are.
data = [(3.0, 4.0), (6.0, 8.0), (3.3, 5.6)]
schema = StructType([StructField("cathetus1",DoubleType(),True),StructField("cathetus2",DoubleType(),True)])
df = spark.createDataFrame(data=data,schema=schema)
df.show()
#and this is creating dataframe where only cathetuses are showing.
this is function i have written so far.
def pandaUdf(cat1, cat2):
leg1 = []
leg2 = []
for i in data:
x = 0
leg1.append(i[x])
leg2.append(i[x+1])
hypoData.append(np.hypot(leg1,leg2))
return np.hypot(leg1,leg2)
#example_series = pd.Series(data)
and im trying to create a new column in df, which values will be hypos.
df.withColumn(col('Hypo'), pandaUdf(example_df.cathetus1,example_df.cathetus2)).show()
but this gives me an error --> col should be Column.
I dont understand how I can fix this error or why its even there.
You can apply np.hypot on the 2 cathetus directly without extracting individual values.
from pyspark.sql import functions as F
from pyspark.sql.types import *
data = [(3.0, 4.0), (6.0, 8.0), (3.3, 5.6)]
schema = StructType([StructField("cathetus1",DoubleType(),True),StructField("cathetus2",DoubleType(),True)])
df = spark.createDataFrame(data=data,schema=schema)
df.show()
"""
+---------+---------+
|cathetus1|cathetus2|
+---------+---------+
| 3.0| 4.0|
| 6.0| 8.0|
| 3.3| 5.6|
+---------+---------+
"""
def hypot(cat1: pd.Series, cat2: pd.Series) -> pd.Series:
return np.hypot(cat1,cat2)
hypot_pandas_df = F.pandas_udf(hypot, returnType=FloatType())
df.withColumn("Hypo", hypot_pandas_df("cathetus1", "cathetus2")).show()
"""
+---------+---------+----+
|cathetus1|cathetus2|Hypo|
+---------+---------+----+
| 3.0| 4.0| 5.0|
| 6.0| 8.0|10.0|
| 3.3| 5.6| 6.5|
+---------+---------+----+
"""

How to yield pandas dataframe rows to spark dataframe

Hi I'm making transformation, I have created some_function(iter) generator to yield Row(id=index, api=row['api'], A=row['A'], B=row['B'] to yield transformed rows from pandas dataframe to rdd and to spark dataframe. I'm getting errors. (I must use pandas to transform data as there is a large amount of legacy code)
Input Spark DataFrame
respond_sdf.show()
+-------------------------------------------------------------------+
|content |
+-------------------------------------------------------------------+
|{'api': ['api_1', 'api_1', 'api_1'],'A': [1,2,3], 'B': [4,5,6] } |
|{'api': ['api_2', 'api_2', 'api_2'],'A': [7,8,9], 'B': [10,11,12] }|
+-------------------------------------------------------------------+
Expected Spark Dataframe after transformation
transform_df.show()
+-------------------+
| api | A | B |
+-------------------+
| api_1 | 1 | 4 |
| api_1 | 3 | 5 |
| api_1 | 4 | 6 |
| api_2 | 7 | 10 |
| api_2 | 8 | 11 |
| api_2 | 9 | 12 |
+-------------------+
Minimum example code
#### IMPORT PYSPARK ###
import pandas as pd
import pyspark
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, IntegerType,StringType
spark = pyspark.sql.SparkSession.builder.appName("test") \
.master('local[*]') \
.getOrCreate()
sc = spark.sparkContext
####### INPUT DATAFRAME WITH LIST OF JSONS ########################
rdd_list = [["{'api': ['api_1', 'api_1', 'api_1'],'A': [1,2,3], 'B': [4,5,6] }"],
["{'api': ['api_2', 'api_2', 'api_2'],'A': [7,8,9], 'B': [10,11,12] }"]]
schema = StructType([StructField('content', StringType(), True)])
jsons = sc.parallelize(rdd_list)
respond_sdf = spark.createDataFrame(jsons, schema)
respond_sdf.show(truncate=False)
####### TRANSFORMATION DATAFRAME ########################
# Pandas transformation function returning pandas dataframe
def pandas_function(url_json):
# Complex Pandas transformation
url = url_json[0]
json = url_json[1]
df = pd.DataFrame(eval(json))
return df
# Generator returing Row from pandas dataframe
def some_function(iter):
# Pandas generator
pandas_df = pandas_function(iter)
for index, row in pandas_df.iterrows():
## ERROR COMES FROM THIS ROW
yield Row(id=index, api=row['api'], A=row['A'], B=row['B'])
# Creating transformation spark dataframe
schema = StructType([
StructField('API', StringType(), True),
StructField('A', IntegerType(), True),
StructField('B', IntegerType(), True)
])
rdd = respond_sdf.rdd.map(lambda x: some_function(x))
transform_df = spark.createDataFrame(rdd,schema)
transform_df.show()
I'm getting error below:
raise TypeError(new_msg("StructType can not accept object %r in type %s"
TypeError: StructType can not accept object <generator object some_function at 0x7f69b43def90> in type <class 'generator'>
Full error:
Py4JJavaError: An error occurred while calling o462.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 37.0 failed 1 times, most recent failure: Lost task 2.0 in stage 37.0 (TID 97, dpc, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
serializer.dump_stream(out_iter, outfile)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 271, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
return f(*args, **kwargs)
File "/usr/lib/spark/python/pyspark/sql/session.py", line 612, in prepare
verify_func(obj)
File "/usr/lib/spark/python/pyspark/sql/types.py", line 1408, in verify
verify_value(obj)
File "/usr/lib/spark/python/pyspark/sql/types.py", line 1395, in verify_struct
raise TypeError(new_msg("StructType can not accept object %r in type %s"
TypeError: StructType can not accept object <generator object some_function at 0x7f69b43def90> in type <class 'generator'>
I'm following advice from the link below:
pySpark convert result of mapPartitions to spark DataFrame
EDIT: In Spark 3.0 there is also a mapInPandas function which should be more efficient because there is no need to group by.
import pyspark.sql.functions as F
def pandas_function(iterator):
for df in iterator:
yield pd.concat(pd.DataFrame(x) for x in df['content'].map(eval))
transformed_df = respond_sdf.mapInPandas(pandas_function, "api string, A int, B int")
transformed_df.show()
Another way: using pandas_udf and apply:
import pyspark.sql.functions as F
#F.pandas_udf("api string, A int, B int", F.PandasUDFType.GROUPED_MAP)
def pandas_function(url_json):
df = pd.DataFrame(eval(url_json['content'][0]))
return df
transformed_df = respond_sdf.groupBy(F.monotonically_increasing_id()).apply(pandas_function)
transformed_df.show()
+-----+---+---+
| api| A| B|
+-----+---+---+
|api_2| 7| 10|
|api_2| 8| 11|
|api_2| 9| 12|
|api_1| 1| 4|
|api_1| 2| 5|
|api_1| 3| 6|
+-----+---+---+
Old answer (not very scalable...):
def pandas_function(url_json):
df = pd.DataFrame(eval(url_json))
return df
transformed_df = spark.createDataFrame(pd.concat(respond_sdf.rdd.map(lambda r: pandas_function(r[0])).collect()))
transformed_df.show()
+-----+---+---+
| api| A| B|
+-----+---+---+
|api_1| 1| 4|
|api_1| 2| 5|
|api_1| 3| 6|
|api_2| 7| 10|
|api_2| 8| 11|
|api_2| 9| 12|
+-----+---+---+
Thanks to #mck examples, From Spark 2.4 I found there is also applyInPandas function, which returns spark dataframe.
def pandas_function(url_json):
df = pd.DataFrame(eval(url_json['content'][0]))
return df
respond_sdf.groupby(F.monotonically_increasing_id()).applyInPandas(pandas_function, schema="api string, A int, B int").show()
+-----+---+---+
| api| A| B|
+-----+---+---+
|api_2| 7| 10|
|api_2| 8| 11|
|api_2| 9| 12|
|api_1| 1| 4|
|api_1| 2| 5|
|api_1| 3| 6|
+-----+---+---+

How to combine two data frame in PySpark aws Glue

I have two dataframes which get records from database using aws glue and both database has same columns .
I am getting count for the id for dataframe one and same for dataframe two .
Now i am printing this two dataframe separately which make difficult to read like report .
So in order to do so i want to combine these two dataframe into one line .like one row .
For example we have a dataframe result
Data Frame One Output
id,"count(1)"
"02adba80-0b00-4094-8645-wrwer",2527
"0a34803c-64eb-12fd-9940-www",35008
"0a34805f-669c-167f-99a7-44",8
"0a370f68-6c05-1aa6-9b12-55",5
"0a370f69-6c05-1aa0-9b0e-66",8
"0a370f6c-6c0d-1ff6-9b06-77",13
Data Frame Two Output
id,"count(1)"
"02adba80-0b00-4094-8645-wrwer",2527
"0a34803c-64eb-12fd-9940-www",35008
"0a34805f-669c-167f-99a7-44",9
"0a370f68-6c05-1aa6-9b12-55",5
"0a370f69-6c05-1aa0-9b0e-66",9
"0a370f6c-6c0d-1ff6-9b06-77",10
Now for reporting and beeter readability puspose i want this be save like belwo
idaws,"count(1)aws" idonprem,"count(1)onprem"
"02adba80-0b00-4094-8645-wrwer",2527 "02adba80-0b00-4094-8645-wrwer",2527
"0a34803c-64eb-12fd-9940-www",35008 "0a34803c-64eb-12fd-9940-www",35008
"0a34805f-669c-167f-99a7-44",8 "0a34805f-669c-167f-99a7-44",9
"0a370f68-6c05-1aa6-9b12-55",5 "0a370f68-6c05-1aa6-9b12-55",5
"0a370f69-6c05-1aa0-9b0e-66",8 "0a370f69-6c05-1aa0-9b0e-66",9
"0a370f6c-6c0d-1ff6-9b06-77",13 "0a370f6c-6c0d-1ff6-9b06-77",10
to this i used union but it does not gives result as expected it adds up in different rows not in same column
Please suggest how can we do
here is my pyspark
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.transforms import Join
from awsglue.dynamicframe import DynamicFrame
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
##Loading Data Source one Mysql
customer_case= glueContext.create_dynamic_frame.from_catalog(database = "a204795-laskhya-report", table_name = "fsp_customer_case", transformation_ctx = "customer_case")
customer_case.toDF().createOrReplaceTempView("customer_case")
##Loading Data Source two Mysql
groupdf = glueContext.create_dynamic_frame.from_catalog(database = "a204795-laskhya-report", table_name = "fsp_fsp_group", transformation_ctx = "group")
groupdf.toDF().createOrReplaceTempView("groupdf")
##Combning Data Source one and two from AWS side
resultdfMysql=spark.sql("select p.id, count(1) from customer_case cc join groupdf c on cc.owner_id = c.id join groupdf p on p.id = substr(c.path, 2, instr(substr(c.path, 2), '/') - 1) where p.status='ACTIVE' and c.status='ACTIVE' group by p.id")
OuptutdfMysql=DynamicFrame.fromDF(resultdfMysql, glueContext, "mydfSource")
###Source Connection On prem side##################
##Loading Data Source one Orcale On prem
customer_caseSourcedf= glueContext.create_dynamic_frame.from_catalog(database = "a204795-source-qa1", table_name = "_ort487a_int_thomsonreuters_com__fsp_customer_case", transformation_ctx = "customer_case")
customer_caseSourcedf.toDF().createOrReplaceTempView("customer_caseSourcedf")
##Loading Data Source two Orcale On prem
groupSourcedf = glueContext.create_dynamic_frame.from_catalog(database = "a204795-source-qa1", table_name = "_ort487a_int_thomsonreuters_com__fsp_fsp_group", transformation_ctx = "group")
groupSourcedf.toDF().createOrReplaceTempView("groupSourcedf")
##Combning Data Source one and two from ON prem side
resultdfOracle=spark.sql("select p.id, count(1) from customer_caseSourcedf cc join groupSourcedf c on cc.owner_id = c.id join groupSourcedf p on p.id = substr(c.path, 2, instr(substr(c.path, 2), '/') - 1) where p.status='ACTIVE' and c.status='ACTIVE' group by p.id")
OuptutdfOracle=DynamicFrame.fromDF(resultdfOracle, glueContext, "mydfOracle")
##Finaly we need to combine Data Frame one and Two here
resultSourcedfFinal = OuptutdfMysql.join(OuptutdfOracle, 'id', 'left')
##OuptutSourcedf=DynamicFrame.fromDF(resultSourcedfFinal, glueContext, "mydf")
repartitionedSource1 = resultSourcedfFinal.repartition(1)
datasinkSource2 = glueContext.write_dynamic_frame.from_options(frame = repartitionedSource1, connection_type = "s3", connection_options = {"path": "s3://a204795-aryabhata/AGLUE/OEACLE", "compression": "gzip"}, format = "csv", transformation_ctx = "datasink2")
job.commit()
Above code does not give me desired result
and gets error like
IllegalArgumentException: "requirement failed: The number of columns doesn't match.\nOld column names (2): id, count(1)\nNew column names (0): "
You can perform a left_join as below assuming the keys are unique as per the example:
df_a.show()
+--------------------+-----+
| ID|count|
+--------------------+-----+
|02adba80-0b00-409...| 2527|
|0a34803c-64eb-12f...|35008|
|0a34805f-669c-167...| 9|
|0a370f68-6c05-1aa...| 5|
|0a370f69-6c05-1aa...| 9|
|0a370f6c-6c0d-1ff...| 10|
+--------------------+-----+
df_b.show()
+--------------------+-----+
| ID|count|
+--------------------+-----+
|02adba80-0b00-409...| 2527|
|0a34803c-64eb-12f...|35008|
|0a34805f-669c-167...| 8|
|0a370f68-6c05-1aa...| 5|
|0a370f69-6c05-1aa...| 8|
|0a370f6c-6c0d-1ff...| 13|
+--------------------+-----+
df_b = df_b.join(df_a, 'id', 'left')
df_b.show()
+--------------------+-----+-----+
| ID|count|count|
+--------------------+-----+-----+
|0a34803c-64eb-12f...|35008|35008|
|0a34805f-669c-167...| 8| 9|
|0a370f6c-6c0d-1ff...| 13| 10|
|0a370f69-6c05-1aa...| 8| 9|
|02adba80-0b00-409...| 2527| 2527|
|0a370f68-6c05-1aa...| 5| 5|
+--------------------+-----+-----+
Try this
df_aws = df_aws.withColumnRenamed('count','aws_count')
df_onprem = df_onprem.withColumnRenamed('count','onprem_count')
df = df_aws.join(df_onprem, 'id', 'left')
df.repartition(1).write.csv("s3://a204795-bucket/AGLUE/OEACLE")
job.commit()

using numpy busday_count function in pyspark

Given I have two columns in pyspark that are not null:
df2 = sqlContext.sql("SELECT AssignedDate,primaryid from parts_so where AssignedDate is not null ")
df2 = df2.withColumn('sdate', df2.AssignedDate )
df2 = df2.withColumn('edate',current_timestamp() )
When I run df2 = df2.withColumn('days_outstanding', lit(get_hours2(df2.sdate,df2.edate ) )
def get_hours2(sdate,edate):
biz_days = np.busday_count(sdate,edate)
return biz_days
I get the following error:
object of type 'Column' has no len()
First of all let's create some random data
import datetime
import random
import pandas as pd
import numpy as np
sdate = [datetime.datetime.now() + datetime.timedelta(i) for i in range(5)]
edate = [date + datetime.timedelta(random.random()+3) for date in sdate]
data = {
'sdate': sdate,
'edate': edate
}
pdf = pd.DataFrame(data)
df = spark.createDataFrame(pdf)
df.show()
+--------------------+--------------------+
| edate| sdate|
+--------------------+--------------------+
|2019-12-06 22:55:...|2019-12-03 08:14:...|
|2019-12-07 19:42:...|2019-12-04 08:14:...|
|2019-12-08 21:26:...|2019-12-05 08:14:...|
|2019-12-09 18:57:...|2019-12-06 08:14:...|
|2019-12-11 04:08:...|2019-12-07 08:14:...|
+--------------------+--------------------+
You cannot use bare function to create another column in pyspark. We have to create UDF in order to do that.
NOTE: Please remember that you have to cast the result of the computation to int, because you might get a problem with pickling numpy type.
import pyspark.sql.types as T
import pyspark.sql.functions as F
#F.udf(returnType=T.IntegerType())
def get_hours2(sdate,edate):
biz_days = np.busday_count(sdate,edate)
return int(biz_days)
Finally we can use UDF on created DataFrame.
df = df.withColumn('days_outstanding', F.lit(get_hours2('sdate', 'edate')))
df.show()
+--------------------+--------------------+----------------+
| edate| sdate|days_outstanding|
+--------------------+--------------------+----------------+
|2019-12-06 22:55:...|2019-12-03 08:14:...| 3|
|2019-12-07 19:42:...|2019-12-04 08:14:...| 3|
|2019-12-08 21:26:...|2019-12-05 08:14:...| 2|
|2019-12-09 18:57:...|2019-12-06 08:14:...| 1|
|2019-12-11 04:08:...|2019-12-07 08:14:...| 2|
+--------------------+--------------------+----------------+
I hope this helps you.