How to validate the date format of a column in Pyspark? - dataframe

I am really new to Pyspark, I want to check if the column has the correct date format or not? How do I do it? I have tried though I am getting an error. Can anyone help me with this?
My code:
df =
Date name
0 12/12/2020 a
1 24/01/2019 b
2 08/09/2018 c
3 12/24/2020 d
4 Nan e
df_out= df.withColumn('output', F.when(F.to_date("Date","dd/mm/yyyy").isNotNull, Y).otherwise(No))
df_out.show()
gives me:
TypeError: condition should be a Column

You can filter out the rows after converting to date type.
Example:
df.show()
#+----------+----+
#| Date|name|
#+----------+----+
#|12/12/2020| a|
#|24/01/2019| b|
#|12/24/2020| d|
#| nan| e|
#+----------+----+
from pyspark.sql.functions import *
df.withColumn("output",to_date(col('Date'),'dd/MM/yyyy')).\
filter(col("output").isNotNull()).\
show()
#+----------+----+----------+
#| Date|name| output|
#+----------+----+----------+
#|12/12/2020| a|2020-12-12|
#|24/01/2019| b|2019-01-24|
#+----------+----+----------+
#without adding new column
df.filter(to_date(col('Date'),'dd/MM/yyyy').isNotNull()).show()
#+----------+----+
#| Date|name|
#+----------+----+
#|12/12/2020| a|
#|24/01/2019| b|
#+----------+----+

Related

How do I create a new column has the count of all the row values that are greater than 0 in pyspark?

Suppose I have a pyspark data frame as:
col1 col2 col3
1 2 -3
2 null 5
4 4 8
1 0 9
I want to add a column called check where it counts the number of values that are greater than 0.
The final output will be:
col1 col2 col3 check
1 2 -3 2
2 null 5 2
4 4 8 3
1 0 9 2
I was trying this. But, it didn't help and errors out as below:
df= df.withColumn("check", sum((df[col] > 0) for col in df.columns))
Invalid argument, not a string or column: <generator object
at 0x7f0a866ae580> of type <class 'generator'>. For column literals,
use 'lit', 'array', 'struct' or 'create_map' function.
Don't know if there is a simpler SQL based solution or not, but it's pretty straight forward with a udf.
count_udf = udf(lambda arr: sum([1 for a in arr if a > 0]), IntegerType())
df.withColumn('check', count_udf(array('col1', 'col2', 'col3'))).show()
Not sure if it'll handle nulls. Add null check (if a and a > 0) in udf if needed.
Idea: https://stackoverflow.com/a/42540401/496289
Your code shows you doing a sum of non-zero columns, not count. If you need sum then
count_udf = udf(lambda arr: sum([a for a in arr if a > 0]), IntegerType())
Create a new column array and filter the newly created column finally count the elements in the column.
Example:
df.show(10,False)
#+----+----+----+
#|col1|col2|col3|
#+----+----+----+
#|1 |2 |-3 |
#|2 |null|5 |
#+----+----+----+
df.withColumn("check",expr("size(filter(array(col1,col2), x -> x > 0))")).show(10,False)
#+----+----+----+-----+
#|col1|col2|col3|check|
#+----+----+----+-----+
#|1 |2 |-3 |2 |
#|2 |null|5 |1 |
#+----+----+----+-----+
You can use functools.reduce to sum the list of columns from df.columns if > 0 like this:
from pyspark.sql import functions as F
from operator import add
from functools import reduce
df = spark.createDataFrame([
(1, 2, -3), (2, None, 5), (4, 4, 8), (1, 0, 9)
], ["col1", "col2", "col3"])
df = df.withColumn(
"check",
reduce(add, [F.when(F.col(c) > 0, 1).otherwise(0) for c in df.columns])
)
df.show()
#+----+----+----+-----+
#|col1|col2|col3|check|
#+----+----+----+-----+
#| 1| 2| -3| 2|
#| 2|null| 5| 2|
#| 4| 4| 8| 3|
#| 1| 0| 9| 2|
#+----+----+----+-----+

pyspark extra column where dates are trasformed to 1, 2 , 3

I have a dataframe with dates in the format YYYYMM.
These start from 201801.
I now want to add a column where 201801 = 1, 201802 = 2 and so on up until the most recent month which is updated every month.
Kind regards,
wokter
months_between can be used:
from pyspark.sql import functions as F
from pyspark.sql import types as T
#some testdata
data = [
[201801],
[201802],
[201804],
[201812],
[202001],
[202010]
]
df = spark.createDataFrame(data, schema=["yyyymm"])
df.withColumn("months", F.months_between(
F.to_date(F.col("yyyymm").cast(T.StringType()), "yyyyMM"), F.lit("2017-12-01")
).cast(T.IntegerType())).show()
Output:
+------+------+
|yyyymm|months|
+------+------+
|201801| 1|
|201802| 2|
|201804| 4|
|201812| 12|
|202001| 25|
|202010| 34|
+------+------+

aggregate a column based on the another columns' values count in pyspark dataframe

I would like to do some aggregations for a pyspark hive table.
my table:
id value_tier ($)
105 5
117 5
108 10
110 12
105 10
112 10
I need to get the number of ids that only appear in one "value_tier".
value_tier num
5 1 -- for 117
10 2 -- for 108 and 112
12 1 -- for 110
Here, 105 is not counted because it appears in two value_tiers.
5 and 10
My sql DDL works but long and ugly.
I would like to have one more elegant.
thanks
In DataFrameAPI use groupBy and agg with collect_list function.
df1.show()
#+---+----------+
#| id|value_tier|
#+---+----------+
#|105| 5|
#|117| 5|
#|108| 10|
#|110| 12|
#|105| 10|
#|112| 10|
#+---+----------+
from pyspark.sql.functions import *
df1.groupBy("id").
agg(concat_ws(',',collect_list(col("value_tier"))).alias("value_tier")).\
filter(size(split(col("value_tier"),",")) <=1).\
groupBy("value_tier").\
agg(count(col("id")).alias("num"),concat_ws(",",collect_list(col("id"))).alias("ids")).\
show()
#+----------+---+-------+
#|value_tier|num| ids|
#+----------+---+-------+
#| 5| 1| 117|
#| 10| 2|112,108|
#| 12| 1| 110|
#+----------+---+-------+
#use collect_set to eliminate duplicates
df1.groupBy("id").
agg(concat_ws(',',collect_set(col("value_tier"))).alias("value_tier")).\
filter(size(split(col("value_tier"),",")) <=1).\
groupBy("value_tier").\
agg(count(col("id")).alias("num"),concat_ws(",",collect_list(col("id"))).alias("ids")).\
show()
In SQL, you can use not exists and aggregation:
sélect value_tier, count(*) cnt
from mytable t
where not exists(
select 1
from mytable t1
where t1.value_tier = t.value_tier and t1.id <> t.id
)
group by value_tier

PySpark Filter between - provide a list of upper and lower bounds, based on groups

I have a PySpark dataframe and would like to filter for rows between an upper bound and lower bound.
Typically, I would just use a filter with between:
import pandas as pd
from pyspark.sql import functions as F
... sql_context creation ...
pdfRaw=pd.DataFrame([{"vehicleID":'A', "Segment":'State Hwy', "speed":68.0},\
{"vehicleID":'B', "Segment":'State Hwy', "speed":76.0}])
dfRaw = sql_context.createDataFrame(pdfRaw).withColumn("vehicleID", "Segment", "speed")
dfRaw.show()
+-----------+------------+-----+
vehicleID| Segment|value|
+-----------+------------+-----+
| A| State Hwy| 68.0|
| B| State Hwy| 73.0|
+-----------+------------+-----+
dfRaw.filter(F.col("speed").between(70,75)).show()
+-----------+------------+-----+
vehicleID| Segment|value|
+-----------+------------+-----+
| B| State Hwy| 73.0|
+-----------+------------+-----+
However I have multiple speed values that I would like to filter between.
Speeds_Curious = {
[25,30],
[55,60],
[60,65],
[70,75]
}
And I actually want to take it one step further. The upper and lower bounds to the filter between depend on the result of a groupby of a previous data frame.
df_RoadSegments.groupby('Segment')\
.agg(F.min('SpeedLimit').alias('minSpeed'),\
F.max('SpeedLimit').alias('maxSpeed'))\
.show()
+-----------+----------+----------+
Segment| minSpeed| maxSpeed|
+-----------+----------+----------+
| Urban| 25.0| 30.0|
| State Hwy| 55.0| 60.0|
|I-State Hwy| 60.0| 65.0|
|I-State Hwy| 70.0| 75.0|
+-----------+----------+----------+
So basically I would like to filter a dataframe between values that are available as columns on a different dataframe.
Something like:
dfLimits = df_RoadSegments.groupby('Segment')\
.agg(F.min('SpeedLimit').alias('minSpeed'),\ F.max('SpeedLimit').alias('maxSpeed'))
dfRaw.groupby('Segment')\
.filter(F.col("speed")\
.between(dfLimits.where(dfLimits.Segment=="State Hwy"(??)).select('minSpeed')),\
dfLimits.where(dfLimits.Segment=="State Hwy"(??)).select('maxSpeed'))))\
.show()
Any thoughts?
Following approach will get you all the vehicles that are between the min and max speed for the particular segment that they belong to.
You can join the two dataframes:
df_joined = dfRaw.join(dfLimits, on="Segment", how="left")
+---------+---------+-----+--------+--------+
| Segment|vehicleID|speed|minSpeed|maxSpeed|
+---------+---------+-----+--------+--------+
|State Hwy| A| 68.0| 55| 60|
|State Hwy| B| 76.0| 55| 60|
+---------+---------+-----+--------+--------+
If you want a further flag of whether the speed is in between rhe mentioned bounds, then you can write:
flag_df = df_joined.withColumn("flag", F.when((F.col("speed") > F.col("minSpeed")) & (F.col("speed") < F.col("minSpeed")), 1).otherwise(0))
flag_df.show()
+---------+---------+-----+--------+--------+----+
| Segment|vehicleID|speed|minSpeed|maxSpeed|flag|
+---------+---------+-----+--------+--------+----+
|State Hwy| A| 68.0| 55| 60| 0|
|State Hwy| B| 76.0| 55| 60| 0|
+---------+---------+-----+--------+--------+----+
You can then simply filter on the flag saying:
df_final = df.filter(F.col("flag") == 1)

SQL/PySpark: Create a new column consisting of a number of rows in the past n days

Currently, I have a table consisting of encounter_id and date field like so:
+---------------------------+--------------------------+
|encounter_id |date |
+---------------------------+--------------------------+
|random_id34234 |2018-09-17 21:53:08.999999|
|this_can_be_anything2432432|2018-09-18 18:37:57.000000|
|423432 |2018-09-11 21:00:36.000000|
+---------------------------+--------------------------+
encounter_id is a random string.
I'm aiming to create a column which consists of the total number of encounters in the past 30 days.
+---------------------------+--------------------------+---------------------------+
|encounter_id |date | encounters_in_past_30_days|
+---------------------------+--------------------------+---------------------------+
|random_id34234 |2018-09-17 21:53:08.999999| 2 |
|this_can_be_anything2432432|2018-09-18 18:37:57.000000| 3 |
|423432 |2018-09-11 21:00:36.000000| 1 |
+---------------------------+--------------------------+---------------------------+
Currently, I'm thinking of somehow using window functions and specifying an aggregate function.
Thanks for the time.
Here is one possible solution, I added some sample data. It indeed uses a window function, as you suggested yourself. Hope this helps!
import pyspark.sql.functions as F
from pyspark.sql.window import Window
df = sqlContext.createDataFrame(
[
('A','2018-10-01 00:15:00'),
('B','2018-10-11 00:30:00'),
('C','2018-10-21 00:45:00'),
('D','2018-11-10 00:00:00'),
('E','2018-12-20 00:15:00'),
('F','2018-12-30 00:30:00')
],
("encounter_id","date")
)
df = df.withColumn('timestamp',F.col('date').astype('Timestamp').cast("long"))
w = Window.orderBy('timestamp').rangeBetween(-60*60*24*30,0)
df = df.withColumn('encounters_past_30_days',F.count('encounter_id').over(w))
df.show()
Output:
+------------+-------------------+----------+-----------------------+
|encounter_id| date| timestamp|encounters_past_30_days|
+------------+-------------------+----------+-----------------------+
| A|2018-10-01 00:15:00|1538345700| 1|
| B|2018-10-11 00:30:00|1539210600| 2|
| C|2018-10-21 00:45:00|1540075500| 3|
| D|2018-11-10 00:00:00|1541804400| 2|
| E|2018-12-20 00:15:00|1545261300| 1|
| F|2018-12-30 00:30:00|1546126200| 2|
+------------+-------------------+----------+-----------------------+
EDIT: If you want to have days as the granularity, you could first convert your date column to the Date type. Example below, assuming that a window of five days means today and the four days before. If it should be today and the past five days just remove the -1.
import pyspark.sql.functions as F
from pyspark.sql.window import Window
n_days = 5
df = sqlContext.createDataFrame(
[
('A','2018-10-01 23:15:00'),
('B','2018-10-02 00:30:00'),
('C','2018-10-05 05:45:00'),
('D','2018-10-06 00:15:00'),
('E','2018-10-07 00:15:00'),
('F','2018-10-10 21:30:00')
],
("encounter_id","date")
)
df = df.withColumn('timestamp',F.to_date(F.col('date')).astype('Timestamp').cast("long"))
w = Window.orderBy('timestamp').rangeBetween(-60*60*24*(n_days-1),0)
df = df.withColumn('encounters_past_n_days',F.count('encounter_id').over(w))
df.show()
Output:
+------------+-------------------+----------+----------------------+
|encounter_id| date| timestamp|encounters_past_n_days|
+------------+-------------------+----------+----------------------+
| A|2018-10-01 23:15:00|1538344800| 1|
| B|2018-10-02 00:30:00|1538431200| 2|
| C|2018-10-05 05:45:00|1538690400| 3|
| D|2018-10-06 00:15:00|1538776800| 3|
| E|2018-10-07 00:15:00|1538863200| 3|
| F|2018-10-10 21:30:00|1539122400| 3|
+------------+-------------------+----------+----------------------+