Read text file using information in separate dataframe - dataframe

I have fixed width file as below
00120181120xyz12341
00220180203abc56792
00320181203pqr25483
And a corresponding dataframe that specifies the schema (says column name (_Name) and the column width (_Length):
How can I use PySpark to get the text file dataframe as follows:
#+---+----+--+
#|C1| C2 |C3|
#+--+-----+--+
#| 0|02018|11|
#| 0|02018|02|
#| 0|02018|12|
#+--+-----+--+

You could:
collect your column names and lengths;
use it to create a list of substrings indexes to be used in extracting string parts that you need;
use the list of substring indexes to extract string parts for every row.
Input:
rdd_data = spark.sparkContext.textFile(r'C:\Temp\sample.txt')
df_lengths = spark.createDataFrame([("1", "C1"), ("5", "C2"), ("2", "C3")], ["_Length", "_Name"])
Script:
from pyspark.sql import Row
lengths = df_lengths.collect()
ranges = [[0, 0]]
for x in lengths:
ranges.append([ranges[-1][-1], ranges[-1][-1] + int(x["_Length"])])
Cols = Row(*[r["_Name"] for r in lengths])
df = rdd_data.map(lambda x: Cols(*[x[r[0]:r[1]] for r in ranges[1:]])).toDF()
df.show()
# +---+-----+---+
# | C1| C2| C3|
# +---+-----+---+
# | 0|01201| 81|
# | 0|02201| 80|
# | 0|03201| 81|
# +---+-----+---+
Something like this is possible using only DataFrame API, if you have a column which you could use inside orderBy for the window function.
from pyspark.sql import functions as F, Window as W
df_data = spark.read.csv(r"C:\Temp\sample.txt")
df_lengths = spark.createDataFrame([("1", "C1"), ("5", "C2"), ("2", "C3")], ["_Length", "_Name"])
sum_col = F.sum("_Length").over(W.orderBy("_Name")) + 1
df_lengths = (df_lengths
.withColumn("_Len", F.array((sum_col - F.col("_Length")).cast('int'), "_Length"))
.groupBy().pivot("_Name").agg(F.first("_Len"))
)
df_data = df_data.select(
[F.substring("_c0", int(c[0]), int(c[1])) for c in df_lengths.head()]
).toDF(*df_lengths.columns)
df_data.show()
# +---+-----+---+
# | C1| C2| C3|
# +---+-----+---+
# | 0|01201| 81|
# | 0|02201| 80|
# | 0|03201| 81|
# +---+-----+---+

Related

How to count hypothenuses with pandas udf, pyspark

I want to write a panda udf which will take two arguments cathetus1, and cathetus2 from other dataframe and return hypot.
# this data is list where cathetuses are.
data = [(3.0, 4.0), (6.0, 8.0), (3.3, 5.6)]
schema = StructType([StructField("cathetus1",DoubleType(),True),StructField("cathetus2",DoubleType(),True)])
df = spark.createDataFrame(data=data,schema=schema)
df.show()
#and this is creating dataframe where only cathetuses are showing.
this is function i have written so far.
def pandaUdf(cat1, cat2):
leg1 = []
leg2 = []
for i in data:
x = 0
leg1.append(i[x])
leg2.append(i[x+1])
hypoData.append(np.hypot(leg1,leg2))
return np.hypot(leg1,leg2)
#example_series = pd.Series(data)
and im trying to create a new column in df, which values will be hypos.
df.withColumn(col('Hypo'), pandaUdf(example_df.cathetus1,example_df.cathetus2)).show()
but this gives me an error --> col should be Column.
I dont understand how I can fix this error or why its even there.
You can apply np.hypot on the 2 cathetus directly without extracting individual values.
from pyspark.sql import functions as F
from pyspark.sql.types import *
data = [(3.0, 4.0), (6.0, 8.0), (3.3, 5.6)]
schema = StructType([StructField("cathetus1",DoubleType(),True),StructField("cathetus2",DoubleType(),True)])
df = spark.createDataFrame(data=data,schema=schema)
df.show()
"""
+---------+---------+
|cathetus1|cathetus2|
+---------+---------+
| 3.0| 4.0|
| 6.0| 8.0|
| 3.3| 5.6|
+---------+---------+
"""
def hypot(cat1: pd.Series, cat2: pd.Series) -> pd.Series:
return np.hypot(cat1,cat2)
hypot_pandas_df = F.pandas_udf(hypot, returnType=FloatType())
df.withColumn("Hypo", hypot_pandas_df("cathetus1", "cathetus2")).show()
"""
+---------+---------+----+
|cathetus1|cathetus2|Hypo|
+---------+---------+----+
| 3.0| 4.0| 5.0|
| 6.0| 8.0|10.0|
| 3.3| 5.6| 6.5|
+---------+---------+----+
"""

How to yield pandas dataframe rows to spark dataframe

Hi I'm making transformation, I have created some_function(iter) generator to yield Row(id=index, api=row['api'], A=row['A'], B=row['B'] to yield transformed rows from pandas dataframe to rdd and to spark dataframe. I'm getting errors. (I must use pandas to transform data as there is a large amount of legacy code)
Input Spark DataFrame
respond_sdf.show()
+-------------------------------------------------------------------+
|content |
+-------------------------------------------------------------------+
|{'api': ['api_1', 'api_1', 'api_1'],'A': [1,2,3], 'B': [4,5,6] } |
|{'api': ['api_2', 'api_2', 'api_2'],'A': [7,8,9], 'B': [10,11,12] }|
+-------------------------------------------------------------------+
Expected Spark Dataframe after transformation
transform_df.show()
+-------------------+
| api | A | B |
+-------------------+
| api_1 | 1 | 4 |
| api_1 | 3 | 5 |
| api_1 | 4 | 6 |
| api_2 | 7 | 10 |
| api_2 | 8 | 11 |
| api_2 | 9 | 12 |
+-------------------+
Minimum example code
#### IMPORT PYSPARK ###
import pandas as pd
import pyspark
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, IntegerType,StringType
spark = pyspark.sql.SparkSession.builder.appName("test") \
.master('local[*]') \
.getOrCreate()
sc = spark.sparkContext
####### INPUT DATAFRAME WITH LIST OF JSONS ########################
rdd_list = [["{'api': ['api_1', 'api_1', 'api_1'],'A': [1,2,3], 'B': [4,5,6] }"],
["{'api': ['api_2', 'api_2', 'api_2'],'A': [7,8,9], 'B': [10,11,12] }"]]
schema = StructType([StructField('content', StringType(), True)])
jsons = sc.parallelize(rdd_list)
respond_sdf = spark.createDataFrame(jsons, schema)
respond_sdf.show(truncate=False)
####### TRANSFORMATION DATAFRAME ########################
# Pandas transformation function returning pandas dataframe
def pandas_function(url_json):
# Complex Pandas transformation
url = url_json[0]
json = url_json[1]
df = pd.DataFrame(eval(json))
return df
# Generator returing Row from pandas dataframe
def some_function(iter):
# Pandas generator
pandas_df = pandas_function(iter)
for index, row in pandas_df.iterrows():
## ERROR COMES FROM THIS ROW
yield Row(id=index, api=row['api'], A=row['A'], B=row['B'])
# Creating transformation spark dataframe
schema = StructType([
StructField('API', StringType(), True),
StructField('A', IntegerType(), True),
StructField('B', IntegerType(), True)
])
rdd = respond_sdf.rdd.map(lambda x: some_function(x))
transform_df = spark.createDataFrame(rdd,schema)
transform_df.show()
I'm getting error below:
raise TypeError(new_msg("StructType can not accept object %r in type %s"
TypeError: StructType can not accept object <generator object some_function at 0x7f69b43def90> in type <class 'generator'>
Full error:
Py4JJavaError: An error occurred while calling o462.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 37.0 failed 1 times, most recent failure: Lost task 2.0 in stage 37.0 (TID 97, dpc, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
process()
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
serializer.dump_stream(out_iter, outfile)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 271, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
return f(*args, **kwargs)
File "/usr/lib/spark/python/pyspark/sql/session.py", line 612, in prepare
verify_func(obj)
File "/usr/lib/spark/python/pyspark/sql/types.py", line 1408, in verify
verify_value(obj)
File "/usr/lib/spark/python/pyspark/sql/types.py", line 1395, in verify_struct
raise TypeError(new_msg("StructType can not accept object %r in type %s"
TypeError: StructType can not accept object <generator object some_function at 0x7f69b43def90> in type <class 'generator'>
I'm following advice from the link below:
pySpark convert result of mapPartitions to spark DataFrame
EDIT: In Spark 3.0 there is also a mapInPandas function which should be more efficient because there is no need to group by.
import pyspark.sql.functions as F
def pandas_function(iterator):
for df in iterator:
yield pd.concat(pd.DataFrame(x) for x in df['content'].map(eval))
transformed_df = respond_sdf.mapInPandas(pandas_function, "api string, A int, B int")
transformed_df.show()
Another way: using pandas_udf and apply:
import pyspark.sql.functions as F
#F.pandas_udf("api string, A int, B int", F.PandasUDFType.GROUPED_MAP)
def pandas_function(url_json):
df = pd.DataFrame(eval(url_json['content'][0]))
return df
transformed_df = respond_sdf.groupBy(F.monotonically_increasing_id()).apply(pandas_function)
transformed_df.show()
+-----+---+---+
| api| A| B|
+-----+---+---+
|api_2| 7| 10|
|api_2| 8| 11|
|api_2| 9| 12|
|api_1| 1| 4|
|api_1| 2| 5|
|api_1| 3| 6|
+-----+---+---+
Old answer (not very scalable...):
def pandas_function(url_json):
df = pd.DataFrame(eval(url_json))
return df
transformed_df = spark.createDataFrame(pd.concat(respond_sdf.rdd.map(lambda r: pandas_function(r[0])).collect()))
transformed_df.show()
+-----+---+---+
| api| A| B|
+-----+---+---+
|api_1| 1| 4|
|api_1| 2| 5|
|api_1| 3| 6|
|api_2| 7| 10|
|api_2| 8| 11|
|api_2| 9| 12|
+-----+---+---+
Thanks to #mck examples, From Spark 2.4 I found there is also applyInPandas function, which returns spark dataframe.
def pandas_function(url_json):
df = pd.DataFrame(eval(url_json['content'][0]))
return df
respond_sdf.groupby(F.monotonically_increasing_id()).applyInPandas(pandas_function, schema="api string, A int, B int").show()
+-----+---+---+
| api| A| B|
+-----+---+---+
|api_2| 7| 10|
|api_2| 8| 11|
|api_2| 9| 12|
|api_1| 1| 4|
|api_1| 2| 5|
|api_1| 3| 6|
+-----+---+---+

cast a date to integer pyspark

Is is possible to convert a date column to an integer column in a pyspark dataframe? I tried 2 different ways but every attempt returns a column with nulls. What am I missing?
from pyspark.sql.types import *
# DUMMY DATA
simpleData = [("James",34,"2006-01-01","true","M",3000.60),
("Michael",33,"1980-01-10","true","F",3300.80),
("Robert",37,"1992-07-01","false","M",5000.50)
]
columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df=df.withColumn("jobStartDate", df['jobStartDate'].cast(DateType()))
# ATTEMPT 1 with cast()
df=df.withColumn("jobStartDateAsInteger1", df['jobStartDate'].cast(IntegerType()))
# ATTEMPT 2 with selectExpr()
df=df.selectExpr("*","CAST(jobStartDate as int) as jobStartDateAsInteger2")
df.show()
You can try casting it to a UNIX timestamp using F.unix_timestamp():
from pyspark.sql.types import *
import pyspark.sql.functions as F
# DUMMY DATA
simpleData = [("James",34,"2006-01-01","true","M",3000.60),
("Michael",33,"1980-01-10","true","F",3300.80),
("Robert",37,"1992-07-01","false","M",5000.50)
]
columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df=df.withColumn("jobStartDate", df['jobStartDate'].cast(DateType()))
df=df.withColumn("jobStartDateAsInteger1", F.unix_timestamp(df['jobStartDate']))
df.show()
+---------+---+------------+-----------+------+------+----------------------+
|firstname|age|jobStartDate|isGraduated|gender|salary|jobStartDateAsInteger1|
+---------+---+------------+-----------+------+------+----------------------+
| James| 34| 2006-01-01| true| M|3000.6| 1136073600|
| Michael| 33| 1980-01-10| true| F|3300.8| 316310400|
| Robert| 37| 1992-07-01| false| M|5000.5| 709948800|
+---------+---+------------+-----------+------+------+----------------------+

pandas_udf error (applyInPandas) stating tuple format

I have the following spark DataFrame :
+-------------------+-------+---------+-------+---------+---------------+
| Time| Close| Volume| Open|Num_Ticks| Dollar Volume|
+-------------------+-------+---------+-------+---------+---------------+
|2015-06-01 00:00:00|2109.25|1337694.0| 2109.5| 1.0| 2.8215310695E9|
|2015-06-02 00:00:00|2106.75|1442673.0| 2106.5| 1.0|3.03935134275E9|
|2015-06-03 00:00:00| 2116.0|1310989.0|2116.25| 1.0| 2.774052724E9|
|2015-06-04 00:00:00| 2099.0|1716475.0| 2099.0| 1.0| 3.602881025E9|
|2015-06-05 00:00:00|2092.25|1459933.0| 2092.0| 1.0|3.05454481925E9|
|2015-06-08 00:00:00|2078.25|1290580.0| 2079.0| 1.0| 2.682147885E9|
|2015-06-09 00:00:00| 2080.0|1446234.0| 2080.5| 1.0| 3.00816672E9|
|2015-06-10 00:00:00| 2107.0|1664080.0| 2106.0| 1.0| 3.50621656E9|
|2015-06-11 00:00:00|2109.25|1480391.0|2109.25| 1.0|3.12251471675E9|
|2015-06-12 00:00:00| 2093.0|1130566.0| 2094.0| 1.0| 2.366274638E9|
|2015-06-15 00:00:00| 2084.0|1077154.0|2083.75| 1.0| 2.244788936E9|
|2015-06-16 00:00:00| 2097.5| 790233.0|2097.25| 1.0| 1.6575137175E9|
|2015-06-17 00:00:00|2089.25|1577521.0|2088.75| 1.0|3.29583574925E9|
|2015-06-18 00:00:00|2114.75|1899198.0| 2114.0| 1.0| 4.0163289705E9|
|2015-06-19 00:00:00|2097.75|1236103.0|2097.75| 1.0|2.59303506825E9|
|2015-06-22 00:00:00|2112.75|1095590.0|2113.25| 1.0| 2.3147077725E9|
|2015-06-23 00:00:00| 2116.5| 835219.0| 2117.0| 1.0| 1.7677410135E9|
|2015-06-24 00:00:00| 2099.5|1153248.0| 2099.5| 1.0| 2.421244176E9|
|2015-06-25 00:00:00| 2094.0|1213961.0| 2094.0| 1.0| 2.542034334E9|
|2015-06-26 00:00:00|2095.75|1318744.0|2095.75| 1.0| 2.763757738E9|
+-------------------+-------+---------+-------+---------+---------------+
root
|-- Time: timestamp (nullable = true)
|-- Close: double (nullable = true)
|-- Volume: double (nullable = true)
|-- Open: double (nullable = true)
|-- Num_Ticks: double (nullable = true)
|-- Dollar Volume: double (nullable = true)
and I am applying the following function:
def getDailyVol(pdf, span0=100):
pdf = pdf.set_index('Time') # Line #1
close = pdf.Close
df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1)) # Line #1
df0 = df0[df0>0]
df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
df0 = close.loc[df0.index] / close.loc[df0.values].values - 1 # daily returns
df0 = df0.ewm(span=span0).std()
return df0
daily_vol = es_dbars.groupBy().applyInPandas(getDailyVol,
schema='Time timestamp, Close double'
)
However when I am trying to show the results it throws the following error 'tuple' object has no attribute 'set_index' and if I comment line #1 it throws an error in line #2 stating something that searchsorted cannot be applied even if I have already loaded all the following libraries :
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import databricks.koalas as ks
import os, shutil
import numpy as np
import matplotlib.pyplot as pl
spark = SparkSession.builder \
.master('local[3]') \
.appName('chapter3') \
.config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config('spark.executor.memory','6gb') \
.getOrCreate()
sc = spark.sparkContext
spark.sql("set spark.sql.shuffle.partitions = 3")
If someone can help me noticing what am I doing incorrect or if I need to do something else I will appreciate it.
edit: I am providing the dataset in a reproducible way as follows:
,Time,Close,Volume,Open,Num_Ticks,Dollar Volume
0,2015-06-01,2109.25,1337694.0,2109.5,1.0,2821531069.5
1,2015-06-02,2106.75,1442673.0,2106.5,1.0,3039351342.75
2,2015-06-03,2116.0,1310989.0,2116.25,1.0,2774052724.0
3,2015-06-04,2099.0,1716475.0,2099.0,1.0,3602881025.0
4,2015-06-05,2092.25,1459933.0,2092.0,1.0,3054544819.25
5,2015-06-08,2078.25,1290580.0,2079.0,1.0,2682147885.0
6,2015-06-09,2080.0,1446234.0,2080.5,1.0,3008166720.0
7,2015-06-10,2107.0,1664080.0,2106.0,1.0,3506216560.0
8,2015-06-11,2109.25,1480391.0,2109.25,1.0,3122514716.75
9,2015-06-12,2093.0,1130566.0,2094.0,1.0,2366274638.0
If you provide two arguments to your apply function (getDailyVol in your case) your function won't work since PySpark interprets the first argument as keys (tuple) and the second as pdf (pd.DataFrame). You can read this in their documentation.
To fix this, you would have to define your function as follows:
def getDailyVol(pdf):
pdf = pdf.set_index('Time') # Line #1
close = pdf.Close
df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1)) # Line #1
df0 = df0[df0>0]
df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
df0 = close.loc[df0.index] / close.loc[df0.values].values - 1 # daily returns
df0 = df0.ewm(span=100).std()
return df0
Alternatively, you could rewrite it to:
def getDailyVol(df, span0):
def apply_function(pdf):
pdf = pdf.set_index('Time') # Line #1
close = pdf.Close
df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1)) # Line #1
df0 = df0[df0>0]
df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
df0 = close.loc[df0.index] / close.loc[df0.values].values - 1 # daily returns
df0 = df0.ewm(span=span0).std()
return df0
return df.groupBy().applyInPandas(getDailyVol, schema='Time timestamp, Close double')
daily_vol = getDailyVol(df=es_dbars, span0=100)
Which would then work for an arbitrary span0. I can't verify if this works 100% since your example is quite hard to get into my local machine. But hope this helps.

Pyspark -- Filter dataframe based on row values of another dataframe

I have a master dataframe and a secondary dataframe which I want to go through row by row, filter the master dataframe based on the values in each row, run a function on the filtered master dataframe, and save the output.
The output could either be saved in a separate dataframe, or in a new column of the secondary dataframe.
# Master DF
df = pd.DataFrame({"Name": ["Mike", "Bob", "Steve", "Jim", "Dan"], "Age": [22, 44, 66, 22, 66], "Job": ["Doc", "Cashier", "Fireman", "Doc", "Fireman"]})
#Secondary DF
df1 = pd.DataFrame({"Age": [22, 66], "Job": ["Doc", "Fireman"]})
df = spark.createDataFrame(df)
+-----+---+-------+
| Name|Age| Job|
+-----+---+-------+
| Mike| 22| Doc|
| Bob| 44|Cashier|
|Steve| 66|Fireman|
| Jim| 22| Doc|
| Dan| 66|Fireman|
+-----+---+-------+
df1 = spark.createDataFrame(df1)
+---+-------+
|Age| Job|
+---+-------+
| 22| Doc|
| 66|Fireman|
+---+-------+
​
# Filter by values in first row of secondary DF
df_filt = df.filter(
(F.col("Age") == 22) &
(F.col('Job') == 'Doc')
)
# Run the filtered DF through my function
def my_func(df_filt):
my_list = df_filt.select('Name').rdd.flatMap(lambda x: x).collect()
return '-'.join(my_list)
# Output of function
my_func(df_filt)
'Mike-Jim'
# Filter by values in second row of secondary DF
df_filt = df.filter(
(F.col("Age") == 66) &
(F.col('Job') == 'Fireman')
)
# Output of function
my_func(df_filt)
'Steve-Dan'
# Desired output at the end of the iterations
new_df1 = pd.DataFrame({"Age": [22, 66], "Job": ["Doc", "Fireman"], "Returned_value": ['Mike-Jim', 'Steve-Dan']})
Basically, I want to take my Master DF and filter it in certain ways, and run an algorithm on the filtered dataset and get the output for that filtering, then go on to the next set of filtering and do the same.
What is the best way to go about this?
Try this with join, groupBy, concat_ws/array_join and collect_list.
from pyspark.sql import functions as F
df.join(df1,['Age','Job'])\
.groupBy("Age","Job").agg(F.concat_ws('-',F.collect_list("Name")).alias("Returned_value")).show()
#+---+-------+--------------+
#|Age| Job|Returned_value|
#+---+-------+--------------+
#| 22| Doc| Mike-Jim|
#| 66|Fireman| Steve-Dan|
#+---+-------+--------------+