I have a pyspark applicataion that loads data from Kinesis and saves to S3.
Each batch processing time is quite stable, but then it can stuck.
How can I figure out why it happens?
Code sample:
columns = [x.name for x in schema]
Event = Row(*[x[0] for x in columns])
def get_spark_session_instance(sparkConf):
if ("sparkSessionSingletonInstance" not in globals()):
globals()["sparkSessionSingletonInstance"] = SparkSession \
.builder \
.config(conf=sparkConf) \
.getOrCreate()
return globals()["sparkSessionSingletonInstance"]
def creating_func():
def timing(message):
print('timing', str(datetime.utcnow()), message)
def process_game(df, game, time_part):
# s3
df.write.json("{}/{}/{}/{}".format(path_prefix, game, 'group_1', time_part),
compression="gzip", timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSS")
timing('{}_grop_1'.format(game))
df[df['group'] == 2] \
.write.json("{}/{}/{}/{}".format(path_prefix, game, 'group_2', time_part),
compression="gzip", timestampFormat="yyyy-MM-dd'T'HH:mm:ss.SSS")
timing('{}_grop_2'.format(game))
# database
df[df['group'] == 3].select(*db_columns) \
.write.jdbc(db_connection_string, table="test.{}group_3".format(game), mode='append',
properties=db_connection_propetries)
timing('{}_db'.format(game))
def event_to_row(event):
event_dict = json.loads(event)
event_dict['json_data'] = event_dict.get('json_data') and json.dumps(
event_dict.get('json_data'))
return Event(*[event_dict.get(x) for x in columns])
def process(rdd):
if not rdd.isEmpty():
spark_time = datetime.utcnow().strftime('%Y/%m/%d/%H/%M%S_%f')
rows_rdd = rdd.map(event_to_row)
spark = get_spark_session_instance(rdd.context.getConf())
df = spark.createDataFrame(data=rows_rdd, schema=schema)
df = df.withColumn("ts", df["ts"].cast(TimestampType())) \
.withColumn("processing_time", lit(datetime.utcnow()))
df.cache()
print('timing -----------------------------')
process_game(df[df['app_id'] == 1], 'app_1', spark_time)
process_game(df[df['app_id'] == 2], 'app_2', spark_time)
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 240)
kinesis_stream = KinesisUtils.createStream(
ssc, sys.argv[2], 'My-stream-name', "kinesis.us-east-1.amazonaws.com",
'us-east-1', InitialPositionInStream.TRIM_HORIZON, 240, StorageLevel.MEMORY_AND_DISK_2)
kinesis_stream.repartition(16 * 3).foreachRDD(process)
ssc.checkpoint(checkpoint_prefix + sys.argv[1])
return ssc
if __name__ == '__main__':
print('timing', 'cast ts', str(datetime.utcnow()))
ssc = StreamingContext.getActiveOrCreate(checkpoint_prefix + sys.argv[1], creating_func)
ssc.start()
ssc.awaitTermination()
Streaming Web UI
Batches info
identify the process taking the time, use kill -QUIT or jstack to get the stack trace. look in the source for possible delays, and consider where you can increase log4j logging for more info.
Does the delay increase with the amount of data written? If so, that's the usual "rename is really copy" problem s3 has
Related
when i tried to write the data (100000 rows , 6 columns : 13.09 mb) in csv format using spark (3.1.2) it takes around 5s. But if i tried to include the auto-ml training& prediction using sparkling water internal backend in spark it takes around 12mins. I have checked the datatype of the auto-ml prediction, it's in pyspark dataframe type. Is this behavior is due to the following reason when you convert between an H2OFrame and a Spark DataFrame because Sparkling Water uses a wrapper around the H2OFrame, which uses the RDD/DataFrame API. Is there any way to improve the write speed ? Any help on this issue would be really helpful. I have attached the sample script below
from pyspark.sql.types import *
from pysparkling import *
from pysparkling.ml import *
import time
import logging
spark = SparkSession.builder.getOrCreate()
hc = H2OContext.getOrCreate()
my_logger = logging.getLogger(__name__)
my_logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s : %(name)s : %(levelname)s : %(message)s')
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
my_logger.addHandler(stream_handler)
read_location = "XXX"
write_location = "XXX"
train_config = {
"labelCol": "XXX",
"maxModels": 2,
"excludeAlgos": ["XGBoost", "StackedEnsemble","GLM","DeepLearning"],
"maxRuntimeSecsPerModel": 300,
"maxRuntimeSecs": 600
}
# bq config
my_logger.info("Reading CSV")
df = spark.read.format("csv").option("header", True).load(read_location)
my_logger.info("Read CSV")
my_logger.info("Printing input DF schema")
df.printSchema()
my_logger.info("Printed input DF schema")
my_logger.info("Starting model training")
autoMLEstimator = H2OAutoML(**train_config)
model = autoMLEstimator.fit(df)
my_logger.info("Model training finished")
my_logger.info("Starting prediction")
outputDf = model.transform(df)
my_logger.info("Prediction done")
my_logger.info("Printing output DF schema")
outputDf.printSchema()
my_logger.info("Printed output DF schema")
startTime = time.time()
my_logger.info("Starting to write")
outputDf.write.csv(write_location,header = 'true')
my_logger.info("Writing finished")
seconds = time.time() - startTime
my_logger.info("Total Time Taken: "+str(time.strftime("%H:%M:%S",time.gmtime(seconds))))```
I have a function which is creating a data frame by doing multiprocessing on a df:-
Suppose if I am having 10 rows in my df so the function processor will process all 10 rows separately. what I want is to concatenate all the output of the function processor and make one data frame.
def processor(dff):
"""
reading data from a data frame and doing all sorts of data manipulation
for multiprocessing
"""
return df
def main(infile, mdebug):
global debug
debug = mdebug
try:
lines = sum(1 for line in open(infile))
except Exception as err:
print("Error {} opening file: {}").format(err, infile)
sys.exit(2000)
if debug >= 2:
print(infile)
try:
dff = pd.read_csv(infile)
except Exception as err:
print("Error {}, opening file: {}").format(err, infile)
sys.exit(2000)
df_split = np.array_split(dff, (lines+1))
cores = multiprocessing.cpu_count()
cores = 64
# pool = Pool(cores)
pool = Pool(lines-1)
for n, frame in enumerate(pool.imap(processor, df_split), start=1):
if frame is not None:
frame.to_csv('{}'.format(n))
pool.close()
pool.join()
if __name__ == "__main__":
args = parse_args()
"""
print "Debug is: {}".format(args.debug)
"""
if args.debug >= 1:
print("Running in debug mode: "), args.debug
main(infile=args.infile, mdebug=args.debug)
you can use either the data frame constructor or concat to solve your problem. the appropriate one to use depends on details of your code that you haven't included
here's a more complete example:
import numpy as np
import pandas as pd
# create dummy dataset
dff = pd.DataFrame(np.random.rand(101, 5), columns=list('abcde'))
# process data
with Pool() as pool:
result = pool.map(processor, np.array_split(dff, 7))
# put it all back together in one dataframe
result = np.concat(result)
I run a simple query to get cookie as string and timestamps as array using pyspark sql.
I want to pass them to my user defined function but the array of timestamps is passed as an array of unicodes.
Can someone help me figure this out. Thanks
#udf(returnType=StringType())
def PrintDetails(cookie, timestamps, current_day, current_hourly_threshold,current_daily_threshold):
print(type(timestamps[0]))
def main(argv):
spark = SparkSession \
.builder \
.appName("parquet_test") \
.config("spark.debug.maxToStringFields", "100") \
.getOrCreate()
inputPath = r'D:\Hadoop\Spark\parquet_input_files'
inputFiles = os.path.join(inputPath, '*.parquet')
impressionDate = datetime.strptime("2019_12_31", '%Y_%m_%d')
current_hourly_threshold = 40
current_daily_threshold = 200
parquetFile = spark.read.parquet(inputFiles)
parquetFile.createOrReplaceTempView("parquetFile")
cookie_and_time = spark.sql("SELECT cookie, collect_list(date_format(from_unixtime(ts), 'YYYY-MM-dd-hh:mm:ss')) as imp_times FROM parquetFile group by 1 ")
cookie_df = cookie_and_time.withColumn("cookies", PrintDetails(cookie_and_time['cookie'], cookie_and_time['imp_times'], lit(impressionDate), lit(current_hourly_threshold), lit(current_daily_threshold)))
cookie_df.show()
if __name__ == "__main__":
main(sys.argv)
To start with, I am not a developer, but a mere automation engineer that have worked a bit with coding in Java, python, C#, C++ and C.
I am trying to make a prototype that take pictures and stores them using a digital pin on the board. Atm I can take pictures using a switch, but it is really slow(around 3 seconds pr image).
My complete system is going to be like this:
A product passes by on a conveyor and a photo cell triggers the board to take an image and store it. If an operator removes a product(because of bad quality) the image is stored in a different folder.
I started with the snapshot function shipped with Mendel and have tried to get rid off the overhead, but the Gstream and pipeline-stuff confuses me a lot.
If someone could help me with how to understand the supplied code, or how to write a minimalistic solution to take an image i would be grateful :)
I have tried to understand and use project-teachable and examples-camera from Google coral https://github.com/google-coral, but with no luck. I have had the best luck with the snapshot tool that uses snapshot.py that are referenced here https://coral.withgoogle.com/docs/camera/datasheet/#snapshot-tool
from periphery import GPIO
import time
import argparse
import contextlib
import fcntl
import os
import select
import sys
import termios
import threading
import gi
gi.require_version('Gst', '1.0')
gi.require_version('GstBase', '1.0')
from functools import partial
from gi.repository import GLib, GObject, Gst, GstBase
from PIL import Image
GObject.threads_init()
Gst.init(None)
WIDTH = 2592
HEIGHT = 1944
FILENAME_PREFIX = 'img'
FILENAME_SUFFIX = '.png'
AF_SYSFS_NODE = '/sys/module/ov5645_camera_mipi_v2/parameters/ov5645_af'
CAMERA_INIT_QUERY_SYSFS_NODE = '/sys/module/ov5645_camera_mipi_v2/parameters/ov5645_initialized'
HDMI_SYSFS_NODE = '/sys/class/drm/card0/card0-HDMI-A-1/status'
# No of initial frames to throw away before camera has stabilized
SCRAP_FRAMES = 1
SRC_WIDTH = 2592
SRC_HEIGHT = 1944
SRC_RATE = '15/1'
SRC_ELEMENT = 'v4l2src'
SINK_WIDTH = 2592
SINK_HEIGHT = 1944
SINK_ELEMENT = ('appsink name=appsink sync=false emit-signals=true '
'max-buffers=1 drop=true')
SCREEN_SINK = 'glimagesink sync=false'
FAKE_SINK = 'fakesink sync=false'
SRC_CAPS = 'video/x-raw,format=YUY2,width={width},height={height},framerate={rate}'
SINK_CAPS = 'video/x-raw,format=RGB,width={width},height={height}'
LEAKY_Q = 'queue max-size-buffers=1 leaky=downstream'
PIPELINE = '''
{src_element} ! {src_caps} ! {leaky_q} ! tee name=t
t. ! {leaky_q} ! {screen_sink}
t. ! {leaky_q} ! videoconvert ! {sink_caps} ! {sink_element}
'''
def on_bus_message(bus, message, loop):
t = message.type
if t == Gst.MessageType.EOS:
loop.quit()
elif t == Gst.MessageType.WARNING:
err, debug = message.parse_warning()
sys.stderr.write('Warning: %s: %s\n' % (err, debug))
elif t == Gst.MessageType.ERROR:
err, debug = message.parse_error()
sys.stderr.write('Error: %s: %s\n' % (err, debug))
loop.quit()
return True
def on_new_sample(sink, snapinfo):
if not snapinfo.save_frame():
# Throw away the frame
return Gst.FlowReturn.OK
sample = sink.emit('pull-sample')
buf = sample.get_buffer()
result, mapinfo = buf.map(Gst.MapFlags.READ)
if result:
imgfile = snapinfo.get_filename()
caps = sample.get_caps()
width = WIDTH
height = HEIGHT
img = Image.frombytes('RGB', (width, height), mapinfo.data, 'raw')
img.save(imgfile)
img.close()
buf.unmap(mapinfo)
return Gst.FlowReturn.OK
def run_pipeline(snapinfo):
src_caps = SRC_CAPS.format(width=SRC_WIDTH, height=SRC_HEIGHT, rate=SRC_RATE)
sink_caps = SINK_CAPS.format(width=SINK_WIDTH, height=SINK_HEIGHT)
screen_sink = FAKE_SINK
pipeline = PIPELINE.format(
leaky_q=LEAKY_Q,
src_element=SRC_ELEMENT,
src_caps=src_caps,
sink_caps=sink_caps,
sink_element=SINK_ELEMENT,
screen_sink=screen_sink)
pipeline = Gst.parse_launch(pipeline)
appsink = pipeline.get_by_name('appsink')
appsink.connect('new-sample', partial(on_new_sample, snapinfo=snapinfo))
loop = GObject.MainLoop()
# Set up a pipeline bus watch to catch errors.
bus = pipeline.get_bus()
bus.add_signal_watch()
bus.connect('message', on_bus_message, loop)
# Connect the loop to the snaphelper
snapinfo.connect_loop(loop)
# Run pipeline.
pipeline.set_state(Gst.State.PLAYING)
try:
loop.run()
except:
pass
# Clean up.
pipeline.set_state(Gst.State.NULL)
while GLib.MainContext.default().iteration(False):
pass
class SnapHelper:
def __init__(self, sysfs, prefix='img', oneshot=True, suffix='jpg'):
self.prefix = prefix
self.oneshot = oneshot
self.suffix = suffix
self.snap_it = oneshot
self.num = 0
self.scrapframes = SCRAP_FRAMES
self.sysfs = sysfs
def get_filename(self):
while True:
filename = self.prefix + str(self.num).zfill(4) + '.' + self.suffix
self.num = self.num + 1
if not os.path.exists(filename):
break
return filename
#def check_af(self):
#try:
# self.sysfs.seek(0)
# v = self.sysfs.read()
# if int(v) != 0x10:
# print('NO Focus')
#except:
# pass
# def refocus(self):
# try:#
# self.sysfs.write('1')
# self.sysfs.flush()
# except:
# pass
def save_frame(self):
# We always want to throw away the initial frames to let the
# camera stabilize. This seemed empirically to be the right number
# when running on desktop.
if self.scrapframes > 0:
self.scrapframes = self.scrapframes - 1
return False
if self.snap_it:
self.snap_it = False
retval = True
else:
retval = False
if self.oneshot:
self.loop.quit()
return retval
def connect_loop(self, loop):
self.loop = loop
def take_picture(snap):
start_time = int(round(time.time()))
run_pipeline(snap)
print(time.time()- start_time)
def main():
button = GPIO(138, "in")
last_state = False
with open(AF_SYSFS_NODE, 'w+') as sysfs:
snap = SnapHelper(sysfs, 'test', 'oneshot', 'jpg')
sysfs.write('2')
while 1:
button_state = button.read()
if(button_state==True and last_state == False):
snap = SnapHelper(sysfs, 'test', 'oneshot', 'jpg')
take_picture(snap)
last_state = button_state
if __name__== "__main__":
main()
sys.exit()
Output is what i expect, but it is slow.
I switched to a USB-webcam and used the pygame library instead.
I need to run some clustering algorithms in parallel in Jupyter notebook. The clustering function I want to parallel works when doing multithreading or when run individually. However, it returns
raise Py4JError("{0} does not exist in the JVM".format(name))
when I try multiprocessing. I don't have much experience with multiprocessing, what could I be doing wrong?
Code for clustering:
def clustering(ID, df):
pandas_df = df.select("row", "features", "type") \
.where(df.type == ID).toPandas()
print("process " + str(ID) + ": preparing data for clustering")
feature_series = pandas_df["features"].apply(lambda x: x.toArray())
objs = [pandas_df, pd.DataFrame(feature_series.tolist())]
t_df = pd.concat(objs, axis=1)
print("process " + str(ID) + ": initiating clustering")
c= #clustering algo here
print("process " + str(ID) + " DONE!")
return
Code for multiprocessing:
import multiprocessing as mp
k = 4
if __name__ == '__main__':
pl = []
for i in range(0,k):
print("sending process:", i)
process = mp.Process(target=clustering, args=(i, df))
jobs.append(process)
process.start()
for process in pl:
print("waiting for join from process")
process.join()
Error was caused by the subprocesses not being able to access the same memory(in which the pyspark dataframe resided).
Solved by partitioning the dataset first by putting the access to the pyspark dataframe in another function like so:
pandas_df = df.select("row", "features", "type") \
.where(df.type == ID).toPandas()
And then running the clustering on the separated Pandas dataframes.