how to store Pyspark dataframe into HBase - dataframe

I have a code that converts Pyspark streaming data to dataframe. I need to store this dataframe into Hbase. Help me to write code additionally.
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
def getSparkSessionInstance(sparkConf):
if ('sparkSessionSingletonInstance' not in globals()):
globals()['sparkSessionSingletonInstance'] = SparkSession\
.builder\
.config(conf=sparkConf)\
.getOrCreate()
return globals()['sparkSessionSingletonInstance']
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: sql_network_wordcount.py <hostname> <port> ",
file=sys.stderr)
exit(-1)
host, port = sys.argv[1:]
sc = SparkContext(appName="PythonSqlNetworkWordCount")
ssc = StreamingContext(sc, 1)
lines = ssc.socketTextStream(host, int(port))
def process(time, rdd):
print("========= %s =========" % str(time))
try:
words = rdd.map(lambda line :line.split(" ")).collect()
spark = getSparkSessionInstance(rdd.context.getConf())
linesDataFrame = spark.createDataFrame(words,schema=["lat","lon"])
linesDataFrame.show()
except :
pass
lines.foreachRDD(process)
ssc.start()
ssc.awaitTermination()

You can use Spark-Hbase connector to access HBase from Spark.It provides an API in both low-level RDD and Dataframes.
The connector requires you to define a Schema for HBase table. Below is an example of Schema defined for a HBase table with name as table1, row key as key and a number of columns (col1-col8). Note that the rowkey also has to be defined in details as a column (col0), which has a specific cf (rowkey).
def catalog = '{
"table":{"namespace":"default", "name":"table1"},\
"rowkey":"key",\
"columns":{\
"col0":{"cf":"rowkey", "col":"key", "type":"string"},\
"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},\
"col2":{"cf":"cf1", "col":"col2", "type":"double"},\
"col3":{"cf":"cf1", "col":"col3", "type":"float"},\
"col4":{"cf":"cf1", "col":"col4", "type":"int"},\
"col5":{"cf":"cf2", "col":"col5", "type":"bigint"},\
"col6":{"cf":"cf2", "col":"col6", "type":"smallint"},\
"col7":{"cf":"cf2", "col":"col7", "type":"string"},\
"col8":{"cf":"cf2", "col":"col8", "type":"tinyint"}\
}\
}'
Once the catalog is defined according to the schema of your dataframe, You can write the dataFrame to HBase using:
df.write\
.options(catalog=catalog)\
.format("org.apache.spark.sql.execution.datasources.hbase")\
.save()
To Read the data from HBase:
df = spark.\
read.\
format("org.apache.spark.sql.execution.datasources.hbase").\
option(catalog=catalog).\
load()
You need to include the Spark-HBase connector package as below while submitting the spark application.
pyspark --packages com.hortonworks:shc-core:1.1.1-2.1-s_2.11 --repositories http://repo.hortonworks.com/content/groups/public/

Related

Pandas combine mutilple columns in a BQ table to generate payload for FB conversions api

I am reading from a bigquery table to generate a payload to upload to FB conversions api.
cols=["payload","client_user_agent","event_source_url"]
I am copying the column values directly from the bq table as I am unable to print the full output of the dataframe in note book.
payload="{"pageDetail":{"pageName":"Confirmation","pageContentType":"cart","pageSiteSection":"cart","breadcrumbs":[{"title":"Home","url":"/en/home.html"},{"title":"Cart","url":"/cart"},{"title":"Confirmation","url":"/order-confirmation="}],"pageCategory":"Home","pageCategory1":"Cart","pageCategory2":"Confirmation","proBtbGlobalHeader":false},"orderDetails":{"hceid":"3b94a","orderConfirmed":true,"orderDate":"2021-01-15","orderId":"0123","unique":2,"pricingSummary":{"total":54.01},"items":[{"productId":"0456","quantity":1,"shippingAddress":{"postalCode":"V4N 3X3"},"promotion":{"voucherCode":null},"clickToInstall":{"eligible":false}},{"productId":"0789","quantity":1,"fulfillment":{"fulfillmentCost":""},"shippingAddress":{"postalCode":"A4N 3Y3"},"promotion":{"voucherCode":null},"clickToInstall":{"eligible":false}}],"billingAddress":{"postalCode":"M$X1A7"}},"event":{"type":"Load","page":"Confirmation","timestamp":1610706772998,"language":"English","url":"https://www"}}"
client_user_agent="Mozilla/5.0"
event_source_url= "https://www.def.com="
I need the value for email=[orderDetails][hceid] and value=["orderDetails"]["pricingSummary"]["total"]
Initially all the payload I wanted was in a single column and I was able to achieve the uploads with the following code
import time
from facebook_business.adobjects.serverside.event import Event
from facebook_business.adobjects.serverside.event_request import EventRequest
from facebook_business.adobjects.serverside.user_data import UserData
from facebook_business.adobjects.serverside.custom_data import CustomData
from facebook_business.api import FacebookAdsApi
import pandas as pd
import json
FacebookAdsApi.init(access_token=access_token)
query='''SELECT JSON_EXTRACT(payload, '$') AS payload FROM `project.dataset.events` WHERE eventType = 'Page Load' AND pagename = "Confirmation" limit 1'''
df = pd.read_gbq(query, project_id= project, dialect='standard')
payload = df.to_dict(orient="records")
for i in payload:
#print(type(i["payload"]))
k = json.loads(i["payload"])
email = k["orderDetails"]["hcemuid"]
user_data = UserData(email)
value=k["orderDetails"]["pricingSummary"]["total"]
order_id = k["orderDetails"]["orderId"]
custom_data = CustomData(
currency='CAD',
value=value)
event = Event(
event_name='Purchase',
event_time=int(time.time()),
user_data=user_data,
custom_data=custom_data,
event_id = order_id,
data_processing_options= [])
events = [event]
#print(events)
event_request = EventRequest(
events=events,
test_event_code='TEST8609',
pixel_id=pixel_id)
#print(event_request)
a=event_request.execute()
print(a)
Now there are additional values client_user_agent that needs to be part of user data and event_source_url as parts of events in the above code that are present as two different columns in GBQ table.
I have tried similar code as above for multiple columns but I am receiving a
TypeError: Object of type Series is not JSON serializable
So I tried concatenating the columns and then create a json serializable object but I am not able to do an upload.
Below is where I am stuck and lost and not sure how to proceed further any inputs appreciated.
import time
from facebook_business.adobjects.serverside.event import Event
from facebook_business.adobjects.serverside.event_request import EventRequest
from facebook_business.adobjects.serverside.user_data import UserData
from facebook_business.adobjects.serverside.custom_data import CustomData
from facebook_business.api import FacebookAdsApi
import pandas as pd
import json
FacebookAdsApi.init(access_token=access_token)
query='''SELECT payload AS payload,location.userAgent as client_user_agent,location.referrer as event_source_url FROM `project.Dataset.events` WHERE eventType = 'Page Load' AND pagename = "Confirmation" limit 1'''
df = pd.read_gbq(query, project_id= project, dialect='standard')
df.reset_index(drop=True, inplace=True)
payload = df.to_dict(orient="records")
print(payload)
## cols = ['payload', 'client_user_agent', 'event_source_url']
## df['combined'] = df[cols].apply(lambda row: ','.join(row.values.astype(str)), axis=1)
## del df["payload"]
## del df["client"]
## del df["source"]
## payload = df.to_dict(orient="records")
#tried concatinating all columns in a the dataframe but not able to create a valid json object for upload
columns = ['payload', 'client_user_agent', 'event_source_url']
df['payload'] = df['payload'].str.replace(r'}"$', '')
payload = df[columns].to_dict(orient='records')
print(payload)
## df = df.drop(columns=columns)
## pd.options.display.max_rows = 4000
# #print(payload)
# for i in payload:
# print(i["payload"])
# k = json.loads(i["payload"])
# email = k["orderDetails"]["hcemuid"]
# print(email)
I am following the instructions from this page:https://developers.facebook.com/docs/marketing-api/conversions-api
I have used the bigquery json_extract_scalar function to extract data from nested column instead of pandas which is a relatively better solution for my scenario.

Writing Data from pandas dataframe to PostgreSQL gives error of 'DataFrame' objects are mutable, thus they cannot be hashed

i am trying to save a data frame which was first imported in pandas from postgresql as dfraw and then do some manipulation and create another dataframe as df and save it back in postgresql same database using sql alchemy. but when i am trying to save it back its giving error of 'DataFrame' objects are mutable, thus they cannot be hashed
PFB code below
import psycopg2
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
# connect the database to python
# Update connection string information
host = "something.something.azure.com"
dbname = "abcd"
user = "abcd"
password = "abcd"
sslmode = "require"
schema = 'xyz'
# Construct connection string
conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode)
conn = psycopg2.connect(conn_string)
print("Connection established")
cursor = conn.cursor()
# Fetch all rows from table
cursor.execute("SELECT * FROM xyz.abc;")
rows = cursor.fetchall()
# Convert the tuples in dataframes
dfraw = pd.DataFrame(rows, columns =["ID","Timestamp","K","S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"])
dfraw[["S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"]] = dfraw[["S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"]].apply(pd.to_numeric)
dfraw[["Timestamp","K"]]=dfraw[["Timestamp","K"]].apply(pd.to_datetime)
# Creating temp files
temp1 = dfraw
dfraw = temp1
# creating some fucntions for data manipulation and imputations
def remZero(df,dropCol):
for k in df.drop(dropCol,axis=1):
if all(df[k] == 0):
continue
if any(df[k] == 0):
print(k)
df[k] = df[k].replace(to_replace=0, method='ffill')
return df
# Drop Columns function
dropCol = ['Timestamp','K','ID','H','C','S']
dropCol2 = ['Timestamp','K','ID','Shift']
df = remZero(dfraw,dropCol)
from sqlalchemy import create_engine
engine = create_engine('postgresql://abcd:abcd#something.something.azure.com:5432/abcd')
df.to_sql(name = df,
con=engine,
index = False,
if_exists= 'replace'
)
Error Message
Found basic error in the code I just missed putting the inverted comma before the data frame name to be published. The basic hygiene was missed
df.to_sql(name = "df",
con=engine,
index = False,
if_exists= 'replace'
)

Return KDB query to a pandas dataframe

I would like to extract data from a KDB database and place into a dataframe. My query runs fine in qpad, no issues; just need to write it into my Pandas dataframe. My code:
from qpython import qconnection
# Create the connection and save the handle to a variable
q = qconnection.QConnection(host = 'wokplpaxvj003', port = 11503, username = 'pelucas', password = 'Dive2600', timeout = 3.0)
try:
# initialize connection
q.open()
print(q)
print('IPC version: %s. Is connected: %s' % (q.protocol_version, q.is_connected()))
df = q.sendSync('{select from quote_flat where date within (2019.08.14;2019.08.14), amendment_no = (max;amendment_no)fby quote_id}')
df.info()
finally:
q.close()
It fails on the df.info() raising AttributeError: 'QLambda' object has no attribute 'info' so I guess the call is not successful.
It looks like you've sent only a lambda but with no instruction to execute that lambda. Two options:
Don't make it a lambda
df = q.sendSync('select from quote_flat where date within (2019.08.14;2019.08.14), amendment_no = (max;amendment_no)fby quote_id')
Execute the lambda
df = q.sendSync('{select from quote_flat where date within (2019.08.14;2019.08.14), amendment_no = (max;amendment_no)fby quote_id}[]')

Apache Beam job (Python) using Tensorflow Transform is killed by Cloud Dataflow

I'm trying to run an Apache Beam job based on Tensorflow Transform on Dataflow but its killed. Someone has experienced that behaviour? This is a simple example with DirectRunner, that runs ok on my local but fails on Dataflow (I change the runner properly):
import os
import csv
import datetime
import numpy as np
import tensorflow as tf
import tensorflow_transform as tft
from apache_beam.io import textio
from apache_beam.io import tfrecordio
from tensorflow_transform.beam import impl as beam_impl
from tensorflow_transform.beam import tft_beam_io
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
import apache_beam as beam
NUMERIC_FEATURE_KEYS = ['feature_'+str(i) for i in range(2000)]
def _create_raw_metadata():
column_schemas = {}
for key in NUMERIC_FEATURE_KEYS:
column_schemas[key] = dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(column_schemas))
return raw_data_metadata
def preprocessing_fn(inputs):
outputs={}
for key in NUMERIC_FEATURE_KEYS:
outputs[key] = tft.scale_to_0_1(inputs[key])
return outputs
def main():
output_dir = '/tmp/tmp-folder-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
RUNNER = 'DirectRunner'
with beam.Pipeline(RUNNER) as p:
with beam_impl.Context(temp_dir=output_dir):
raw_data_metadata = _create_raw_metadata()
_ = (raw_data_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join(output_dir, 'rawdata_metadata'), pipeline=p))
m = numpy_dataset = np.random.rand(100,2000)*100
raw_data = (p
| 'CreateTestDataset' >> beam.Create([dict(zip(NUMERIC_FEATURE_KEYS, m[i,:])) for i in range(m.shape[0])]))
raw_dataset = (raw_data, raw_data_metadata)
transform_fn = (raw_dataset | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))
_ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))
(transformed_data, transformed_metadata) = ((raw_dataset, transform_fn) | 'Transform' >> beam_impl.TransformDataset())
transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
_ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(os.path.join(output_dir, 'train'), file_name_suffix='.gz', coder=transformed_data_coder)
if __name__ == '__main__':
main()
Also, my production code (not shown) fail with the message: The job graph is too large. Please try again with a smaller job graph, or split your job into two or more smaller jobs.
Any hint?
The restriction on the pipeline description size is documented here:
https://cloud.google.com/dataflow/quotas#limits
There is a way around that, instead of creating stages for each tensor that goes into tft.scale_to_0_1 we could fuse them by first stacking them together, and then passing them into tft.scale_to_0_1 with 'elementwise=True'.
The result will be the same, because the min and max are computed per 'column' instead of across the whole tensor.
This would look something like this:
stacked = tf.stack([inputs[key] for key in NUMERIC_FEATURE_KEYS], axis=1)
scaled_stacked = tft.scale_to_0_1(stacked, elementwise=True)
for key, tensor in zip(NUMERIC_FEATURE_KEYS, tf.unstack(scaled_stacked, axis=1)):
outputs[key] = tensor

'Insert SparkSession DataFrame' automatically converting data from integer to float in DSX

I have a csv file which I insert to IBM Data Science Experience using "SparkSession DataFrame". All content in the csv file (other than the headers) are integers.
The dataframe works as expected though certain Machine Learning models until trying to create a Linear Regression Classification where I get this error:
TypeError: Cannot cast array data from dtype('float64') to dtype('U32') according to the rule 'safe'
I believe this means that the data is no longer an integer and is being treated as a float.
How can I resolve this? Is there anything that can be done when you import the file to make sure it stays as an integer? See example below where I tried to add in a second option for format.
`from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read\
.format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
.option('header', 'true')\
.option('format', 'int32')\
.load(bmos.url('name', 'name.csv'))
df.take(5)`
#charles-gomes is correct. Here's a complete example where my file tinyinttest.csv is in an objectstore container called TestingSandbox.
Contents of tinyinttest.csv is:
name,val
a,1
b,2
code:
from pyspark.sql import SparkSession
import ibmos2spark
credentials = {
'auth_url': 'https://identity.open.softlayer.com',
'project_id': 'xxx',
'region': 'xxx',
'user_id': 'xxx',
'username': 'xxx',
'password': 'xxx'
}
configuration_name = 'xxx'
bmos = ibmos2spark.bluemix(sc, credentials, configuration_name)
spark = SparkSession.builder.getOrCreate()
df = spark.read\
.format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
.option('header', 'true')\
.option('inferSchema', 'true')\
.load(bmos.url('TestingSandbox', 'tinyinttest.csv'))
df.schema
output:
StructType(List(StructField(name,StringType,true),StructField(val,IntegerType,true)))