spark streaming with json file - apache-spark-sql

I want to read json data from a folder location through spark streaming.
I assume my json data is
{"transactionId":111,"customerId":1,"itemId": 1,"amountPaid": 100}
I want the output in Spark SQL table as:--
transactionId customerId itemId amountPaid
111 1 1 100
my code is :
package org.training.spark.streaming
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Duration
import org.apache.spark.sql.functions.udf
import org.training.spark.streaming.sqlstreaming.Persons
object jsonread {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("local").setAppName("jsonstreaming")
val sc = new SparkContext(sparkConf)
// Create the context
val ssc = new StreamingContext(sc, Seconds(40))
val lines = ssc.textFileStream("src/main/resources/fileStreaming")
lines.foreachRDD(rdd=>rdd.foreach(println))
val words = lines.flatMap(_. split(","))
words.foreachRDD(rdd=>rdd.foreach(println))
val sqc = new SQLContext(sc);
import sqc.implicits._
words.foreachRDD { rdd =>
val persons = rdd.map(_.split(":")).map(p => (p(0), p(1))).toDF()
persons.registerTempTable("data")
val jsontable = sqc.sql("SELECT * from data")
jsontable.show
}
ssc.start()
ssc.awaitTermination()
}
}

Json Data:
{"transactionId":"111","customerId":"1","itemId": "1","amountPaid": "100"}
Pyspark Code to read from above json data:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.types import IntegerType, LongType, DecimalType,StructType, StructField, StringType
from pyspark.sql import Row
from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.sql import Window
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
ssc = StreamingContext(sc, 5)
stream_data = ssc.textFileStream("/filepath/")
def readMyStream(rdd):
if not rdd.isEmpty():
df = spark.read.json(rdd)
print('Started the Process')
print('Selection of Columns')
df = df.select('transactionId','customerId','itemId','amountPaid').where(col("transactionId").isNotNull())
df.show()
stream_data.foreachRDD( lambda rdd: readMyStream(rdd) )
ssc.start()
ssc.stop()

Related

Read web content into a dataframe without writing to a file

I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)

Selenium Date Format

I'm pulling data with selenium and saving this data to the database. Although the relevant column in the database is date and the field is filled in the relevant site, the database is empty, as '0000-00-00'
The code for the area I'm scraping.
if "Test" in description_list:
index_no = description_list.index("Test")
try:
first_registration = value_list[index_no]
except:
first_registration =
An example of the date I am trying to engrave; 07/28. I appreciate your help.
from xml.etree.ElementTree import QName
import bs4
import urllib.request
import pandas as pd
from datetime import datetime
from tkinter import E
import pymysql
import mysql.connector
import configparser
import re
import numpy as np
import time
import concurrent.futures
# import erequests
# import lxml
from multiprocessing import Pool
# from multiprocessing import Process, Lock
from multiprocessing import Process
from datetime import datetime
from tqdm import tqdm # progress bar
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import datetime
from sqlalchemy import create_engine
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.firefox.options import Options
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="",
database="m_n"
)
mycursor = mydb.cursor()
sql = "SELECT ad_link FROM adlinks_d"
mycursor.execute(sql)
myresult = mycursor.fetchall()
all_links = myresult[0:]
len_all_links = len(all_links)
dataframe = pd.DataFrame(all_links, columns=['links'])
x = 1
y = 5
#def fonksiyon(i):
# global x
# global y
number = np.arange(x,y)
for i in tqdm(number):
ad_link = dataframe.links[i]
fireFoxOptions = Options()
fireFoxOptions.binary_location = r'C:\Program Files\Firefox Developer Edition\firefox.exe'
fireFoxOptions.add_argument("--headless")
fireFoxOptions.add_argument('--disable-gpu')
fireFoxOptions.add_argument('--no-sandbox')
driver = webdriver.Firefox(options=fireFoxOptions)
sleep_time = 1
driver.get(ad_link)
time.sleep(sleep_time)
ad_source = driver.page_source
ad_soup = BeautifulSoup(ad_source, 'html.parser')
mainresults = ad_soup.find_all('div', {'class': 'cBox cBox--content u-overflow-inherit '})
cars_data = pd.DataFrame({
'brand_and_model': brand_and_model,
'model_version': model_version,
},
index=[0])
df3 = pd.DataFrame(list(zip(equipment_key, equipment_value)), columns=['all_key', 'all_value'])
df2 = pd.DataFrame(list(zip(all_key, all_value)), columns=['all_key', 'all_value'])
df1.insert(0, "brand_and_model", brand_and_model)
df2_3 = pd.concat([df2, df3])
df2_3 = df2_3.set_index('all_key').T.reset_index(drop=True)
df2_3 = df2_3.rename_axis(None, axis=1)
df_last = pd.concat([df1, df2_3], axis=1)
df_last = df_last.astype(str).groupby(df_last.columns, sort=False, axis=1).agg(
lambda x: x.apply(','.join, 1))
now = datetime.now()
datetime_string = str(now.strftime("%Y%m%d_%H%M%S"))
df_last['download_date_time'] = datetime_string
config = configparser.RawConfigParser()
config.read(filenames='my.properties')
scrap_db = pymysql.connect(host='localhost', user='root', password='', database='m_n', charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
cursor = scrap_db.cursor()
sql = """CREATE TABLE S(
brand_and_model VARCHAR(32),
first_registration DATE(6),
download_date_time DATE(6)
)"""
#cursor.execute(sql)
for row_count in range(0, df_last.shape[0]):
chunk = df_last.iloc[row_count:row_count + 1, :].values.tolist()
brand_and_model = ""
first_registration = ""
download_date_time = ""
lenght_of_chunk = len(chunk[0])
if "brand_and_model" in cars_data:
try:
brand_and_model = chunk[0][0]
except:
brand_and_model = ""
if chunk[0][lenght_of_chunk - 1] != "":
download_date_time = chunk[0][lenght_of_chunk - 1]
if (brand_and_model == ' '):
control = "false"
else:
control = "true"
if control == "true":
mySql_insert_query = "INSERT INTO S (brand_and_model,first_registration,download_date_time) VALUES (%s,%s,%s)"
val = (
brand_and_model, location, first_registration, download_date_time)
cursor = scrap_db.cursor()
cursor.execute(mySql_insert_query, val)
scrap_db.commit()
print(cursor.rowcount, "Record inserted successfully into *S* table")
driver.close()

Printing json result in an request - pyspark

I would appreciate some help on this, having this API call below, how could I print the json results?
import requests
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType
from pyspark.sql.functions import udf
class Test:
def Test1():
r = requests.get("https://mylink")
if r.status_code != 200:
print("Error calling geo-cache: {} - {}".format(r.status_code, r.text))
else:
return r.json()
Thanks in advance!

delete s3 object using pyspark

i need delete object
import logging
import boto3
from botocore.exceptions import ClientError
def delete_object(bucket_name, object_name):
# Delete the object
s3 = boto3.client('s3')
try:
s3.delete_object(Bucket=bucket_name, Key=object_name)
except ClientError as e:
logging.error(e)
return False
return True
a = delete_object("dgaray-bucket","consolidado.dat")
generates error
Command failed with exit code 1
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
def delete_object(bucket_name, object_name):
# Delete the object
s3 = boto3.client('s3')
s3.delete_object(Bucket=bucket_name, Key=object_name)
a = delete_object("name-bucket","directory/file.dat")
I failed because of the spark session.

Table or view not found - Spark Scala

I'm beginner with big data and i'm working with spark-scala. I work with dataframes and to make things clear to me I used multiple scala objects to write my code. all the classes have main methods to run the code. The first scala object is used to load files into dataframes and the other scala objects make statistics computations. this is some of the code of the first one
object LoadFiles {
//classes for datasets
case class T(X: Option[String], P: Option[String],Y:Option[String])
println("Load File 1 into dataframe")
def main(args: Array[String]){
val sc = new SparkContext("local[*]", "LoadFiles1")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val warehouseLocation = new File("spark-warehouse").getAbsolutePath
val spark = SparkSession
.builder()
.appName("Spark Hive Example")
.config("spark.sql.warehouse.dir", warehouseLocation)
.enableHiveSupport()
.getOrCreate()
import sqlContext.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.{StructType,StructField,StringType};
val dataframe1 = sc.textFile("file1.ttl").map(_.split(" |\\ . ")).map(p =>
T(Try(p(0).toString()).toOption,Try(p(1).toString()).toOption,Try(p(2).toString()).toOption)).toDF()
dataframe1
.write
.partitionBy("Predicate")
.mode(SaveMode.Overwrite)
.saveAsTable("dataframe1")
}}
The other scala objects are used to make many computations from the loaded dataframes and create other dataframes
this is the second one
object Statistics1 {
def main(args: Array[String]) {
val sc = new SparkContext("local[*]", "LoadFiles1")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val warehouseLocation = new File("spark-warehouse").getAbsolutePath
val spark = SparkSession
.builder()
.appName("Spark Hive Example")
.config("spark.sql.warehouse.dir", warehouseLocation)
.enableHiveSupport()
.getOrCreate()
import sqlContext.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.{StructType,StructField,StringType};
// subject query
val Query1 = spark.sql("SELECT X As Res, P as Pred, COUNT(Y) As nbrFROM dataframe1 GROUP BY X, P")
.write
.mode(SaveMode.Overwrite)
.saveAsTable("stat1") }}
I got the error Exception in thread "main" org.apache.spark.sql.AnalysisException: Table or view not found: dataframe1; line 1 pos 75
How can I fix this ?