delete s3 object using pyspark - amazon-s3

i need delete object
import logging
import boto3
from botocore.exceptions import ClientError
def delete_object(bucket_name, object_name):
# Delete the object
s3 = boto3.client('s3')
try:
s3.delete_object(Bucket=bucket_name, Key=object_name)
except ClientError as e:
logging.error(e)
return False
return True
a = delete_object("dgaray-bucket","consolidado.dat")
generates error
Command failed with exit code 1

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
def delete_object(bucket_name, object_name):
# Delete the object
s3 = boto3.client('s3')
s3.delete_object(Bucket=bucket_name, Key=object_name)
a = delete_object("name-bucket","directory/file.dat")
I failed because of the spark session.

Related

Selenium Date Format

I'm pulling data with selenium and saving this data to the database. Although the relevant column in the database is date and the field is filled in the relevant site, the database is empty, as '0000-00-00'
The code for the area I'm scraping.
if "Test" in description_list:
index_no = description_list.index("Test")
try:
first_registration = value_list[index_no]
except:
first_registration =
An example of the date I am trying to engrave; 07/28. I appreciate your help.
from xml.etree.ElementTree import QName
import bs4
import urllib.request
import pandas as pd
from datetime import datetime
from tkinter import E
import pymysql
import mysql.connector
import configparser
import re
import numpy as np
import time
import concurrent.futures
# import erequests
# import lxml
from multiprocessing import Pool
# from multiprocessing import Process, Lock
from multiprocessing import Process
from datetime import datetime
from tqdm import tqdm # progress bar
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import datetime
from sqlalchemy import create_engine
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.firefox.options import Options
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="",
database="m_n"
)
mycursor = mydb.cursor()
sql = "SELECT ad_link FROM adlinks_d"
mycursor.execute(sql)
myresult = mycursor.fetchall()
all_links = myresult[0:]
len_all_links = len(all_links)
dataframe = pd.DataFrame(all_links, columns=['links'])
x = 1
y = 5
#def fonksiyon(i):
# global x
# global y
number = np.arange(x,y)
for i in tqdm(number):
ad_link = dataframe.links[i]
fireFoxOptions = Options()
fireFoxOptions.binary_location = r'C:\Program Files\Firefox Developer Edition\firefox.exe'
fireFoxOptions.add_argument("--headless")
fireFoxOptions.add_argument('--disable-gpu')
fireFoxOptions.add_argument('--no-sandbox')
driver = webdriver.Firefox(options=fireFoxOptions)
sleep_time = 1
driver.get(ad_link)
time.sleep(sleep_time)
ad_source = driver.page_source
ad_soup = BeautifulSoup(ad_source, 'html.parser')
mainresults = ad_soup.find_all('div', {'class': 'cBox cBox--content u-overflow-inherit '})
cars_data = pd.DataFrame({
'brand_and_model': brand_and_model,
'model_version': model_version,
},
index=[0])
df3 = pd.DataFrame(list(zip(equipment_key, equipment_value)), columns=['all_key', 'all_value'])
df2 = pd.DataFrame(list(zip(all_key, all_value)), columns=['all_key', 'all_value'])
df1.insert(0, "brand_and_model", brand_and_model)
df2_3 = pd.concat([df2, df3])
df2_3 = df2_3.set_index('all_key').T.reset_index(drop=True)
df2_3 = df2_3.rename_axis(None, axis=1)
df_last = pd.concat([df1, df2_3], axis=1)
df_last = df_last.astype(str).groupby(df_last.columns, sort=False, axis=1).agg(
lambda x: x.apply(','.join, 1))
now = datetime.now()
datetime_string = str(now.strftime("%Y%m%d_%H%M%S"))
df_last['download_date_time'] = datetime_string
config = configparser.RawConfigParser()
config.read(filenames='my.properties')
scrap_db = pymysql.connect(host='localhost', user='root', password='', database='m_n', charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
cursor = scrap_db.cursor()
sql = """CREATE TABLE S(
brand_and_model VARCHAR(32),
first_registration DATE(6),
download_date_time DATE(6)
)"""
#cursor.execute(sql)
for row_count in range(0, df_last.shape[0]):
chunk = df_last.iloc[row_count:row_count + 1, :].values.tolist()
brand_and_model = ""
first_registration = ""
download_date_time = ""
lenght_of_chunk = len(chunk[0])
if "brand_and_model" in cars_data:
try:
brand_and_model = chunk[0][0]
except:
brand_and_model = ""
if chunk[0][lenght_of_chunk - 1] != "":
download_date_time = chunk[0][lenght_of_chunk - 1]
if (brand_and_model == ' '):
control = "false"
else:
control = "true"
if control == "true":
mySql_insert_query = "INSERT INTO S (brand_and_model,first_registration,download_date_time) VALUES (%s,%s,%s)"
val = (
brand_and_model, location, first_registration, download_date_time)
cursor = scrap_db.cursor()
cursor.execute(mySql_insert_query, val)
scrap_db.commit()
print(cursor.rowcount, "Record inserted successfully into *S* table")
driver.close()

It runs, but 0xc0000409 error occurs during operation

I'm practicing a program that reads and displays Excel files
Try raising the entire code.
I'm using Python 3.9, Pycharm
PyQt5 - 5.15.7
Help me TT
import sys
from PyQt5.QtWidgets import QApplication, QWidget, QTableWidget, QTableWidgetItem, QHeaderView, QHBoxLayout, QVBoxLayout, QPushButton
from PyQt5.QtCore import Qt
import pandas as pd
from pympler import muppy
all_objects = muppy.get_objects()
class MyApp(QWidget):
def init(self):
super().init()
self.window_width, self.window_height = 700, 500
self.resize(self.window_width, self.window_height)
layout = QVBoxLayout()
self.setLayout(layout)
self.table = QTableWidget()
layout.addWidget(self.table)
self.button = QPushButton('&load Data')
self.button.clicked.connect(lambda _, xl_path=excel_file_patch, sheet_name=worksheet_name: self.loadExcelData(xl_path, sheet_name))
layout.addWidget(self.button)
def loadExcelData(self, excel_file_dir, worksheet_name):
df = pd.read_excel(excel_file_dir, worksheet_name)
if df.size == 0:
return super.loadExcelData(excel_file_dir, worksheet_name)
df.fillna('', inplace=True)
self.table.setRowCount(df.shape[0])
self.table.setColumnCount(df.shape[0])
self.table.setHorizontalHeaderLabels(df.columns)
if name == 'main':
excel_file_patch = 'data.xlsx'
worksheet_name = 'Sales'
app = QApplication(sys.argv)
myApp = MyApp()
myApp.show()
try:
sys.exit(app.exec())
except SystemExit:
print('closing Window...')

boto3 waiter to check the file availability in s3 bucket

from __future__ import print_function
import urllib.parse
import boto3
import json
s3 = boto3.client('s3')
def lambda_handler(event, context):
# TODO implement
source_bucket = event['Records'][0]['s3']['bucket']['name']
object_key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'])
try:
waiter = s3.get_waiter('object_exists')
waiter.wait(Bucket=source_bucket, Key="<dirname>" + str(object_key),
WaiterConfig={
'Delay': 2,
'MaxAttempts': 5})
print("Object s3://{bucket}/{key} arrived!")
except Exception as e:
print(e)
print('Error getting object')
raise e

Invoke Sagemaker Endpoint using Spark (EMR Cluster)

I am developing a spark application in an EMR cluster. The flow of the project goes like this :
Dataframe is repartitioned based in a Id.
Sagemaker endpoint needs to be invoked on each partition and get the result.
But doing that i am getting this error :
cPickle.PicklingError: Could not serialize object: TypeError: can't pickle thread.lock objects
The code is a follows :
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark import SparkConf
import itertools
import json
import boto3
import time
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from io import BytesIO as StringIO
client=boto3.client('sagemaker-runtime')
def invoke_endpoint(json_data):
ansJson=json.dumps(json_data)
response=client.invoke_endpoint(EndpointName="<EndpointName>",Body=ansJson,ContentType='text/csv',Accept='Accept')
resultJson=json.loads(str(response['Body'].read().decode('ascii')))
return resultJson
def execute(list_of_url):
final_iterator=[]
urlist=[]
json_data={}
for url in list_of_url:
final_iterator.append((url.ID,url.Prediction))
urlist.append(url.ID)
json_data['URL']=urlist
ressultjson=invoke_endpoint(json_data)
return iter(final_iterator)
### Atributes to be added to Spark Conf
conf = (SparkConf().set("spark.executor.extraJavaOptions","-Dcom.amazonaws.services.s3.enableV4=true").set("spark.driver.extraJavaOptions","-Dcom.amazonaws.services.s3.enableV4=true"))
scT=SparkContext(conf=conf)
scT.setSystemProperty("com.amazonaws.services.s3.enableV4","true")
hadoopConf=scT._jsc.hadoopConfiguration()
hadoopConf.set("f3.s3a.awsAccessKeyId","<AccessKeyId>")
hadoopConf.set("f3.s3a.awsSecretAccessKeyId","<SecretAccessKeyId>")
hadoopConf.set("f3.s3a.endpoint","s3-us-east-1.amazonaws.com")
hadoopConf.set("com.amazonaws.services.s3.enableV4","true")
hadoopConf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
sql=SparkSession(scT)
csv_df=sql.read.csv('s3 path to my csv file',header =True)
#print('Total count is',csv_df.count())
csv_dup_df=csv_df.dropDuplicates(['ID'])
print('Total count is',csv_dup_df.count())
windowSpec=Window.orderBy("ID")
result_df=csv_dup_df.withColumn("ImageID",F.row_number().over(windowSpec)%80)
final_df=result_df.withColumn("Prediction",lit(str("UNKOWN")))
df2 = final_df.repartition("ImageID")
df3=df2.rdd.mapPartitions(lambda url: execute(url)).toDF()
df3.coalesce(1).write.mode("overwrite").save("s3 path to save the results in csv format",format="csv")
print(df3.rdd.glom().collect())
##Ok
print("Work is Done")
Can you tell me how to rectify this issue ?

spark streaming with json file

I want to read json data from a folder location through spark streaming.
I assume my json data is
{"transactionId":111,"customerId":1,"itemId": 1,"amountPaid": 100}
I want the output in Spark SQL table as:--
transactionId customerId itemId amountPaid
111 1 1 100
my code is :
package org.training.spark.streaming
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Duration
import org.apache.spark.sql.functions.udf
import org.training.spark.streaming.sqlstreaming.Persons
object jsonread {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("local").setAppName("jsonstreaming")
val sc = new SparkContext(sparkConf)
// Create the context
val ssc = new StreamingContext(sc, Seconds(40))
val lines = ssc.textFileStream("src/main/resources/fileStreaming")
lines.foreachRDD(rdd=>rdd.foreach(println))
val words = lines.flatMap(_. split(","))
words.foreachRDD(rdd=>rdd.foreach(println))
val sqc = new SQLContext(sc);
import sqc.implicits._
words.foreachRDD { rdd =>
val persons = rdd.map(_.split(":")).map(p => (p(0), p(1))).toDF()
persons.registerTempTable("data")
val jsontable = sqc.sql("SELECT * from data")
jsontable.show
}
ssc.start()
ssc.awaitTermination()
}
}
Json Data:
{"transactionId":"111","customerId":"1","itemId": "1","amountPaid": "100"}
Pyspark Code to read from above json data:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.types import IntegerType, LongType, DecimalType,StructType, StructField, StringType
from pyspark.sql import Row
from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.sql import Window
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
ssc = StreamingContext(sc, 5)
stream_data = ssc.textFileStream("/filepath/")
def readMyStream(rdd):
if not rdd.isEmpty():
df = spark.read.json(rdd)
print('Started the Process')
print('Selection of Columns')
df = df.select('transactionId','customerId','itemId','amountPaid').where(col("transactionId").isNotNull())
df.show()
stream_data.foreachRDD( lambda rdd: readMyStream(rdd) )
ssc.start()
ssc.stop()