I'm pulling data with selenium and saving this data to the database. Although the relevant column in the database is date and the field is filled in the relevant site, the database is empty, as '0000-00-00'
The code for the area I'm scraping.
if "Test" in description_list:
index_no = description_list.index("Test")
try:
first_registration = value_list[index_no]
except:
first_registration =
An example of the date I am trying to engrave; 07/28. I appreciate your help.
from xml.etree.ElementTree import QName
import bs4
import urllib.request
import pandas as pd
from datetime import datetime
from tkinter import E
import pymysql
import mysql.connector
import configparser
import re
import numpy as np
import time
import concurrent.futures
# import erequests
# import lxml
from multiprocessing import Pool
# from multiprocessing import Process, Lock
from multiprocessing import Process
from datetime import datetime
from tqdm import tqdm # progress bar
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import datetime
from sqlalchemy import create_engine
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.firefox.options import Options
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="",
database="m_n"
)
mycursor = mydb.cursor()
sql = "SELECT ad_link FROM adlinks_d"
mycursor.execute(sql)
myresult = mycursor.fetchall()
all_links = myresult[0:]
len_all_links = len(all_links)
dataframe = pd.DataFrame(all_links, columns=['links'])
x = 1
y = 5
#def fonksiyon(i):
# global x
# global y
number = np.arange(x,y)
for i in tqdm(number):
ad_link = dataframe.links[i]
fireFoxOptions = Options()
fireFoxOptions.binary_location = r'C:\Program Files\Firefox Developer Edition\firefox.exe'
fireFoxOptions.add_argument("--headless")
fireFoxOptions.add_argument('--disable-gpu')
fireFoxOptions.add_argument('--no-sandbox')
driver = webdriver.Firefox(options=fireFoxOptions)
sleep_time = 1
driver.get(ad_link)
time.sleep(sleep_time)
ad_source = driver.page_source
ad_soup = BeautifulSoup(ad_source, 'html.parser')
mainresults = ad_soup.find_all('div', {'class': 'cBox cBox--content u-overflow-inherit '})
cars_data = pd.DataFrame({
'brand_and_model': brand_and_model,
'model_version': model_version,
},
index=[0])
df3 = pd.DataFrame(list(zip(equipment_key, equipment_value)), columns=['all_key', 'all_value'])
df2 = pd.DataFrame(list(zip(all_key, all_value)), columns=['all_key', 'all_value'])
df1.insert(0, "brand_and_model", brand_and_model)
df2_3 = pd.concat([df2, df3])
df2_3 = df2_3.set_index('all_key').T.reset_index(drop=True)
df2_3 = df2_3.rename_axis(None, axis=1)
df_last = pd.concat([df1, df2_3], axis=1)
df_last = df_last.astype(str).groupby(df_last.columns, sort=False, axis=1).agg(
lambda x: x.apply(','.join, 1))
now = datetime.now()
datetime_string = str(now.strftime("%Y%m%d_%H%M%S"))
df_last['download_date_time'] = datetime_string
config = configparser.RawConfigParser()
config.read(filenames='my.properties')
scrap_db = pymysql.connect(host='localhost', user='root', password='', database='m_n', charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
cursor = scrap_db.cursor()
sql = """CREATE TABLE S(
brand_and_model VARCHAR(32),
first_registration DATE(6),
download_date_time DATE(6)
)"""
#cursor.execute(sql)
for row_count in range(0, df_last.shape[0]):
chunk = df_last.iloc[row_count:row_count + 1, :].values.tolist()
brand_and_model = ""
first_registration = ""
download_date_time = ""
lenght_of_chunk = len(chunk[0])
if "brand_and_model" in cars_data:
try:
brand_and_model = chunk[0][0]
except:
brand_and_model = ""
if chunk[0][lenght_of_chunk - 1] != "":
download_date_time = chunk[0][lenght_of_chunk - 1]
if (brand_and_model == ' '):
control = "false"
else:
control = "true"
if control == "true":
mySql_insert_query = "INSERT INTO S (brand_and_model,first_registration,download_date_time) VALUES (%s,%s,%s)"
val = (
brand_and_model, location, first_registration, download_date_time)
cursor = scrap_db.cursor()
cursor.execute(mySql_insert_query, val)
scrap_db.commit()
print(cursor.rowcount, "Record inserted successfully into *S* table")
driver.close()
Related
I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)
I want to attach a screenshot to my HTML report but I haven't found any good resource on how to use the conftest.py file. I created the coftest.py file inside the pytest folder with the following code:
import pytest
#pytest.hookimpl(hookwrapper=True)
def pytest_runtest_makereport(item, call):
pytest_html = item.config.pluginmanager.getplugin("html")
outcome = yield
report = outcome.get_result()
extra = getattr(report, "extra", [])
image="D:/Selenium/Insights/2022-11-02_00-13-18/error_page.png"
if report.when == "call":
# always add url to report
extra.append(pytest_html.extras.url("http://www.example.com/"))
extra.append(pytest_html.extra.image(image))
xfail = hasattr(report, "wasxfail")
if (report.skipped and xfail) or (report.failed and not xfail):
# only add additional html on failure
# extra.append(pytest_html.extras.html("<div>Additional HTML</div>"))
extra.append(pytest_html.extra.image(image))
report.extra = extra
And my test.py file is:
import time
from os import getenv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from dotenv import load_dotenv
from Login_actions import Login_activities
from Insights_actions import Insights_activities
from Locators import Locators
import pytest, os
from datetime import datetime
class Test_Insights():
#pytest.fixture
def test_setup(self):
#make new directory for downloads
new_dir = r"D:\Selenium\Insights\{timestamp}".format(timestamp=datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
# print(new_dir)
if not os.path.exists(new_dir):
os.makedirs(new_dir)
self.saved_dir=new_dir
prefs = {"download.default_directory": new_dir, "download.directory_upgrade": True, "download.prompt_for_download": False}
#intiating chrome browser instance
options=Options()
options.add_argument('--start-maximized')
# options.add_argument('--headless')
options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
#load credentials
load_dotenv()
self.username = getenv("TOP_USERNAME")
self.password = getenv("TOP_PWD")
#exiting ceremonies
yield
self.driver.close()
self.driver.quit()
print("Test executed")
def test_check_in(self, test_setup):
driver=self.driver
# login_url="https://tilt-sso.preprod.crto.in/" separate login page
# url="https://tilt-orange360.preprod.crto.in/insights/home"
url="https://tilt-sso.preprod.crto.in/auth?code=5515f8b0-4b64-4da4-b506-e6a6a3f81b23&scope=cn%20dn%20mail%20uid%20umsId&state=eyJyZWRpcmVjdF91cmkiOiJcL2hvbWUiLCJub25jZSI6IktaTFBxczU5T3lQUWJaRUp0OFhBQWZvZDNueDhPaENDbGlJWVRqZ08ifQ%3D%3D"
driver.get(url)
try:
welcome_text = driver.find_element(by=By.XPATH, value="//div[contains(text(),'Criteo')]")
assert welcome_text
login_actions = Login_activities(driver)
login_actions.enter_username(test_setup.username)
login_actions.enter_password(test_setup.password)
login_actions.login()
page_load_wait = WebDriverWait(driver, timeout=30).until(
EC.url_to_be("https://tilt-orange360.preprod.crto.in/insights/home"))
if (page_load_wait):
WebDriverWait(driver, timeout=20).until(
EC.visibility_of_element_located((By.XPATH, Locators.welcome_text)))
WebDriverWait(driver, timeout=20).until(EC.element_to_be_clickable((By.XPATH, Locators.run_insight)))
insights_actions = Insights_activities(driver)
insights_actions.insights_search("Check-In")
insights_actions.search_partners("BOOKINGIT")
insights_actions.smart_date_30days()
insights_actions.submit_insights()
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, Locators.success_mesg)))
# submit_verify = driver.find_element(by=By.XPATH, value=Locators.success_mesg)
# assert(submit_verify)
print("Submission successful")
insights_actions.download_file()
time.sleep(20)
print(self.saved_dir)
arr=[]
arr+=[file for file in os.listdir(self.saved_dir) if file.endswith('.pptx')]
print("File in the directory: " + arr[0])
while not arr:
time.sleep(5)
if arr:
print("Insights completed. File downloaded successfully")
else:
print("File not available")
raise NoSuchElementException
except:
if driver.find_element(by=By.XPATH,value=Locators.error_page):
driver.get_screenshot_as_file('{dir}/error_page.png'.format(dir=self.saved_dir))
print("500 Internal server error")
Error_page=driver.current_url
print("The error page: "+Error_page)
raise NoSuchElementException
I do not know why is it not working. The document: https://pytest-html.readthedocs.io/en/latest/user_guide.html#enhancing-reports does not have much information. I really need help here, please.
I'm practicing a program that reads and displays Excel files
Try raising the entire code.
I'm using Python 3.9, Pycharm
PyQt5 - 5.15.7
Help me TT
import sys
from PyQt5.QtWidgets import QApplication, QWidget, QTableWidget, QTableWidgetItem, QHeaderView, QHBoxLayout, QVBoxLayout, QPushButton
from PyQt5.QtCore import Qt
import pandas as pd
from pympler import muppy
all_objects = muppy.get_objects()
class MyApp(QWidget):
def init(self):
super().init()
self.window_width, self.window_height = 700, 500
self.resize(self.window_width, self.window_height)
layout = QVBoxLayout()
self.setLayout(layout)
self.table = QTableWidget()
layout.addWidget(self.table)
self.button = QPushButton('&load Data')
self.button.clicked.connect(lambda _, xl_path=excel_file_patch, sheet_name=worksheet_name: self.loadExcelData(xl_path, sheet_name))
layout.addWidget(self.button)
def loadExcelData(self, excel_file_dir, worksheet_name):
df = pd.read_excel(excel_file_dir, worksheet_name)
if df.size == 0:
return super.loadExcelData(excel_file_dir, worksheet_name)
df.fillna('', inplace=True)
self.table.setRowCount(df.shape[0])
self.table.setColumnCount(df.shape[0])
self.table.setHorizontalHeaderLabels(df.columns)
if name == 'main':
excel_file_patch = 'data.xlsx'
worksheet_name = 'Sales'
app = QApplication(sys.argv)
myApp = MyApp()
myApp.show()
try:
sys.exit(app.exec())
except SystemExit:
print('closing Window...')
Hello i have many files of TFRecords. i use python tensorflow and want to plot in one histogram all labels.
TFRecords is pair of (image,label)
so how i can extract all the labels ?
i have try to extract labels and have success plot several batches
all_label = []
for image, label in ds_train.take(10):
all_label.append(label)
sns.distplot(all_label)
Maybe something like this.
import re
#import pdftotext
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
with open('C:\\Users\\Finance10K.txt') as f:
clean_cont = f.read().splitlines()
clean_cont
doc=[i.replace('\xe2\x80\x9c','') for i in clean_cont ]
doc=[i.replace('\xe2\x80\x9d','') for i in doc ]
doc=[i.replace('\xe2\x80\x99s','') for i in doc ]
docs = [x for x in doc if x != ' ']
docss = [x for x in docs if x != '']
doc
docs
docss
financedoc=[re.sub("[^a-zA-Z]+", " ", s) for s in docss]
financedoc
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
#%pylab
#%matplotlib inline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
vect=CountVectorizer(ngram_range=(1,1),stop_words='english')
fin=vect.fit_transform(financedoc)
fin
pd.DataFrame(fin.toarray(),columns=vect.get_feature_names())
lda=LatentDirichletAllocation(n_components=5)
lda.fit_transform(fin)
lda_dtf=lda.fit_transform(fin)
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())
import mglearn
mglearn.tools.print_topics(topics=range(5), feature_names=features,
sorting=sorting, topics_per_chunk=5, n_words=10)
#from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
zit=pyLDAvis.sklearn.prepare(lda,fin,vect)
pyLDAvis.show(zit)
I am trying to scrape "number of items sold" on eBay but for some reason I cannot. I already have the title, price, and all I need is total_sold_price which I am unable to attain. Every time I run my code, I just get a blank for total_sold_price.
try:
title_selenium = driver.find_element_by_xpath('//*[#id="itemTitle"]').text
except:
title_selenium = ""
try:
price_selenium = driver.find_element_by_xpath('//*[#id="prcIsum"]').text.strip().split()
except:
price_selenium = ""
try:
total_sold_price_BeautifulSoup = soup.find('span', {'class': 'vi-qtyS-hot-red'}).text
except:
total_sold_price_BeautifulSoup = ""
My entire code: https://pastebin.com/bu8HgCDZ
Thank you so much.
Fixed it for you. You need to make the soup call inside your loop.
Note: I am using this path '../chromedriver', please change it to your path before running the code.
Code
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Chrome('../chromedriver')
driver.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=watches&_sacat=0&_pgn=1')
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.maximize_window()
tempList = []
for link in soup.find_all('a', href=True):
if 'itm' in link['href']:
print(link['href'])
tempList.append(link['href'])
array_length = len(tempList)
for i in range(array_length):
driver.get(tempList[i])
timeout = 5
try:
element_present = EC.presence_of_element_located((By.XPATH, '//*[#id="itemTitle"]'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
try:
title_selenium = driver.find_element_by_xpath('//*[#id="itemTitle"]').text
except:
title_selenium = ""
try:
price_selenium = driver.find_element_by_xpath('//*[#id="prcIsum"]').text.strip().split()
except:
price_selenium = ""
#you need to call soup here due to your loop structure
soup = BeautifulSoup(driver.page_source, 'lxml')
try:
total_sold_price_BeautifulSoup = soup.find('span', {'class': 'vi-qtyS-hot-red'}).text
except:
total_sold_price_BeautifulSoup = ""
print("title: ", title_selenium)
print("price: ", price_selenium)
print("total_sold_price: ", total_sold_price_BeautifulSoup)
print("\n")
i+=1
driver.close()