How to wait 30 second after 20 requests selenium scraping - selenium

Hello i have a csv file 300 datas.
After 10 requests , the website stop to give me results.
How to pause 3 minutes my script after 10 requests
thanks you
my code :
societelist =[]
import csv
with open('1.csv') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
browser = webdriver.Firefox(options=options)
browser.get("myurl".format(row[0]))
time.sleep(20)
try:
societe = browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/div[1]/span[2]').text
except NoSuchElementException:
societe = 'Element not found'
societelist.append(societe)
print (row[0])
browser.quit()
df = pd.DataFrame(list(zip(societelist)), columns=['societe'])
data = df.to_csv('X7878.csv', index=False)

Use:
import csv
societelist =[]
with open('1.csv') as csvfile:
reader = csv.reader(csvfile)
for i, row in enumerate(reader): # i gives the index of the row.
browser = webdriver.Firefox(options=options)
browser.get("myurl".format(row[0]))
time.sleep(20)
try:
societe = browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/div[1]/span[2]').text
except NoSuchElementException:
societe = 'Element not found'
societelist.append(societe)
print(row[0])
browser.quit()
if not ((i+1) % 10):
time.sleep(180)
df = pd.DataFrame(list(zip(societelist)), columns=['societe'])
df.to_csv('X7878.csv', index=False)
Alternate solution to write each line of text to Excel after scraping instead of writing all text at once.
import csv
import win32com.client as win32
# Launch excel
excel = win32.Dispatch('Excel.Application')
excel.Visible = 1
wb = excel.Workbooks.Add()
ws = wb.Sheets(1)
# Read csv and scrape webpage
with open('1.csv') as csvfile:
reader = csv.reader(csvfile)
for i, row in enumerate(reader): # i gives the index of the row.
browser = webdriver.Firefox(options=options)
browser.get("myurl".format(row[0]))
time.sleep(20)
try:
societe = browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/div[1]/div[2]/div[1]/span[2]').text
except NoSuchElementException:
societe = 'Element not found'
# it may make sense to write the input text and the scraped value side by side.
ws.Cells(i+1, 1).Value = row[0]
ws.Cells(i+1, 2).Value = societe
print(row[0], societe)
browser.quit()
if not ((i+1) % 10):
time.sleep(180)
# If you want to save the file programmatically and close excel.
path = r'C:\Users\jarodfrance\Documents\X7878.xlsx'
wb.SaveAs(path)
wb.Close()
excel.Quit()

Related

Storing string to clipboard

Hello I am trying to make an automation where I can iterate through the rows in a df column and copy and paste them one at a time to excel. I would like to include a loop to where I can press enter and it will copy the next cell. I have this code written for reference but it is not working.
import pandas as pd
import openpyxl
import pyperclip as pc
import pyautogui as pg
Excel_File = r'/Users/martinflores/Desktop/Control.xlsx'
df = pd.read_excel(Excel_File)
x= df['Age']
y = df['Name']
z = df['Count']
def main():
for index, row in df.iterrows():
string = row['Age']
cp = pc.copy(string)
return cp
pg.sleep(3)
pc.paste(main())
pg.press('down')
I thought my main function would save the string to the Clipboard and I could either paste by pg.hotkey('ctrl','v',) or pc.paste(main()) but it won't do anything.Also I am not sure if it matter but I am developing this code on IOS at the moment.

selenium webdriver send keys pycharm

I have data in an excel sheet, first Column has a number, and second Column has text. My program works with text but not with numbers.
import xlrd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
PATH = "C:/Program Files (x86)/chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.youtube.com")
print(driver.title)
search = driver.find_element_by_name("search_query")
workbook = xlrd.open_workbook("mohammed2.xls")
sheet = workbook.sheet_by_name("sheet3")
rowCount = sheet.nrows
colCount = sheet.ncols
print(rowCount)
print(colCount)
for curr_row in range(1, rowCount):
numpValue = sheet.cell_value(curr_row, 0)
#name = sheet.cell_value(curr_row, 1)
search.send_keys(numpValue)
time.sleep(3)
search.send_keys(Keys.RETURN)
search.clear()
time.sleep(3)
search.clear()
search.send_keys(str(numpValue))
it seems sendKeys doesn't allow float , and the value from number field is in folat formate

Fastest way to read a large excel file into databricks

So I have been having some issues reading large excel files into databricks using pyspark and pandas. Spark seems to be really fast at csv and txt but not excel
i.e
df2=pd.read_excel(excel_file, sheetname=sheets,skiprows = skip_rows).astype(str)
df = spark.read.format("com.crealytics.spark.excel").option("dataAddress", "\'" + sheet + "\'" + "!A1").option("useHeader","false").option("maxRowsInMemory",1000).option("inferSchema","false").load(filePath)
We have found the fastest way to read in an excel file to be one which was written by a contractor:
from openpyxl import load_workbook
import csv
from os import sys
excel_file = "/dbfs/{}".format(path)
sheets = []
workbook = load_workbook(excel_file,read_only=True,data_only=True)
all_worksheets = workbook.get_sheet_names()
for worksheet_name in workbook.get_sheet_names():
print("Export " + worksheet_name + " ...")
try:
worksheet = workbook.get_sheet_by_name(worksheet_name)
except KeyError:
print("Could not find " + worksheet_name)
sys.exit(1)
with open("/dbfs/{}/{}.csv".format(tempDir, worksheet_name), 'w') as your_csv_file:
wr = csv.writer(your_csv_file, quoting=csv.QUOTE_ALL)
headerDone = False
for row in worksheet.iter_rows():
lrow = []
if headerDone == True:
lrow.append(worksheet_name)
else:
lrow.append("worksheet_name")
headerDone = True
for cell in row:
lrow.append(cell.value)
wr.writerow(lrow)
#Sometimes python gets a bit ahead of itself and
#tries to do this before it's finished writing the csv
#and fails
retryCount = 0
retryMax = 20
while retryCount < retryMax:
try:
df2 = spark.read.format("csv").option("header", "true").load(tempDir)
if df2.count() == 0:
print("Retrying load from CSV")
retryCount = retryCount + 1
time.sleep(10)
else:
retryCount = retryMax
except:
print("Thew an error trying to read the file")
The reason it is fast is that it is only storing one line of excel sheet in memory when it loops round. I tried appending the list of rows together but this made it very slow.
The issue with the above method is that it writing to csv and re-reading it doesn't seem the most robust method. Its possible that the csv could be read part way while its written and it could still be read in and data could be lost.
Is there any other way of making this fast such as using cython so you can just put the append the list of rows without incurring a penalty for the memory and put them directly into spark directly via createDataFrame?

isnull() and dropna() not working for pandas 0.22 when using xlwings to get dataframe

Desperate about this mystery. So i just upgraded my pandas to 0.22 (from 0.18) and mysteriously, when using xlwings, dropna or isnull does NOT work anymore. I see that myTemp is still giving me the correct True and False, yet
unwindDF will give me all the df_raw data just with everything filled to become nan and naT. Similar issue for noPx.
This is the case even if I manually assign np.nan to a cell Yet surprisingly, when in the same file I create a simple df towards the end, then myTest1
is working well. why? is there something special about xlwings with pandas 0.22?
My code is below and my xlsx file in the image.
import pythoncom
import pandas as pd
import xlwings as xw
import numpy as np
folder_path = 'S:/Order/all PNL files/'
excel_name='pnlTest.xlsx'
pnl_excel_path = folder_path + excel_name
sheetName = 'Sheet1'
pythoncom.CoInitialize()
app = None
bk = None
app_count = xw.apps.count
for i in range(app_count):
try:
app = xw.apps[i]
temp = app.books[excel_name]
bk = temp
print()
print("Using Opened File")
except:
print()
if bk == None:
print("Open New Excel App")
app = xw.App()
bk = xw.Book(pnl_excel_path)
bk.app.calculation = 'manual'
bk.app.screen_updating = False
sht = bk.sheets[sheetName]
last_row_index = sht.range('A1').end('down').row
df_raw = sht.range('A1:M' + str(last_row_index)).options(pd.DataFrame, header=1,
index=0).value
myTemp = df_raw['UNWD_DT'].isnull()
unwindDF = df_raw[df_raw['UNWD_DT'].isnull()]
df_raw.loc[10,'Curr_Px']=np.nan
df_raw.iloc[10,11]=np.nan
noPx=df_raw[df_raw['Curr_Px'].isnull()]
df = pd.DataFrame({'a':[0,0,1,1], 'b':[0,1,0,1],'c':[np.nan,1,0,np.nan]})
myTemp1=df['c'].isnull()
myTest1=df[df['c'].isnull()]
df_raw.dropna(thresh=2,inplace=True)
df_raw2=df_raw.dropna(thresh=2)

Rhino: Not All Arguments Converted During String Formatting

I am attempting to execute a code using Rhino Python and I am having some issues with the following TypeError:
Message: not all arguments converted during string formatting
The code I have written is meant to read point coordinates from a file "newpoints.csv" and use them as arguments for Rhino Python's 'AddLine' function.
#!/usr/bin/env python
import rhinoscriptsyntax as rs
file = open("C:\\Users\\Seshane Mahlo\\Documents\\MSc Thesis\\newpoints.csv", "r")
lines = file.readlines()
file.close()
ab = len(lines)
seq = range(0, ab-1, 2)
coordinates = []
startvals = []
stopvals = []
for line in lines:
coords = line.split(',')
xcoord = float(coords[0])
ycoord = float(coords[1])
point = (xcoord, ycoord)
coordinates.append(point)
starts = range(0, ab-2, 2)
ends = range(1, ab+1, 2)
for i,j in zip(starts, ends):
strt = coordinates[i]
stp = coordinates[j]
rs.AddLine(start=strt,end=stp)
I think there is a small mistake in your code here:
starts = range(0, ab-2, 2)
ends = range(1, ab-1, 2)
which should be
starts = range(0, ab-1, 2)
ends = range(1, ab, 2)
because the last element you get from the range function is one less than the stop argument.
But what is causing the error is that you are trying to add a line, which is composed of two 3d points using a 2-tuple (x,y)
To fix this change:
point = (xcoord, ycoord)
to
point = (xcoord, ycoord, 0)
or whatever you want your z-coordinate to be.