WebScraping a changing webpage search results? - pandas

I'm trying to get data from a search result but every time I try to use a specific link to give to Beautiful Soup I get errors and I think it is because the webpage isn't the same every time you visit it? I'm not exactly sure what this is called to even search so any help would be appreciated.
This is the link to the search results. But when you go to visit it unless you've already made a search it won't show up the results.
https://www.clarkcountycourts.us/Portal/Home/WorkspaceMode?p=0
instead, if you copy and paste it will take you to this page to make a search.
https://www.clarkcountycourts.us/Portal/ and then you have to click smart search.
So for simplicity's sake, let's say we search for "Robinson" and I need to take the table data and export it to an excel file. I cant give beautiful soup a link because it isn't valid I believe? How would I go about this challenge?
Even pulling the tables up with a simple view table doesn't give any info about the data from our search of "Robinson" such as Case Number or File Date to create a pandas data frame.
//EDIT//
so far thanks to #Arundeep Chohan
This is what I've got. Huge Shout out for the awesome help!
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(20) # gives an implicit wait for 20 seconds
driver.get("https://www.clarkcountycourts.us/Portal/Home/Dashboard/29")
search_box = driver.find_element_by_id("caseCriteria_SearchCriteria")
search_box.send_keys("Robinson")
#Code to complete captchas
WebDriverWait(driver, 15).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[name^='a-'][src^='https://www.google.com/recaptcha/api2/anchor?']")))
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, "//span[#id='recaptcha-anchor']"))).click()
driver.switch_to.default_content() #necessary to switch out of iframe element for submit button
time.sleep(5) #gives time to click submit to results
submit_box = driver.find_element_by_id("btnSSSubmit").click()
time.sleep(5)
soup = BeautifulSoup(driver.page_source,'html.parser')
df = pd.read_html(str(soup))[0]
print(df)

options = Options()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
driver.maximize_window()
wait=WebDriverWait(driver,10)
driver.get('https://www.clarkcountycourts.us/Portal/')
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"a.portlet-buttons"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"input#caseCriteria_SearchCriteria"))).send_keys("Robinson")
wait.until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,"//iframe[#title='reCAPTCHA']")))
elem=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"div.recaptcha-checkbox-checkmark")))
driver.execute_script("arguments[0].click()", elem)
driver.switch_to.default_content()
x = input("Waiting for recaptcha done")
wait.until(EC.element_to_be_clickable((By.XPATH,"(//input[#id='btnSSSubmit'])[1]"))).click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
df = pd.read_html(str(soup))[0]
print(df)
Should be the minimum to get to your page if you want to know.There's an iframe to deal and the spinner to deal with. After this just use pandas to grab the table.
(edit): They added a recaptcha properly so add a solver where I added my pause input.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from bs4 import BeautifulSoup
Outputs:
Waiting for manual date to be entered. Enter YES when done.
Unnamed: 0_level_0 ... Date of Birth
Case Number ... File Date
Case Number ... File Date
0 NaN ... NaN
1 NaN ... Cases (1) Case NumberStyle / DefendantFile Da...
2 Case Number ... File Date
3 08A575873 ... 11/17/2008
4 NaN ... NaN
5 NaN ... Cases (1) Case NumberStyle / DefendantFile Da...
6 Case Number ... File Date
7 08A575874 ... 11/17/2008

Related

Selenium Python3 - AttributeError: 'str' object has no attribute 'tag_name'

Newbie to automation with Selenium/Python. I'm getting blocked automating a sign up form. The drop down is a required element but I'm getting the following error...
AttributeError: 'list' object has no attribute 'tag_name'
I've posted my code below and can't find any answer online as to why this would be. Any/all help greatly appreciated.
from re import X
from socket import timeout
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from locale import currency
from operator import index
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from unicodedata import name
import pandas as pd
import csv
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
web = webdriver.Chrome("/home/nosra/chromedriver")
#class selenium :webdriver.support.wait.WebDriverWait(web, timeout, poll_frequency=0.5, ignored_exceptions=None)
df=pd.read_excel('info1.xlsx')
url=["https://docs.google.com/forms/d/e/1FAIpQLScGMoYVsxtsQ0Je4RTYEZndWrKkdt5jJwXBcMAcOia2WuIRtA/viewform?usp=sf_link"]
for link in url:
for i in df.index :
web.get(link)
entry=df.loc[i]
name=web.find_element(By.XPATH,"//*[#id='mG61Hd']/div[2]/div/div[2]/div/div/div/div[2]/div/div[1]/div/div[1]/input")
time.sleep(1)
name.send_keys(entry['name'])
time.sleep(1)
lastName=web.find_element(By.XPATH,"//*[#id='mG61Hd']/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/input")
time.sleep(2)
lastName.send_keys(entry['lastName '])
select_C=Select(web.find_elements(By.XPATH,"//*[#id='mG61Hd']/div[2]/div/div[2]/div[3]/div/div/div[2]/div/div[1]/div[1]/div[3]/span"))
select_C.select_by_index(1)```
select_C defined wrong: you should pass single WebElement to Select (so you need to use web.find_element instead of web.find_elements) and also your XPath should fetch select node not span. Select class is not applicable here. Just click on
web.find_element('xpath', '//div[#role="option"]').click()
to open drop-down menu and then
web.find_element('xpath', '//div[#data-value="Tunisia" and #aria-selected="false"]').click()
to select option

Scraping Amazon dropdown list using Selenium; Dynamic scraping

#I am scraping amazon.ae. I was trying scrape the size of clothes (e.g. jeans) by #selecting from the dropdown menu. My code is as follows but got an error. Please help
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
url='https://www.amazon.ae/Jack-Jones-Glenn-Original-Pants/dp/B07JQB87KL/ref=sr_1_5?
crid=M8QQKGLLZ1O9&keywords=jeans&qid=1657289288&sprefix=jeans%2Caps%2C232&sr=8-
5&th=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
d=driver.find_element_by_name("dropdown_selected_size_name").click()
select=Select(d)
select.select_by_index(1)
#Error: AttributeError: 'NoneType' object has no attribute 'tag_name'

How to get all stocks from the specified URL in selenium headless mode?

Below I describe the issue I have.
Description
I want to simple fetch all stocks from the URL: https://www.di.se/bors/large-cap/
I do this from a very slow computer with a small screen (15"1) also zoom is set to 150% in Windows.
I want to do this in selenium headless mode by Java.
Problem
All stocks are not visible nor at screen or in inspect.
I try to fetch all stocks by the line:
driver.findElement(By.tagName("body")).getText();
This command don't return all stocks. If I go the end of page and pageup to the end of the stock lists, I can see "getting more data" or in my language Swedish "hämtar mer data" at the end of the stock lists ie. Complete list with all stocks ends with Wihlborgs Fastigheter.
Inspect of current element gives:
<p class="instrument-table__load-more-info">Hämtar mer data...</p>
To update page with more stocks I have to scroll the page.
Question
How to fetch all stocks in headless mode in Java?
Can you simply download your data from the Yahoo Finance API?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.optimize as sco
import datetime as dt
import math
from datetime import datetime, timedelta
from pandas_datareader import data as wb
from sklearn.cluster import KMeans
np.random.seed(777)
import yfinance as yf
start = '2018-06-30'
end = '2020-06-30'
tickers = ['MSFT','AAPL','GOOG']
thelen = len(tickers)
price_data = []
for ticker in tickers:
data = yf.download(ticker, start, end)
data = data.reset_index()
data

What is the correct soup.find() command?

I am trying to webscrape the racename ('The Valley R2') and the horse name ('Ronniejay') from the following website https://www.punters.com.au/form-guide/form-finder/e2a0f7e13bf0057b4c156aea23019b18.
What is the correct soup.find() code to do this.
My code to get the race name:
from bs4 import BeautifulSoup
import requests
source = requests.get('https://www.punters.com.au/form-guide/form-finder/e2a0f7e13bf0057b4c156aea23019b18').text
soup = BeautifulSoup(source,'lxml')
race = soup.find('h3')
print(race)
The website uses JavaScript, but requests doesn't support it. We can use Selenium as an alternative to scrape the page.
Install it with: pip install selenium.
Download the correct ChromeDriver from here.
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
URL = "https://www.punters.com.au/form-guide/form-finder/e2a0f7e13bf0057b4c156aea23019b18"
driver = webdriver.Chrome(r"C:\path\to\chromedriver.exe")
driver.get(URL)
# Wait for page to fully render
sleep(5)
soup = BeautifulSoup(driver.page_source, "lxml")
race_name = soup.select_one(".form-result-group__event span").text
horse_name = "".join(
x for x in soup.select_one(".form-result__competitor-name").text if x.isalpha()
)
print(race_name)
print(horse_name)
driver.quit()
Output:
The Valley R2
Ronniejay

SQL not working within a python function when printed reflects as a string

I am trying to run SQL code to retrieve data from IBM DB2 within a python function that is retrieving data from SAP GUI and based on certain criteria pull the data from IBM DB2. When i print the connection of DB2 it works. However the SQL code is being printed as a string. Note that i have not mentioned the entire code to login in to SAP as it would be very long. When run the same script separately it works fine retrieving the required data. Any idea why it is considering it like a string not a SQL script.
result of the query:
<ibm_db_dbi.Connection object at 0x000002B8DF807588>
Select * from DBA.M82 T82
WHERE T82.EID IN 324809
Code is:
import win32com.client
import sys
import subprocess
import time
import pandas as pd
import numpy as np
from datetime import date
from datetime import datetime, timedelta
from multiprocessing import Process
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from time import sleep
from selenium.webdriver.support.ui import Select
import ibm_db
import ibm_db_dbi as db
def sap_login()
dsn = "DRIVER={{IBM DB2 ODBC DRIVER}};" + \
"DATABASE=;" + \
"HOSTNAME=;" + \
"PORT=;" + \
"UID=;" + \
"PWD=;"
hdbc = db.connect(dsn, "", "")
e_id=session.findById("wnd[0]/usr/cntlBCALV_GRID_DEMO_0100_CONT1/shellcont/shell").GetCellValue(i,"ZEMP_CODE")
sql=(""" Select *
from DBA.M82 T82
WHERE T82.EID in {}""").format(e_id)
print(sql)
fsdd=pd.read_sql(sql,hdbc)
sap_login()
import win32com.client
import sys
import subprocess
import time
import pandas as pd
import numpy as np
from datetime import date
from datetime import datetime, timedelta
from multiprocessing import Process
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from time import sleep
from selenium.webdriver.support.ui import Select
import ibm_db
import ibm_db_dbi as db
def sap_login()
dsn = "DRIVER={{IBM DB2 ODBC DRIVER}};" + \
"DATABASE=;" + \
"HOSTNAME=;" + \
"PORT=;" + \
"UID=;" + \
"PWD=;"
hdbc = db.connect(dsn, "", "")
e_id=session.findById("wnd[0]/usr/cntlBCALV_GRID_DEMO_0100_CONT1/shellcont/shell").GetCellValue(i,"ZEMP_CODE")
sql=""" Select *
from DBA.M82 T82
WHERE T82.EID in {}""".format(e_id)
print(sql)
fsdd=pd.read_sql(sql,hdbc)
A note you would need to convert your list to tuples while using format to pull data for multiple values.