Unable to parse data correctly in BeautifulSoup

Unable to parse data correctly in BeautifulSoup - beautifulsoup

Below is a snippet of the code I am using in order to parse data off a webpage
link1 = "https://www.codechef.com/status/" + sys.argv[1] + "?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(link1)
s = response.read()
soup = BeautifulSoup(s)
l = soup.findAll('tr',{'class' : 'kol'})
Here is the URL of an example page that gets stored in the variable link1
https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO
Now, the problems is that the variable l always gets an empty list even though there are entries in the table generated by the HTML tags I am trying to find.
Please help me out with this.
EDIT
Complete Code
from BeautifulSoup import BeautifulSoup
import urllib2
import os
import sys
import subprocess
import time
import HTMLParser
import requests
html_parser = HTMLParser.HTMLParser()
link = "https://www.codechef.com/status/"+sys.argv[1]+"?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(link)
s = response.read()
soup = BeautifulSoup(s)
try:
l = soup.findAll('div',{'class' : 'pageinfo'})
for x in l:
str_val = str(x.contents)
pos = str_val.find('of')
i = pos+3
x = 0
while i < len(str_val):
if str_val[i] >= str(0) and str_val[i] <= str(9):
x = x*10 + int(str_val[i])
i += 1
except:
x = 1
print x
global lis
lis = list()
break_loop = 0
for i in range(0,x):
print i
if break_loop == 1:
break
if i == 0:
link1 = link
else:
link1 = "https://www.codechef.com/status/"+sys.argv[1]+"?page="+str(i)+"&sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
# opener = urllib2.build_opener()
# opener.addheaders = [('User-agent', 'Mozilla/5.0')]
# response = opener.open(link1)
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = requests.get(link1, headers={'User-Agent': useragent})
# s = response.read()
soup = BeautifulSoup(req.content)
l = soup.findAll('tr',{'class' : r'\"kol\"'})
print l
for val in l:
lang_val = val.find('td',{'width' : '70'})
lang = lang_val.renderContents().strip()
print lang
try:
data = val.find('td',{'width' : '51'})
data_val = data.span.contents
except:
break
if lang != 'PHP':
break_loop = 1
break
if len(data_val) > 1 and html_parser.unescape(data_val[2]) != '100':
continue
str_val = str(val.td.contents)
p = 0
j = 0
while p < len(str_val):
if str_val[p] >= str(0) and str_val[p] <= str(9):
j = j*10 + int(str_val[p])
p += 1
lis.insert(0,str(j))
if len(lis) > 0:
try:
os.mkdir(sys.argv[1]+"_php")
except:
pass
count = 1
for data in lis:
cmd = "python parse_data_final.py "+data+" > "+sys.argv[1]+"_php/"+sys.argv[1]+"_"+str(count)+".php"
subprocess.call(cmd, shell=True)
count += 1

Your code doesn't work because because your class is wrong, try it with:
l = soup.findAll('tr',{'class' : r'\"kol\"'})
You can also get the tags like this:
l = soup.find('table', {'class': 'dataTable'}).tbody
Also, you should probably be using requests depending on which version of python you're using. Here's an example:
import requests
from bs4 import BeautifulSoup
url = "https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = requests.get(url, headers={'User-Agent': useragent})
soup = BeautifulSoup(req.content, "html.parser")
#l = soup.findAll('tr',{'class' : r'\"kol\"'})
l = soup.find('table', {'class': 'dataTable'}).tbody

Related

Url Redirect Pyhton

I want to redirect the page to another url, but it's not happening. I would appreciate your help.
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64',
}
response = requests.get(ad_link, headers=headers, allow_redirects=True)
ic(response.status_code)
#ic(response.history)
if response.history:
ic("Request was redirected")
for resp in response.history:
ic(resp.status_code, resp.url)
ic('For Control')
ic("Final destination:")
ic(response.status_code, response.all_links)
ic(ad_link)
else:
ic("Request was not redirected")
ic(response)
time.sleep(sleep_time)
I wrote a loop like this, the content is 200 but it doesn't open the url to the screen. It doesn't run the for here at all.
I go to a url and log in and then I want it to take me to a link from the column named 'ad_link' in the database. Because I don't want to have to open that session all the time. I want to browse the links in my database with an open session.
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="",
database="m"
)
mycursor = mydb.cursor()
urllib3.disable_warnings()
sql = "SELECT ad_link FROM test"
mycursor.execute(sql)
myresult = mycursor.fetchall()
all_links = myresult[0:]
len_all_links = len(all_links)
dataframe = pd.DataFrame(all_links, columns=['links'])
x = 0
y = 5
#def fonksiyon(i):
#global x
#global y
number = np.arange(x,y)
for i in tqdm(number):
ad_link = dataframe.links[i] #ad_link = dataframe["links"][i]
print(ad_link)
Display = []
prefs = {"profile.managed_default_content_settings.images": 2} # this is to not load images
sx = random.randint(1000, 1500)
sn = random.randint(3000, 4500)
options = Options()
options = webdriver.ChromeOptions()
time.sleep(5)
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("detach", True)
capabilities = options.to_capabilities()
os.environ['WDM_SSL_VERIFY'] = '0'
options.add_experimental_option("prefs", prefs)
wsize = "--window-size=" + str(sx - 10) + ',' + str(sn - 10)
options.add_argument(str(wsize))
options.add_argument("prefs", )
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
options.add_argument(['--headless', '--disable-gpu', '--window-size=1920,1080', '--no-sandbox', '--disable-dev-shm-usage'])
service = Service(executable_path = r'C:\Users\Wiveda\chromedriver.exe')
test = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
sleep_time = 5
test.get(ad_link)
time.sleep(sleep_time)
ad_source = test.page_source
ad_soup = BeautifulSoup(ad_source, 'lxml')
mainresults = ad_soup.find_all('div', {'class': 'cBox u-inherit '})
try:
WebDriverWait(test, timeout=10).until(
lambda d: d.find_element(By.XPATH, "//button[#class='sc-bczRLJ-accept-btn']")).click()
WebDriverWait(test, timeout=10).until(
lambda d: d.find_element(By.XPATH, "//p[#class='phone-container']")).click()
tel_number = test.find_element(By.XPATH, "//p[#class='phone-container']").text
ic(tel_number)
except:
tel_number = 'Not Found Tel Number'
ic(tel_number)
time.sleep(1)
search_words = ""
try:
web_text = test.find_element(By.XPATH, "/html/body/div[6]/div/div[2]/div[3]/div[1]")
words = ["Import", "x", "y", "z"]
search_words = [word for word in words if re.findall(word, web_text)]
text_words = ''
if search_words:
for i, word in enumerate(search_words):
if i < len(search_words) - 1:
text_words += f"{word}, "
else:
text_words += f"{word}."
ic(f"\nCannot send mail because it contains the word.Index : {text_words}")
ic(re.findall)
print("İf tamamlandı")
print("Try tamamlandı")
except Exception:
text_words = "Not Found Words"
ic(text_words)
time.sleep(1)
#mainresults = ad_soup.find_all('div', {'class': 'cBox cBox--content u-overflow-inherit '})
try:
brand_and_model = ad_soup.find("h1", {"class": ('h u-word')}).get_text()
except:
brand_and_model = ' '
try:
model_version = ad_soup.find("div", {"class": ('list-title')}).get_text()
except:
model_version = ' '
try:
location = ad_soup.find("p", {"class": ('seller-address')}).get_text()
except:
location = ' '
try:
url_id = ad_soup.find(" ", {"class": ('')}).get_text()
except:
url_id = ''
cars_data = pd.DataFrame({
'brand_and_model': brand_and_model,
'model_version': model_version,
'location': location,
'tel_number': tel_number,
'url_id': url_id,
},
index=[0])
try:
table_pre = ad_soup.find("div", {"class": "cBox cBox--content cBox-body"}) # 1 (6 in one)
all_div = table_pre.findAll("div", {"class": ('key-feature__content')}) # 6 (2 in one)
all_title = table_pre.findAll("div", {"class": ('key-feature__label')}) # 6
all_results = table_pre.findAll("div", {"class": ('key-feature__value')}) # 6
except:
pass
description_list = []
value_list = []
try:
div_length = len(all_div)
except:
div_length = 6
for i in range(div_length):
try:
description_list.append(all_title[i].text)
description_list = list(map(lambda x: x.replace(" ", "_"), description_list))
value_list.append(all_results[i].text)
except:
description_list.append('')
value_list.append('')
all_key = []
all_value = []
try:
pdiv = ad_soup.find_all('div', {'class': 'bullet-list'})
except:
pass
equipment_key = []
try:
equipment_key_length = len(pdiv)
except:
equipment_key_length = 1
equipment_value = []
try:
dd_ul_li_length = len(pdiv)
except:
dd_ul_li_length = 1
df3 = pd.DataFrame(list(zip(equipment_key, equipment_value)), columns=['all_key', 'all_value'])
df2 = pd.DataFrame(list(zip(all_key, all_value)), columns=['all_key', 'all_value'])
df1 = pd.DataFrame(list(zip(description_list, value_list)), columns=['description_list', 'value_list'])
df1 = df1.set_index('description_list').T.reset_index(drop=True)
df1 = df1.rename_axis(None, axis=1)
df1['link'] = ad_link
df1.insert(0, "brand_and_model", brand_and_model)
df1.insert(1, "model_version", model_version)
df1.insert(2, "location", location)
df1.insert(5, "tel_number", tel_number)
df2_3 = pd.concat([df2, df3])
df2_3 = df2_3.set_index('all_key').T.reset_index(drop=True)
df2_3 = df2_3.rename_axis(None, axis=1)
df_last = pd.concat([df1, df2_3], axis=1)
df_last = df_last.astype(str).groupby(df_last.columns, sort=False, axis=1).agg(
lambda x: x.apply(','.join, 1))
now = datetime.now()
datetime_string = str(now.strftime("%Y%m%d_%H%M%S"))
df_last['ad_link'] = ad_link
df_last['download_date_time'] = datetime_string
config = configparser.RawConfigParser()
config.read(filenames='my.properties')
scrap_db = pymysql.connect(host='localhost', user='root', password='', database='m',
charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
cursor = scrap_db.cursor()
sql = """CREATE TABLE CARS(
brand_and_model VARCHAR(32),
model_version VARCHAR(64),
location VARCHAR(64),
tel_number VARCHAR(32),
mileage VARCHAR(32),
first_registration DATE(7),
ad_link VARCHAR(256),
download_date_time DATE(32),
search words VARCHAR(64)
url_id int(9)
)"""
#cursor.execute(sql) #Save data to the table
for row_count in range(0, df_last.shape[0]):
chunk = df_last.iloc[row_count:row_count + 1, :].values.tolist()
brand_and_model = ""
model_version = ""
location = ""
tel_number = ""
mileage = ""
first_registration = ""
ad_link = ""
download_date_time = ""
url_id = ""
lenght_of_chunk = len(chunk[0])
if "brand_and_model" in cars_data:
try:
brand_and_model = chunk[0][0]
except:
brand_and_model = ""
if "model_version" in cars_data:
try:
model_version = chunk[0][1]
except:
model_version = ""
if "location" in cars_data:
try:
location = chunk[0][2]
except:
location = ""
if "tel_number" in cars_data:
try:
tel_number = chunk[0][5]
except:
tel_number = ""
if "Kilometerstand" in description_list:
index_no = description_list.index("Kilometerstand")
try:
mileage = value_list[index_no]
except:
mileage = ""
if "Erstzulassung" in description_list:
index_no = description_list.index("Erstzulassung")
try:
first_registration = value_list[index_no]
except:
first_registration = ""
if chunk[0][lenght_of_chunk - 2] != "":
ad_link = chunk[0][lenght_of_chunk - 2] # ad_link
if chunk[0][lenght_of_chunk - 1] != "":
download_date_time = chunk[0][lenght_of_chunk - 1] # datetime_string
if (brand_and_model == ' '):
control = "false"
else:
control = "true"
if control == "true":
mySql_insert_query = "INSERT INTO CARS(brand_and_model,model_version,location,tel_number,mileage,first_registration,ad_link,download_date_time,url_id) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
val = (brand_and_model, model_version, location, tel_number, mileage, first_registration, ad_link, download_date_time, url_id)
time.sleep(5)
cursor = scrap_db.cursor()
cursor.execute(mySql_insert_query, val)
scrap_db.commit()
ic(cursor.rowcount, "Record inserted successfully into *CARS* table")
if (tel_number == 'Not Found Tel Number') and (text_words == 'Not Found Words'):
control = "true"
else:
control = "pass"
time.sleep(10)
if control == "true":
ic("Mail Sending")
test.find_element(By.XPATH, "/div[2]/div[2]/div/div[4]/div/span").click()
test.implicitly_wait(5)
eMl = test.find_element(By.XPATH, "/html/div[1]/form/div[1]/div/input")
test.implicitly_wait(3)
eMl.click()
time.sleep(10)
eMl.send_keys("v#gmail.com")
time.sleep(8)
passw = WebDriverWait(test, 20).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[2]/div[1]/form/div[2]/div/input")))
test.implicitly_wait(3)
passw.click()
time.sleep(1)
passw.send_keys('W' + Keys.ENTER)
time.sleep(5)
test.find_element(By.XPATH,'/html/div/div[2]/a').click()
time.sleep(3)
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64',
}
response = requests.get(ad_link, headers=headers, allow_redirects=True, timeout=2.50)
ic(response.status_code)
if response.history:
for step in response.history:
ic("Request was redirected")
for resp in response.history:
ic(resp.status_code, resp.url)
ic('For Control')
ic("Final destination:")
ic(response.status_code, response.all_links)
ic(ad_link)
else:
ic("Request was not redirected")
ic(response)
time.sleep(sleep_time)
ic('destination_status')

Selenium Issue with Exec?

I am running a webscraper with selenium to get some data on the NBA. I have urls to get to the websites for each of the 30 teams, but when I run the code it only gets through a few of the urls and then crashes with the errors below being shown:
#web scraper
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import pandas as pd
import os
class NBAScraper:
def __init__(self):
#part 1
url = "https://www.nba.com/teams"
HTML = requests.get(url)
soup = BeautifulSoup(HTML.text, 'html.parser')
text = str(soup.find_all("a", "Anchor_anchor__cSc3P TeamFigureLink_teamFigureLink__uqnNO"))
ids = []
for i in range(0, 30):
hr = text.find("stats")
ids.append(text[(hr+11):(hr+21)])
text = text[(hr+22):]
#part 2
names = []
for j in range(0, 30):
url2 = "https://www.nba.com/stats/team/"+str(ids[j])+"/advanced"
HTML2 = requests.get(url2)
soup2 = BeautifulSoup(HTML2.text, 'html.parser')
##div class="TeamHeader_name__MmHlP
name = str(soup2.find("div", "TeamHeader_name__MmHlP"))
ni = name.find("div>")
ni2 = name.find("<!")
name1 = name[(ni+4):ni2]
name = name[ni2:]
ni3 = name.find("<div>")
name = name[(ni3+5):]
ni4 = name.find("</div>")
name2 = name[:ni4]
n = name1 + " " + name2
names.append(n)
##tbody class="Crom_body__UYOcU"
#part 3
offrtg = []
defrtg = []
reb = []
tov = []
efg = []
for k in range(0, 30):
self.driver = webdriver.Chrome()
url3 = "https://www.nba.com/stats/team/"+str(ids[k])+"/advanced"
self.driver.get(url3)
rndrhtml = self.driver.page_source
self.driver.close()
#self.driver.quit()
soup3 = BeautifulSoup(rndrhtml, 'html.parser')
ovrall = str(soup3.find("tbody", "Crom_body__UYOcU").find_all("td"))
for d in range(0, 13):
di = ovrall.find("<td>")
ovrall = ovrall[(di+4):]
#conditions
if d == 2:
di2 = ovrall.find("</td>")
offrtg.append(float(ovrall[:di2]))
elif d == 3:
di2 = ovrall.find("</td>")
defrtg.append(float(ovrall[:di2]))
elif d == 10:
di2 = ovrall.find("</td>")
reb.append(float(ovrall[:di2]))
elif d == 11:
di2 = ovrall.find("</td>")
tov.append(float(ovrall[:di2]))
elif d == 12:
di2 = ovrall.find("</td>")
efg.append(float(ovrall[:di2]))
#writing to excel
os.remove(r"C:\Users\jackm\OneDrive\Desktop\NBA\NBASTATS.xlsx")
d = {'Name': names, 'OFFRTG': offrtg, 'DEFRTG': defrtg, 'REB': reb,
'TOV': tov, 'EFG': efg}
df = pd.DataFrame(data=d)
df.to_excel(r"C:\Users\jackm\OneDrive\Desktop\NBA\NBASTATS.xlsx", sheet_name="STATS")
NBAScraper()
I tried to play around with the closing and quitting functions for the driver, or put the driver in a separate function and run it outside the class, but none of that worked. I realized through some testing that even if it's not inside a loop, selenium will throw the error for a url but run it fine the second time. I tried using implicit waits to solve this but to no avail.
Traceback (most recent call last):
File "C:\Program Files\Spyder\pkgs\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\users\jackm\spyder\nba.py", line 104, in <module>
NBAScraper()
File "c:\users\jackm\spyder\nba.py", line 71, in __init__
ovrall = str(soup3.find("tbody", "Crom_body__UYOcU").find_all("td"))
AttributeError: 'NoneType' object has no attribute 'find_all'

Why am I seeing "no connection adapters found" when trying to use results as a variable, but not while trying to print?

Hope I am asking this the right way - just confused with what's going on: I have my working script (below). I'm trying to take the URLs from a spreadsheet, rather than copy and paste them in - basically, creating urlsA from column N on the sheet connected.
I've tested it out - I can print urlsA to terminal no problem, so I know the Sheet connection is working. I just can't seem to use them when I try to run the full script. I'm receiving this error:
Working code (before pulling links from Google Sheet):
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
profilesA = []
urlsA = ['https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=2&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=6&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=7&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=17&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=23&stats_player_seq=-100']
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
Broken code: "No connection adapters were found" error:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
from unittest import skip
import json
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1cEQlPB_ykJrucnbGgKhlKj49RdLNAzeO6fiO2gkQeNU')
wk = sh.worksheet("Team Select")
profilesA = []
ShUrls = wk.batch_get(('N3:N',))[0]
urlsA = ShUrls
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)

I'd inspect this line:
ShUrls = wk.batch_get(('N3:N',))[0]
As you might be pulling a list of lists, hence, this line breaks
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
with the No connection adapters were found error as a list is not a valid URL.

Needed to flatten urlsA after seeing it was an array of arrays. Using this, then calling flatten fixed the issue:
def flatten(l):
fl = []
for sublist in l:
for item in sublist:
fl.append(item)
return fl

ValueError: 4 columns passed, passed data had 1 columns

'NEW LEARNER'
If data in column from webpage is not an integer, I cannot append the row to my data frame.
[webpage data as seen by this image] (https://i.stack.imgur.com/KBjRU.png)
Here is my code:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
ticker = input("Type your ticker symbol: ")
def get_balance_sheet_from_yfinance_web(ticker):
url = f"https://finance.yahoo.com/quote/%7Bticker%7D/balance-sheet?p=%7Bticker%7D"
header = {'Connection': 'keep-alive',
'Expires': '-1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
r = requests.get(url, headers=header)
html = r.text
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
col = []
pd.set_option('max_colwidth', None)
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
pd.set_option('max_colwidth', None)
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = df.append(row)
df.to_csv(f'{ticker}.csv')
return df
print(get_balance_sheet_from_yfinance_web(ticker))
I have tried replace('-', 0)

requests_htlml infinite scrolling on div instead of entire page

Hello I am trying to get all the links from below web page. This page loads new product when we scroll down and I am trying to get the links for all the products by scrolling to the bottom of the page. I am using scrolldown method of requests_html after following this post however it only fetches links of the products that are visible without scroll. The problem is it is scrolling down the complete page instead of the product frame. If you see the below image the products are loaded only when you scroll at the bottom of the products frame.
I also tried seleniumwire(check below code) but it does the same thing, scrolls to the bottom of the page where no products are loaded. How ca I only scroll the products div?
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from seleniumwire import webdriver
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/74.0.3729.169 Safari/537.36 '
}
driver = webdriver.Chrome(executable_path="/src/resources/chromedriver")
driver.implicitly_wait(30)
product_links = []
try:
SCROLL_PAUSE_TIME = 2
def interceptor(request):
del request.headers['Referer'] # Delete the header first
request.headers['Referer'] = header
# Set the interceptor on the driver
driver.request_interceptor = interceptor
# All requests will now use 'some_referer' for the referer
driver.get(baseurl)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# r = requests.get(driver.page_source, headers=header)
print(driver.page_source)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# product_list = soup.find_all('div', class_='col-item productInfoDiv ')
#
# for itemprop in product_list:
# for link in itemprop.find_all('a', href=True):
# product_links.append("{}{}".format(baseurl, link['href']))
#
# product_links_uniq = set(product_links)
#
# print(product_links_uniq)
finally:
driver.quit()
from requests_html import HTML, HTMLSession
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
session = HTMLSession()
page = session.get(baseurl)
page.html.render(scrolldown=50, sleep=3)
html = HTML(html=page.text)
#noticeName = html.find('a href')
all_links = html.links
for ln in all_links:
print(ln)
print(len(all_links))
filtered_links = [link for link in all_links if link.startswith("/product")]
print(len(filtered_links))

You could just mimic the POST requests the page does and keep requesting batches of 20 results, extracting the links, until you have gathered the total specified number of results.
import requests
import math
from bs4 import BeautifulSoup as bs
def add_product_links(soup):
product_links.extend(['https://www.medplusmart.com' + i['href']
for i in soup.select('.productInfoDiv > div:nth-child(1) > [href^=\/product]')])
return
product_links = []
n = 0
results_per_page = 20
page = 1
data = {
'sortField': '',
'startIndex': n,
'productCategoryId': 'MART_20002',
'startPrice': '',
'endPrice': '',
'minPrice': '0',
'maxPrice': '2650',
'excludeNoStock': 'N',
'pCatName': 'personal-care_10102',
'catName': 'skin-care_20002',
'productIdString': '',
'Brand Search': ''
}
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get(
'https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002')
soup = bs(r.content, 'lxml')
data['productIdString'] = soup.select_one('#productIdString')['value']
num_results = int(soup.select_one('#totalProductFound')['value'])
num_pages = math.ceil(num_results / results_per_page)
add_product_links(soup)
s.headers.update({'x-kl-ajax-request': 'Ajax_Request'})
while True:
if page > num_pages:
break
data['startIndex'] = n
r = s.post('https://www.medplusmart.com/loadMoreProduct.mart', data=data)
soup = bs(r.content, 'lxml')
add_product_links(soup)
n += results_per_page
page += 1
print(len(product_links))

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Unable to parse data correctly in BeautifulSoup - beautifulsoup

Related

Url Redirect Pyhton

Selenium Issue with Exec?

Why am I seeing "no connection adapters found" when trying to use results as a variable, but not while trying to print?

ValueError: 4 columns passed, passed data had 1 columns

requests_htlml infinite scrolling on div instead of entire page

Categories

Resources