Why am I seeing "no connection adapters found" when trying to use results as a variable, but not while trying to print? - pandas

Hope I am asking this the right way - just confused with what's going on: I have my working script (below). I'm trying to take the URLs from a spreadsheet, rather than copy and paste them in - basically, creating urlsA from column N on the sheet connected.
I've tested it out - I can print urlsA to terminal no problem, so I know the Sheet connection is working. I just can't seem to use them when I try to run the full script. I'm receiving this error:
Working code (before pulling links from Google Sheet):
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
profilesA = []
urlsA = ['https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=2&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=6&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=7&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=17&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=23&stats_player_seq=-100']
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
Broken code: "No connection adapters were found" error:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
from unittest import skip
import json
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1cEQlPB_ykJrucnbGgKhlKj49RdLNAzeO6fiO2gkQeNU')
wk = sh.worksheet("Team Select")
profilesA = []
ShUrls = wk.batch_get(('N3:N',))[0]
urlsA = ShUrls
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)

I'd inspect this line:
ShUrls = wk.batch_get(('N3:N',))[0]
As you might be pulling a list of lists, hence, this line breaks
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
with the No connection adapters were found error as a list is not a valid URL.

Needed to flatten urlsA after seeing it was an array of arrays. Using this, then calling flatten fixed the issue:
def flatten(l):
fl = []
for sublist in l:
for item in sublist:
fl.append(item)
return fl

Related

ValueError: 4 columns passed, passed data had 1 columns

'NEW LEARNER'
If data in column from webpage is not an integer, I cannot append the row to my data frame.
[webpage data as seen by this image] (https://i.stack.imgur.com/KBjRU.png)
Here is my code:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
ticker = input("Type your ticker symbol: ")
def get_balance_sheet_from_yfinance_web(ticker):
url = f"https://finance.yahoo.com/quote/%7Bticker%7D/balance-sheet?p=%7Bticker%7D"
header = {'Connection': 'keep-alive',
'Expires': '-1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
r = requests.get(url, headers=header)
html = r.text
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
col = []
pd.set_option('max_colwidth', None)
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
pd.set_option('max_colwidth', None)
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = df.append(row)
df.to_csv(f'{ticker}.csv')
return df
print(get_balance_sheet_from_yfinance_web(ticker))
I have tried replace('-', 0)

I cant get text from atribute span in bs4

when i try to get text i have a output like:
price = item.find('span').text
AttributeError: 'NoneType' object has no attribute 'text'
code:
#___IMPORTS_____
from datetime import date
import calendar
import requests
from bs4 import BeautifulSoup
#_______________
url= 'https://www.investing.com/currencies/eur-usd'
page = requests.get(url, headers = {'User-
Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/91.0.4472.124 Safari/537.36'})
#print(f'Status code is: {page.status_code}')
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('div', class_='first
inlineblock')[0]
for item in table:
price = item.find('span').text
print(price)
Try:
#___IMPORTS_____
from datetime import date
import calendar
import requests
from bs4 import BeautifulSoup
#_______________
url= 'https://www.investing.com/currencies/eur-usd'
page = requests.get(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})
#print(f'Status code is: {page.status_code}')
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('div', class_='first inlineblock')
for item in table:
price = item.find('span', class_='float_lang_base_2')
print(price.text)
1.1753
1.1752
- 0.4
Or if you require the field:
for item in table:
field = item.find('span', class_='float_lang_base_1')
price = item.find('span', class_='float_lang_base_2')
print(field.text, ':', price.text)
Prev. Close : 1.1753
Open : 1.1752
1-Year Change : - 0.4

python selenium/soup not scrolling and printing entire job containers in linkedined

Here's the problem statement: The base_site link below takes us to a job search URL.
There are small containers that show jobs on the left pane of the webpage.
The problem is that with this code I can only see 7 containers as output.
For example, it shows the 1st seven job result locations in the output whereas I am expecting all of them to be shown in the output. For this, I am using scrolltoview but that doesn't seem to help as well.
What is it that I'm missing?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from time import sleep
def get_driver():
options = Options()
options.add_argument("user-data-dir=C:\\Users\\abc\\AppData\\Local\\Google\\Chrome\\User Data")
path = 'C:\\Program Files (x86)\\Google\\chromedriver.exe'
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(path, options=options)
text_search = 'Product Development Engineer'
location_search = 'california'
# base_site = 'https://www.linkedin.com/jobs'
base_site = 'https://www.linkedin.com/jobs/search/?currentJobId=2638809245&f_E=3%2C4&f_JT=F&f_SB2=3&f_TPR=r60' \
'4800&geoId=102095887&keywords=product%20development%20engineer&location=California%2C%20United%20States&sortBy=R'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
"70.0.3538.102 Safari/537.36 Edge/18.19582"}
driver.get(base_site)
parsing_job_data(driver, base_site, headers)
def parsing_job_data(driver, base_site, headers):
try:
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
soup = BeautifulSoup(driver.page_source, 'lxml')
results = soup.find_all('div', class_="job-card-container relative job-card-list job-card-container--clickable "
"job-card-list--underline-title-on-hover jobs-search-results-list__list-"
"item--active jobs-search-two-pane__job-card-container--viewport-tracking"
"-0")
sleep(1)
each_container = soup.select('[class*="occludable-update"]', limit=20)
for container in each_container:
element = driver.find_element_by_class_name("artdeco-entity-lockup__caption")
driver.execute_script("arguments[0].scrollIntoView(true);", element)
element.click()
job_title = container.find('a', class_='disabled ember-view job-card-container__link job-card-list__title').text
location = container.find('li', class_='job-card-container__metadata-item').text
job_title = job_title.strip()
location = location.strip()
print(job_title, ', ', location)
except Exception as e:
print(e)
if __name__ == "__main__":
get_driver()
import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
allin = []
async def worker(channel):
async with channel:
async for num in channel:
async with httpx.AsyncClient(timeout=None) as client:
client.headers.update(headers)
params = {
"currentJobId": "2638809245",
"f_E": "3,4",
"f_JT": "F",
"f_SB2": "3",
"f_TPR": "r604800",
"geoId": "102095887",
"keywords": "product development engineer",
"location": "California, United States",
"sortBy": "R",
"position": "1",
"pageNum": "0",
"start": num
}
r = await client.get('https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search', params=params)
soup = await get_soup(r.text)
goal = [(x.h3.get_text(strip=True), x.select_one('.job-search-card__location').get_text(strip=True))
for x in soup.select('.base-search-card__info')]
allin.extend(goal)
async def main():
async with trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(2):
nurse.start_soon(worker, receiver.clone())
async with sender:
for num in range(0, 450, 25):
await sender.send(num)
df = pd.DataFrame(allin, columns=["Title", "Location"])
print(df)
#df.to_csv('result.csv', index=False)
if __name__ == "__main__":
trio.run(main)
Output:
Title Location
0 Packaging Process Engineer Fremont, CA
1 Project Engineer Oakland, CA
2 Process Engineer- Materials and Fibers Santa Clarita, CA
3 Senior Product Design Engineer Carson, CA
4 Design Engineer Sacramento, CA
.. ... ...
436 Software Development Engineer Irvine, CA
437 Software Development Engineer Sunnyvale, CA
438 Software Development Engineer San Luis Obispo, CA
439 Software Development Engineer - Luna Irvine, CA
440 Software Development Engineer Irvine, CA
[441 rows x 2 columns]

Why not all of the coordinates are generated while running Geopy to scrape yellow pages?

The output is a csv file with a list of businesses including name, address, telephone and coordinates, for some reason only partial coordinates are generated, the ones that aren't generated and ran in a single run with geopy will find the coordinates, so potentially geopy can find the coordinates for all of them but for some reason it skips sometimes, I thought it might be needing some time to call the api and added threading but it didn't solve the issue.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import threading
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ypscraper#gmail.com")
main_list = []
def extract(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup.find_all('div', class_ = 'listing__content__wrap--flexed jsGoToMp')
def transform(articles):
for item in articles:
name = item.find('a', class_ ='listing__name--link listing__link jsListingName').text
try:
street = item.find('span', {'itemprop':'streetAddress'}).text
except:
street = ''
try:
city = item.find('span', {'itemprop':'addressLocality'}).text
except:
city = ''
try:
province = item.find('span', {'itemprop':'addressRegion'}).text
except:
province = ''
try:
postCode = item.find('span', {'itemprop':'postalCode'}).text
except:
postCode = ''
try:
phone = item.find('li', class_ = 'mlr__submenu__item').text.strip()
except:
phone = ''
try:
def search_geo():
global location
location = geolocator.geocode(street + ' ' + city)
print(street + ' ' + city)
thread = threading.Thread(target=search_geo)
thread.start()
thread.join()
slatitude = location.latitude
except:
slatitude = ''
try:
thread = threading.Thread(target=search_geo)
thread.start()
thread.join()
slongitude = location.longitude
except:
slongitude = ''
business = {
'name': name,
'street': street,
'city': city,
'province': province,
'postCode': postCode,
'phone': phone,
'slongitude': slongitude,
'slatitude': slatitude
}
main_list.append(business)
return
def load():
df = pd.DataFrame(main_list)
df.to_csv('repairshopsbc', index=False)
for x in range(1,2):
print(f'Getting page {x}')
articles = extract(f'https://www.yellowpages.ca/search/si/{x}/car+repair/British+Columbia+BC')
transform(articles)
time.sleep(5)
load()
print('Saved to CSV')

Unable to parse data correctly in BeautifulSoup

Below is a snippet of the code I am using in order to parse data off a webpage
link1 = "https://www.codechef.com/status/" + sys.argv[1] + "?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(link1)
s = response.read()
soup = BeautifulSoup(s)
l = soup.findAll('tr',{'class' : 'kol'})
Here is the URL of an example page that gets stored in the variable link1
https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO
Now, the problems is that the variable l always gets an empty list even though there are entries in the table generated by the HTML tags I am trying to find.
Please help me out with this.
EDIT
Complete Code
from BeautifulSoup import BeautifulSoup
import urllib2
import os
import sys
import subprocess
import time
import HTMLParser
import requests
html_parser = HTMLParser.HTMLParser()
link = "https://www.codechef.com/status/"+sys.argv[1]+"?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(link)
s = response.read()
soup = BeautifulSoup(s)
try:
l = soup.findAll('div',{'class' : 'pageinfo'})
for x in l:
str_val = str(x.contents)
pos = str_val.find('of')
i = pos+3
x = 0
while i < len(str_val):
if str_val[i] >= str(0) and str_val[i] <= str(9):
x = x*10 + int(str_val[i])
i += 1
except:
x = 1
print x
global lis
lis = list()
break_loop = 0
for i in range(0,x):
print i
if break_loop == 1:
break
if i == 0:
link1 = link
else:
link1 = "https://www.codechef.com/status/"+sys.argv[1]+"?page="+str(i)+"&sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
# opener = urllib2.build_opener()
# opener.addheaders = [('User-agent', 'Mozilla/5.0')]
# response = opener.open(link1)
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = requests.get(link1, headers={'User-Agent': useragent})
# s = response.read()
soup = BeautifulSoup(req.content)
l = soup.findAll('tr',{'class' : r'\"kol\"'})
print l
for val in l:
lang_val = val.find('td',{'width' : '70'})
lang = lang_val.renderContents().strip()
print lang
try:
data = val.find('td',{'width' : '51'})
data_val = data.span.contents
except:
break
if lang != 'PHP':
break_loop = 1
break
if len(data_val) > 1 and html_parser.unescape(data_val[2]) != '100':
continue
str_val = str(val.td.contents)
p = 0
j = 0
while p < len(str_val):
if str_val[p] >= str(0) and str_val[p] <= str(9):
j = j*10 + int(str_val[p])
p += 1
lis.insert(0,str(j))
if len(lis) > 0:
try:
os.mkdir(sys.argv[1]+"_php")
except:
pass
count = 1
for data in lis:
cmd = "python parse_data_final.py "+data+" > "+sys.argv[1]+"_php/"+sys.argv[1]+"_"+str(count)+".php"
subprocess.call(cmd, shell=True)
count += 1
Your code doesn't work because because your class is wrong, try it with:
l = soup.findAll('tr',{'class' : r'\"kol\"'})
You can also get the tags like this:
l = soup.find('table', {'class': 'dataTable'}).tbody
Also, you should probably be using requests depending on which version of python you're using. Here's an example:
import requests
from bs4 import BeautifulSoup
url = "https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = requests.get(url, headers={'User-Agent': useragent})
soup = BeautifulSoup(req.content, "html.parser")
#l = soup.findAll('tr',{'class' : r'\"kol\"'})
l = soup.find('table', {'class': 'dataTable'}).tbody