Is it possible to web scape the pythagorean win/loss record column using beautiful soup and save data to excel? - pandas

This is my first scrape. I have tried watching some videos online and using google. I haven't had much success with this website, though. Perhaps someone can help me.
This is what I have started with...
from lxml import html
from bs4 import BeautifulSoup
import requests
import pandas as pd
req = requests.get('https://www.baseball-reference.com/leagues/MLB-
standings.shtml')
soup = BeautifulSoup(req.text, "lxml")
W_L = [i.text for i in soup.find_all('td', {'data-stat': 'record_pythag'})]
team = [i.text for i in soup.find_all('td', {'data-stat': 'team_ID'})]
my_dict = dict(zip(team, W_L))
df = pd.DataFrame(my_dict)
writer = pd.ExcelWriter('my1st_webscrape.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
I would just like the pythagorean win/loss column please. Thanks!

Search for all td tags that include data-stat corresponding to record_pythag and extract .text from each:
W_L = [i.text for i in soup.find_all('td', {'data-stat': 'record_pythag'})]
Note that this assumes all tags found will include text. If one does not it will throw a NoneType error, in which case you could wrap this in a try-except block.
I imagine the dictionary you want to create uses the team name abbreviations as the keys, in which case you can assemble a list of them like so:
team = [i.text for i in soup.find_all('td', {'data-stat': 'team_ID'})]
Then you can create your dictionary:
my_dict = dict(zip(team, W_L))
And pass it to a dataframe:
df = pd.DataFrame(my_dict)
EDIT
It turns out you actually need Selenium in order to pull the information you are interested in. Selenium allows you to automate a web browser and it will load the full page source HTML. You will also need to convert the dictionary to a pandas Series prior to passing it to the DataFrame constructor. See revised code below (note that you will need to download the selenium chrome driver, or whichever web browser driver you are interested in using):
from bs4 import BeautifulSoup
import requests, os
import pandas as pd
from selenium import webdriver
os.chdir('path your chrome driver')
header = {'User-agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'}
options = webdriver.ChromeOptions(); options.add_argument("--start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get('https://www.baseball-reference.com/leagues/MLB-standings.shtml')
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
W_L = [i.text for i in soup.find_all('td', {'data-stat': 'record_pythag'})]
team = [i.text for i in soup.find_all('td', {'data-stat': 'team_ID'})]
my_dict = dict(zip(team, W_L))
df = pd.DataFrame(pd.Series(my_dict))
writer = pd.ExcelWriter('my1st_webscrape.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()

Related

beautiful soup unable to scrape website contents

Hi I am trying to do a simple web scrape on this website https://www.sayurbox.com/p/Swallow%20Tepung%20Agar%20Agar%20Tinggi%20Serat%207%20gram
where my code is this:
def userAgent(URL):
ua = UserAgent()
USER_AGENT = ua.random
headers = {"User-Agent" : str(USER_AGENT),"Accept-Encoding": "*","Connection": "keep-alive"}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
print(f'{URL}')
else:
print(f'error 200:{URL}')
urlError = pd.DataFrame({'url':[URL],
'date':[dateNow]
})
urlError.to_csv('errorUrl/errorUrl.csv', mode='a', index=False, header=False)
return soup
soup = userAgent(url)
productTitle = soup.find_all('div', {"class":"InfoProductDetail__shortDesc"})
However it is unable to do so, is there something wrong with my code? I tried adding time.sleep to wait for the page to load, however it still does not work. Help will be greatly appreciated
Your code is fine but the url is dynamic meaning data is generated by JavaScript and requests,BeautifulSoup can't mimic that's you need automation tool something like selenium.Now you can run the code.
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.sayurbox.com/p/Swallow%20Tepung%20Agar%20Agar%20Tinggi%20Serat%207%20gram'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
title=soup.select_one('.InfoProductDetail__shortDesc').text
price= soup.select_one('span.InfoProductDetail__price').text
print(title)
print(price)
Output:
Swallow Tepung Agar Agar Tinggi Serat 7 gram
7.900

bs4 can't get specific results

I am trying to get specific data from a website that is under a class that is used multiple times. So my thought was to search for the next biggest class and then use bs4 again to narrow my search results further. However, I get this error:
AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
This is my code:
import requests
from bs4 import BeautifulSoup
def main():
responce()
def responce():
r = requests.get('https://robinhood.com/stocks/WISH')
soup = BeautifulSoup(r.content, 'html.parser')
responce = soup.find_all(class_="css-ktio0g""")
responce = responce.find_all(class_="css-6e9xj2")
print(responce)
main()
import requests
from bs4 import BeautifulSoup
from pprint import pp
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
goal = [x.text for x in soup.select('span.css-ktio0g')]
pp(goal)
main('https://robinhood.com/stocks/WISH')
Output:
['Piotr Szulczewski',
'—',
'San Francisco, California',
'2010',
'7.25B',
'—',
'—',
'155.46M',
'$12.39',
'$11.44',
'$11.97',
'76.70M',
'$32.85',
'$7.52',
'— per share',
'Expected Aug 11, After Hours',
'Sign up for a Robinhood Account to buy or sell ContextLogic stock and '
'options commission-free.']

Append href links into a dataframe list, getting all required info, but only links from last page appears

Using Beautiful Soup and pandas, I am trying to append all the links on a site into a list with the following code. I am able to scrape all pages with relevant information in the table. The code seems work to me somehow. But the small problem occurs is that just only links in the last page appears. The output is not what I expected. In the end, I'd like to append a list containing all 40 links (next to the required info) in 2 pages. I try scraping 2 pages first although there are 618 pages in total. Do you have any advice how to adjust the code so that each link is appended into the table? Many thanks in advance.
import pandas as pd
import requests
from bs4 import BeautifulSoup
hdr={'User-Agent':'Chrome/84.0.4147.135'}
dfs=[]
for page_number in range(2):
http= "http://example.com/&Page={}".format(page_number+1)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.text, 'html.parser')
table = soup.find('table')
df_list= pd.read_html(url.text)
df = pd.concat(df_list)
dfs.append(df)
links = []
for tr in table.findAll("tr"):
trs = tr.findAll("td")
for each in trs:
try:
link = each.find('a')['href']
links.append(link)
except:
pass
df['Link'] = links
final_df = pd.concat(dfs)
final_df.to_csv('myfile.csv',index=False,encoding='utf-8-sig')
It's with your logic. You only add the links column to the last df since it's outside your loop. Get the links within the page loop, then add that to df, then you can append the df to your dfs list:
import pandas as pd
import requests
from bs4 import BeautifulSoup
hdr={'User-Agent':'Chrome/84.0.4147.135'}
dfs=[]
for page_number in range(2):
http= "http://example.com/&Page={}".format(page_number+1)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.text, 'html.parser')
table = soup.find('table')
df_list= pd.read_html(url.text)
df = pd.concat(df_list)
links = []
for tr in table.findAll("tr"):
trs = tr.findAll("td")
for each in trs:
try:
link = each.find('a')['href']
links.append(link)
except:
pass
df['Link'] = links
dfs.append(df)
final_df = pd.concat(dfs)
final_df.to_csv('myfile.csv',index=False,encoding='utf-8-sig')

input custom text in youtube text field using selenium python

I'm making a text scraper for youtube in which I want to enter data and search videos and collect data of it. I'm facing problems in entering data in the text field. Can anyone suggest me a method to do that?
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
soup = BeautifulSoup(driver.page_source, 'lxml') #Use the page as source
page = driver.get('https://freight.rivigo.com/dashboard/home')
import sys
from importlib import reload
reload
elem = driver.find_element_by_tag_name("body")
no_of_pagedowns = 120
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
no_of_pagedowns-=1
soup = BeautifulSoup(driver.page_source, 'lxml')
In between this code I want to add a custom text in input field, lets say "comedy" and want to get data on that. I'm stuck on how to input data and I'm quite new to this so any sort of help will be helpful.
That page is NOT pointing to YouTube. Check out the working code sample below for an idea of what you can do with the YouTube API.
# https://medium.com/greyatom/youtube-data-in-python-6147160c5833
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from youtube_data import youtube_search
test = youtube_search("Nine Inch Nails")
test.keys()
test['commentCount'][:5]
df = pd.DataFrame(data=test)
df.head()
df1 = df[['title','viewCount','channelTitle','commentCount','likeCount','dislikeCount','tags','favoriteCount','videoId','channelId','categoryId']]
df1.columns = ['Title','viewCount','channelTitle','commentCount','likeCount','dislikeCount','tags','favoriteCount','videoId','channelId','categoryId']
df1.head()
#import numpy as np
#numeric_dtype = ['viewCount','commentCount','likeCount','dislikeCount','favoriteCount']
#for i in numeric_dtype:
# df1[i] = df[i].astype(int)
NIN = df1[df1['channelTitle']=='Nine Inch Nails']
NIN.head()
NIN = NIN.sort_values(ascending=False,by='viewCount')
plt.bar(range(NIN.shape[0]),NIN['viewCount'])
plt.xticks(range(NIN.shape[0]),NIN['Title'],rotation=90)
plt.ylabel('viewCount in 100 millions')
plt.show()

Extract specific columns from a given webpage

I am trying to read web page using python and save the data in csv format to be imported as pandas dataframe.
I have the following code that extracts the links from all the pages, instead I am trying to read certain column fields.
for i in range(10):
url='https://pythonexpress.in/workshop/'+str(i).zfill(3)
import urllib2
from bs4 import BeautifulSoup
try:
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
for anchor in soup.find_all('div', {'class':'col-xs-8'})[:9]:
print i, anchor.text
except:
pass
Can I save these 9 columns as pandas dataframe?
df.columns=['Organiser', 'Instructors', 'Date', 'Venue', 'Level', 'participants', 'Section', 'Status', 'Description']
This returns the correct results for the first 10 pages - but it takes a lot of time for 100 pages. Any suggestions to make it faster?
import urllib2
from bs4 import BeautifulSoup
finallist=list()
for i in range(10):
url='https://pythonexpress.in/workshop/'+str(i).zfill(3)
try:
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
mylist=list()
for anchor in soup.find_all('div', {'class':'col-xs-8'})[:9]:
mylist.append(anchor.text)
finallist.append(mylist)
except:
pass
import pandas as pd
df=pd.DataFrame(finallist)
df.columns=['Organiser', 'Instructors', 'Date', 'Venue', 'Level', 'participants', 'Section', 'Status', 'Description']
df['Date'] = pd.to_datetime(df['Date'],infer_datetime_format=True)
df['participants'] = df['participants'].astype(int)