Pandas: Creating multiple new columns from function with multiple output values - pandas

Im trying to scrape a website for multiple values regarding a list of books. The links to the book pages are stored in a dataframe. Now I need a function that iterates those links and adds the book values to new columns in the dataframe. I don't want to request the page again every time I'm scraping a new book value, so I want to do it all in one function.
The problem is the function then returns multiple values (e.g. book_title and book_rating) which I don't know how to best add to the dataframe.
I tried the following, which I know can't work but I'm stuck:
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
#function to get the book page
def get_book_page(page):
# Construct the URL
books_page_url = page
# Get the HTML page content using requests
response = rq.get(books_page_url, headers = headers)
# Ensure that the response is valid
if response.status_code != 200:
print('Status code:', response.status_code)
raise Exception('Failed to fetch web page ' + books_page_url)
# Construct a beautiful soup document
doc = bs(response.content, "html.parser")
return doc
#function to scrape the book title
def scrape_book_title(book_content):
try:
title_tag = book_content.find("h1", class_="bc-heading bc-color-base bc-size-large bc-text-bold").text.strip()
except:
title_tag = "fehlt"
return title_tag
#function to scrape the book rating
def scrape_book_rating(book_content):
star_tag = book_content.find("li", class_="bc-list-item ratingsLabel")
try:
rating_tag = star_tag.find("span", class_="bc-text bc-pub-offscreen").text.strip()
except:
rating_tag = "fehlt"
return rating_tag
#function I'm trying to fix
def get_book_title(links):
bs_page = get_book_page(links)
bs_content = bs_page.find("ul", class_="bc-list bc-spacing-s2 bc-color-secondary bc-list-nostyle")
book_title = scrape_book_title(bs_content)
book_rating = scrape_book_rating(bs_content)
return book_title, book_rating
#here I would like to add the columns "A_Titel" and "A_Rating" with the values of "book_title" and "book_rating"
df['A_Titel'], df['A_Rating'] = df.apply(lambda x: get_book_title(x.Link), axis=1)

Related

Using BeautifulSoup to exploit a URL and its dependent pages and store results in csv?

This code does not crash, which is good. However, it generates and empty icao_publications.csv f. I want to populate icao_publications.csv with all record on all the pages from the URL and capture all the pages. The dataset should be about 10,000 rows or their about in all.
I want to get these 10,000 or so rows in the csv file.
import requests, csv
from bs4 import BeautifulSoup
url = 'https://www.icao.int/publications/DOC8643/Pages/Search.aspx'
with open('Test1_Aircraft_Type_Designators.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Manufacturers", "Model", "Type_Designator", "Description", "Engine_Type", "Engine_Count", "WTC"])
while True:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
for row in soup.select('table tbody tr'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
if soup.select_one('li.paginate_button.active + li a'):
url = soup.select_one('li.paginate_button.active + li a')['href']
else:
break
Here you go:
import requests
import pandas as pd
url = 'https://www4.icao.int/doc8643/External/AircraftTypes'
resp = requests.post(url).json()
df = pd.DataFrame(resp)
df.to_csv('aircraft.csv',encoding='utf-8',index=False)
print('Saved to aircraft.csv')

How to improve the speed of getting request content via the request module

The below functions extract content from 'http://thegreyhoundrecorder.com.au/form-guides/' and append all content to a list. The function works fine, although the speed at which the content is scraped from the website is slow. This line tree = html.fromstring(page.content) in particular slows down the process. Is there a way I can improve on the speed of my request.
import lxml
from lxml import html
import requests
import re
import pandas as pd
from requests.exceptions import ConnectionError
greyhound_url = 'http://thegreyhoundrecorder.com.au/form-guides/'
def get_page(url):
"""fxn take page url and return the links to the acticle(Field) we
want to scrape in a list.
"""
page = requests.get(url)
tree = html.fromstring(page.content)
my_list = tree.xpath('//tbody/tr/td[2]/a/#href') # grab all link
print('Length of all links = ', len(my_list))
my_url = [page.url.split('/form-guides')[0] + str(s) for s in my_list]
return my_url
def extract_data(my_url):
"""
fxn take a list of urls and extract the needed infomation from
greyhound website.
return: a list with the extracted field
"""
new_list = []
try:
for t in my_url:
print(t)
page_detail = requests.get(t)
tree_1 = html.fromstring(page_detail.content)
title = ''.join(tree_1.xpath('//div/h1[#class="title"]/text()'))
race_number = tree_1.xpath("//tr[#id = 'tableHeader']/td[1]/text()")
Distance = tree_1.xpath("//tr[#id = 'tableHeader']/td[3]/text()")
TGR_Grade = tree_1.xpath("//tr[#id = 'tableHeader']/td[4]/text()")
TGR1 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[1]/text()")
TGR2 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[2]/text()")
TGR3 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[3]/text()")
TGR4 = tree_1.xpath("//tbody/tr[#class='fieldsTableRow raceTipsRow']//div/span[4]/text()")
clean_title = title.split(' ')[0].strip()
#clean title and extract track number
Track = title.split(' ')[0].strip()
#clean title and extract track date
date = title.split('-')[1].strip()
#clean title and extract track year
year = pd.to_datetime('now').year
#convert date to pandas datetime
race_date = pd.to_datetime(date + ' ' + str(year)).strftime('%d/%m/%Y')
#extract race number
new_rn = []
for number in race_number:
match = re.search(r'^(.).*?(\d+)$', number)
new_rn.append(match.group(1) + match.group(2))
new_list.append((race_date,Track,new_rn,Distance,TGR_Grade,TGR1,TGR2,TGR3,TGR4))
return new_list
except ConnectionError as e:
print('Connection error, connect to a stronger network or reload the page')

Beautiful Soup List of Words

I am trying to find certain words within a website. Right now my code can only check for one word but I want it to be able to check for multiple words, (say instead of just checking for 'dog', i want it to check for ["dog","cat","adult"]
#Import Packages
import requests
from bs4 import BeautifulSoup
def count_words(url, the_word):
r = requests.get(url, allow_redirects=False)
soup = BeautifulSoup(r.content, 'lxml')
words = soup.find(text=lambda text: text and the_word in text)
print(words)
def main():
url = 'https://patch.com/illinois/alsip-crestwood/pet-adoption-alsip-crestwood-area-see-latest-
dogs-cats-more'
word= 'dog'
count = count_words(url, word)
print(url, count, word)
if __name__ == '__main__':
main()
Basically I do not know how to pass in a list of words instead of one singular string!
I believe you're making it a bit too complicated than what is actually necessary. Try something like this:
url = "https://patch.com/illinois/alsip-crestwood/pet-adoption-alsip-crestwood-area-see-latest-dogs-cats-more"
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
pets = ["dog","cat"]
for pet in pets:
print(pet, len(soup.find_all(text=lambda text: text and pet in text)))
Output:
dog 13
cat 76

Is there a way to get ID of the starting URL from database in scrapy with some function, make_requests_from_url

I am pulling start URL's from Database and also need ID's associated with the URL so that I can pass it in the ITEMS pipeline and store in the table along with items.
I am using "make_requests_from_url(row[1])" to pass the start URL's "start_urls = []" which forms the list of starting URL's. The id's row[0] is what I need to pass to Items when the respective items are crawled.
Below is my spider code:
import scrapy
import mysql.connector
from ..items import AmzProductinfoItem
class AmzProductinfoSpiderSpider(scrapy.Spider):
name = 'amz_ProductInfo_Spider'
nextPageNumber = 2
allowed_domains = ['amazon.in']
start_urls = []
url_fid = []
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', database='datacollecter', user='root', password='', charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[1])
I have tried with comparing "response.url" in parse method but that changes as spider moves on to next page.
Not sure how can I achieve this. any direction is appreciated.
It's not clear why do you need self.make_requests_from_url. You can yield your requests directly:
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield scrapy.Request(url=row[1], meta={'url_id': row[0]}, callback=self.parse)
def parse(self, response):
url_id = response.meta["url_id"]

How to get value of a cell in html page when click to a link in list link?

I have a list about 5000 link.
Ex 2 in 5000 link:
https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019
...
I want to get value of column Time of Day and row Finish of links.
Ex:
09:51:07 AM - https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
07:50:55 AM - https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019
I got user infor of a website, that website has id, class. But table in https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019 have not id, class in table. So I can't.
#!/usr/bin/python
from urllib.request import urlopen
from bs4 import BeautifulSoup
list_user = []
for userID in range(1, 100000):
link = "https://example.com/member.php?u=" + str(userID)
html = urlopen(link)
bsObj = BeautifulSoup(html, "lxml")
user_name = bsObj.find("div", {"id":"main_userinfo"}).h1.get_text()
list_user.append(user_name)
print("username", userID, "is: ", user_name)
with open("result.txt", "a") as myfile:
myfile.write(user_name)
Please help me.
Thank you.
Using bs4 4.7.1.
There is only one table and you want the second column (td) of the last row. You can use last:child to select the last row; which should be used in conjunction with tbody type selector, and > child combinator, so as not to get header row. You can use nth-of-type to specify the td cell to return.
Now you may wish to develop this in at least two ways:
Handle cases where not found e.g.
name = getattr(soup.select_one('title'), 'text', 'N/A')
timing = getattr(soup.select_one('tbody > tr:last-child td:nth-of-type(2)'), 'text', 'N/A')
Add items to lists/data structure, which can be output as a dataframe at end and written out as csv. Or you may wish to stick with your current method
Python:
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019', 'https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019']
with requests.Session() as s:
for url in urls:
r = s.get(url)
soup = bs(r.content, 'lxml')
name = soup.select_one('title').text
timing = soup.select_one('tbody > tr:last-child td:nth-of-type(2)').text
print(name, timing)
This is my code.
It's working Ok.
import requests
from bs4 import BeautifulSoup
f = open("input.ecopark","r")
f_content = f.readlines()
f.close()
for url in f_content:
r = requests.get(url.rstrip())
soup = BeautifulSoup(r.text, 'html.parser')
result = soup.select("table tbody tr td")
x = ""
for i in result:
if not x:
if i.get_text() == "Finish":
x = 1
continue
if x:
print(url.rstrip()+ " "+i.get_text())
break