Better way to clean product description using BeautifulSoup? - beautifulsoup

I have written following code to fetch product description from a site using BeautifulSoup-
def get_soup(url):
try:
response = requests.get(url)
if response.status_code == 200:
html = response.content
return BeautifulSoup(html, "html.parser")
except Exception as ex:
print("error from " + url + ": " + str(ex))
def get_product_details(url):
try:
soup = get_soup(url)
prod_details = dict()
desc_list = soup.select('p ~ ul')
prod_details['description'] = ''.join(desc_list)
return prod_details
except Exception as ex:
logger.warning('%s - %s', ex, url)
if __name__ == '__main__':
get_product_details("http://www.aprisin.com.sg/p-748-littletikespoptunesguitar.html")
In above code I am trying to convert description(a list) to string but getting below issue-
[WARNING] aprisin.py:82 get_product_details() : sequence item 0: expected str instance, Tag found - http://www.aprisin.com.sg/p-748-littletikespoptunesguitar.html
Output of description without converting description to string-
[<ul>
<li>Freestyle</li>
<li>Play along with 5 pre-set tunes: </li>
</ul>, <ul>
<li>Each string will play a note</li>
<li>Guitar has a whammy bar</li>
<li>2-in-1 volume control and power button </li>
<li>Simple and easy to use </li>
<li>Helps develop music appreciation </li>
<li>Requires 3 "AA" alkaline batteries (included)</li>
</ul>]

You are passing a list of tags (Object) instead of string to join(). join() works with list of strings. Use the following code changes for join function:-
prod_details['description'] = ''.join([tag.get_text() for tag in desc_list])
or
prod_details['description'] = ''.join([tag.string for tag in desc_list])
In case you want the description along with html content, you can use the following:-
# this will preserve the html tags and indentation.
prod_details['description'] = ''.join([tag.prettify() for tag in desc_list])
or
# this will return the html content as string.
prod_details['description'] = ''.join([str(tag) for tag in desc_list])

desc_list is list of bs4.element.Tag. you should convert tag to string:
desc_list = soup.select('p ~ ul')
prod_details['description'] = str(desc_list[0])

You're trying to join a list of Tags, but the join method needs str arguments. Try:
''.join([str(i) for i in desc_list])

Related

Why does my web scraping function not export the data?

I am currently web scraping a few pages inside a list. I have the following code provided.
pages = {
"https://shop.supervalu.ie/shopping/wine-beer-spirits-germany/c-150410100",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-small-bottles/c-150410110",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302375", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302380",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302385",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302386",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302387",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302388", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302389",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302390",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-alcopops/c-150302395",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-vodka/c-150302430",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-irish-whiskey/c-150302435", #More than one page
}
products = []
prices = []
images = []
urls = []
def export_data():
logging.info("exporting data to pandas dataframe")
supervalu = pd.DataFrame({
'img_url' : images,
'url' : urls,
'product' : products,
'price' : prices
})
logging.info("sorting data by price")
supervalu.sort_values(by=['price'], inplace=True)
output_json = 'supervalu.json'
output_csv = 'supervalu.csv'
output_dir = Path('../../json/supervalu')
output_dir.mkdir(parents=True, exist_ok=True)
logging.info("exporting data to json")
supervalu.to_json(output_dir / output_json)
logging.info("exporting data to csv")
supervalu.to_csv(output_dir / output_csv)
def get_data(div):
raw_data = div.find_all('div', class_='ga-product')
raw_images = div.find_all('img')
raw_url = div.find_all('a', class_="ga-product-link")
product_data = [data['data-product'] for data in raw_data]
new_data = [d.replace("\r\n","") for d in product_data]
for name in new_data:
new_names = re.search(' "name": "(.+?)"', name).group(1)
products.append(new_names)
for price in new_data:
new_prices = re.search(' "price": ''"(.+?)"', price).group(1)
prices.append(new_prices)
for image in raw_images:
new_images = image['data-src']
images.append(new_images)
for url in raw_url:
new_url = url['href']
urls.append(new_url)
def scrape_page(next_url):
page = requests.get(next_url)
if page.status_code != 200:
logging.error("Page does not exist!")
exit()
soup = BeautifulSoup(page.content, 'html.parser')
get_data(soup.find(class_="row product-list ga-impression-group"))
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()
except:
logging.warning("No more next pages to scrape")
pass
for page in pages:
logging.info("Scraping page: {}".format(page))
scrape_page(page)
The main issue that appears is during the try exception handling of the next page. As not all of the pages provided have the the appropriate snippet, a ValueAttribute error will araise hence I have the aforementioned statement closed off in a try exception case. I want to skip the pages that don't have next page and scrape them regardless and continue looping the rest of the pages until a next page arises. All of the pages appear to be looped through but I never get the data exported. If I try the following code:
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
except:
logging.warning("No more next pages to scrape")
pass
else:
export_data()
This would be the closest that I have gotten to the desired outcome. The above code works and the data gets exported but not all of the pages get exported because as a result - a new dataframe is created for every time a new next page appears and ends i.e. - code iterarets through the list, finds a next page, next page 'pages' get scraped and a new dataframe is created and deletes the previous data.
I'm hoping that someone would give me some guidance on what to do as I have been stuck on this part of my personal project and I'm not so sure on how I am supposed to overcome this obstacle. Thank you in advance.
I have modified my code as shown below and I have received my desired outcome.
load_more_text = soup.find('a', class_='pill ajax-link load-more')
if load_more_text:
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()

Webscraping customer review - Invalid selector error using XPath

I am trying to extract userid, rating and review from the following site using selenium and it is showing "Invalid selector error". I think, the Xpath I have tried to define to get the review text is the reason for error. But I am unable to resolve the issue. The site link is as below:
teslamotor review
The code that I have used is following:
#Class for Review webscraping from consumeraffairs.com site
class CarForumCrawler():
def __init__(self, start_link):
self.link_to_explore = start_link
self.comments = pd.DataFrame(columns = ['rating','user_id','comments'])
self.driver = webdriver.Chrome(executable_path=r'C:/Users/mumid/Downloads/chromedriver/chromedriver.exe')
self.driver.get(self.link_to_explore)
self.driver.implicitly_wait(5)
self.extract_data()
self.save_data_to_file()
def extract_data(self):
ids = self.driver.find_elements_by_xpath("//*[contains(#id,'review-')]")
comment_ids = []
for i in ids:
comment_ids.append(i.get_attribute('id'))
for x in comment_ids:
#Extract dates from for each user on a page
user_rating = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]/div[1]/div/img')[0]
rating = user_rating.get_attribute('data-rating')
#Extract user ids from each user on a page
userid_element = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]/div[2]/div[2]/strong')[0]
userid = userid_element.get_attribute('itemprop')
#Extract Message for each user on a page
user_message = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]/text()')[0]
comment = user_message.text
#Adding date, userid and comment for each user in a dataframe
self.comments.loc[len(self.comments)] = [rating,userid,comment]
def save_data_to_file(self):
#we save the dataframe content to a CSV file
self.comments.to_csv ('Tesla_rating-6.csv', index = None, header=True)
def close_spider(self):
#end the session
self.driver.quit()
try:
url = 'https://www.consumeraffairs.com/automotive/tesla_motors.html'
mycrawler = CarForumCrawler(url)
mycrawler.close_spider()
except:
raise
The error that I am getting is as following:
Also, The xpath that I tried to trace is from following HTML
You are seeing the classic error of...
as find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]/text()')[0] would select the attributes, instead you need to pass an xpath expression that selects elements.
You need to change as:
user_message = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]')[0]
References
You can find a couple of relevant detailed discussions in:
invalid selector: The result of the xpath expression "//a[contains(#href, 'mailto')]/#href" is: [object Attr] getting the href attribute with Selenium

All the links in the same row

I'm doing a database where it has 2 columns: event_name and event_URL. It doesn't get the name and puts all the urls on the event_URL column. Print: https://prnt.sc/fru1tr
Code:
import urllib2
from bs4 import BeautifulSoup
import psycopg2
page = urllib2.urlopen('https://www.meetup.com/find/outdoors-adventure/?allMeetups=false&radius=50&userFreeform=London%2C+&mcId=c1012717&change=yes&sort=default')
soup = BeautifulSoup(page, 'lxml')
events = soup.find('ul', class_='j-groupCard-list searchResults tileGrid tileGrid--3col tileGrid_atMedium--2col tileGrid_atSmall--1col')
A = []
B = []
try:
conn = psycopg2.connect("dbname='meetup' user='postgres' host='localhost' password='root'")
except:
print 'Unable to connect to the database.'
cur = conn.cursor()
for event in events.findAll('li'):
text = event.findAll('h3')
if len(text) != 0:
A.append(text[0].find(text = True))
url = event.find('a', href=True)
if len(url) != 0:
B.append(url['href'])
cur.execute("""INSERT INTO outdoors_adventure(event_name,event_url) VALUES(%s,%s)""", (tuple(A),tuple(B)))
conn.commit()
del A[:]
del B[:]
If the indentation is right in the posted code, the problem might be in the nested for-loop: for every event, you append the "B" list with all the links on the page. You could try:
for event in events.findAll('li'):
text = event.findAll('h3')
if len(text) != 0:
A.append(text[0].find(text = True))
for link in events.findAll('li'):
url = link.find('a', href=True)
if len(url) != 0:
B.append(url['href'])
Or better, keep the event-name and event-URL search in a single for-loop, fetching first the text and then the url of event
EDIT: You can simplify the name extraction, by using:
for event in events.findAll('li'):
text = event.h3.string.strip()
if len(text) != 0:
A.append(text)
url = event.find('a', href=True)
...
Let me know if that does the trick for you (it does on my side).
EDIT2: The problem might be just the fact that the extracted string starts with tabs (maybe that's why your DB seems "not to show the name" - its there, but you only see the tabs in the preview?). Just use strip() to remove them.

Update response.body in scrapy(without reload)

I use scrapy and selenium for crawl! my site use ajax for pagination! actully , url no change and so response.body also no change! I want to click with selenium (for pagination) and get self.driver.page_source and use it instead response.body!
So i writed this code :
res = scrapy.http.TextResponse(url=self.driver.current_url, body=self.driver.page_source,
encoding='utf-8')
print(str(res)) //nothing to print!
for quote in res.css("#ctl00_ContentPlaceHolder1_Grd_Dr_DXMainTable > tr.dxgvDataRow_Office2003Blue"):
i = i+1
item = dict()
item['id'] = int(quote.css("td.dxgv:nth-child(1)::text").extract_first())
And no error !
You can replace body of original response in scrapy by using response.replace() method:
def parse(self, response):
response = response.replace(body=driver.page_source)

Convert </br> to end line

I'm trying to extract some text using BeautifulSoup. I'm using get_text() function for this purpose.
My problem is that the text contains </br> tags and I need to convert them to end lines. how can I do this?
You can do this using the BeautifulSoup object itself, or any element of it:
for br in soup.find_all("br"):
br.replace_with("\n")
As official doc says:
You can specify a string to be used to join the bits of text together: soup.get_text("\n")
A regex should do the trick.
import re
s = re.sub('<br\s*?>', '\n', yourTextHere)
Hope this helps!
Also you can use ‍‍‍get_text(separator = '\n', strip = True) :
from bs4 import BeautifulSoup
bs=BeautifulSoup('<td>some text<br>some more text</td>','html.parser')
text=bs.get_text(separator = '\n', strip = True)
print(text)
>>
some text
some more text
it works for me.
Adding to Ian's and dividebyzero's post/comments you can do this to efficiently filter/replace many tags in one go:
for elem in soup.find_all(["a", "p", "div", "h3", "br"]):
elem.replace_with(elem.text + "\n\n")
Instead of replacing the tags with \n, it may be better to just add a \n to the end of all of the tags that matter.
To steal the list from #petezurich:
for elem in soup.find_all(["a", "p", "div", "h3", "br"]):
elem.append('\n')
If you call element.text you'll get the text without br tags.
Maybe you need define your own custom method for this purpose:
def clean_text(elem):
text = ''
for e in elem.descendants:
if isinstance(e, str):
text += e.strip()
elif e.name == 'br' or e.name == 'p':
text += '\n'
return text
# get page content
soup = BeautifulSoup(request_response.text, 'html.parser')
# get your target element
description_div = soup.select_one('.description-class')
# clean the data
print(clean_text(description_div))