Send Discord Embed of Product Stock Parsed form HTML with BS4 and Requests - beautifulsoup

So I have the code:
import requests
import discord
from bs4 import BeautifulSoup
class MyClient(discord.Client):
async def on_ready(self):
print('Logged on as {0}!'.format(self.user))
async def on_message(self, message):
if (message.channel.id == 678447420643868674):
if "test" in message.content:
r = requests.get('https://www.jimmyjazz.com/mens/footwear/adidas-solar-hu-nmd/BB9528')
soup = BeautifulSoup(r.text, 'html.parser')
embed = discord.Embed(color=0x00ff00)
embed.title = "test"
for anchor_tag in soup.find_all(class_="box_wrapper")[0].findChildren():
if "piunavailable" in anchor_tag['class']:
embed.description = f"Size {anchor_tag.text} OOS"
await message.channel.send(embed=embed)
else:
embed.description = f"Size {anchor_tag.text} in stock!"
await message.channel.send(embed=embed)
client = MyClient()
client.run('NjY2MDMyMDc0NjM3MTgwOTQ4.XkjBLg.I3dtsL2nkVh_bafTlycSwBApQfU')
And that sends the item stock as an embed for each size:
https://gyazo.com/7a7c868d00a99fc3798a3c24feb9ea7e
How would I change the code to make it send for every size in one embed instead of an embed per size?
Thanks :)

Embeds in discord can have field which you can add with the embed.add_field() function embed.add_field(name="Field1", value="hi", inline=False)
Embed have a few limits in size (copied from https://discordjs.guide/popular-topics/embeds.html#notes):
A field's name is limited to 256 characters and its value to 1024 characters
There can be up to 25 fields
In addition, the sum of all characters in an embed structure must not exceed 6000 characters
Due to this you will have to likely split your product stock into multiple embed when it exceeds 25 field or 6000 character by having a counter for both and if it goes over resetting and sending message.
Here is a part example (I've not tested it but logic should be correct)
r = requests.get('https://www.jimmyjazz.com/mens/footwear/adidas-solar-hu-nmd/BB9528')
soup = BeautifulSoup(r.text, 'html.parser')
charCount = 0
fieldCount = 0
embed = discord.Embed(color=0x00ff00)
embed.title = "test"
for anchor_tag in soup.find_all(class_="box_wrapper")[0].findChildren():
anchor_text = anchor_tag.text
charCount += len(anchor_text)
if charCount >=6000 or fieldCount >=25:
charCount = len(anchor_text)
fieldCount = 0
await message.channel.send(embed=embed)
embed = discord.Embed(color=0x00ff00)
embed.title = "test"
if "piunavailable" in anchor_tag['class']:
embed.add_field(name= f"Size {anchor_text}", value="Out Of Stock")
else:
embed.add_field(name= f"Size {anchor_text}", value="In stock!")
fieldCount +=1

Append as such:
for anchor_tag in soup.find_all(class_="box_wrapper")[0].findChildren():
if "piunavailable" in anchor_tag['class']:
embed.add_field(name= f"Size {anchor_tag.text}", value=":x:", inline= 'true')
else:
embed.add_field(name= f"Size {anchor_tag.text}", value=f":white_check_mark: | ATC", inline= 'true')
await message.channel.send(embed=embed)

Related

Why does my web scraping function not export the data?

I am currently web scraping a few pages inside a list. I have the following code provided.
pages = {
"https://shop.supervalu.ie/shopping/wine-beer-spirits-germany/c-150410100",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-small-bottles/c-150410110",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302375", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302380",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302385",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302386",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302387",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302388", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302389",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302390",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-alcopops/c-150302395",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-vodka/c-150302430",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-irish-whiskey/c-150302435", #More than one page
}
products = []
prices = []
images = []
urls = []
def export_data():
logging.info("exporting data to pandas dataframe")
supervalu = pd.DataFrame({
'img_url' : images,
'url' : urls,
'product' : products,
'price' : prices
})
logging.info("sorting data by price")
supervalu.sort_values(by=['price'], inplace=True)
output_json = 'supervalu.json'
output_csv = 'supervalu.csv'
output_dir = Path('../../json/supervalu')
output_dir.mkdir(parents=True, exist_ok=True)
logging.info("exporting data to json")
supervalu.to_json(output_dir / output_json)
logging.info("exporting data to csv")
supervalu.to_csv(output_dir / output_csv)
def get_data(div):
raw_data = div.find_all('div', class_='ga-product')
raw_images = div.find_all('img')
raw_url = div.find_all('a', class_="ga-product-link")
product_data = [data['data-product'] for data in raw_data]
new_data = [d.replace("\r\n","") for d in product_data]
for name in new_data:
new_names = re.search(' "name": "(.+?)"', name).group(1)
products.append(new_names)
for price in new_data:
new_prices = re.search(' "price": ''"(.+?)"', price).group(1)
prices.append(new_prices)
for image in raw_images:
new_images = image['data-src']
images.append(new_images)
for url in raw_url:
new_url = url['href']
urls.append(new_url)
def scrape_page(next_url):
page = requests.get(next_url)
if page.status_code != 200:
logging.error("Page does not exist!")
exit()
soup = BeautifulSoup(page.content, 'html.parser')
get_data(soup.find(class_="row product-list ga-impression-group"))
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()
except:
logging.warning("No more next pages to scrape")
pass
for page in pages:
logging.info("Scraping page: {}".format(page))
scrape_page(page)
The main issue that appears is during the try exception handling of the next page. As not all of the pages provided have the the appropriate snippet, a ValueAttribute error will araise hence I have the aforementioned statement closed off in a try exception case. I want to skip the pages that don't have next page and scrape them regardless and continue looping the rest of the pages until a next page arises. All of the pages appear to be looped through but I never get the data exported. If I try the following code:
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
except:
logging.warning("No more next pages to scrape")
pass
else:
export_data()
This would be the closest that I have gotten to the desired outcome. The above code works and the data gets exported but not all of the pages get exported because as a result - a new dataframe is created for every time a new next page appears and ends i.e. - code iterarets through the list, finds a next page, next page 'pages' get scraped and a new dataframe is created and deletes the previous data.
I'm hoping that someone would give me some guidance on what to do as I have been stuck on this part of my personal project and I'm not so sure on how I am supposed to overcome this obstacle. Thank you in advance.
I have modified my code as shown below and I have received my desired outcome.
load_more_text = soup.find('a', class_='pill ajax-link load-more')
if load_more_text:
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()

Scraping a web page with a "more" button...with beautifulsoup

I'm trying to scrape information from this website: "http://vlg.film/"
I'm not only interested in the first 15 titles, but in all of them. When clicking on the 'Show More' button a couple of times, the extra titles show up in the "inspect element" window, but the url stays the same, i.e. "https://vlg.film/". Does anyone have a or some bright ideas? I am fairly new to this..Thanks
`
import requests as re
from bs4 import BeautifulSoup as bs
url = ("https://vlg.film/")
page = re.get(url)
soup = bs(page.content, 'html.parser')
wrap = soup.find_all('div', class_="column column--20 column--main")
for det in wrap:
link = det.a['href']
print(link)
`
Looks like you can simply add the pagination to the url. The trick is to know when you reached the end. Playing around with it, it appears once you reach the end, it repeats the first page. So all you need to do is keep appending the links into a list, and when you start to repeat a link, have it stop.
import requests as re
from bs4 import BeautifulSoup as bs
next_page = True
page_num = 1
links = []
while next_page == True:
url = ("https://vlg.film/")
payload = {'PAGEN_1': '%s' %page_num}
page = re.get(url, params=payload)
soup = bs(page.content, 'html.parser')
wrap = soup.find_all('div', class_="column column--20 column--main")
for det in wrap:
link = det.a['href']
if link in links:
next_page = False
break
links.append(link)
page_num += 1
for link in links:
print(link)
Output:
/films/ainbo/
/films/boss-level/
/films/i-care-a-lot/
/films/fear-of-rain/
/films/extinct/
/films/reckoning/
/films/marksman/
/films/breaking-news-in-yuba-county/
/films/promising-young-woman/
/films/knuckledust/
/films/rifkins-festival/
/films/petit-pays/
/films/life-as-it-should-be/
/films/human-voice/
/films/come-away/
/films/jiu-jitsu/
/films/comeback-trail/
/films/cagefighter/
/films/kolskaya/
/films/golden-voices/
/films/bad-hair/
/films/dragon-rider/
/films/lucky/
/films/zalozhnik/
/films/findind-steve-mcqueen/
/films/black-water-abyss/
/films/bigfoot-family/
/films/alone/
/films/marionette/
/films/after-we-collided/
/films/copperfield/
/films/her-blue-sky/
/films/secret-garden/
/films/hour-of-lead/
/films/eve/
/films/happier-times-grump/
/films/palm-springs/
/films/unhinged/
/films/mermaid-in-paris/
/films/lassie/
/films/sunlit-night/
/films/hello-world/
/films/blood-machines/
/films/samsam/
/films/search-and-destroy/
/films/play/
/films/mortal/
/films/debt-collector-2/
/films/chosen-ones/
/films/inheritance/
/films/tailgate/
/films/silent-voice/
/films/roads-not-taken/
/films/jim-marshall/
/films/goya-murders/
/films/SUFD/
/films/pinocchio/
/films/swallow/
/films/come-as-you-are/
/films/kelly-gang/
/films/corpus-christi/
/films/gentlemen/
/films/vic-the-viking/
/films/perfect-nanny/
/films/farmageddon/
/films/close-to-the-horizon/
/films/disturbing-the-peace/
/films/trauma-center/
/films/benjamin/
/films/COURIER/
/films/aeronauts/
/films/la-belle-epoque/
/films/arctic-dogs/
/films/paradise-hills/
/films/ditya-pogody/
/films/selma-v-gorode-prizrakov/
/films/rainy-day-in-ny/
/films/ty-umeesh-khranit-sekrety/
/films/after-the-wedding/
/films/the-room/
/films/kuda-ty-propala-bernadett/
/films/uglydolls/
/films/smert-i-zhizn-dzhona-f-donovana/
/films/sinyaya-bezdna-2/
/films/just-a-gigolo/
/films/i-am-mother/
/films/city-hunter/
/films/lets-dance/
/films/five-feet-apart/
/films/after/
/films/100-things/
/films/greta/
/films/CORGI/
/films/destroyer/
/films/vice/
/films/ayka/
/films/van-gogh/
/films/serenity/
This is pretty simple web site to extract data. Create a urls list of web page , how many page do you want to extract. Then use for loop to iterate all page extract the data.
import requests as re
from bs4 import BeautifulSoup as bs
urls = ["http://vlg.film/ajax/index_films.php?PAGEN_1={}".format(x) for x in range(1,11)]
for url in urls:
page = re.get(url)
soup = bs(page.content, 'html.parser')
wrap = soup.find_all('div', class_="column column--20 column--main")
print(url)
for det in wrap:
link = det.a['href']
print(link)

How to receive multiple messages in python-telegram-bot?

I am sending multiple images at a time to a bot in telegram. I am trying to create a conversational chatbot using python-telegram bot.
here is my code:
def main():
updater = Updater("1141074258:Axxxxxxxxxxxxxxxxxxxxxxxxg", use_context=True)
dp = updater.dispatcher
conv_handler = ConversationHandler(
entry_points = [CommandHandler('start',start)],
states = {
CHOSEN_OPTION: [MessageHandler(Filters.regex('^(Option2|Option3|Option4)$'),choose_option)],
PRODUCTS: [MessageHandler(Filters.text | Filters.photo,products)],
Option2: [MessageHandler(Filters.text,option2)],
Option3: [MessageHandler(Filters.text,option3)],
Option4: [CommandHandler('create', create_order)]
},
fallbacks=[CommandHandler('cancel', cancel)]
)
dp.add_handler(conv_handler)
updater.start_polling()
updater.idle()
if __name__ == '__main__':
main()
#run_async
def products(update,context):
logger.info("update is %s",update)
input_message = update.message.text
if input_message:
data['products'] = input_message
logger.info("product text is:%s",input_message)
elif update.message.photo:
photo_list = []
bot = context.bot
length = len(update.message.photo)
for photo in range(0,length):
ident = update.message.photo[photo].file_id
getFile = context.bot.get_file(ident)
photo_list.append(getFile['file_path'])
data['products_image'] = photo_list
update.message.reply_text("Please type name.",)
return Option3
If i am send 2 images same time, i am getting one image with a different size (3 times), How can I receive the actual two messages?
if update contains photo return PRODUCTS for getting other photos else
get text and return to every state you want
#run_async
def products(update,context):
logger.info("update is %s",update)
if update.message.photo:
# to what you want with your photos
return PRODUCTS
if update.message.text:
# getting product text
return Option3

How to fixing different element text to extract

I'm setting up a new scrapy spider and developed
I am using windows 10 and it's running.
My problem is extracting text from different element. This elements sometime on (strong tag, p,) sometime have class , sometime have id but i need to implement to one element to extracting a row text.
Please checkout the link of site
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=404&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193193&fromFeatured=1
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=0&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=202434
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=1218&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193194&fromFeatured=1
https://prnt.sc/nkl1vc,
https://prnt.sc/nkl1zy,
https://prnt.sc/nkl247,
# -*- coding: utf-8 -*-
import scrapy
class OtcnetSpider(scrapy.Spider):
name = 'otcnet'
# allowed_domains = ['otcnet.org']
start_urls = ['https://exhibits.otcnet.org/otc2019/Public/Exhibitors.aspx?Index=All&ID=26006&sortMenu=107000']
def parse(self, response):
links = response.css('a.exhibitorName::attr(href)').extract()
for link in links:
ab_link = response.urljoin(link)
yield scrapy.Request(ab_link, callback=self.parse_p)
def parse_p(self, response):
url = response.url
Company = response.xpath('//h1/text()').extract_first()
if Company:
Company = Company.strip()
Country = response.xpath('//*[#class="BoothContactCountry"]/text()').extract_first()
State = response.xpath('//*[#class="BoothContactState"]/text()').extract_first()
if State:
State = State.strip()
Address1 = response.xpath('//*[#class="BoothContactAdd1"]/text()').extract_first()
City = response.xpath('//*[#class="BoothContactCity"]/text()').extract_first()
if City:
City = City.strip()
zip_c = response.xpath('//*[#class="BoothContactZip"]/text()').extract_first()
Address = str(Address1)+' '+str(City)+' '+str(State)+' '+str(zip_c)
Website = response.xpath('//*[#id="BoothContactUrl"]/text()').extract_first()
Booth = response.css('.eBoothControls li:nth-of-type(1)::text').extract_first().replace('Booth: ','')
Description = ''
Products = response.css('.caption b::text').extract()
Products= ', '.join(Products)
vid_bulien = response.css('.aa-videos span.hidden-md::text').extract_first()
if vid_bulien=="Videos":
vid_bulien = "Yes"
else:
vid_bulien = "No"
Video_present = vid_bulien
Conference_link = url
Categories = response.css('.ProductCategoryLi a::text').extract()
Categories = ', '.join(Categories)
Address = Address.replace('None','')
yield {
'Company':Company,
'Country':Country,
'State':State,
'Address':Address,
'Website':Website,
'Booth':Booth,
'Description':Description,
'Products':Products,
'Video_present':Video_present,
'Conference_link':Conference_link,
'Categories':Categories
}
I expect the output would be a row description from different element
According to this post and excellent #dimitre-novatchev answer you need to find a node-set intersection:
$ns1 for your page is:
//p[#class="BoothProfile"]/following-sibling::p
$ns2 is:
p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p
as a result you need to process these p elements:
//p[#class="BoothProfile"]/following-sibling::p[count(.|//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p) = count(//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p)]
You can use this Scrapy code:
for p_elem in response.xpath('//p[#class="BoothProfile"]/following-sibling::p[count(.|//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p) = count(//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p)]'):
# using string() to stringify <p>
Description += p_elem.xpath('string(.)').extract_first()

All the links in the same row

I'm doing a database where it has 2 columns: event_name and event_URL. It doesn't get the name and puts all the urls on the event_URL column. Print: https://prnt.sc/fru1tr
Code:
import urllib2
from bs4 import BeautifulSoup
import psycopg2
page = urllib2.urlopen('https://www.meetup.com/find/outdoors-adventure/?allMeetups=false&radius=50&userFreeform=London%2C+&mcId=c1012717&change=yes&sort=default')
soup = BeautifulSoup(page, 'lxml')
events = soup.find('ul', class_='j-groupCard-list searchResults tileGrid tileGrid--3col tileGrid_atMedium--2col tileGrid_atSmall--1col')
A = []
B = []
try:
conn = psycopg2.connect("dbname='meetup' user='postgres' host='localhost' password='root'")
except:
print 'Unable to connect to the database.'
cur = conn.cursor()
for event in events.findAll('li'):
text = event.findAll('h3')
if len(text) != 0:
A.append(text[0].find(text = True))
url = event.find('a', href=True)
if len(url) != 0:
B.append(url['href'])
cur.execute("""INSERT INTO outdoors_adventure(event_name,event_url) VALUES(%s,%s)""", (tuple(A),tuple(B)))
conn.commit()
del A[:]
del B[:]
If the indentation is right in the posted code, the problem might be in the nested for-loop: for every event, you append the "B" list with all the links on the page. You could try:
for event in events.findAll('li'):
text = event.findAll('h3')
if len(text) != 0:
A.append(text[0].find(text = True))
for link in events.findAll('li'):
url = link.find('a', href=True)
if len(url) != 0:
B.append(url['href'])
Or better, keep the event-name and event-URL search in a single for-loop, fetching first the text and then the url of event
EDIT: You can simplify the name extraction, by using:
for event in events.findAll('li'):
text = event.h3.string.strip()
if len(text) != 0:
A.append(text)
url = event.find('a', href=True)
...
Let me know if that does the trick for you (it does on my side).
EDIT2: The problem might be just the fact that the extracted string starts with tabs (maybe that's why your DB seems "not to show the name" - its there, but you only see the tabs in the preview?). Just use strip() to remove them.