Issue requesting when scraping images from google using 'src' tag, how to scrape images from google using beautiful soup? - beautifulsoup

I am a newbie to python at best. I have been attempting to make a function to download a specific number of images from a desired google image search into a specific folder in your google drive. But I have hit a snag that I am unable to fix; please can someone point out where I am going wrong or point me in the right direction to fix it. I believe the issue is im = requests.get(link) (line 36). So far, I have the following:
# mount the drive
from google.colab import drive
drive.mount('/content/gdrive')
#module import
import requests
from bs4 import BeautifulSoup
#define parameters of search
query = input("Images of:")
print("Number of images:")
NumberOfImages = int(input())
FolderLocation = input("Input Folder Location:")
image_type="ActiOn"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# soup
request = requests.get(url,headers=headers)
soup = BeautifulSoup(request.text,'html.parser')
images = soup.find_all('img')
# loop to save
tik = 0
for image in images:
if tik <= NumberOfImages:
link = image['src']
name = query+"_"+str(tik)
print(link, name)
with open(FolderLocation+"/"+name+".jpg",'wb') as f:
im = requests.get(link)
f.write(im.content)
print("Writing "+name+ " to file")
tik +=1
else:
break
Is this an issue with requesting the 'src' links from google, or is there something else I am missing out?
Any help would be much appreciated. Thanks.

In order to scrape the full-res image URL using requests and beautifulsoup you need to scrape data from the page source (CTRL+U) code via regex.
Find all <script> tags:
soup.select('script')
Match images data via regex from the <script> tags:
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
Match desired images (full res size) via regex:
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
matched_images_data_json)
Extract and decode them using bytes() and decode() and tell how much elements you want to extract with list() slicing [:20] (grabs first 20 images):
for fixed_full_res_image in matched_google_full_resolution_images[:20]:
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
Code and full example in the online IDE that also downloads images:
import requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "pexels cat",
"tbm": "isch",
"hl": "en",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
def get_images_data():
print('\nGoogle Images Metadata:')
for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
source = google_image.select_one('.fxgdke').text
link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
print(f'{title}\n{source}\n{link}\n')
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nFull Resolution Images:') # in order
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
# ------------------------------------------------
# Download original images
# print(f'Downloading {index} image...')
# opener=urllib.request.build_opener()
# opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
# urllib.request.install_opener(opener)
# urllib.request.urlretrieve(original_size_img, f'Bs4_Images/original_size_img_{index}.jpg')
get_images_data()
-------------
'''
Google Images Metadata:
9,000+ Best Cat Photos · 100% Free Download · Pexels Stock Photos
pexels.com
https://www.pexels.com/search/cat/
other results ...
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2cZsuRkkLWXOIsl9BZzbeaCcI0qav7nenDvvqi-YSm4nVJZYyljRsJZv6N5vS8hMNU_w&usqp=CAU
other results ...
Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?cs=srgb&dl=pexels-evg-culture-1170986.jpg&fm=jpg
other results ...
'''
Alternatively, you can achieve the same thing by using Google Images API from SerpApi. It's a paid API with a free plan.
The difference is that you don't have to deal with regex, bypass blocks from Google, and maintain it over time if something crashes. Instead, you only need to iterate over structured JSON and get the data you want.
Example code to integrate:
import os, json # json for pretty output
from serpapi import GoogleSearch
def get_google_images():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "pexels cat",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))
get_google_images()
---------------
'''
[
... # other images
{
"position": 100, # img number
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRR1FCGhFsr_qZoxPvQBDjVn17e_8bA5PB8mg&usqp=CAU",
"source": "pexels.com",
"title": "Close-up of Cat · Free Stock Photo",
"link": "https://www.pexels.com/photo/close-up-of-cat-320014/",
"original": "https://images.pexels.com/photos/2612982/pexels-photo-2612982.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500",
"is_product": false
}
]
'''
P.S - I wrote a more in-depth blog post about how to scrape Google Images, and how to reduce the chance of being blocked while web scraping search engines.
Disclaimer, I work for SerpApi.

Related

How to collect all comments with scrapy?

I have to use a data scraper to scrape all the comments from newspaper articles. I have very little experience with any kind of coding. A very kind person on reddit gave me this code:
import json
import scrapy
class NewsCommentsSpider(scrapy.Spider):
name = "newscomments"
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36." }
def start_requests(self):
with open("news.txt") as file:
lines = [line.rstrip() for line in file]
for article_id in lines:
url = f"https://www.dailymail.co.uk/reader-comments/p/asset/readcomments/{article_id}?max=500&order=desc"
yield scrapy.Request(
url=url,
callback=self.parse_comments,
headers=self.headers,
meta={"article_id": article_id},
)
def parse_comments(self, response):
comments_dict = json.loads(response.text)
valid_comments = []
for comment in comments_dict["payload"]["page"]:
if comment["replies"]["totalCount"] >= 3:
valid_comments.append(comment)
with open(f"{response.meta.get('article_id')}.json", "w") as f:
json.dump(valid_comments, f)
I tested it, and it works! However, I think he only designed it to download comments with three or replies, which was my origial query. So I was wondering if anyone here can help me change the variables in what's written here so that it will download all the comments, not just the one's that got replies, but the one's that got replies as well as the one's that didn't.
Quick aside: the data I got from this also contained alot of repeated words, like it repeated the title of the article before every comment, and there were words like "userid" infront of every username, this made it kind of difficult to read, and I was wondering if anyone here could help change the code so it downloads less information, all I really need is the comments, usernames and dates the things comments were made.
Thanks a bunch!
Here's the code once again:
import json
import scrapy
class NewsCommentsSpider(scrapy.Spider):
name = "newscomments"
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36." }
def start_requests(self):
with open("news.txt") as file:
lines = [line.rstrip() for line in file]
for article_id in lines:
url = f"https://www.dailymail.co.uk/reader-comments/p/asset/readcomments/{article_id}?max=500&order=desc"
yield scrapy.Request(
url=url,
callback=self.parse_comments,
headers=self.headers,
meta={"article_id": article_id},
)
def parse_comments(self, response):
comments_dict = json.loads(response.text)
valid_comments = []
for comment in comments_dict["payload"]["page"]:
if comment["replies"]["totalCount"] >= 3:
valid_comments.append(comment)
with open(f"{response.meta.get('article_id')}.json", "w") as f:
json.dump(valid_comments, f)

Login in to Amazon using BeautifulSoup

I am working on a script to scrape some information off Amazon's Prime Now grocery website. However, I am stumbling on the first step in which I am attempting to start a session and login to the page.
I am fairly positive that the issue is in building the 'data' object. There are 10 input's in the html but the data object I have constructed only has 9, with the missing one being the submit button. I am not entirely sure if it is relevant as this is my first time working with BeautifulSoup.
Any help would be greatly appreciated! All of my code is below, with the last if/else statement confirming that it has not worked when I run the code.
import requests
from bs4 import BeautifulSoup
# define URL where login form is located
site = 'https://primenow.amazon.com/ap/signin?clientContext=133-1292951-7489930&openid.return_to=https%3A%2F%2Fprimenow.amazon.com%2Fap-post-redirect%3FsiteState%3DclientContext%253D131-7694496-4754740%252CsourceUrl%253Dhttps%25253A%25252F%25252Fprimenow.amazon.com%25252Fhome%252Csignature%253DIFISh0byLJrJApqlChzLdkc2FCEj3D&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=amzn_houdini_desktop_us&openid.mode=checkid_setup&marketPlaceId=A1IXFGJ6ITL7J4&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&pageId=amzn_pn_us&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.pape.max_auth_age=3600'
# initiate session
session = requests.Session()
# define session headers
session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.61 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': site
}
# get login page
resp = session.get(site)
html = resp.text
# get BeautifulSoup object of the html of the login page
soup = BeautifulSoup(html , 'lxml')
# scrape login page to get all the needed inputs required for login
data = {}
form = soup.find('form')
for field in form.find_all('input'):
try:
data[field['name']] = field['value']
except:
pass
# add username and password to the data for post request
data['email'] = 'my email'
data['password'] = 'my password'
# submit post request with username / password and other needed info
post_resp = session.post(site, data = data)
post_soup = BeautifulSoup(post_resp.content , 'lxml')
if post_soup.find_all('title')[0].text == 'Your Account':
print('Login Successfull')
else:
print('Login Failed')

Retrieving urls using mutiple search words in csv file from google search. The csv file is empty when i run the above code

I have a list of companies in csv columns for which I don't have URLs. I would like to search google and add the URLs of each company to csv file. I would like to Google search only the exact company name.
Lists = companiesnames.
from bs4 import BeautifulSoup
import csv
import requests
import urllib
with open("database.csv", "r", newline="") as f_input:
csv_reader = csv.reader(f_input, delimiter=";", quotechar="|")
lists = [row[0] for row in csv_reader]
with open("results.csv", "w", newline="") as f_output:
csv_writer = csv.writer(f_output, delimiter=";")
for list in lists:
search_url = "https://www.google.com/search?&q={}".format(urllib.parse.quote_plus(list, safe='/'))
google = requests.get(search_url)
soup = BeautifulSoup(google.content, "html.parser")
for r in soup.find_all('a', class_='r'):
csv_writer.writerow([list, r.a.text, r.a['href'][7:]])
Lists = [Accolade wines, Da loc, New viet dairy,..]
It's because you haven't specified a user-agent. Learn more about user-agent and HTTP request headers.
Basically, user-agent let identifies the browser, its version number, and its host operating system that representing a person (browser) in a Web context that lets servers and network peers identify if it's a bot or not.
In short, you need to send a fake user-agent so Google would treat your request as a "real" user visit otherwise, it will block a request eventually and you'll receive a completely different HTML that contains an error message with completely different selectors and elements. Check what is your user-agent.
Pass user-agent in headers:
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
requests.get(YOUR_URL, headers=headers)
Code and example in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml, csv
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
company_list = ['facebook', 'whatsapp', 'alibaba', 'sony']
# declare a csv file name
with open('companies_file.csv', mode='w') as csv_file:
# creating column headers
fieldnames = ['Company name', 'Company URL']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
# adding column headers
writer.writeheader()
company_data = []
# iterating over company names
for company in company_list:
html = requests.get(f'https://www.google.com/search?q={company}&hl=en&gl=us', headers=headers).text
soup = BeautifulSoup(html, 'lxml')
comapny_url = soup.select_one('.yuRUbf a')['href']
print(f'Writing data from {company} company.')
# adding data to the array as a dict() so csv.writer can write to csv file
# Values should be the same as in the fieldnames variable
company_data.append({
'Company name': company,
'Company URL': comapny_url
})
# iterate over each entry and write to csv file
for data in company_data:
writer.writerow(data)
-----------
'''
Writing data from facebook company.
...
# CSV output:
Company name,Company URL
facebook,https://www.facebook.com/
whatsapp,https://www.whatsapp.com/?lang=en
alibaba,https://www.alibaba.com/
sony,https://www.sony.com/
'''
Alternatively, you can do the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that everything is already done for the end-user and the only thing that needs to be done on your part is to iterate over structured JSON and get the data you want, rather than figuring out what things don't work as they should.
Code to integrate:
from serpapi import GoogleSearch
import os, csv
company_list = ['facebook', 'whatsapp', 'alibaba', 'sony']
with open('serpapi_companies_solution.csv', mode='w') as csv_file:
fieldnames = ['Company name', 'Company URL']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
company_data = []
for company in company_list:
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"google_domain": "google.com",
"q": f"{company}",
"hl": "en",
"gl": "us"
}
search = GoogleSearch(params)
results = search.get_dict()
print(f"Writing data from {company} company.")
company_url = results["organic_results"][0]["link"]
company_data.append({
"Company name": company,
"Company URL": company_url
})
for data in company_data:
writer.writerow(data)
--------------
'''
Writing data from facebook company.
Writing data from whatsapp company.
...
# output from CSV:
Company name,Company URL
facebook,https://www.facebook.com/
whatsapp,https://www.whatsapp.com/?lang=en
alibaba,https://www.alibaba.com/
sony,https://www.sony.com/
'''
Disclaimer, I work for SerpApi.

How do I parse the data hidden inside html code using bs4

I need to parse this number from a website that has been updated. I cannot access the data-component part of the html code.
HTML CODE
I've tried xpath parsing and bs4
url = "https://www.muthead.com/20/players/10111309/upgrades/"
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content,"lxml")
hello = soup.find("div",class_="average rating-list__RatingValue-ubw14i-3 jzOWLB").text
print(hello)
I get the following error:
hello = soup.find("div",class_="average rating-list__RatingValue-ubw14i-3 jzOWLB").text
builtins.AttributeError: 'NoneType' object has no attribute 'text'
I need to scrape the 77inside the html code inside the dic class= average rating-list__RatingValue-ubw14i-3 jzOWLB> 77 </div>
It seems like bs4 can't look inside the main container i get no code after
<div data-component="player-upgrades" data-props="{externalId": 10111309, "gameSlug": "20", "basePath": "/20/players/10111309/upgrades/"}
The content on the website is rendered using javascript so you there is barely any use in using BeautifulSoup in your case. That being said I would recommend getting all of the player stats directly from the API by using the following code:
import requests
url = "https://www.muthead.com/api/mutdb/player_item/?expand=game%2Cposition%2Cprogram%2Cteam%2Cupgrade_tiers&external_id=10111309&game__slug=20"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0"}
req_raw = requests.get(url, headers=headers).json()
spd = req_raw["results"][0]["stats"][1]["value"]
print(spd)
Alternatively, if you are planning to use many links you might want to use the following code:
import requests
url_raw = "https://www.muthead.com/20/players/10111309/upgrades/"
external_id = url_raw.split("/")[5]
game_slug = url_raw.split("/")[3]
url = "https://www.muthead.com/api/mutdb/player_item/?expand=game%2Cposition%2Cprogram%2Cteam%2Cupgrade_tiers&external_id={}&game__slug={}".format(external_id, game_slug)
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0"}
req_raw = requests.get(url, headers=headers).json()
spd = req_raw["results"][0]["stats"][1]["value"]
print(spd)
where you just need to replace url_raw with the URL of the game you are scraping and the script will automatically send a request to the API (without you having to find the API link)
(do note both codes output 77)
Hope this helps!

Why is Beautifulsoup find_all not returning complete results?

I am trying to parse an Amazon search results page. I want to access the data contained in an <li> tag with <id=result_0>, <id=result_1>, <id=result_2>, etc. The find_all('li') function only returns 4 results (up to result_3), which I thought was odd, since when viewing the webpage in my browser, I see 12 results.
When I print parsed_html, I see it contains all the way to result_23. Why isn't find_all returning all 24 objects? A snippet of my code is below.
import requests
try:
from BeautifulSoup import bsoup
except ImportError:
from bs4 import BeautifulSoup as bsoup
search_url = 'https://www.amazon.com/s/ref=nb_sb_noss_2?url=search-
alias%3Dstripbooks&field-keywords=data+analytics'
response = requests.get(search_url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"})
parsed_html = bsoup(response.text)
results_tags = parsed_html.find_all('div',attrs={'id':'atfResults'})
results_html = bsoup(str(results_tags[0]))
results_html.find_all('li')
For what it's worth, the results_tags object also only contains the 4 results. Which is why I am thinking the issue is in the find_all step, rather than with the BeautifulSoup object.
If anyone can help me figure out what is happening here and how I can access all of the search results on this webpage, I will really appreciate it!!
import requests, re
try:
from BeautifulSoup import bsoup
except ImportError:
from bs4 import BeautifulSoup as bsoup
search_url = 'https://www.amazon.com/s/?url=search-%20alias%3Dstripbooks&field-keywords=data+analytics' #delete the irrelevant part from url
response = requests.get(search_url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" }) # add 'Accept' header
parsed_html = bsoup(response.text, 'lxml')
lis = parsed_html.find_all('li', class_='s-result-item' ) # use class to find li tag
len(lis)
out:
25
Can access the li elements directly through class instead of id. This will print the text from each li element.
results_tags = parsed_html.find_all('li',attrs={'class':'s-result-item'})
for r in results_tags:
print(r.text)