How to collect all comments with scrapy? - scrapy

I have to use a data scraper to scrape all the comments from newspaper articles. I have very little experience with any kind of coding. A very kind person on reddit gave me this code:
import json
import scrapy
class NewsCommentsSpider(scrapy.Spider):
name = "newscomments"
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36." }
def start_requests(self):
with open("news.txt") as file:
lines = [line.rstrip() for line in file]
for article_id in lines:
url = f"https://www.dailymail.co.uk/reader-comments/p/asset/readcomments/{article_id}?max=500&order=desc"
yield scrapy.Request(
url=url,
callback=self.parse_comments,
headers=self.headers,
meta={"article_id": article_id},
)
def parse_comments(self, response):
comments_dict = json.loads(response.text)
valid_comments = []
for comment in comments_dict["payload"]["page"]:
if comment["replies"]["totalCount"] >= 3:
valid_comments.append(comment)
with open(f"{response.meta.get('article_id')}.json", "w") as f:
json.dump(valid_comments, f)
I tested it, and it works! However, I think he only designed it to download comments with three or replies, which was my origial query. So I was wondering if anyone here can help me change the variables in what's written here so that it will download all the comments, not just the one's that got replies, but the one's that got replies as well as the one's that didn't.
Quick aside: the data I got from this also contained alot of repeated words, like it repeated the title of the article before every comment, and there were words like "userid" infront of every username, this made it kind of difficult to read, and I was wondering if anyone here could help change the code so it downloads less information, all I really need is the comments, usernames and dates the things comments were made.
Thanks a bunch!
Here's the code once again:
import json
import scrapy
class NewsCommentsSpider(scrapy.Spider):
name = "newscomments"
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36." }
def start_requests(self):
with open("news.txt") as file:
lines = [line.rstrip() for line in file]
for article_id in lines:
url = f"https://www.dailymail.co.uk/reader-comments/p/asset/readcomments/{article_id}?max=500&order=desc"
yield scrapy.Request(
url=url,
callback=self.parse_comments,
headers=self.headers,
meta={"article_id": article_id},
)
def parse_comments(self, response):
comments_dict = json.loads(response.text)
valid_comments = []
for comment in comments_dict["payload"]["page"]:
if comment["replies"]["totalCount"] >= 3:
valid_comments.append(comment)
with open(f"{response.meta.get('article_id')}.json", "w") as f:
json.dump(valid_comments, f)

Related

Scrapy - Splash - Not rendering everything on the site

I try to scrape the odds comparison site from www.raingpost.com
Example from racingpost -> these sites are only working until the race is over, so if you can not see it anymore, pick a race that is still to come :)
So I scraped this site for some info using different spiders, but it seems the odds from the bookmakers are not rendered by splash - at least I can not see the odds in my local splash or the html returned.
I tried:
Increasing the wait time up to 20sec
deactivating the private mode
using scroll down
But it is still not rendering.
How do I scrape these odds?
I tried some solutions from answers here on stackoverflow, the last code I tried was this one:
class DailyoddSpider(scrapy.Spider):
name = 'dailyodd'
allowed_domains = ['www.racingpost.com']
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
script = '''
function main(splash, args)
splash.private_mode_enabled = false
url = args.url
assert(splash:go(url))
assert(splash:wait(5))
return splash:html()
end
'''
def start_requests(self):
yield SplashRequest(url="https://www.racingpost.com/racecards/394/southwell-aw/2022-03-05/804308/odds-comparison", callback=self.parse, endpoint='execute', args={
'lua_source': self.script
})

Issue requesting when scraping images from google using 'src' tag, how to scrape images from google using beautiful soup?

I am a newbie to python at best. I have been attempting to make a function to download a specific number of images from a desired google image search into a specific folder in your google drive. But I have hit a snag that I am unable to fix; please can someone point out where I am going wrong or point me in the right direction to fix it. I believe the issue is im = requests.get(link) (line 36). So far, I have the following:
# mount the drive
from google.colab import drive
drive.mount('/content/gdrive')
#module import
import requests
from bs4 import BeautifulSoup
#define parameters of search
query = input("Images of:")
print("Number of images:")
NumberOfImages = int(input())
FolderLocation = input("Input Folder Location:")
image_type="ActiOn"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# soup
request = requests.get(url,headers=headers)
soup = BeautifulSoup(request.text,'html.parser')
images = soup.find_all('img')
# loop to save
tik = 0
for image in images:
if tik <= NumberOfImages:
link = image['src']
name = query+"_"+str(tik)
print(link, name)
with open(FolderLocation+"/"+name+".jpg",'wb') as f:
im = requests.get(link)
f.write(im.content)
print("Writing "+name+ " to file")
tik +=1
else:
break
Is this an issue with requesting the 'src' links from google, or is there something else I am missing out?
Any help would be much appreciated. Thanks.
In order to scrape the full-res image URL using requests and beautifulsoup you need to scrape data from the page source (CTRL+U) code via regex.
Find all <script> tags:
soup.select('script')
Match images data via regex from the <script> tags:
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
Match desired images (full res size) via regex:
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
matched_images_data_json)
Extract and decode them using bytes() and decode() and tell how much elements you want to extract with list() slicing [:20] (grabs first 20 images):
for fixed_full_res_image in matched_google_full_resolution_images[:20]:
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
Code and full example in the online IDE that also downloads images:
import requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "pexels cat",
"tbm": "isch",
"hl": "en",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
def get_images_data():
print('\nGoogle Images Metadata:')
for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
source = google_image.select_one('.fxgdke').text
link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
print(f'{title}\n{source}\n{link}\n')
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nFull Resolution Images:') # in order
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
# ------------------------------------------------
# Download original images
# print(f'Downloading {index} image...')
# opener=urllib.request.build_opener()
# opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
# urllib.request.install_opener(opener)
# urllib.request.urlretrieve(original_size_img, f'Bs4_Images/original_size_img_{index}.jpg')
get_images_data()
-------------
'''
Google Images Metadata:
9,000+ Best Cat Photos · 100% Free Download · Pexels Stock Photos
pexels.com
https://www.pexels.com/search/cat/
other results ...
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2cZsuRkkLWXOIsl9BZzbeaCcI0qav7nenDvvqi-YSm4nVJZYyljRsJZv6N5vS8hMNU_w&usqp=CAU
other results ...
Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?cs=srgb&dl=pexels-evg-culture-1170986.jpg&fm=jpg
other results ...
'''
Alternatively, you can achieve the same thing by using Google Images API from SerpApi. It's a paid API with a free plan.
The difference is that you don't have to deal with regex, bypass blocks from Google, and maintain it over time if something crashes. Instead, you only need to iterate over structured JSON and get the data you want.
Example code to integrate:
import os, json # json for pretty output
from serpapi import GoogleSearch
def get_google_images():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "pexels cat",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))
get_google_images()
---------------
'''
[
... # other images
{
"position": 100, # img number
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRR1FCGhFsr_qZoxPvQBDjVn17e_8bA5PB8mg&usqp=CAU",
"source": "pexels.com",
"title": "Close-up of Cat · Free Stock Photo",
"link": "https://www.pexels.com/photo/close-up-of-cat-320014/",
"original": "https://images.pexels.com/photos/2612982/pexels-photo-2612982.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500",
"is_product": false
}
]
'''
P.S - I wrote a more in-depth blog post about how to scrape Google Images, and how to reduce the chance of being blocked while web scraping search engines.
Disclaimer, I work for SerpApi.

I send a post request by scrapy, response data is 'too frequently',but i send this same request by postman,response is this i want

**
This is my code of my scrapy. I also send same request with postman.No matter i send it any times,i can recive data that i want.But i send it by scrapy,I recive data alwanys is 'too frequently,forbid visit'.Maybe there will are many causes.But I want to know what are the possible causes.
**
'
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['www.lagou.com']
start_urls = ['https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false']
def start_requests(self):
yield FormRequest(
self.start_urls[0],
callback=self.parse,
)
def parse(self,response):
print(response.text)
'
You need to show the website that you are an actual user, not a bot
try sending a user-agent in the header
yield FormRequest(
url=self.start_urls[0],
callback=self.parse,
headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',}
)

Error 403 Forbidden not User-Agent

I've tried looking at previous posts on the same subject but none of the solutions seem to be working and I'd like to confirm that there is indeed nothing I can do to get around this.
I'm a journalist trying to download permit data from off the planning authority's website. I could do this no problem up till a few months ago but the website has been changed and after adapting my code to the new site, I now seem to be getting an Error 403 every time I try to follow links on the site.
Any help would be greatly appreciated.
My code -not the best looking or most efficient, but I'm self taught and use coding mainly for scraping data for work - stats on the page: http://www.pa.org.mt/padecisionSearch?date=1/31/2018%2012:00:00%20AM
In the bit of code I have pasted beneath I am trying to access each link permit link (first one on page: http://www.pa.org.mt/PACaseDetails?Systemkey=200414&CaseType=PA/10351/17%27) in order to scrape permit details.
While I can generate the link addresses without a problem (they are accessible by clicking the link), sending a request to the address returns:
b'\r\nForbidden\r\n\r\nForbidden URL\r\nHTTP Error 403. The request URL is forbidden.\r\n\r\n'
I've tried changing the User-Agent, and I've also tried to put in a timer between requests but nothing seems to have any effect.
Any suggestions would be very welcome
My code:
import requests
import pandas as pd
import csv
from bs4 import BeautifulSoup
from datetime import date, timedelta as td
import pandas as pd
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import urllib
with requests.Session() as s:
#s.headers.update(head)
r= s.get("http://www.pa.org.mt",data=None, headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"})
page = (s.get("http://www.pa.org.mt/padecisionSearch?date=1/31/2018%2012:00:00%20AM", data=None, headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/63.0.3239.132 Safari/537.36"}).content)
soup = BeautifulSoup(page, 'html.parser')
search_1 = soup.find_all('table')
for item in search_1:
item1 = item.find_all('tr')
for item2 in item1:
item3 = item2.find_all('td', class_ = 'fieldData')
for element in item3:
list2.append(element.text)
zejt_number = (len(list2)/6)
zi = element.find_all('a')
if len(zi) == 0 and ((len(list2)-1)%5 == 0 or len(list2) == 1):
case_status.append("")
applicant.append("")
architect.append("")
application_type.append("")
case_category.append("")
case_officer.append("")
case_officer2.append("")
date_approved.append("")
application_link.append("")
elif len(zi) != 0:
for li in zi:
hyperlink = "http://www.pa.org.mt/"+li.get('href')
application_link.append(hyperlink)
print(hyperlink)
z = (s.get(hyperlink, data=None, headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}).content)
print(z)
first of all your code is a bit messy. is it all your code? or just a part of it? e.g. you are importing pandas twice. nevertheless your main problem why this is not working is the hyperlinks you are generating:
for li in zi:
hyperlink = "http://www.pa.org.mt/"+li.get('href')
print(hyperlink)
the result looks like this:
http://www.pa.org.mt/../PACaseDetails?Systemkey=200414&CaseType=PA/10351/17'
this is link won't work. a quick workaround would be to edit the hyperlink before you do the request:
for li in zi:
hyperlink = "http://www.pa.org.mt/"+li.get('href')
hyperlink = hyperlink.replace('../', '')
print(hyperlink)
z = (s.get(hyperlink, data=None, headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}).content)
print(z)
the hyperlinks now should look like this:
http://www.pa.org.mt/PACaseDetails?Systemkey=200414&CaseType=PA/10351/17'
and the request should pass through.

Why is Beautifulsoup find_all not returning complete results?

I am trying to parse an Amazon search results page. I want to access the data contained in an <li> tag with <id=result_0>, <id=result_1>, <id=result_2>, etc. The find_all('li') function only returns 4 results (up to result_3), which I thought was odd, since when viewing the webpage in my browser, I see 12 results.
When I print parsed_html, I see it contains all the way to result_23. Why isn't find_all returning all 24 objects? A snippet of my code is below.
import requests
try:
from BeautifulSoup import bsoup
except ImportError:
from bs4 import BeautifulSoup as bsoup
search_url = 'https://www.amazon.com/s/ref=nb_sb_noss_2?url=search-
alias%3Dstripbooks&field-keywords=data+analytics'
response = requests.get(search_url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"})
parsed_html = bsoup(response.text)
results_tags = parsed_html.find_all('div',attrs={'id':'atfResults'})
results_html = bsoup(str(results_tags[0]))
results_html.find_all('li')
For what it's worth, the results_tags object also only contains the 4 results. Which is why I am thinking the issue is in the find_all step, rather than with the BeautifulSoup object.
If anyone can help me figure out what is happening here and how I can access all of the search results on this webpage, I will really appreciate it!!
import requests, re
try:
from BeautifulSoup import bsoup
except ImportError:
from bs4 import BeautifulSoup as bsoup
search_url = 'https://www.amazon.com/s/?url=search-%20alias%3Dstripbooks&field-keywords=data+analytics' #delete the irrelevant part from url
response = requests.get(search_url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" }) # add 'Accept' header
parsed_html = bsoup(response.text, 'lxml')
lis = parsed_html.find_all('li', class_='s-result-item' ) # use class to find li tag
len(lis)
out:
25
Can access the li elements directly through class instead of id. This will print the text from each li element.
results_tags = parsed_html.find_all('li',attrs={'class':'s-result-item'})
for r in results_tags:
print(r.text)