Scrapy | How get response from request without urllib? - scrapy

I believe there is a better way to get response using scrapy.Request then I do
...
import urllib.request
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
...
class MatchResultsSpider(scrapy.Spider):
name = 'match_results'
allowed_domains = ['site.com']
start_urls = ['url.com']
def get_detail_page_data(self, detail_url):
req = urllib.request.Request(
detail_url,
data=None,
headers={
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'accept': 'application/json, text/javascript, */*; q=0.01',
'referer': 'site.com',
}
)
page = urllib.request.urlopen(req)
response = HtmlResponse(url=detail_url, body=page.read())
target = Selector(response=response)
return target.xpath('//dd[#data-first_name]/text()').extract_first()
I get all information inside parse function.
But in one place I need to get a little peace data from inside detail page.
# Lineups
lineup_team_tables = lineups_container.xpath('.//tbody')
for i, table in enumerate(lineup_team_tables):
# lineup players
line_up = []
lineup_players = table.xpath('./tr[not(contains(string(), "Coach"))]')
for lineup_player in lineup_players:
line_up_entries = {}
lineup_player_url = lineup_player.xpath('.//a/#href').extract_first()
line_up_entries['player_id'] = get_id(lineup_player_url)
line_up_entries['jersey_num'] = lineup_player.xpath('./td[#class="shirtnumber"]/text()').extract_first()
abs_lineup_player_url = response.urljoin(lineup_player_url)
line_up_entries['position_id_detail'] = self.get_detail_page_data(abs_lineup_player_url)
line_up.append(line_up_entries)
# team_lineup['line_up'] = line_up
self.write_to_scuard(i, 'line_up', line_up)
Can I get data from other page using scrapy.Request(detail_url, calback_func)?
Thank for your help!

Too much extra code. Use simple scheme of Scrapy parsing:
class ********(scrapy.Spider):
name = '*******'
domain = '****'
allowed_domains = ['****']
start_urls = ['https://******']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64;AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
'DEFAULT_REQUEST_HEADERS': {
'ACCEPT': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'ACCEPT_ENCODING': 'gzip, deflate, br',
'ACCEPT_LANGUAGE': 'en-US,en;q=0.9',
'CONNECTION': 'keep-alive',
}
def parse(self, response):
(You already have responsed html start_urls = ['https://******'])
yield scrapy.Request(url, callback=self.parse_details)
then you can parse further (nested). And return back to parse callback:
def parse_details(self, response):
************
yield scrapy.Request(url_2, callback=self.parse)

Related

How to collect all comments with scrapy?

I have to use a data scraper to scrape all the comments from newspaper articles. I have very little experience with any kind of coding. A very kind person on reddit gave me this code:
import json
import scrapy
class NewsCommentsSpider(scrapy.Spider):
name = "newscomments"
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36." }
def start_requests(self):
with open("news.txt") as file:
lines = [line.rstrip() for line in file]
for article_id in lines:
url = f"https://www.dailymail.co.uk/reader-comments/p/asset/readcomments/{article_id}?max=500&order=desc"
yield scrapy.Request(
url=url,
callback=self.parse_comments,
headers=self.headers,
meta={"article_id": article_id},
)
def parse_comments(self, response):
comments_dict = json.loads(response.text)
valid_comments = []
for comment in comments_dict["payload"]["page"]:
if comment["replies"]["totalCount"] >= 3:
valid_comments.append(comment)
with open(f"{response.meta.get('article_id')}.json", "w") as f:
json.dump(valid_comments, f)
I tested it, and it works! However, I think he only designed it to download comments with three or replies, which was my origial query. So I was wondering if anyone here can help me change the variables in what's written here so that it will download all the comments, not just the one's that got replies, but the one's that got replies as well as the one's that didn't.
Quick aside: the data I got from this also contained alot of repeated words, like it repeated the title of the article before every comment, and there were words like "userid" infront of every username, this made it kind of difficult to read, and I was wondering if anyone here could help change the code so it downloads less information, all I really need is the comments, usernames and dates the things comments were made.
Thanks a bunch!
Here's the code once again:
import json
import scrapy
class NewsCommentsSpider(scrapy.Spider):
name = "newscomments"
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36." }
def start_requests(self):
with open("news.txt") as file:
lines = [line.rstrip() for line in file]
for article_id in lines:
url = f"https://www.dailymail.co.uk/reader-comments/p/asset/readcomments/{article_id}?max=500&order=desc"
yield scrapy.Request(
url=url,
callback=self.parse_comments,
headers=self.headers,
meta={"article_id": article_id},
)
def parse_comments(self, response):
comments_dict = json.loads(response.text)
valid_comments = []
for comment in comments_dict["payload"]["page"]:
if comment["replies"]["totalCount"] >= 3:
valid_comments.append(comment)
with open(f"{response.meta.get('article_id')}.json", "w") as f:
json.dump(valid_comments, f)

How to get Scrapy to parse CSS

I am following this guide to scrape movie titles from my local cinema website. I am using Scrapy Spider and CSS parsing to get this done. Within the HTML for the site, each movie title is constructed like this:
<div class="col-md-12 movie-description">
<h2>Minions: The Rise of Gru<h2>
...
Here is my code that attempts to scrape this info
import scrapy
class CinemaSpider(scrapy.Spider):
name = "cinema"
allowed_domains = ["cannonvalleycinema10.com"]
start_urls = ["https://cannonvalleycinema10.com/"]
def parse(self, response):
movie_names = response.css(".col-md-12.movie-description h2::text").extract()
for movie_name in movie_names:
yield {
'name': movie_name
}
The cinema's website is here. I have tried all sorts of different combinations for what would get the titles I'm looking for to be added to my json file but can't figure it out.
If it helps, I am running this code:
scrapy runspider .\cinema_scrape.py -o movies.json
I am in the proper directory, too.
The page is dynamically loaded so you have try scrapy and json together :
import scrapy
from scrapy import FormRequest
from scrapy.crawler import CrawlerProcess
import json
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
url = 'https://cabbtheatres.intensify-solutions.com/embed/ajaxGetRepertoire'
cookies = {
'PHPSESSID': 'i8l12572hvd3a702d4nfj3vbg0',
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Cookie': 'PHPSESSID=i8l12572hvd3a702d4nfj3vbg0',
'Origin': 'https://cabbtheatres.intensify-solutions.com',
'Referer': 'https://cabbtheatres.intensify-solutions.com/embed?location=3663456',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
data = {
'location': '3663456',
'date': '2022-07-30',
'lang': 'en',
'soon': '',
}
def start_requests(self):
yield scrapy.FormRequest(
url =self.url,
method='POST',
formdata=self.data,
headers=self.headers,
callback=self.parse_item,
)
def parse_item(self, response):
detail=response.json()
titles=detail['data']
for name in titles:
title=name['title']
print(title)
output:
Minions: The Rise of Gru
Thor Love and Thunder
DC League of Super-Pets
Elvis(2022)
Mrs. Harris Goes to Paris
Where the Crawdads Sing
Top Gun: Maverick
Nope

BeautifulSoup: How to get only one part of <p> output?

I am trying to find the solution how to get only the price without text from the paragraph.
from bs4 import BeautifulSoup
import requests
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
p = requests.get(url = 'https://www.tia-mobiteli.hr/detaljan-prikaz.aspx?gid=11-appise_64wheu', headers = headers)
soup = BeautifulSoup(p.content,'lxml')
price = soup.find('div', class_='widget widget-info widget-price').p.text
price2 = price.strip()
print(price2)
My output is:
Naša najniža cijena za gotovinsko/virmansko plaćanje: 3.649,00 kn
I want to get only:
3.649,00 kn
Or if it is possible:
3649.00
The price is inside <b> tag:
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
p = requests.get(
url="https://www.tia-mobiteli.hr/detaljan-prikaz.aspx?gid=11-appise_64wheu",
headers=headers,
)
soup = BeautifulSoup(p.content, "lxml")
price = soup.find("div", class_="widget widget-info widget-price").b.text
price = float(price.split()[0].replace(".", "").replace(",", "."))
print(price)
Prints:
3649.0
You can use parse module that acts like reverse format().
Usage:
import parse
...
float(parse.parse('Naša najniža cijena za gotovinsko/virmansko plaćanje: {} kn',price2)[0].replace('.','').replace(',','.'))

How to change the header just for a specific request in scrapy spider?

I am trying to build a web crawler using scrapy. I want to change useragent for a single request in the spider. I tried the below code but the user agent is not being updated during the crawl process.
def start_requests(self):
request = Request(
"url",
callback=self.parse_search,
meta={'xpaths': self.xpaths},
headers={
"User-Agent": "Googlebot-Image/1.0"
}
)
return [request]
Your code works perfectly (see my code). But some middleware on your side may affect your User-Agent header:
class UserAgentSpider(scrapy.Spider):
name = 'useragent_spider'
user_agents = [
{'title': 'Galaxy S9', 'value': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G960F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36'},
{'title': 'iPhone', 'value': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/69.0.3497.105 Mobile/15E148 Safari/605.1'},
{'title': 'Edge', 'value': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'},
]
def start_requests(self):
for user_agent in self.user_agents:
yield scrapy.Request(
url="https://www.myip.com/",
headers={
'user-agent': user_agent['value'],
},
cb_kwargs={
'user_agent': user_agent['title']
},
callback=self.parse,
dont_filter=True,
)
def parse(self, response, user_agent):
with open(f"Samples/{user_agent}.htm", 'wb') as f:
f.write(response.body)

I send a post request by scrapy, response data is 'too frequently',but i send this same request by postman,response is this i want

**
This is my code of my scrapy. I also send same request with postman.No matter i send it any times,i can recive data that i want.But i send it by scrapy,I recive data alwanys is 'too frequently,forbid visit'.Maybe there will are many causes.But I want to know what are the possible causes.
**
'
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['www.lagou.com']
start_urls = ['https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false']
def start_requests(self):
yield FormRequest(
self.start_urls[0],
callback=self.parse,
)
def parse(self,response):
print(response.text)
'
You need to show the website that you are an actual user, not a bot
try sending a user-agent in the header
yield FormRequest(
url=self.start_urls[0],
callback=self.parse,
headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',}
)