How to handle scrapy status 400 - scrapy

Hello I had developed a scraper that crawls https://crypto.jobs and scrapes its jobs. My crawler works perfect on first page of webpage but when I try to request its next page its returns status code 400.
HTTP status code is not handled or not allowed
import scrapy
import w3lib.html
class CryptojobsSpider(scrapy.Spider):
name = 'cryptojobs'
allowed_domains = ['crypto.jobs']
start_urls = ['https://crypto.jobs/']
def parse(self, response):
jobs=response.css('.table-jobs').xpath('//tbody/tr[#class="highlighted"]')
for job in jobs:
job_url=job.css('.job-url::attr(href)').get()
url = response.urljoin(job_url)
if url:
yield scrapy.Request(url=url, callback=self.single_parse)
nextUrl=response.css('ul.pagination li:last-of-type a::attr(href)').get()
if nextUrl:
nextPageUrl=response.urljoin(nextUrl)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
yield scrapy.Request(url=nextPageUrl, callback=self.parse,
headers=headers,
method='GET')
def single_parse(self,response):
items=dict()
print(response)
I had changed user agent in settings.py and also tried with passing user-agent in headers of request.
Further anyone have idea that how I can achieve that.

Related

Looping through pages of Web Page's Request URL with Scrapy

I'm looking to adapt this tutorial, (https://medium.com/better-programming/a-gentle-introduction-to-using-scrapy-to-crawl-airbnb-listings-58c6cf9f9808) to scraping this site of tiny home listings: https://tinyhouselistings.com/.
The tutorial uses the request URL, to get a very complete and clean JSON file, but does so for the first page only. It seems that looping through the 121 pages of my tinyhouselistings request url should be pretty straight-forward but I have not been able to get anything to work. The tutorial does not loop through the pages of the request url, but rather uses scrapy splash, run within a Docker container to get all the listings. I am willing to try that, but I just feel like it should be possible to loop through this request url.
This outputs only the first page only of the tinyhouselistings request url for my project:
import scrapy
class TinyhouselistingsSpider(scrapy.Spider):
name = 'tinyhouselistings'
allowed_domains = ['tinyhouselistings.com']
start_urls = ['http://www.tinyhouselistings.com']
def start_requests(self):
url = 'https://thl-prod.global.ssl.fastly.net/api/v1/listings/search?area_min=0&measurement_unit=feet&page=1'
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
_file = "tiny_listings.json"
with open(_file, 'wb') as f:
f.write(response.body)
I've tried this:
class TinyhouselistingsSpider(scrapy.Spider):
name = 'tinyhouselistings'
allowed_domains = ['tinyhouselistings.com']
start_urls = ['']
def start_requests(self):
url = 'https://thl-prod.global.ssl.fastly.net/api/v1/listings/search?area_min=0&measurement_unit=feet&page='
for page in range(1, 121):
self.start_urls.append(url + str(page))
yield scrapy.Request(url=start_urls, callback=self.parse)
But I'm not sure how to then pass start_urls to parse so as to write the response to the json being written at the end of the script.
Any help would be much appreciated!
Remove allowed_domains = ['tinyhouselistings.com'] because the url thl-prod.global.ssl.fastly.net will be filtered out by Scrapy
Since you are using start_requests method so you do not need start_urls, you can only have either of them
import json
class TinyhouselistingsSpider(scrapy.Spider):
name = 'tinyhouselistings'
listings_url = 'https://thl-prod.global.ssl.fastly.net/api/v1/listings/search?area_min=0&measurement_unit=feet&page={}'
def start_requests(self):
page = 1
yield scrapy.Request(url=self.listings_url.format(page),
meta={"page": page},
callback=self.parse)
def parse(self, response):
resp = json.loads(response.body)
for ad in resp["listings"]:
yield ad
page = int(response.meta['page']) + 1
if page < int(listings['meta']['pagination']['page_count'])
yield scrapy.Request(url=self.listings_url.format(page),
meta={"page": page},
callback=self.parse)
From terminal, run spider using to save scraped data to a JSON file
scrapy crawl tinyhouselistings -o output_file.json

How to use python requests with scrapy?

I am trying to use requests to fetch a page then pass the response object to a parser, but I ran into a problem:
def start_requests(self):
yield self.parse(requests.get(url))
def parse(self, response):
#pass
builtins.AttributeError: 'generator' object has no attribute 'dont_filter'
You first need to download the page's resopnse and then convert that string to HtmlResponse object
from scrapy.http import HtmlResponse
resp = requests.get(url)
response = HtmlResponse(url="", body=resp.text, encoding='utf-8')
what you need to do is
get the page with python requests and save it to variable different then Scrapy response.
r = requests.get(url)
replace scrapy response body with your python requests text.
response = response.replace(body = r.text)
thats it. Now you have Scrapy response object with all data available from python requests.
yields return a generator so it iterates over it before the request get's the data you can remove the yield and it should work. I have tested it with sample URL
def start_requests(self):
self.parse(requests.get(url))
def parse(self, response):
#pass

Scrapy passed one page from parsing

Here is my spider, which I run from a script to parse content of my local dokuwiki:
DEBUG = True
if DEBUG:
f_debug = open('debug.log','w')
md5s = []
class DokuWikiMd5Spider(scrapy.Spider):
name = 'dokuwikispider'
start_urls = ['https://dokuwiki.mjcc.lasil.ru/doku.php']
visited = []
custom_settings = {
'CONCURRENT_REQUESTS': 1,
}
#staticmethod
def get_page_name(url):
url = url.replace("https://dokuwiki.mjcc.lasil.ru/doku.php?", '')
if 'id=start&do=search' in url:
# because credentials are in URL, here we cut only page name
# https://dokuwiki.mjcc.lasil.ru/doku.php?id=start&do=search&id=%D0%BF%D0%BE%D1%81%D1%82%D0%B0%D0%B2%D1%89%D0%B8%D0%BA%D0%B8_%D0%B8_%D0%BA%D0%BE%D0%BD%D1%82%D0%B0%D0%BA%D1%82%D1%8B&q=&p=PASSWORD&u=admin
m = re.findall('id=([^&]+)', url)
return m[1]
else:
m = re.search('id=([^&]+)', url)
return m.group(1)
def parse(self, response):
password = keyring.get_password('dokuwiki', 'admin')
return scrapy.FormRequest.from_response(
response,
formdata = {'u': 'admin', 'p': password},
callback = self.after_login
)
def after_login(self, response):
# check login succeed before going on
if b"authentication failed" in response.body:
self.logger.error("Login failed")
return
# continue scraping with authenticated session...
if DEBUG:
f_debug.write("parsing: {}\n".format(response.url))
text = response.text
# cut everything except page content, not to depend on wiki settings when comparing
m = re.findall('.*(<!-- wikipage start -->.*<!-- wikipage stop -->).*', text, re.DOTALL)
text = m[0][0]
# with open(r'F:\TEMP\test.html','w') as f:
# f.write(text)
md5 = hashlib.md5()
md5.update(text.encode('utf-8'))
md5s.append({'url': self.get_page_name(response.url), 'md5': md5.hexdigest()})
yield {'url': self.get_page_name(response.url), 'md5': md5.hexdigest()}
for next_page in response.xpath('//a/#href'):
next_url = next_page.extract()
if DEBUG:
f_debug.write("\t?next page: {}\n".format(next_url))
if 'doku.php?id=' in next_url:
# to process every page name only one time
next_page_name = self.get_page_name(next_url)
if next_page_name not in self.visited:
if DEBUG:
f_debug.write("\t\t!\n")
self.visited.append(next_page_name)
yield response.follow("https://dokuwiki.mjcc.lasil.ru/{}&u=admin&p={}".format(next_url, keyring.get_password('dokuwiki', 'admin')), self.after_login)
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(DokuWikiMd5Spider)
process.start() # the script will block here until the crawling is finished
So in debug messages I see, that spider crowled page 'wiki_backup':
2019-01-28 19:49:22 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://dokuwiki.mjcc.lasil.ru//doku.php?id=wiki_backup&u=admin&p=PASSWORD> (referer: https://dokuwiki.mjcc.lasil.ru//doku.php?id=%D1%81%D0%BE%D0%B7%D0%B4%D0%B0%D0%BD%D0%B8%D0%B5_%D0%B8_%D0%BF%D1%80%D0%BE%D0%B2%D0%B5%D1%80%D0%BA%D0%B0_%D0%B1%D1%8D%D0%BA%D0%B0%D0%BF%D0%BE%D0%B2&u=admin&p=PASSWORD)
And I can see its content in the crawled method, as you can see in screenshot
But that page wasn't parsed even one time, as you can see in ''debug.log'':
root#F91_Moin20:/home/ishayahu # cat debug.log | grep wiki_backup
?next page: /doku.php?id=wiki_backup
The problem was in a way, how spider checks if authentification was failed. It (as in the tutorial) search for words "authentification failed", but because I had the same words in page content, spider thought that here was an authentification error and stop processing the page.
There should be another way to check if authentification was really failed.

Can i use scrapy Post request without callback?

I need to update location on site that uses redio button. This can be done with simple Post request. The problem is that output of this request is
window.location='http://store.intcomex.com/en-XCL/Products/Categories?r=True';
Since it is not a valid url Scrapy redirects it to PageNotFound and closes spider.
2017-09-17 09:57:59 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https:
//store.intcomex.com/en-XCL/ServiceClient/PageNotFound> from <POST https://store.intcomex.com/en-XC
L//User/SetNewLocation>
Here is my code:
def after_login(self, response):
# inspect_response(response, self)
url = "https://store.intcomex.com/en-XCL//User/SetNewLocation"
data={"id":"xclf1"
}
yield scrapy.FormRequest(url, formdata=data, callback = self.location)
# inspect_response(response, self)
def location(self, response):
yield scrapy.Request(url = 'http://store.intcomex.com/en-XCL/Products/Categories?r=True', callback = self.categories)
The question is how can I redirect scrapy to valid url after executing Post request that changes location? Is there some argument that indicates target url or i can execute it without callback and yiel correct url on the next line?
Thanks.

why does Scrapy CrawlSpider rules not work ?it just crawl the start url page

I am using Scrapy to crawl a site, it has no errors,but it just crawl the start url page, not follow the rules and continue down .I am sure that start url has next link
match "/plc.*" and the setting is default
please see the code:
class TestSpider(CrawlSpider):
name = 'Test'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
}
rules = (
Rule(LinkExtractor(allow=r"/gsx.*"), callback="parse_item", process_request="test"),
Rule(LinkExtractor(allow=r'/plc.*'), follow=True, process_request="test"),
Rule(LinkExtractor(allow=r'/pla.*'), follow=True, process_request="test"),
)
def start_requests(self):
yield Request("http://www.example.com/login.json", meta = {'cookiejar' : 1}, headers=self.headers, callback=self.post_login)
def post_login(self, response):
print 'Preparing login'
return [FormRequest.from_response(response,
meta = {'cookiejar' : 1},
headers = self.headers,
formdata = {
'account': 'test',
'password': 'test',
'redirect': 'http://www.example.com/',
'remember' : 'true'
},
callback = self.after_login
)]
def after_login(self, response):
for url in self.start_urls:
yield Request(url, meta = {'cookiejar' : 1}, headers = self.headers)
def test(self, request):
return Request(request.url, meta={'cookiejar':1}, headers=self.headers, callback=self.parse_item)
def parse_item(self, response):
print response.url
logging.info('URL:%s' %response.url)
time.sleep(random.uniform(3, 5))