I am trying to deploy a spider from within a python script.
this is my folder stucture:
scraping.py
blogs/
__init_.py
blogs/
scrapy.cfg
__init_.py
items.py
settings.py
spiders/
__init__.py
spider_blog.py
this is the test scraping.py snippet (it belongs to a class):
def spider_blog(self):
parse()
and this is what I have in spider_blog.py:
class PitchforkSpider(scrapy.Spider):
name = "pitchfork_reissues"
allowed_domains = ["pitchfork.com"]
#creates objects for each URL listed here
start_urls = [
"http://pitchfork.com/reviews/best/reissues/?page=1",
"http://pitchfork.com/reviews/best/reissues/?page=2",
"http://pitchfork.com/reviews/best/reissues/?page=3",
]
def parse(self, response):
for sel in response.xpath('//div[#class="album-artist"]'):
item = PitchforkItem()
item['artist'] = sel.xpath('//ul[#class="artist-list"]/li/text()').extract()
item['reissue'] = sel.xpath('//h2[#class="title"]/text()').extract()
yield item
how is the proper syntax for me to import parse() from spider_blog.py as a module into scraping.py?
Related
I am trying to get within several urls of a webpage and follow the response to the next parser to grab another set of urls on a page. However, from this page I need to grab the next page urls but I wanted to try this by manipulating the page string by parsing it and then passing this as the next page. However, the scraper crawls but it returns nothing not even the output on the final parser when I load item.
Note: I know that I can grab the next page rather simply with an if-statement on the href. However, I wanted to try something different in case I had to face a situation where I would have to do this.
Here's my scraper:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
class ZooplasSpider(scrapy.Spider):
name = 'zooplas'
start_urls = ['https://www.zoopla.co.uk/overseas/']
def start_request(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse, )
def parse(self, response):
container = response.xpath("//ul[#class='list-inline list-unstyled']//li")
for links in container:
urls = links.xpath(".//a/#href").get()
yield response.follow(
urls, callback = self.parse_places
)
def parse_places(self, response):
container = response.xpath("//ul[#class='listing-results clearfix js-gtm-list']//li")
for links in container:
urls = links.xpath('(//div[#class="listing-results-right clearfix"]//a)[position() mod 3=1]//#href').get()
yield response.follow(
urls, callback = self.parse_listings
)
if response.xpath("//div[#id='content']//div//h1//text()").extract_first():
page_on = response.xpath("//div[#id='content']//div//h1//text()").extract_first()
name_of_page = page_on.split()[-1]
else:
pass
if response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first():
url_link = response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first()
url_link = url_link.split('/')
last_page = url_link[-1].split('=')[-1]
else:
pass
all_pages = []
for index, n in enumerate(url_link):
for page_name, page_num in zip(name_of_page, last_page):
if index == 5:
url_link[index] = page_name
testit='/'.join(url_link)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, page_num+1):
equal_split[another_i] = str(2)
all_pages.append('='.join(equal_split))
for urls in all_pages:
yield response.follow(
urls, callback = self.parse.places
)
def parse_listings(self, response):
loader = ItemLoader(ZooplasItem(), response=response)
loader.default.output_processor = TakeFirst()
loader.add_xpath("//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'zoopla.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ZooplasSpider)
process.start()
I know the way of grabbing the urls works as I have tried it on a single url using the following:
url = "https://www.zoopla.co.uk/overseas/property/ireland/?new_homes=include&include_sold=false&pn=16"
list_of_stuff = ['Ireland', 'Germany','France']
pages_of_stuff = [5, 7, 6]
test = []
all_pages = []
j=0
for index, n in enumerate(a):
for l_stuff, p_stuff in zip(list_of_stuff,pages_of_stuff):
if index == 5:
a[index] = l_stuff
testit='/'.join(a)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, p_stuff+1):
equal_split[another_i] = str(range_val)
print('='.join(equal_split))
Which is the same as the one used above just a change of variables. this outputs the following links and they work:
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=2
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=3
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=4
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=5
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=2
...
Your use case is suited for using scrapy crawl spider. You can write rules on how to extract links to the properties and how to extract links to the next pages. I have changed your code to use a crawl spider class and I have changed your FEEDS settings to use the recommended settings. FEED_URI and FEED_FORMAT are deprecated in newer versions of scrapy.
Read more about the crawl spider from the docs
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(#class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","//li[#class='ui-breadcrumbs__item'][3]/a/text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()
I'm trying to make a crawler that will crawl an entire website and output a list of all the domains that the said websites links to (without duplicates).
I have come up with the following code :
import scrapy
from crawler.items import CrawlerItem
from crawler.functions import urlToDomain
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class domainSpider(CrawlSpider):
global allDomains
allDomains = []
name = "domainSpider"
allowed_domains = ["example.com"]
start_urls = [
"https://example.com/"
]
rules = (
Rule(LinkExtractor(), callback='parse', follow=True),
)
def parse(self, response):
urls = response.xpath("//a/#href").extract()
# formating all URL formats to the same one (https://url.com)
urlsOk = []
for elt in urls :
if elt [:2] == "//" : # link is external, append http
urlsOk.append(elt)
elif elt[:4] == "http" :
urlsOk.append(elt)
domaines = list(set([urlToDomain(x) for x in urlsOk]))
item = CrawlerItem()
item["domaines"] = []
item["url"] = response.url
for elt in domaines:
if elt not in allDomains :
item['domaines'].append(elt)
allDomains.append(elt)
yield({
'domaines':elt
})
This is workinge exactly like expected in retrieving the domains, but it stops crawling (finished) after the very first page.
I was overriding a built-in CrawlSpider method (parse) which caused the bug...
The solution was to change the callback method's name from parse to anything else.
For an assignment I have to fetch from data from a Kaercher webshop. The data I need to fetch is the product Title, description and price.
Additionally I need to be able to fetch multiple products (high pressure cleaners, vacuum cleaners, ...) with the same script. So I probably need to make a .csv keyword file or something to adjust the URL accordingly.
However, I can't seem to be able to fetch the data with my current script..
Info: I will add my entire file structure and current code. I only adjusted the actual spider file (karcher_crawler.py), the other files are mostly default.
My folder structure:
scrapy_karcher/ # Project root directory
scrapy.cfg # Contains the configuration information to deploy the spider
scrapy_karcher/ # Project's python module
__init__.py
items.py # Describes the definition of each item that we’re scraping
middlewares.py # Project middlewares
pipelines.py # Project pipelines file
settings.py # Project settings file
spiders/ # All the spider code goes into this directory
__init__.py
karcher_crawler.py # The spider
My "karcher_crawler.py" code
import scrapy
class KarcherCrawlerSpider(scrapy.Spider):
name = 'karcher_crawler'
start_urls = [
'https://www.kaercher.com/nl/webshop/hogedrukreinigers-resultaten.html'
]
def parse(self, response):
products=response.xpath("//div[#class='col-sm-3 col-xs-6 fg-products-item']")
# iterating over search results
for product in products:
# Defining the XPaths
XPATH_PRODUCT_NAME=".//div[#class='product-info']//h6[contains(#class,'product-label')]//a/text()"
XPATH_PRODUCT_PRICE=".//div[#class='product-info']//div[#class='product-price']//span/text()"
XPATH_PRODUCT_DESCRIPTION=".//div[#class='product-info']//div[#class='product-description']//a/text()"
raw_product_name=product.xpath(XPATH_PRODUCT_NAME).extract()
raw_product_price=product.xpath(XPATH_PRODUCT_PRICE).extract()
raw_product_description=product.xpath(XPATH_PRODUCT_DESCRIPTION).extract()
# cleaning the data
product_name=''.join(raw_product_name).strip(
) if raw_product_name else None
product_price=''.join(raw_product_price).strip(
) if raw_product_price else None
product_description=''.join(raw_product_description).strip(
) if raw_product_description else None
yield {
'product_name': product_name,
'product_price': product_price,
'product_description': product_description,
}
My "items.py" code:
import scrapy
class ScrapyKarcherItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
My "pipelines.py" code:
class ScrapyKarcherPipeline(object):
def process_item(self, item, spider):
return item
my "scrapy.cfg" code:
[settings]
default = scrapy_karcher.settings
[deploy]
#url = http://localhost:6800/
project = scrapy_karcher
I managed to request the required data using the following code:
Spider file (.py)
import scrapy
from krc.items import KrcItem
import json
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def parse(self, response):
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"]
yield item
items file (.py.
import scrapy
class KrcItem(scrapy.Item):
productid=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
price=scrapy.Field()
pass
Thanks to #gangabass I managed to locate the URL's which contain the data I needed to extract. (you can find them in the "Network" tab when you are inspecting a webpage (press F12 or right click anywhere to inspect).
I have written this short spider code to extract titles from hacker news front page(http://news.ycombinator.com/).
import scrapy
class HackerItem(scrapy.Item): #declaring the item
hackertitle = scrapy.Field()
class HackerSpider(scrapy.Spider):
name = 'hackernewscrawler'
allowed_domains = ['news.ycombinator.com'] # website we chose
start_urls = ['http://news.ycombinator.com/']
def parse(self,response):
sel = scrapy.Selector(response) #selector to help us extract the titles
item=HackerItem() #the item declared up
# xpath of the titles
item['hackertitle'] =
sel.xpath("//tr[#class='athing']/td[3]/a[#href]/text()").extract()
# printing titles using print statement.
print (item['hackertitle']
However when i run the code scrapy scrawl hackernewscrawler -o hntitles.json -t json
i get an empty .json file that does not have any content in it.
You should change print statement to yield:
import scrapy
class HackerItem(scrapy.Item): #declaring the item
hackertitle = scrapy.Field()
class HackerSpider(scrapy.Spider):
name = 'hackernewscrawler'
allowed_domains = ['news.ycombinator.com'] # website we chose
start_urls = ['http://news.ycombinator.com/']
def parse(self,response):
sel = scrapy.Selector(response) #selector to help us extract the titles
item=HackerItem() #the item declared up
# xpath of the titles
item['hackertitle'] = sel.xpath("//tr[#class='athing']/td[3]/a[#href]/text()").extract()
# return items
yield item
Then run:
scrapy crawl hackernewscrawler -o hntitles.json -t json
I want crawl image of link : "http://vnexpress.net/photo/cuoc-song-do-day/nguoi-trung-quoc-ra-be-boi-danh-mat-chuoc-tranh-nong-3445592.html" but but the code is only crawl a picture (in my computer) and crawl all pictures (in my friend computer). Plese please help me
import scrapy
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.contrib.linkextractors import LinkExtractor
from imgur.items import ImgurItem
class ImgurSpider(CrawlSpider):
name = 'imgur'
allowed_domains = ['vnexpress.net']
start_urls = ['http://vnexpress.net/photo/cuoc-song-do-day/nguoi-trung-quoc-ra-be-boi-danh-mat-chuoc-tranh-nong-3445592.html']
# rules = [Rule(LinkExtractor(allow=['/*']), 'parse123')]
def parse(self, response):
image = ImgurItem()
# image['title'] = response.xpath(\
# "//img[data-notes-url=""]").extract()
rel = response.xpath("//div[#id='article_content']//img/#src").extract()
image['image_urls'] = [rel[0]]
return image
rel = response.xpath("//div[#id='article_content']//img/#src").extract()
image['image_urls'] = [rel[0]]
You take only one link by specifying [0] index.
Try
image['image_urls'] = rel
You can also split your code to url parsing function, and a callback for downloading the images.