import scrapy
import csv
from series.items import SeriesItem
class EpisodeScraperSpider(scrapy.Spider):
name = "episode_scraper"
allowed_domains = ["imdb.com"]
start_urls = []
def __init__(self, id=None, series=None, *args, **kwargs):
super(EpisodeScraperSpider, self).__init__(*args, **kwargs)
if id is not None:
self.start_urls = ['http://www.imdb.com/title/{!s}/episodes?season={!s}'.format(id, series)]
else:
with open('series_episode.csv') as f:
f_csv = csv.DictReader(f)
for row in f_csv:
self.start_urls.append('http://www.imdb.com/title/{!s}/episodes?season={!s}'.format(row["id"], row["series"]))
def parse(self, response):
episodes = response.xpath('//div[contains(#class, "list_item")]')
title = response.xpath('//h3/a/text()').extract()[0]
for episode in episodes:
global title
item = SeriesItem()
item['series_episode'] = episode.xpath('div/a/div[contains(#data-const,"tt")]/div/text()').extract()
item['title'] = '{!s}: {!s}'.format(title, episode.xpath('div[#class="info"]/strong/a/text()').extract())
item['imdb_id'] = episode.xpath('div[#class="image"]/a/div/#data-const').extract()
item['airdate'] = [x.strip() for x in episode.xpath('div/div[#class="airdate"]/text()').extract()]
yield item
When I try this script in scrapyd I got no result. It does have result in scrapy. I think the problem is in this line.
with open('series_episode.csv') as f:
I don't know where to put my csv file.
Please help me!!
Thanks
one option would be to save it in /tmp
with open('/tmp/series_episode.csv') as f:
Related
I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()
… I have a scrapy Code that is running in shell, but when I try to export it to csv, it returns an empty file. It exports data when I do not go into a link and try to parse the description
import scrapy
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmozs.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
filename = response.url.split("/")[-2] + '.html'
with open(filename, 'wb') as f:
f.write(response.body)
import scrapy
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
filename = response.url.split("/")[-2] + '.html'
with open(filename, 'wb') as f:
f.write(response.body)
just you have miss type
allowed_domains = ["dmozs.org"]
allowed_domains = ["dmoz.org"]
please change code allowed_domains = ["dmoz.org"]
Scrapy newbie here. I'm currently trying to extend my crawlspider so it can take in multiple arguments from a text document (instead of manually typing in each arguments into the command line like scrapy crawl crawl5 -a start_url="argument"). Currently, I can input one argument and that generates a few items. But I'd like some guidance on two problems:
How can I create an item for each argument?
How can I use that item as a container for the items I generate from each argument?
My goal here is to sort of imitate running my crawlspider multiple times, while keeping the items returned from each argument separate.
EDIT.. here's my code- as you can see it's a scraper for thesaurus.com
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from thesaurus.items import ThesaurusItem
class MySpider(CrawlSpider):
name = 'crawl5'
def __init__(self, *args, **kwargs):
self.start_urls = ["http://www.thesaurus.com/browse/%s" %kwargs.get('start_url')]
self.allowed_domains = ["thesaurus.com"]
self.rules = (
Rule(LinkExtractor(restrict_xpaths=("//div[id='paginator']//a/#href"))),
Rule(LinkExtractor(allow=('http://www.thesaurus.com/browse/%s/.$' %kwargs.get('start_url'), 'http://www.thesaurus.com/browse/%s/..$' %kwargs.get('start_url'))), callback='parse_item', follow=True)
)
super(MySpider, self).__init__(*args, **kwargs)
def parse_start_url(self, response):
for sel in response.xpath("//div[contains(#class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[#class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item
def parse_item(self, response):
for sel in response.xpath("//div[contains(#class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[#class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item
Here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
from scrapy.http import Request
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (Rule(SgmlLinkExtractor(allow=('\?page=\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
subpages = hxs.select('//a[#class="detailsHyper_class"]/#href').extract()
ep = hxs.select('//a[#itemprop="name"]/text()').extract()
ad = hxs.select('//div[#class="results_address_class"]/text()').extract()
for eponimia,address,subpage in zip(ep,ad,subpages):
vriskoit = VriskoItem()
vriskoit['eponimia'] = eponimia
vriskoit['address'] = address
request = Request(subpage,callback = self.subPage)
request.meta['vriskoit'] = vriskoit
yield request
def subPage(self,response):
vriskoit = response.meta['vriskoit']
hxs = HtmlXPathSelector(response)
vriskoit['category'] = hxs.select('//div[#class="category_class"]/span/text()').extract()
yield vriskoit
and here is my pipiline:
import csv
class myExporter(object):
def __init__(self):
self.brandCategoryCsv = csv.writer(open('brandCategoryTable.csv', 'wb'))
self.brandCategoryCsv.writerow(['eponimia', 'address','category'])
def process_item(self, item, spider):
for e,a,c in zip(item['eponimia'],item['address'],item['category']):
self.brandCategoryCsv.writerow([e.encode('utf-8'), a.encode('utf-8'), c.encode('utf-8')])
return item
my problem is that for both the first 2 fields (eponimia,address), only the first character is written to the output csv file and i cant find why.
Any help would be much appreciated, i am out of ideas.
Remove zip function from myExporter.process_item
def process_item(self, item, spider):
self.brandCategoryCsv.writerow([item['eponimia'].encode('utf-8'),
item['address'].encode('utf-8'),
item['category'].encode('utf-8')])
return item
You already converted items list to individual items in vriskoSpider.parse_start_url.
zip iterates your strings:
In [1]: a = 'test1'
In [2]: b = 'test2'
In [3]: for x, y in zip(a, b):
...: print x, y
...:
t t
e e
s s
t t
1 2
here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko