How do I change shell text to save to a CSV output? - scrapy

I am trying to make this code output to a csv file when calling the spider with -o output.csv
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "test"
handle_httpstatus_list = [404]
def parse(self, response):
print response.url
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = []
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
print sites
def __init__(self, spider=None, *a, **kw):
super(GetpagesfromsitemapSpider, self).__init__(*a, **kw)
self.spider = spider
l = []
url = "https://channelstore.roku.com"
resp = requests.head(url + "/sitemap.xml")
if (resp.status_code != 404):
l.append(resp.url)
else:
resp = requests.head(url + "/robots.txt")
if (resp.status_code == 200):
l.append(resp.url)
self.sitemap_urls = l
print self.sitemap_urls
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l
I have tried changing the print response url on line 18 to a few things but I cant seem to make this script output to a CSV, all I can manage is seeing the exact information I want but on the terminal screen.
This code is from here but I am not working well with the easy part of completing the code.
Any help is greatly appreciated!

Not clear from your example, but it looks like you are not passing the command line arguments (-o) to your SitemapSpider.
A simpler solution, instead of passing the -o argument, is to just redirect your output to a file:
my_script.py > output.csv
OR
my_script.py | tee output.csv <-- this way will write to file, and also output in your terminal
EDIT:
Not the most efficient way, but without seeing a full script:
def parse(self, response):
with open('output.csv', 'a') as fh:
fh.write('{}\n'.format(response.url))
This will append each response.url to a new line in the output.csv file

Related

Webcrawler - Scrapy Python

I need help with my webcrawler.
I got an invalid syntax here:
"f.write("{},{},{}\n".format(word,url,count))"
and also when I command "scrapy crawl FirstSpider > wordlist.csv" a csv file shows up but either is empty or not as structured as I want it to be.
I want to crawl 300 websites and need the data as structured as possible.
How can I get a csv file with the urls structured and then the count of the certain keywords next to it,
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.item import Item
import requests
def find_all_substrings(string, sub):
import re
starts = [match.start() for match in re.finditer(re.escape(sub), string)]
return starts
class FirstSpider(CrawlSpider):
name = "FirstSpider"
allowed_domains = ["www.example.com"]
start_urls = ["https://www.example.com/"]
rules = [Rule(LinkExtractor(), follow=True, callback="check_buzzwords")]
crawl_count = 0
words_found = 0
def check_buzzwords(self, response):
self.__class__.crawl_count += 1
wordlist = [
"keyword1",
"keyword2",
"keyword3"
]
url = response.url
data = response.body.decode('utf-8')
count = 0
for word in wordlist:
substrings = find_all_substrings(data, word)
count = 0
word_counts = {}
links = []
"f = open('wordlist.csv', 'w')"
for pos in substrings:
ok = False
if not ok:
count += 1
word_counts[word] = {url: count}
for link in links:
page = requests.get(link)
data = page.text
for word in wordlist:
substrings = find_all_substrings(data, word)
count = 0
for word in wordlist:
substrings = find_all_substrings(data, word)
for pos in substrings:
ok = False
if not ok:
"f.write("{},{},{}\n".format(word,url,count))"
self.__class__.words_found += 1
print(word + ";" + url + ";" + str(count) + ";")
with open('wordlist.csv', 'w') as f:
for word, data in word_counts.items():
for url, count in data.items():
f.write("{},{},{}\n".format(word, url, count))
f.close()
return Item()
def _requests_to_follow(self, response):
if getattr(response, "encoding", None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
I want to crawl websites for certain keywords (wordlist). My output should be a csv file with the following information: url, count of keyword found on the website.
I got an invalid syntax for the following ``` "f.write("{},{},{}\n".format(word,url,count))"
And the output csv file is often empty or does not crawl all the urls.
You have unnecessary quotation marks around lines 41 and 61
line 41 ---> "f = open('wordlist.csv', 'w')"
line 61 ---> "f.write("{},{},{}\n".format(word,url,count))"
Also usually you don't need to manually save data to a file because Scrapy has a built-in mechanism - Feed export
By using FEED_EXPORT_FIELDS setting you can specify which fields of the item should be exported and their order.
Here is the command to run the spider and save data to a file:
scrapy crawl FirstSpider -O url.csv
-O (capital 'O') means "rewrite a file"
-o (lowercase 'o') means "append to an existent file".

How to take user argument and pass it to Rule extractor in Scrapy

I have a config file in which many website details are present. I am taking user input argument in scrapy using -a parameter and taking out matching allowed_domains and start_urls from config file. Since this is a generic spider, I am using rule extractor.
Below is my code:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
import yaml
import re
import scrapy
with open("/scrapyConfig.yaml", "r") as f:
config = yaml.load(f, Loader=yaml.FullLoader)
def cleanHtml(raw_html):
CLEANR = re.compile('<.*?>')
cleanText = str(re.sub(CLEANR,'', raw_html))
return cleanText
def remove_tags(html):
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script']):
data.decompose()
noTagsData = str(' '.join(soup.stripped_strings))
return noTagsData
class SpiderSpider(CrawlSpider):
name = 'spider1'
def __init__(self, **kwargs):
super().__init__(**kwargs)
userInp = self.site
self.allowed_domains=config[userInp]['allowed_domain']
self.start_urls=config[userInp]['start_url']
rules = [(Rule(LinkExtractor(unique=False,allow=(config[self.site]['regex1'],config[self.site]['regex2'])),callback='parse_item',follow=True))]
def parse_item(self,response):
uncleanText = response.xpath(config[self.site]['xpath1']).extract()
cleanText = [x.replace("\n","") for x in uncleanText]
cleanText = [x.replace("\t"," ") for x in cleanText]
cleanText = [x.replace("\r","") for x in cleanText]
cleanText = [x.replace("\xa0","") for x in cleanText]
cleanText = [x.replace(":"," ") for x in cleanText]
cleanText = remove_tags(str(cleanText))
finalCleanJD = cleanHtml(str(cleanText))
yield {"URL":response.url,"Job Description":finalCleanJD}
I am able to take the user input and fetch corresponding allowed_domains and start_urls from config file using init function but when I am passing the same argument in rule extractor, it is not recognising self.site and if I put this rule extractor inside init function then spider is not scraping the page. It's just written as crawled in terminal and then it exits. Even the rule variable is not highlighted when it is inside init function which tells that rule variable is not used anywhere but when it is put outside init function it is getting highlighted but it is not recognizing self.site variable. How can I make this generic spider take user input argument and take out the matching details from config file and start scraping?

CsvItemExporter for multiple files in custom item pipeline not exporting all items

I have created an item pipeline as an answer to this question.
It is supposed to create a new file for every page according to the page_no value set in the item. This works mostly fine.
The problem is with the last csv file generated by the pipeline/item exporter, page-10.csv.
The last 10 values are not exported, so the file stays empty.
What could be the reason for this behaviour?
pipelines.py
from scrapy.exporters import CsvItemExporter
class PerFilenameExportPipeline:
"""Distribute items across multiple CSV files according to their 'page_no' field"""
def open_spider(self, spider):
self.filename_to_exporter = {}
def spider_closed(self, spider):
for exporter in self.filename_to_exporter.values():
exporter.finish_exporting()
def _exporter_for_item(self, item):
filename = 'page-' + str(item['page_no'])
del item['page_no']
if filename not in self.filename_to_exporter:
f = open(f'{filename}.csv', 'wb')
exporter = CsvItemExporter(f, export_empty_fields=True)
exporter.start_exporting()
self.filename_to_exporter[filename] = exporter
return self.filename_to_exporter[filename]
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item
spider
import scrapy
from ..pipelines import PerFilenameExportPipeline
class spidey(scrapy.Spider):
name = "idk"
custom_settings = {
'ITEM_PIPELINES': {
PerFilenameExportPipeline: 100
}
}
def start_requests(self):
yield scrapy.Request("http://quotes.toscrape.com/", cb_kwargs={'page_no': 1})
def parse(self, response, page_no):
for qts in response.xpath("//*[#class=\"quote\"]"):
yield {
'page_no': page_no,
'author' : qts.xpath("./span[2]/small/text()").get(),
'quote' : qts.xpath("./*[#class=\"text\"]/text()").get()
}
next_pg = response.xpath('//li[#class="next"]/a/#href').get()
if next_pg is not None:
yield response.follow(next_pg, cb_kwargs={'page_no': page_no + 1})
I know, 2 years later, but still - it might turn out helpful for someone.
It looks like you're never closing the file you're writing to (as you're using inline open). Please compare your code to the one in Scrapy's docs (the "Using Item Exporters" section): https://docs.scrapy.org/en/latest/topics/exporters.html
Besides, the method should now be called "close_spider", not "spider_closed"
Changing your code to the following should help:
from scrapy.exporters import CsvItemExporter
class PerFilenameExportPipeline:
def open_spider(self, spider):
self.filename_to_exporter = {}
def close_spider(self, spider):
#iterating over exporter-file tuples instead of only exporters
for exporter, csv_file in self.filename_to_exporter.values():
exporter.finish_exporting()
#closing the file
csv_file.close()
def _exporter_for_item(self, item):
filename = 'page-' + str(item['page_no'])
del item['page_no']
if filename not in self.filename_to_exporter:
csv_file = open(f'{filename}.csv', 'wb')
exporter = CsvItemExporter(f, export_empty_fields=True)
exporter.start_exporting()
#adding both exporter & file to later be closed as the dict's value
self.filename_to_exporter[filename] = (exporter, csv_file)
#picking only the exporter via [0]
return self.filename_to_exporter[filename][0]
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item

How scrapy crawl work:which class instanced and which method called?

Here is a simple python file--test.py.
import math
class myClass():
def myFun(self,x):
return(math.sqrt(x))
if __name__ == "__main__":
myInstance=myClass()
print(myInstance.myFun(9))
It print 3 with python test.py,let's analyse the running process.
1. to instance myClass and assign it to myInstance.
2.to call myFun function and print the result.
It is scrapy's turn.
In the scrapy1.4 manual,quotes_spider.py is as below.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
To run the spider with scrapy crawl quotes,i am puzzled:
1.Where is the main function or main body for the spider?
2.Which class was instanced?
3.Which method was called?
mySpider = QuotesSpider(scrapy.Spider)
mySpider.parse(response)
How scrapy crawl work exactly?
So let's start. Assuming you use linux/mac. Let's check where us scrapy
$ which scrapy
/Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
Let's look at the content of this file
$ cat /Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
#!/Users/tarun.lalwani/.virtualenvs/myproject/bin/python3.6
# -*- coding: utf-8 -*-
import re
import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
sys.exit(execute())
So this executes execute method from cmdline.py and her is your main method.
cmdline.py
from __future__ import print_function
....
....
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapy.conf.settings singleton ---
if settings is None and 'scrapy.conf' in sys.modules:
from scrapy import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
except KeyError: pass
else:
settings['EDITOR'] = editor
check_deprecated_settings(settings)
# --- backwards compatibility for scrapy.conf.settings singleton ---
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapy import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
if __name__ == '__main__':
execute()
Now if you notice execute method it processes the arguments passed by you. which is crawl quotes in your case. The execute methods scans the projects for classes and check which has name defined as quotes. It creates the CrawlerProcess class and that runs the whole show.
Scrapy is based on Twisted Python Framework. Which is a scheduler based framework.
Consider the below part of the code
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
When the engine executes this function and first yield is execute. The value is returned to the engined. The engine now looks at other task that are pending executes them, (when they yield, some other pending task queue function gets a chance). So yield is what allows to break a function execution into parts and help Scrapy/Twisted work.
You can get a detailed overview on the link below
https://doc.scrapy.org/en/latest/topics/architecture.html

How to make rules of CrawlSpider context-sensitive?

I notice that, the rule of CrawlSpider extract urls on every none-leaf pages.
Can I enable rule only when current page meets some condition (for example: url matches a regex)?
I have two pages:
-------------------Page A-------------------
Page URL: http://www.site.com/pattern-match.html
--------------------------------------------
- [link](http://should-extract-this)
- [link](http://should-extract-this)
- [link](http://should-extract-this)
--------------------------------------------
--------------------Page B--------------------
Page URL: http://www.site.com/pattern-not-match.html
-----------------------------------------------
- [link](http://should-not-extract-this)
- [link](http://should-not-extract-this)
- [link](http://should-not-extract-this)
-----------------------------------------------
So, the rule should only extract urls from PageA. How to do it? Thanks!
I just found a dirty way to inject response to rule.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scrapy.http import Request, HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
import inspect
class MyCrawlSpider(CrawlSpider):
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
if links and rule.process_links:
links = rule.process_links(links)
seen = seen.union(links)
for link in links:
r = Request(url=link.url, callback=self._response_downloaded)
r.meta.update(rule=n, link_text=link.text)
# ***>>> HACK <<<***
# pass `response` as additional argument to `process_request`
fun = rule.process_request
if not hasattr(fun, 'nargs'):
fun.nargs = len(inspect.getargs(fun.func_code).args)
if fun.nargs==1:
yield fun(r)
elif fun.nargs==2:
yield fun(r, response)
else:
raise Exception('too many arguments')
Try it out:
def process_request(request, response):
if 'magick' in response.url:
return request
class TestSpider(MyCrawlSpider):
name = 'test'
allowed_domains = ['test.com']
start_urls = ['http://www.test.com']
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//a'), callback='parse_item', process_request=process_request),
]
def parse_item(self, response):
print response.url