FileNotFoundError when I invoke AWS lambda function for integration test - testing

I want a integration test of aws lambda.
Therefore I used aws sdk v2 for java.
Please refer to the test code as shown below:
package integration;
import com.amazonaws.services.lambda.runtime.events.SQSEvent;
import com.squareup.moshi.JsonAdapter;
import com.squareup.moshi.JsonReader;
import com.squareup.moshi.JsonWriter;
import com.squareup.moshi.Moshi;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import software.amazon.awssdk.core.SdkBytes;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.lambda.LambdaClient;
import software.amazon.awssdk.services.lambda.model.InvokeRequest;
import software.amazon.awssdk.services.lambda.model.InvokeResponse;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
public class ConsumerRequestHandlerTest extends WsinDatabaseConnectionTest
{
private static final String FUNCTION_NAME = "ConsumerFunction";
private static final String LOCAL_ENDPOINT = "http://127.0.0.1:3001";
private LambdaClient lambdaClient;
private TestWsinJdbcTemplate testWsinJdbcTemplate = new TestWsinJdbcTemplate(getConnection());
private String makerId;
#BeforeEach
void setUp()
{
lambdaClient = LambdaClient.builder()
.region(Region.AP_NORTHEAST_2)
.endpointOverride(URI.create(LOCAL_ENDPOINT))
.build();
}
#Test
void integrationTest()
{
SQSEvent event = createEvent();
String jsonEvent = toJson(event);
SdkBytes payload = SdkBytes.fromUtf8String(jsonEvent);
InvokeRequest request = InvokeRequest.builder()
.functionName(FUNCTION_NAME)
.payload(payload)
.build();
InvokeResponse response = lambdaClient.invoke(request);
System.out.println(response);
}
#AfterEach
void tearDown()
{
if (Objects.nonNull(lambdaClient))
{
lambdaClient.close();
}
}
}
when the function invoked, the client gets an error as following
software.amazon.awssdk.services.lambda.model.LambdaException: ServiceException (Service: Lambda, Status Code: 500, Request ID: null, Extended Request ID: null)
and the server gets an error as following
Invoking com.manufacturer.ui.ConsumerRequestHandler::handleRequest (java8.al2)
WsinManufacturerLambdaShutdownExtension is a local Layer in the template
Image was not found.
Building image...Exception on /2015-03-31/functions/ConsumerFunction/invocations [POST]
Traceback (most recent call last):
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/flask/app.py", line 2447, in wsgi_app
response = self.full_dispatch_request()
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/flask/app.py", line 1952, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/flask/app.py", line 1821, in handle_user_exception
reraise(exc_type, exc_value, tb)
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/flask/_compat.py", line 39, in reraise
raise value
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/flask/app.py", line 1950, in full_dispatch_request
rv = self.dispatch_request()
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/flask/app.py", line 1936, in dispatch_request
return self.view_functions[rule.endpoint](**req.view_args)
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/local/lambda_service/local_lambda_invoke_service.py", line 165, in _invoke_request_handler
self.lambda_runner.invoke(function_name, request_data, stdout=stdout_stream_writer, stderr=self.stderr)
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/commands/local/lib/local_lambda.py", line 137, in invoke
self.local_runtime.invoke(
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/lib/telemetry/metric.py", line 230, in wrapped_func
return_value = func(*args, **kwargs)
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/local/lambdafn/runtime.py", line 178, in invoke
container = self.create(function_config, debug_context, container_host, container_host_interface)
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/local/lambdafn/runtime.py", line 73, in create
container = LambdaContainer(
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/local/docker/lambda_container.py", line 93, in __init__
image = LambdaContainer._get_image(
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/local/docker/lambda_container.py", line 236, in _get_image
return lambda_image.build(runtime, packagetype, image, layers, architecture, function_name=function_name)
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/local/docker/lambda_image.py", line 163, in build
self._build_image(
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/local/docker/lambda_image.py", line 261, in _build_image
with create_tarball(tar_paths, tar_filter=tar_filter) as tarballfile:
File "/usr/local/Cellar/python#3.8/3.8.12_1/Frameworks/Python.framework/Versions/3.8/lib/python3.8/contextlib.py", line 113, in __enter__
return next(self.gen)
File "/usr/local/Cellar/aws-sam-cli/1.38.1/libexec/lib/python3.8/site-packages/samcli/lib/utils/tar.py", line 29, in create_tarball
archive.add(path_on_system, arcname=path_in_tarball, filter=tar_filter)
File "/usr/local/Cellar/python#3.8/3.8.12_1/Frameworks/Python.framework/Versions/3.8/lib/python3.8/tarfile.py", line 1955, in add
tarinfo = self.gettarinfo(name, arcname)
File "/usr/local/Cellar/python#3.8/3.8.12_1/Frameworks/Python.framework/Versions/3.8/lib/python3.8/tarfile.py", line 1834, in gettarinfo
statres = os.lstat(name)
FileNotFoundError: [Errno 2] No such file or directory: '/Users/we/IdeaProjects/wsin-manufacturer-queue-serverless/WsinManufacturerLambdaShutdownExtension.zip'
2022-04-22 10:41:20 127.0.0.1 - - [22/Apr/2022 10:41:20] "POST /2015-03-31/functions/ConsumerFunction/invocations HTTP/1.1" 500 -
Do I have to place the file on the path?
Why does the error occur?
Thank you in advance.

Related

telethon fails on python on GAE platform

I would appreciate if somebody could shine a light on this, I try to setup a simple telegram bot to be run on Python on Google App Engine with telethon libarary, but failed. Was this approach workable ?
after gcloud app deploy, I receive the Internal Server Error from browser.
the problem does not happen if I remarked the following line:
client = TelegramClient(phone, api_id, api_hash)
main.py
from flask import Flask, request
import datetime
import time
import asyncio
from telethon import TelegramClient, events, sync
app = Flask(__name__)
api_id = xxxxxxxx
api_hash = 'xxxxxxxxxxxxxxxxxxxxxxxx'
phone = '+xxxxxxxxxxxx'
#app.route('/', methods=['GET'])
def hello():
reqaction = request.args.get('action', 'connect')
client = TelegramClient(phone, api_id, api_hash)
if __name__ == '__main__':
app.run(host='localhost', port=8080, debug=True)
below is the logging:
Traceback (most recent call last):
File "/env/lib/python3.7/site-packages/flask/app.py", line 2446, in wsgi_app
response = self.full_dispatch_request()
File "/env/lib/python3.7/site-packages/flask/app.py", line 1951, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/env/lib/python3.7/site-packages/flask/app.py", line 1820, in handle_user_exception
reraise(exc_type, exc_value, tb)
File "/env/lib/python3.7/site-packages/flask/_compat.py", line 39, in reraise
raise value
File "/env/lib/python3.7/site-packages/flask/app.py", line 1949, in full_dispatch_request
rv = self.dispatch_request()
File "/env/lib/python3.7/site-packages/flask/app.py", line 1935, in dispatch_request
return self.view_functions[rule.endpoint](**req.view_args)
File "/srv/main.py", line 95, in hello
client = TelegramClient(phone, api_id, api_hash) ;
File "lib/telethon/client/telegrambaseclient.py", line 230, in __init__
self._loop = loop or asyncio.get_event_loop()
File "/opt/python3.7/lib/python3.7/asyncio/events.py", line 644, in get_event_loop
% threading.current_thread().name)
RuntimeError: There is no current event loop in thread 'ThreadPoolExecutor-0_0'.
Many thanks in advance for your giude and help :)

How to write numpy arrays directly to s3 in a deep learning application backed by spark

We are generating ~10k numpy arrays using keras and then finally we have to save those arrays as .npy files to s3. But the problem is for saving to s3 inside the map function of spark we have to create intermediate file.What we want is instead of creating intermediate files directly stream them to s3. I used this "Cottoncandy" library but then its not working inside spark map function and throwing error as:-
pickle.PicklingError: Could not serialize object: TypeError: can't pickle thread.lock objects
Is there any possible way/library available which we can use inside a deep learning application inside spark map function to directly stream the numpy arrays to s3 ?
I have my rdd of numpy array as:
features_rdd
options I tried:-
def writePartition(xs):
cci = cc.get_interface('BUCKET_NAME', ACCESS_KEY=os.environ.get("AWS_ACCESS_KEY_ID"),
SECRET_KEY=os.environ.get("AWS_SECRET_ACCESS_KEY"), endpoint_url='https://s3.amazonaws.com')
#output_path, format_name
for k,v in xs:
file_name_with_domain = get_file_with_parents(k, 1)
file_name = ...
file_name_without_ext = get_file_name_without_ext(file_name)
bucket_name = OUTPUT.split('/', 1)[0]
rest_of_path = OUTPUT.split('/', 1)[1]
final_path = rest_of_path + '/' + file_name_without_ext + '.' + '.npy'
LOGGER.info("Saving to S3....")
response = cci.upload_npy_array(final_path, v)
features_rdd.foreachpartition(writePartition)
option 2:-
def writePartition1(xs):
s3 = boto3.client('s3',region_name='us-east-1')
for k,v in xs:
...
...
np.save(local_dir_full_path, v)
s3.upload_file(local_dir_full_path, 'BUCKET', s3_full_path)
os.remove(local_dir_full_path)
features_rdd.foreachpartition(writePartition1)
Error:-
File "/usr/lib64/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib64/python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib64/python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "/usr/lib64/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib64/python2.7/pickle.py", line 606, in save_list
self._batch_appends(iter(obj))
File "/usr/lib64/python2.7/pickle.py", line 642, in _batch_appends
save(tmp[0])
File "/usr/lib64/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 600, in save_reduce
save(state)
File "/usr/lib64/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib64/python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib64/python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "/usr/lib64/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 600, in save_reduce
save(state)
File "/usr/lib64/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib64/python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib64/python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "/usr/lib64/python2.7/pickle.py", line 306, in save
rv = reduce(self.proto)
TypeError: can't pickle thread.lock objects
Traceback (most recent call last):
File "six_file_boto3_write1.py", line 248, in <module>
run()
File "six_file_boto3_write1.py", line 239, in run
features_rdd.foreachPartition(writePartitionWithBoto)
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 799, in foreachPartition
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 1041, in count
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 1032, in sum
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 906, in fold
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 809, in collect
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 2455, in _jrdd
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 2388, in _wrap_function
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 2374, in _prepare_for_python_RDD
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/serializers.py", line 464, in dumps
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 704, in dumps
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 162, in dump
pickle.PicklingError: Could not serialize object: TypeError: can't pickle thread.lock objects
imports:-
from pyspark.sql import SparkSession
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.models import Model
from io import BytesIO
from keras.applications.vgg16 import preprocess_input
import numpy as np
import logging
import os
import boto3
import cottoncandy as cc
So,basically the application works perfectly fine till features_rdd. Even I can verify the count. But when I am trying to save these features that part its not working. Added the imports above
updates:-
def extract_features(model,obj):
try:
print('executing vgg16 feature extractor...')
img = image.load_img(BytesIO(obj), target_size=(224, 224,3))
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)
vgg16_feature = model.predict(img_data)[0]
print('++++++++++++++++++++++++++++',vgg16_feature.shape)
return vgg16_feature
except Exception as e:
print('Error......{}'.format(e.args))
return []
def extract_features_(xs):
model_data = initVGG16()
for k, v in xs:
yield k, extract_features(model_data, v)
spark = SparkSession \
.builder \
.appName('test-app') \
.getOrCreate()
sc = spark.sparkContext
s3_files_rdd = sc.binaryFiles(RESOLVED_IMAGE_PATH)
s3_files_rdd.persist()
features_rdd = s3_files_rdd.mapPartitions(extract_features_)

python urllib error - AttributeError: 'module' object has no attribute 'request'

I am trying out a tutorial code which fetches the html code form a website and prints it. I'm using python 3.4.0 on ubuntu. The code:
import urllib.request
page = urllib.request.urlopen("http://www.brainjar.com/java/host/test.html")
text = page.read().decode("utf8")
print(text)
I saw previous solutions and tried them, I also tried importing only urllib but it still doesn't work. The error message displayed is as shown:
Traceback (most recent call last):
File "string.py", line 1, in <module>
import urllib.request
File "/usr/lib/python3.4/urllib/request.py", line 88, in <module>
import http.client
File "/usr/lib/python3.4/http/client.py", line 69, in <module>
import email.parser
File "/usr/lib/python3.4/email/parser.py", line 12, in <module>
from email.feedparser import FeedParser, BytesFeedParser
File "/usr/lib/python3.4/email/feedparser.py", line 27, in <module>
from email import message
File "/usr/lib/python3.4/email/message.py", line 15, in <module>
from email import utils
File "/usr/lib/python3.4/email/utils.py", line 40, in <module>
from email.charset import Charset
File "/usr/lib/python3.4/email/charset.py", line 15, in <module>
import email.quoprimime
File "/usr/lib/python3.4/email/quoprimime.py", line 44, in <module>
from string import ascii_letters, digits, hexdigits
File "/media/saiwal/D89602199601F930/Documents/Copy/codes/python/headfirst/string.py", line 2, in <module>
page = urllib.request.urlopen("http://www.brainjar.com/java/host/test.html")
AttributeError: 'module' object has no attribute 'request'
Error in sys.excepthook:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/apport_python_hook.py", line 63, in apport_excepthook
from apport.fileutils import likely_packaged, get_recent_crashes
File "/usr/lib/python3/dist-packages/apport/__init__.py", line 5, in <module>
from apport.report import Report
File "/usr/lib/python3/dist-packages/apport/report.py", line 21, in <module>
from urllib.request import urlopen
File "/usr/lib/python3.4/urllib/request.py", line 88, in <module>
import http.client
File "/usr/lib/python3.4/http/client.py", line 69, in <module>
import email.parser
File "/usr/lib/python3.4/email/parser.py", line 12, in <module>
from email.feedparser import FeedParser, BytesFeedParser
File "/usr/lib/python3.4/email/feedparser.py", line 27, in <module>
from email import message
File "/usr/lib/python3.4/email/message.py", line 15, in <module>
from email import utils
File "/usr/lib/python3.4/email/utils.py", line 40, in <module>
from email.charset import Charset
File "/usr/lib/python3.4/email/charset.py", line 15, in <module>
import email.quoprimime
File "/usr/lib/python3.4/email/quoprimime.py", line 44, in <module>
from string import ascii_letters, digits, hexdigits
File "/media/saiwal/D89602199601F930/Documents/Copy/codes/python/headfirst/string.py", line 2, in <module>
page = urllib.request.urlopen("http://www.brainjar.com/java/host/test.html")
AttributeError: 'module' object has no attribute 'request'
Original exception was:
Traceback (most recent call last):
File "string.py", line 1, in <module>
import urllib.request
File "/usr/lib/python3.4/urllib/request.py", line 88, in <module>
import http.client
File "/usr/lib/python3.4/http/client.py", line 69, in <module>
import email.parser
File "/usr/lib/python3.4/email/parser.py", line 12, in <module>
from email.feedparser import FeedParser, BytesFeedParser
File "/usr/lib/python3.4/email/feedparser.py", line 27, in <module>
from email import message
File "/usr/lib/python3.4/email/message.py", line 15, in <module>
from email import utils
File "/usr/lib/python3.4/email/utils.py", line 40, in <module>
from email.charset import Charset
File "/usr/lib/python3.4/email/charset.py", line 15, in <module>
import email.quoprimime
File "/usr/lib/python3.4/email/quoprimime.py", line 44, in <module>
from string import ascii_letters, digits, hexdigits
File "/media/saiwal/D89602199601F930/Documents/Copy/codes/python/headfirst/string.py", line 2, in <module>
page = urllib.request.urlopen("http://www.brainjar.com/java/host/test.html")
AttributeError: 'module' object has no attribute 'request'
This looks like a nasty coincidence.
TL;DR: Don’t name your script string.py.
So what’s happening here?
You’re trying to import urllib.request.
urllib.request tries to import http.client, which tries to import email.parser, which tries to import email.feedparser, which tries to import email.message, which tries to import email.utils, which tries to import email.charset, which tries to import email.quoprimime.
email.quoprimime tries to import string, expecting it to be the standard Python string module—but since the current working directory has priority over the standard Python library directories, it finds your string.py instead and tries to import that.
When importing your string.py, you try to import urllib.request. Since urllib.request is still being imported, you get back a skeleton urllib without a request attribute yet.
Because your imported string.py then fails because it can’t find the request attribute, the exception starts propagating back up.
But wait, there’s more! Since there was an error during an import, Ubuntu tries to be helpful by seeing if you’re missing a dpkg package. If so, it could say “hey, it looks like you’re missing this module; want to apt-get it?” So the mechanism for looking up the appropriate package is activated…
…but the module for looking up the appropriate package itself depends on urllib.request, so it tries to import it, and again fails…
In short, because you picked string.py as a file name, you overrode the standard string module, which broke a lot of other modules, and even broke the module that was supposed to be helpful when you were missing a module, causing a whole lot of havoc. Fortunately the solution is easy: rename your script.

xmlrpc server in jython

I've tested in jython the xmlrpc code that works in python:
server code:
import xmlrpclib
from SimpleXMLRPCServer import SimpleXMLRPCServer
def is_even(n):
return n%2 == 0
server = SimpleXMLRPCServer(("localhost", 8000))
print "Listening on port 8000..."
server.register_function(is_even, "is_even")
server.serve_forever()
client code:
import xmlrpclib
proxy = xmlrpclib.ServerProxy("http://localhost:8000/")
print "3 is even: %s" % str(proxy.is_even(3))
print "100 is even: %s" % str(proxy.is_even(100))
I have the errors:
File "<stdin>", line 1, in <module>
File "/usr/lib/python2.7/xmlrpclib.py", line 1224, in __call__
return self.__send(self.__name, args)
File "/usr/lib/python2.7/xmlrpclib.py", line 1578, in __request
verbose=self.__verbose
File "/usr/lib/python2.7/xmlrpclib.py", line 1264, in request
return self.single_request(host, handler, request_body, verbose)
File "/usr/lib/python2.7/xmlrpclib.py", line 1297, in single_request
return self.parse_response(response)
File "/usr/lib/python2.7/xmlrpclib.py", line 1473, in parse_response
return u.close()
File "/usr/lib/python2.7/xmlrpclib.py", line 793, in close
raise Fault(**self._stack[0])
xmlrpclib.Fault: <Fault 1: "<type 'org.xml.sax.SAXException'>:org.xml.sax.SAXException: SAX2 driver class org.apache.xerces.parsers.SAXParser not found\njava.lang.ClassNotFoundException: org.apache.xerces.parsers.SAXParser">
I suppose I need to include the xerces library to the server?
It turns oft that I need to include de xerces library into the CLASSPATH. But what I don't understand is why it's not included by default when I install Jython

Why is Scrapy spider not loading?

I am a little new to the Scraping domain and was able to manage the following piece of code for my spider:
import os
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'thesentientspider.settings')
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.utils.response import get_base_url
from urlparse import urljoin
from thesentientspider.items import RestaurantDetails, UserReview
import urllib
from scrapy.conf import settings
import pymongo
from pymongo import MongoClient
#MONGODB Settings
MongoDBServer=settings['MONGODB_SERVER']
MongoDBPort=settings['MONGODB_PORT']
class ZomatoSpider(BaseSpider):
name = 'zomatoSpider'
allowed_domains = ['zomato.com']
CITY=["hyderabad"]
start_urls = [
'http://www.zomato.com/%s/restaurants/' %cityName for cityName in CITY
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
BASE_URL=get_base_url(response)
However, when i try to launch it through the scrapy crawl zomatoSpider command, it throws the following error:
Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/pymodules/python2.6/scrapy/cmdline.py", line 131, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.6/scrapy/cmdline.py", line 76, in _run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.6/scrapy/cmdline.py", line 138, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.6/scrapy/commands/crawl.py", line 43, in run
spider = self.crawler.spiders.create(spname, **opts.spargs)
File "/usr/lib/pymodules/python2.6/scrapy/command.py", line 33, in crawler
self._crawler.configure()
File "/usr/lib/pymodules/python2.6/scrapy/crawler.py", line 40, in configure
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/pymodules/python2.6/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/lib/pymodules/python2.6/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/pymodules/python2.6/scrapy/spidermanager.py", line 23, in __init__
self._load_spiders(module)
File "/usr/lib/pymodules/python2.6/scrapy/spidermanager.py", line 26, in _load_spiders
for spcls in iter_spider_classes(module):
File "/usr/lib/pymodules/python2.6/scrapy/utils/spider.py", line 21, in iter_spider_classes
issubclass(obj, BaseSpider) and \
TypeError: issubclass() arg 1 must be a class
Could anyone please point out the root cause and suggest modification for the same via a code snippet?
def __init__(self):
MongoDBServer=settings['MONGODB_SERVER']
MongoDBPort=settings['MONGODB_PORT']
database=settings['MONGODB_DB']
rest_coll=settings['RESTAURANTS_COLLECTION']
review_coll=settings['REVIEWS_COLLECTION']
client=MongoClient(MongoDBServer, MongoDBPort)
db=client[database]
self.restaurantsCollection=db[rest_coll]
self.reviewsCollection=db[review_coll]
This is the code that i added in to make it work. Hope it helps.