scrapy memory consumption is increasing only - scrapy

I am using scrapy along with scrapyrt to create APIs. My application have around 20 spiders. We are using NFS server for load balancing on production. Unfortunately application using 40% and more space.
"stats": {
"downloader/request_bytes": 12033,
"downloader/request_count": 5,
"downloader/request_method_count/GET": 4,
"downloader/request_method_count/POST": 1,
"downloader/response_bytes": 20165,
"downloader/response_count": 5,
"downloader/response_status_count/200": 3,
"downloader/response_status_count/302": 1,
"downloader/response_status_count/404": 1,
"finish_reason": "finished",
"finish_time": "2019-05-23 06:05:04",
"item_scraped_count": 1,
"log_count/DEBUG": 35,
"log_count/INFO": 20,
"memusage/max": 3399057408,
"memusage/startup": 3399057408,
"request_depth_max": 2,
"response_received_count": 4,
"scheduler/dequeued": 4,
"scheduler/dequeued/memory": 4,
"scheduler/enqueued": 4,
"scheduler/enqueued/memory": 4,
"start_time": "2019-05-23 06:05:01"
}
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
14500 root 20 0 4999116 3.190g 7184 S 0.3 40.9 103:34.01 scrapyrt
I followed scrapy memory leak documentation so remove meta attribute from request, but still memory is increasing.
class GarudaRetrieveBookingSpider(Spider):
"""get the pax, flight and fare details"""
name = "garuda_retrieve_booking"
meta = dict()
formdata = dict()
start_url = ''
booking_code = ''
output_dict = {'schedule_detail': [], 'pax_details': [], 'reservation_name': '', 'fare_details': {}}
pax_count = 0
adult_count = 0
child_count = 0
infant_count = 0
ticket_list_adt_child = []
ticket_list_inf = []
# this variable is created to save rt command response data to pass in next call if there are no tickets
rt_response = ''
def start_requests(self):
"""
:return: Request object
"""
post_data = self.data
garuda_session_id = post_data['parameter']['jSessionId']
post_data["command"] = "IG"
file_path = os.path.join(GARUDA_SESSION_FILES_PATH, TODAY_DATE, garuda_session_id + "_session.txt")
session_data = get_cookies(self, file_path)
self.start_url = GARUDA_KEEP_ALIVE_URL.format(session_id=session_data["jSessionId"], site=SITE, lang=LANGUAGE)
self.meta = {"session_data": session_data, "post_data": post_data}
return [Request(self.start_url, self.parse, errback=self.errback_httpbin)]
def parse(self, response):
"""
:param response:
:return: FormRequest
description: submit IG command
"""
self.log("\n\nparse response: {}\n\n".format(response.text))
if response.status != 200:
error_message = 'parse method failed.'
return {"status": False, "error_message": error_message}
session_data = self.meta["session_data"]
command = self.meta["post_data"]["command"]
# override the command with current command
session_data["tasks"][0]["command"]["command"] = command
self.formdata = {
"data": json.dumps(session_data)
}
yield scrapy.FormRequest(CRYTO_COMMAND_URL, formdata=self.formdata,
callback=self.ig_command_response, errback=self.errback_httpbin)

Related

Why am I getting 'Key Error' at for my file I am doing

Receiving a key error:
KeyError at /
'entries'
Request Method: GET
Request URL: http://localhost:8000/
Django Version: 3.2.14
Exception Type: KeyError
Exception Value:
'entries'
Exception Location: /home/project/agfzb-CloudAppDevelopment_Capstone/server/djangoapp/restapis.py, line 50, in get_all_dealerships_from_cf
Python Executable: /usr/bin/python3
Python Version: 3.6.9
Python Path:
['/home/project/agfzb-CloudAppDevelopment_Capstone/server',
'/usr/lib/python36.zip',
'/usr/lib/python3.6',
'/usr/lib/python3.6/lib-dynload',
'/home/theia/.local/lib/python3.6/site-packages',
'/usr/local/lib/python3.6/dist-packages',
'/usr/lib/python3/dist-packages']
Error message from Theia Labs:
File "/home/project/agfzb-CloudAppDevelopment_Capstone/server/djangoapp/views.py", line 42, in get_all_dealerships
dealer = get_all_dealerships_from_cf(dealer_url, id=id)
File "/home/project/agfzb-CloudAppDevelopment_Capstone/server/djangoapp/restapis.py", line 50, in get_all_dealerships_from_cf
dealers = json_result["entries"]
KeyError: 'entries'
from my views py file:
get_all_dealerships(request):
if request.method == "GET":
context = {}
dealer_url = "https://2d6871f8.us-south.apigw.appdomain.cloud/api/dealership/dealership"
apikey="xxxxxxxxxxxxxxx"
dealer = get_all_dealerships_from_cf(dealer_url, id=id)
context["dealer"] = dealer
review_url = "https://2d6871f8.us-south.apigw.appdomain.cloud/api/get_review/review"
reviews = get_allReviews_byID_from_cf(review_url, id=id)
print(reviews)
context["reviews"] = reviews
return render(request, 'djangoapp/dealer_details.html', context)
from my restapis file:
def get_all_dealerships_from_cf(url, **kwargs):
results = []
# Call get_request with a URL parameter
json_result = get_request(url)
if json_result:
# Get the row list in JSON as dealers
dealers = json_result["entries"]
# For each dealer object
for dealer in dealers:
# Get its content in `doc` object
dealer_doc = dealer["doc"]
# Create a CarDealer object with values in `doc` object
dealer_obj = CarDealer(address=dealer_doc["address"], city=dealer_doc["city"], id=dealer_doc["id"], lat=dealer_doc["lat"], long=dealer_doc["long"],
short_name=dealer_doc["short_name"],
st=dealer_doc["st"], zip=dealer_doc["zip"])
results.append(dealer_obj)
return results
Can't figure it out...

Pybullet on colab, cannot connect X server

I am using rl-baselines-zoo 3 to run ddpg with my custom env on colab. After I used show video function in that zoo repo, it said it cannot connect to the server. It works fine on other built-in envs, so I guess it's my env problem. please, need some help...
I set every thing from zoo's tutorials
Traceback:
pybullet build time: Jul 12 2021 20:46:20
/usr/local/lib/python3.7/dist-packages/gym/logger.py:30: UserWarning:
WARN: Box bound precision lowered by casting to float32
startThreads creating 1 threads.
starting thread 0
started thread 0
argc=2
argv[0] = --unused
argv[1] = --start_demo_name=Physics Server
ExampleBrowserThreadFunc started
X11 functions dynamically loaded using dlopen/dlsym OK!
X11 functions dynamically loaded using dlopen/dlsym OK!
Creating context
Created GL 3.3 context
Direct GLX rendering context obtained
Making context current
GL_VENDOR=VMware, Inc.
GL_RENDERER=llvmpipe (LLVM 10.0.0, 256 bits)
GL_VERSION=3.3 (Core Profile) Mesa 20.0.8
GL_SHADING_LANGUAGE_VERSION=3.30
pthread_getconcurrency()=0
Version = 3.3 (Core Profile) Mesa 20.0.8
Vendor = VMware, Inc.
Renderer = llvmpipe (LLVM 10.0.0, 256 bits)
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0
MotionThreadFunc thread started
ven = VMware, Inc.
ven = VMware, Inc.
Wrapping the env in a VecTransposeImage.
tcmalloc: large alloc 3276800000 bytes == 0x556b03bda000 # 0x7f7cad04a001 0x7f7caa3f554f 0x7f7caa445b58 0x7f7caa449b17 0x7f7caa4e8203 0x556a81194d54 0x556a81194a50 0x556a81209105 0x556a812037ad 0x556a81196c9f 0x556a811d7d79 0x556a811d4cc4 0x556a81196ea1 0x556a81205bb5 0x556a8119630a 0x556a812087f0 0x556a812037ad 0x556a811963ea 0x556a8120460e 0x556a812034ae 0x556a811963ea 0x556a8120532a 0x556a812034ae 0x556a812031b3 0x556a81201660 0x556a81194b59 0x556a81194a50 0x556a81208453 0x556a812034ae 0x556a811963ea 0x556a812043b5
tcmalloc: large alloc 3276800000 bytes == 0x556bc78da000 # 0x7f7cad04a001 0x7f7caa3f554f 0x7f7caa445b58 0x7f7caa449b17 0x7f7caa4e8203 0x556a81194d54 0x556a81194a50 0x556a81209105 0x556a812037ad 0x556a81196c9f 0x556a811d7d79 0x556a811d4cc4 0x556a81196ea1 0x556a81205bb5 0x556a8119630a 0x556a812087f0 0x556a812037ad 0x556a811963ea 0x556a8120460e 0x556a812034ae 0x556a811963ea 0x556a8120532a 0x556a812034ae 0x556a812031b3 0x556a81201660 0x556a81194b59 0x556a81194a50 0x556a81208453 0x556a812034ae 0x556a811963ea 0x556a812043b5
/content/gdrive/My Drive/hsr/rl-baselines3-zoo/logs/ddpg/FoodHuntingHSR-v0_3/videos/final-model-ddpg-FoodHuntingHSR-v0-step-0-to-step-200.mp4
/usr/local/lib/python3.7/dist-packages/gym/logger.py:30: UserWarning:
WARN: Tried to pass invalid video frame, marking as broken: Your frame has data type float32, but we require uint8 (i.e. RGB values from 0-255).
Saving video to /content/gdrive/My Drive/hsr/rl-baselines3-zoo/logs/ddpg/FoodHuntingHSR-v0_3/videos/final-model-ddpg-FoodHuntingHSR-v0-step-0-to-step-200.mp4
numActiveThreads = 0
stopping threads
destroy semaphore
semaphore destroyed
Thread with taskId 0 exiting
Thread TERMINATED
destroy main semaphore
main semaphore destroyed
finished
numActiveThreads = 0
btShutDownExampleBrowser stopping threads
Thread with taskId 0 exiting
Thread TERMINATED
destroy semaphore
semaphore destroyed
destroy main semaphore
main semaphore destroyed
Exception ignored in: <function VecVideoRecorder.__del__ at 0x7f7c2b5cc200>
Traceback (most recent call last):
File "/content/gdrive/My Drive/hsr/stable-baselines3/stable_baselines3/common/vec_env/vec_video_recorder.py", line 114, in __del__
File "/content/gdrive/My Drive/hsr/stable-baselines3/stable_baselines3/common/vec_env/vec_video_recorder.py", line 110, in close
File "/content/gdrive/My Drive/hsr/stable-baselines3/stable_baselines3/common/vec_env/base_vec_env.py", line 278, in close
File "/content/gdrive/My Drive/hsr/stable-baselines3/stable_baselines3/common/vec_env/dummy_vec_env.py", line 67, in close
File "/content/gdrive/My Drive/hsr/stable-baselines3/stable_baselines3/common/monitor.py", line 113, in close
File "/usr/local/lib/python3.7/dist-packages/gym/core.py", line 243, in close
File "/usr/local/lib/python3.7/dist-packages/gym/core.py", line 243, in close
File "/content/gdrive/My Drive/hsr/PyLIS/gym-foodhunting/gym_foodhunting/foodhunting/gym_foodhunting.py", line 538, in close
pybullet.error: Not connected to physics server
class FoodHuntingEnv(gym.Env):
metadata = {'render.modes': ['human','rgb_array']}
GRAVITY = -10.0
BULLET_STEPS = 120 # p.setTimeStep(1.0 / 240.0), so 1 gym step == 0.5 sec.
def __init__(self, render=False, robot_model=R2D2, max_steps=500, num_foods=3, num_fakes=0, object_size=1.0, object_radius_scale=1.0, object_radius_offset=1.0, object_angle_scale=1.0):
"""Initialize environment.
"""
### gym variables
self.observation_space = robot_model.getObservationSpace() # classmethod
self.action_space = robot_model.getActionSpace() # classmethod
self.reward_range = (-1.0, 1.0)
self.seed()
### pybullet settings
self.ifrender = render
self.physicsClient = p.connect(p.GUI if self.ifrender else p.DIRECT)
p.setAdditionalSearchPath(pybullet_data.getDataPath())
### env variables
self.robot_model = robot_model
self.max_steps = max_steps
self.num_foods = num_foods
self.num_fakes = num_fakes
self.object_size = object_size
self.object_radius_scale = object_radius_scale
self.object_radius_offset = object_radius_offset
self.object_angle_scale = object_angle_scale
self.plane_id = None
self.robot = None
self.object_ids = []
### episode variables
self.steps = 0
self.episode_rewards = 0.0
def close(self):
"""Close environment.
"""
p.disconnect(self.physicsClient)
def reset(self):
"""Reset environment.
"""
self.steps = 0
self.episode_rewards = 0
p.resetSimulation()
# p.setTimeStep(1.0 / 240.0)
p.setGravity(0, 0, self.GRAVITY)
self.plane_id = p.loadURDF('plane.urdf')
self.robot = self.robot_model()
self.object_ids = []
for i, (pos, orn) in enumerate(self._generateObjectPositions(num=(self.num_foods+self.num_fakes), radius_scale=self.object_radius_scale, radius_offset=self.object_radius_offset, angle_scale=self.object_angle_scale)):
if i < self.num_foods:
urdfPath = 'food_sphere.urdf'
else:
urdfPath = 'food_cube.urdf'
object_id = p.loadURDF(urdfPath, pos, orn, globalScaling=self.object_size)
self.object_ids.append(object_id)
for i in range(self.BULLET_STEPS):
p.stepSimulation()
obs = self._getObservation()
#print('reset laile')
#self.robot.printAllJointInfo()
return obs
def step(self, action):
"""Apply action to environment, then return observation and reward.
"""
self.steps += 1
self.robot.setAction(action)
reward = -1.0 * float(self.num_foods) / float(self.max_steps) # so agent needs to eat foods quickly
for i in range(self.BULLET_STEPS):
p.stepSimulation()
reward += self._getReward()
self.episode_rewards += reward
obs = self._getObservation()
done = self._isDone()
pos, orn = self.robot.getPositionAndOrientation()
info = { 'steps': self.steps, 'pos': pos, 'orn': orn }
if done:
#print('Done laile')
info['episode'] = { 'r': self.episode_rewards, 'l': self.steps }
# print(self.episode_rewards, self.steps)
#print(self.robot.getBaseRollPosition(), self.robot.getTorsoLiftPosition(), self.robot.getHeadPosition(), self.robot.getArmPosition(), self.robot.getWristPosition(), self.robot.getGripperPosition()) # for HSR debug
#print(self.robot.getHeadPosition(), self.robot.getGripperPosition()) # for R2D2 debug
return obs, reward, done, info
def render(self, mode='human', close=False):
"""This is a dummy function. This environment cannot control rendering timing.
"""
if mode != 'rgb_array':
return np.array([])
return self._getObservation()
def seed(self, seed=None):
"""Set random seed.
"""
self.np_random, seed = seeding.np_random(seed)
return [seed]
def _getReward(self):
"""Detect contact points and return reward.
"""
reward = 0
contacted_object_ids = [ object_id for object_id in self.object_ids if self.robot.isContact(object_id) ]
for object_id in contacted_object_ids:
reward += 1 if self._isFood(object_id) else -1
p.removeBody(object_id)
self.object_ids.remove(object_id)
return reward
def _getObservation(self):
"""Get observation.
"""
obs = self.robot.getObservation()
return obs
def _isFood(self, object_id):
"""Check if object_id is a food.
"""
baseLink, urdfPath = p.getBodyInfo(object_id)
return urdfPath == b'food_sphere.urdf' # otherwise, fake
def _isDone(self):
"""Check if episode is done.
"""
#print(self.object_ids,'self')
available_object_ids = [ object_id for object_id in self.object_ids if self._isFood(object_id) ]
#print(available_object_ids)
return self.steps >= self.max_steps or len(available_object_ids) <= 0
def _generateObjectPositions(self, num=1, retry=100, radius_scale=1.0, radius_offset=1.0, angle_scale=1.0, angle_offset=0.5*np.pi, z=1.5, near_distance=0.5):
"""Generate food positions randomly.
"""
def genPos():
r = radius_scale * self.np_random.rand() + radius_offset
a = -np.pi * angle_scale + angle_offset
b = np.pi * angle_scale + angle_offset
ang = (b - a) * self.np_random.rand() + a
return np.array([r * np.sin(ang), r * np.cos(ang), z])
def isNear(pos, poss):
for p, o in poss:
if np.linalg.norm(p - pos) < near_distance:
return True
return False
def genPosRetry(poss):
for i in range(retry):
pos = genPos()
if not isNear(pos, poss):
return pos
return genPos()
poss = []
for i in range(num):
pos = genPosRetry(poss)
orn = p.getQuaternionFromEuler([0.0, 0.0, 2.0*np.pi*self.np_random.rand()])
poss.append((pos, orn))
return poss

Getting flow infos from switch and copy the info in csv file (Ryu controller)

Hope you help me, I want to get flow info from switch and that by sending a request every 10s and the switch reply with the info but I get the following error when the controller receive the reply by using a flow request reply handler The error is occuring because of flow matching 'eth_type'
CollectTrainingStatsApp: Exception occurred during handler processing. Backtrace from offending handler [_flow_stats_reply_handler] servicing event [EventOFPFlowStatsReply] follows.
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/ryu/base/app_manager.py", line 290, in _event_loop
handler(ev)
File "/home/guenfaf/Documents/Training ryu/data_to_csv/data_to_csv.py", line 59, in _flow_stats_reply_handler
for stat in sorted([flow for flow in body if (flow.priority == 1) ], key=lambda flow:
File "/home/guenfaf/Documents/Training ryu/data_to_csv/data_to_csv.py", line 60, in <lambda>
(flow.match['eth_type'],flow.match['ipv4_src'],flow.match['ipv4_dst'],flow.match['ip_proto'])):
File "/usr/local/lib/python2.7/dist-packages/ryu/ofproto/ofproto_v1_3_parser.py", line 904, in __getitem__
return dict(self._fields2)[key]
KeyError: 'eth_type'
Here is my code :
from ryu.app import simple_switch_13
from ryu.controller import ofp_event
from ryu.controller.handler import MAIN_DISPATCHER, DEAD_DISPATCHER
from ryu.controller.handler import set_ev_cls
from ryu.lib import hub
from time import time
# class CollectTrainingStatsApp(simple_switch_13.SimpleSwitch13):
class CollectTrainingStatsApp(simple_switch_13.SimpleSwitch13):
def __init__(self, *args, **kwargs):
super(CollectTrainingStatsApp, self).__init__(*args, **kwargs)
self.datapaths = {}
self.monitor_thread = hub.spawn(self.monitor)
file0 = open("FlowStatsfile.csv","w")
file0.write('datapath_id,flow_id,ip_src,tp_src,ip_dst,tp_dst,ip_proto,flow_duration_sec,flow_duration_nsec,idle_timeout,hard_timeout,flags,packet_count,byte_count,packet_count_per_second,packet_count_per_nsecond,byte_count_per_second,byte_count_per_nsecond,label\n')
file0.close()
#Asynchronous message
#set_ev_cls(ofp_event.EventOFPStateChange,[MAIN_DISPATCHER, DEAD_DISPATCHER])
def state_change_handler(self, ev):
datapath = ev.datapath
if ev.state == MAIN_DISPATCHER:
if datapath.id not in self.datapaths:
self.logger.debug('register datapath: %016x', datapath.id)
self.datapaths[datapath.id] = datapath
elif ev.state == DEAD_DISPATCHER:
if datapath.id in self.datapaths:
self.logger.debug('unregister datapath: %016x', datapath.id)
del self.datapaths[datapath.id]
def monitor(self):
while True:
for dp in self.datapaths.values():
self.request_stats(dp)
hub.sleep(10)
def request_stats(self, datapath):
self.logger.debug('send stats request: %016x', datapath.id)
parser = datapath.ofproto_parser
req = parser.OFPFlowStatsRequest(datapath)
datapath.send_msg(req)
#set_ev_cls(ofp_event.EventOFPFlowStatsReply, MAIN_DISPATCHER)
def _flow_stats_reply_handler(self, ev):
#timestamp = time.time()
tp_src = 0
tp_dst = 0
file0 = open("FlowStatsfile.csv","a+")
body = ev.msg.body
for stat in sorted([flow for flow in body if (flow.priority == 1) ], key=lambda flow:
(flow.match['eth_type'],flow.match['ipv4_src'],flow.match['ipv4_dst'],flow.match['ip_proto'])):
ip_src = stat.match['ipv4_src']
ip_dst = stat.match['ipv4_dst']
ip_proto = stat.match['ip_proto']
if stat.match['ip_proto'] == 1:
icmp_code = stat.match['icmpv4_code']
icmp_type = stat.match['icmpv4_type']
elif stat.match['ip_proto'] == 6:
tp_src = stat.match['tcp_src']
tp_dst = stat.match['tcp_dst']
elif stat.match['ip_proto'] == 17:
tp_src = stat.match['udp_src']
tp_dst = stat.match['udp_dst']
flow_id = str(ip_src) + str(tp_src) + str(ip_dst) + str(tp_dst) + str(ip_proto)
try:
packet_count_per_second = stat.packet_count/stat.duration_sec
packet_count_per_nsecond = stat.packet_count/stat.duration_nsec
except:
packet_count_per_second = 0
packet_count_per_nsecond = 0
try:
byte_count_per_second = stat.byte_count/stat.duration_sec
byte_count_per_nsecond = stat.byte_count/stat.duration_nsec
except:
byte_count_per_second = 0
byte_count_per_nsecond = 0
file0.write("{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n"
.format(ev.msg.datapath.id, flow_id, ip_src, tp_src,ip_dst, tp_dst,
stat.match['ip_proto'],
stat.duration_sec, stat.duration_nsec,
stat.idle_timeout, stat.hard_timeout,
stat.flags, stat.packet_count,stat.byte_count,
packet_count_per_second,packet_count_per_nsecond,
byte_count_per_second,byte_count_per_nsecond,0))
file0.close()

Why is scrapy suddenly giving me an *unpredictable* AttributeError, stating no attribute 'css'

For my job, I built a scrapy spider to quickly check in on ~200-500 website landing pages for clues that the pages are not functioning, outside of just 400-style errors. (e.g. check for the presence of "out of stock" on page.) This check happens across approx. 30 different websites under my purview, all of them using the same page structure.
This has worked fine, every day, for 4 months.
Then, suddenly, and without change to the code, I started getting unpredictable errors, about 4 weeks ago:
url_title = response.css("title::text").extract_first()
AttributeError: 'Response' object has no attribute 'css'
If I run this spider, this error will occur with, say... 3 out of 400 pages.
Then, if immediately run the spider again, those same 3 pages are scraped just fine without error, and 4 totally different pages will return the same error.
Furthermore, if I run the EXACT same spider as below, but replace mapping with just these 7 erroneous landing pages, they are scraped perfectly fine.
Is there something in my code that's not quite right??
I'm going to attach the whole code - sorry in advance!! - I just fear that something I might deem as superfluous may in fact be the cause. So this is the whole thing, but with sensitive data replaced with ####.
I've checked all of the affected pages, and of course the css is valid, and the title is always present.
I've done sudo apt-get update & sudo apt-get dist-upgrade on the server running scrapy, in hopes that this would help. No luck.
import scrapy
from scrapy import signals
from sqlalchemy.orm import sessionmaker
from datetime import date, datetime, timedelta
from scrapy.http.request import Request
from w3lib.url import safe_download_url
from sqlalchemy import and_, or_, not_
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
from sqlalchemy.engine import create_engine
engine = create_engine('mysql://######:#######localhost/LandingPages', pool_recycle=3600, echo=False)
#conn = engine.connect()
from LandingPageVerifier.models import LandingPagesFacebook, LandingPagesGoogle, LandingPagesSimplifi, LandingPagesScrapeLog, LandingPagesScrapeResults
Session = sessionmaker(bind=engine)
session = Session()
# today = datetime.now().strftime("%Y-%m-%d")
# thisyear = datetime.now().strftime("%Y")
# thismonth = datetime.now().strftime("%m")
# thisday = datetime.now().strftime("%d")
# start = date(year=2019,month=04,day=09)
todays_datetime = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
print todays_datetime
landingpages_today_fb = session.query(LandingPagesFacebook).filter(LandingPagesFacebook.created_on >= todays_datetime).all()
landingpages_today_google = session.query(LandingPagesGoogle).filter(LandingPagesGoogle.created_on >= todays_datetime).all()
landingpages_today_simplifi = session.query(LandingPagesSimplifi).filter(LandingPagesSimplifi.created_on >= todays_datetime).all()
session.close()
#Mix 'em together!
landingpages_today = landingpages_today_fb + landingpages_today_google + landingpages_today_simplifi
#landingpages_today = landingpages_today_fb
#Do some iterating and formatting work
landingpages_today = [(u.ad_url_full, u.client_id) for u in landingpages_today]
#print landingpages_today
landingpages_today = list(set(landingpages_today))
#print 'Unique pages: '
#print landingpages_today
# unique_landingpages = [(u[0]) for u in landingpages_today]
# unique_landingpage_client = [(u[1]) for u in landingpages_today]
# print 'Pages----->', len(unique_landingpages)
class LandingPage004Spider(scrapy.Spider):
name='LandingPage004Spider'
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(LandingPage004Spider, cls).from_crawler(crawler, *args, **kwargs)
#crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_closed(self, spider):
#stats = spider.crawler.stats.get_stats()
stats = spider.crawler.stats.get_value('item_scraped_count'),
Session = sessionmaker(bind=engine)
session = Session()
logitem = LandingPagesScrapeLog(scrape_count = spider.crawler.stats.get_value('item_scraped_count'),
is200 = spider.crawler.stats.get_value('downloader/response_status_count/200'),
is400 = spider.crawler.stats.get_value('downloader/response_status_count/400'),
is403 = spider.crawler.stats.get_value('downloader/response_status_count/403'),
is404 = spider.crawler.stats.get_value('downloader/response_status_count/404'),
is500 = spider.crawler.stats.get_value('downloader/response_status_count/500'),
scrapy_errors = spider.crawler.stats.get_value('log_count/ERROR'),
scrapy_criticals = spider.crawler.stats.get_value('log_count/CRITICAL'),
)
session.add(logitem)
session.commit()
session.close()
#mapping = landingpages_today
handle_httpstatus_list = [200, 302, 404, 400, 500]
start_urls = []
def start_requests(self):
for url, client_id in self.mapping:
yield Request(url, callback=self.parse, meta={'client_id': client_id})
def parse(self, response):
##DEBUG - return all scraped data
#wholepage = response.body.lower()
url = response.url
if 'redirect_urls' in response.request.meta:
redirecturl = response.request.meta['redirect_urls'][0]
if 'utm.pag.ca' in redirecturl:
url_shortener = response.request.meta['redirect_urls'][0]
else:
url_shortener = 'None'
else:
url_shortener = 'None'
client_id = response.meta['client_id']
url_title = response.css("title::text").extract_first()
# pagesize = len(response.xpath('//*[not(descendant-or-self::script)]'))
pagesize = len(response.body)
HTTP_code = response.status
####ERROR CHECK: Small page size
if 'instapage' in response.body.lower():
if pagesize <= 20000:
err_small = 1
else:
err_small = 0
else:
if pagesize <= 35000:
err_small = 1
else:
err_small = 0
####ERROR CHECK: Page contains the phrase 'not found'
if 'not found' in response.xpath('//*[not(descendant-or-self::script)]').extract_first().lower():
#their sites are full of HTML errors, making scrapy unable to notice what is and is not inside a script element
if 'dealerinspire' in response.body.lower():
err_has_not_found = 0
else:
err_has_not_found = 1
else:
err_has_not_found = 0
####ERROR CHECK: Page cotains the phrase 'can't be found'
if "can't be found" in response.xpath('//*[not(self::script)]').extract_first().lower():
err_has_cantbefound = 1
else:
err_has_cantbefound = 0
####ERROR CHECK: Page contains the phrase 'unable to locate'
if 'unable to locate' in response.body.lower():
err_has_unabletolocate = 1
else:
err_has_unabletolocate = 0
####ERROR CHECK: Page contains phrase 'no longer available'
if 'no longer available' in response.body.lower():
err_has_nolongeravailable = 1
else:
err_has_nolongeravailable = 0
####ERROR CHECK: Page contains phrase 'no service specials'
if 'no service specials' in response.body.lower():
err_has_noservicespecials = 1
else:
err_has_noservicespecials = 0
####ERROR CHECK: Page contains phrase 'Sorry, no' to match zero inventory for a search, which normally says "Sorry, no items matching your request were found."
if 'sorry, no ' in response.body.lower():
err_has_sorryno = 1
else:
err_has_sorryno = 0
yield {'client_id': client_id, 'url': url, 'url_shortener': url_shortener, 'url_title': url_title, "pagesize": pagesize, "HTTP_code": HTTP_code, "err_small": err_small, 'err_has_not_found': err_has_not_found, 'err_has_cantbefound': err_has_cantbefound, 'err_has_unabletolocate': err_has_unabletolocate, 'err_has_nolongeravailable': err_has_nolongeravailable, 'err_has_noservicespecials': err_has_noservicespecials, 'err_has_sorryno': err_has_sorryno}
#E-mail settings
def sendmail(recipients,subject,body):
fromaddr = "#######"
toaddr = recipients
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['Subject'] = subject
body = body
msg.attach(MIMEText(body, 'html'))
server = smtplib.SMTP('########)
server.starttls()
server.login(fromaddr, "##########")
text = msg.as_string()
server.sendmail(fromaddr, recipients, text)
server.quit()
`
Expected results is a perfect scrape, with no errors.
Actual results are unpredicatable AttributeErrors, claiming that attribute 'css' can't be found on some pages. But if I scrape those pages individually, using the same script, they scrape just fine.
Sometimes Scrapy can't parse HTML because of markup errors, that's why you can't call response.css(). You can catch these events in your code and analyze broken HTML:
def parse(self, response):
try:
....
your code
.....
except:
with open("Error.htm", "w") as f:
f.write(response.body)
UPDATE You can try to check for empty response:
def parse(self, response):
if not response.body:
yield scrapy.Request(url=response.url, callback=self.parse, meta={'client_id': response.meta["client_id"]})
# your original code

Django: object needs to have a value for field "..." before this many-to-many relationship can be used

I experience a strange error with Django 1.5:
I have defined a model like below:
class Company(models.Model):
user = models.OnetoOneField(User)
agreed_to_terms = models.NullBooleanField(default=False)
address = models.CharField(_('Complete Address'),
max_length = 255, null = True, blank = True)
winning_bid = models.ForeignKey('Bid',
related_name='winning_bid',
blank = True, null = True)
bid_list = models.ManyToManyField('Bid',
related_name='bids',
blank = True, null = True)
...
class Bid(models.Model):
user = models.ForeignKey(User, null = True, blank = True)
description = models.TextField(_('Description'),
blank = True, null = True,)
volume = models.DecimalField(max_digits=7, decimal_places=3,
null=True, blank=True,)
...
# all other attributes are of the Boolean, CharField or DecimalField type. No Foreignkeys, nor ManytoManyFields.
When I try to file the form with the initial data through the Django admin, I get the following error:
Exception Value:
"" needs to have a value for field "company" before this many-to-many relationship can be used.
Please see the traceback below.
The error message does not make very much sense to me. The only m2m relationship is bid_list, which is null = True and was null at the time of saving.
Is there something new in Django 1.5, which I have not discovered while reading the changelog (this is my first project in Django 1.5)?
Interestingly, when I save an object in the Django shell, I do not get an error message, but the object does not get saved without any error message.
In [1]: user = User.objects.get(username='admin')
In [2]: new_company = Company()
In [3]: new_company.user = user
In [4]: new_company.save() Out[4]: <Company: Company object>
In [5]: foo = Company.objects.all()
Out[5]: []
When I try to trace the SQL statements with the debug toolbar, I can only see SQL SELECT statements, no INSERT requests.
What is the explanation for this strange behaviour?
Traceback:
Request Method: POST
Request URL: /admin/company/company/add/
Django Version: 1.5.1
Python Version: 2.7.1
Installed Applications:
('django.contrib.admin',
'django.contrib.admindocs',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.gis',
'django.contrib.humanize',
'django.contrib.sessions',
'django.contrib.sites',
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.admin',
'django.contrib.admindocs',
'crispy_forms',
'django_extensions',
'easy_thumbnails',
'registration',
'south',
'company',
'bid',
'debug_toolbar')
Installed Middleware:
('django.middleware.common.CommonMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'debug_toolbar.middleware.DebugToolbarMiddleware')
Traceback:
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/core/handlers/base.py" in get_response
115. response = callback(request, *callback_args, **callback_kwargs)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/contrib/admin/options.py" in wrapper
372. return self.admin_site.admin_view(view)(*args, **kwargs)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/utils/decorators.py" in _wrapped_view
91. response = view_func(request, *args, **kwargs)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/views/decorators/cache.py" in _wrapped_view_func
89. response = view_func(request, *args, **kwargs)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/contrib/admin/sites.py" in inner
202. return view(request, *args, **kwargs)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/utils/decorators.py" in _wrapper
25. return bound_func(*args, **kwargs)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/utils/decorators.py" in _wrapped_view
91. response = view_func(request, *args, **kwargs)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/utils/decorators.py" in bound_func
21. return func(self, *args2, **kwargs2)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/db/transaction.py" in inner
223. return func(*args, **kwargs)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/contrib/admin/options.py" in add_view
1008. self.save_related(request, form, formsets, False)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/contrib/admin/options.py" in save_related
762. form.save_m2m()
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/forms/models.py" in save_m2m
84. f.save_form_data(instance, cleaned_data[f.name])
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/db/models/fields/related.py" in save_form_data
1336. setattr(instance, self.attname, data)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/db/models/fields/related.py" in __set__
910. manager = self.__get__(instance)
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/db/models/fields/related.py" in __get__
897. through=self.field.rel.through,
File "/Users/neurix/Development/virtual/lib/python2.7/site-packages/django/db/models/fields/related.py" in __init__
586. (instance, source_field_name))
Exception Type: ValueError at /admin/company/company/add/
Exception Value: "<Company: Company object>" needs to have a value for field "company" before this many-to-many relationship can be used.
settings.py
import os, os.path, sys
DEBUG = True
TEMPLATE_DEBUG = DEBUG
# Setting up folders
abspath = lambda *p: os.path.abspath(os.path.join(*p))
PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
TASK2_MODULE_PATH = abspath(PROJECT_ROOT, 'apps/')
sys.path.insert(0, TASK2_MODULE_PATH)
# Loading passwords
try:
from settings_pwd import *
except ImportError:
pass
AUTH_PROFILE_MODULE = 'profile.UserProfile'
#ALLOWED_HOSTS = [''] # not needed for DEBUG = True
TIME_ZONE = 'Europe/London'
LANGUAGE_CODE = 'en-uk'
LANGUAGES = [
("en", u"English"),
]
SITE_ID = 1
USE_I18N = True
USE_L10N = True
USE_TZ = True
if DEBUG:
MEDIA_ROOT = os.path.join(PROJECT_ROOT, "site_media", "media")
else:
MEDIA_ROOT = "folder_to_upload_files"
if DEBUG:
MEDIA_URL = "/media/"
else:
MEDIA_URL = "/media/uploads/"
if DEBUG:
STATIC_ROOT = os.path.join(PROJECT_ROOT, "site_media","static")
else:
STATIC_ROOT = "folder_to_static_files"
STATIC_URL = '/static/'
STATICFILES_DIRS = (
os.path.join(PROJECT_ROOT, "assets"),
)
STATICFILES_FINDERS = (
'django.contrib.staticfiles.finders.FileSystemFinder',
'django.contrib.staticfiles.finders.AppDirectoriesFinder',
)
SECRET_KEY = '...'
TEMPLATE_LOADERS = (
'django.template.loaders.filesystem.Loader',
'django.template.loaders.app_directories.Loader',)
MIDDLEWARE_CLASSES = (
'django.middleware.common.CommonMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',)
ROOT_URLCONF = 'task2.urls'
WSGI_APPLICATION = 'task2.wsgi.application'
TEMPLATE_DIRS = (
os.path.join(PROJECT_ROOT, "templates"),
os.path.join(PROJECT_ROOT, "templates/pages"),)
INSTALLED_APPS = (
# Django apps
'django.contrib.admin',
'django.contrib.admindocs',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.gis',
'django.contrib.humanize',
'django.contrib.sessions',
'django.contrib.sites',
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.admin',
'django.contrib.admindocs',
# third party apps
'crispy_forms',
'django_extensions',
'easy_thumbnails',
'registration',
'south',
# task2 apps
'profile',
'company',
)
AUTHENTICATION_BACKENDS = (
'django.contrib.auth.backends.ModelBackend',
)
log = DEBUG
if log:
LOGGING = {
'version': 1,
'disable_existing_loggers': True,
'formatters': {
'simple': {
'format': '%(levelname)s %(message)s',
},
},
'handlers': {
'console':{
'level':'DEBUG',
'class':'logging.StreamHandler',
'formatter': 'simple'
},
},
'loggers': {
'django': {
'handlers': ['console'],
'level': 'DEBUG',
},
}
}
####################
# THIRD PARTY SETUPS
# For Crispy Forms
CRISPY_FAIL_SILENTLY = not DEBUG
CRISPY_TEMPLATE_PACK = 'bootstrap'
## For Django Registration
ACCOUNT_ACTIVATION_DAYS = 7
# for Django testing to avoid conflicts with South migrations
SOUTH_TESTS_MIGRATE = False
# Debug_toolbar settings
if DEBUG:
INTERNAL_IPS = ('127.0.0.1',)
MIDDLEWARE_CLASSES += (
'debug_toolbar.middleware.DebugToolbarMiddleware',
)
INSTALLED_APPS += (
'debug_toolbar',
)
DEBUG_TOOLBAR_PANELS = (
'debug_toolbar.panels.version.VersionDebugPanel',
'debug_toolbar.panels.timer.TimerDebugPanel',
'debug_toolbar.panels.settings_vars.SettingsVarsDebugPanel',
'debug_toolbar.panels.headers.HeaderDebugPanel',
#'debug_toolbar.panels.profiling.ProfilingDebugPanel',
'debug_toolbar.panels.request_vars.RequestVarsDebugPanel',
'debug_toolbar.panels.sql.SQLDebugPanel',
'debug_toolbar.panels.template.TemplateDebugPanel',
'debug_toolbar.panels.cache.CacheDebugPanel',
'debug_toolbar.panels.signals.SignalDebugPanel',
'debug_toolbar.panels.logger.LoggingPanel',
)
DEBUG_TOOLBAR_CONFIG = {
'INTERCEPT_REDIRECTS': False,
}
# Easy_Thumbnail setup
THUMBNAIL_ALIASES = {
'': {
'thumbnail': {'size': (50, 50), 'crop': True},
},
}
The problem has to do with how you use your views.
I think you are using:
instance.add(many_to_many_instance)
before you have an instance id.
so first save your model:
instance.save()
instance.add(many_to_many_instance)
You are using custom user model AUTH_PROFILE_MODULE = 'profile.UserProfile', but in code I suppose you use native django user.
I suppose your models should be like
class Company(models.Model):
user = models.OnetoOneField('profile.UserProfile')
...
read more https://docs.djangoproject.com/en/1.5/ref/contrib/auth/#django.contrib.auth.models.User.get_profile