Scrapy KeyError(f"{self.__class__.__name__} does not support field: {key}" - scrapy

Calling on all Scrapy experts to look into what this newbie missed.
I am getting the following error
KeyError(f"{self.__class__.__name__} does not support field: {key}"
My code is as follows:
In spiders/myCrawler.py
def parse_item(self, response):
item = scrapy.Item()
item['title'] = response.css('.p-card__info-title')
return item
and in my settings.py
class MyItem(scrapy.Item):
title = scrapy.Field()
I am unable to figure out what exactly I am doing wrong. Please help. Thank you.

You are using
def parse_item(self, response):
item = scrapy.Item()
item['title'] = response.css('.p-card__info-title')
return item
In this you are initializing scrapy.Item() instead of your own item
def parse_item(self, response):
item = MyItem()
item['title'] = response.css('.p-card__info-title')
return item

Related

How write a Django REST APIView POST request with a condition?

I am trying to create a post request for a game api. The game implies that a user can label a picture. A label entered once is a tagging, a label entered twice for the same resource is a tag.
This is how I am trying to create a Tagging so far:
saved_tagging = Tagging.objects.create(user_id=current_user_id,
gameround=gameround,
resource=random_resource,
tag='tag newwww',
created=datetime.now(),
score=score,
origin=origin
)
tagging_serializer = TaggingSerializer(saved_tagging)
At the moment I am getting the ValueError: Cannot assign "'tag newwww'": "Tagging.tag" must be a "Tag" instance.
Is there any way that I can avoid this?
Here are also my models and the relevant serializer.
models.py
class Tag(models.Model):
name = models.CharField(max_length=256)
language = models.CharField(max_length=256)
objects = models.Manager()
def create(self, validated_data):
tag_data = validated_data.pop('tag')
Tag.objects.create(**tag_data)
return tag_data
def __str__(self):
return self.name or ''
class Tagging(models.Model):
user = models.ForeignKey(CustomUser, on_delete=models.SET_NULL, null=True)
gameround = models.ForeignKey(Gameround, on_delete=models.CASCADE, related_name='taggings')
resource = models.ForeignKey(Resource, on_delete=models.CASCADE, related_name='taggings')
tag = models.ForeignKey(Tag, on_delete=models.CASCADE, related_name='tagging')
created = models.DateTimeField(editable=False)
score = models.PositiveIntegerField(default=0)
origin = models.URLField(max_length=256, blank=True, default='')
objects = models.Manager()
def create(self, validated_data):
tag_data = validated_data.pop('tag')
tagging = Tagging.objects.create(**validated_data)
Tag.objects.create(name=tagging, **tag_data)
return tagging
def __str__(self):
return str(self.tag) or ''
serializers.py
class TaggingSerializer(serializers.ModelSerializer):
tag = StringRelatedField()
resource = ResourceSerializer(read_only=True)
gameround = GameroundSerializer(read_only=True)
class Meta:
model = Tagging
fields = ('id', 'tag', 'gameround', 'created', 'score', 'resource', 'origin')
def create(self, validated_data):
return Tagging.objects.create(**validated_data)
def to_representation(self, data):
data = super().to_representation(data)
return data
tag must be a Tag Instance !
So... you can do this in two ways (in my opinion).
First you can create a Tag object in your view and than pass this object to tag value in your Tagging create method.
Or create a service layer on your app, and create a custom create method for your model. This in my opinion is superior because your a centralizing your rules in one method instead of in one view.
Ex.:
services/tag_service.py
def create(user_id,gameround,resource,tag,...origin):
if not isinstance(tag, Tag):
#create your tag model based on the String passed
your_new_tag_object = Tag.objects.create(tag=tag, ...)
# Here your create others rules too for model creation
# Return your model .create method
return Tagging.objects.create(user_id=user_id,...tag=your_new_tag_object,...)
And Than use this new create method inside your POST serializer.
from services import tag_service
class TaggingSerializer(serializers.ModelSerializer):
# your normal serializer here
def create(self, validated_data):
return tag_service.create(**validated_data)

How to do Django Rest Framework double nested serializer

I'm trying to build an api that have an endpoint which in the POST receives the JSON like bellow:
{
"title":"Quiz 1",
"questions":[
{
"description":"Question 1?",
"answers":[
{
"description":"Answer 1",
"true_or_false":true
},
{
"description":"Answer 2",
"true_or_false":false
}
]
},
{
"description":"Question 2?",
"answers":[
{
"description":"Answer 1",
"true_or_false":true
},
{
"description":"Answer 2",
"true_or_false":false
}
]
}
]
}
But I don't know how to build the serializers for double nested fields, like a list of answers within a question and a list of questions within a quiz. The code I made so far is below:
models.py
from django.db import models
class Quiz(models.Model):
title = models.CharField(max_length=200, blank=False)
def __str__(self):
return self.title
class Question(models.Model):
description = models.CharField(max_length=10000, blank=False)
quiz = models.ForeignKey(Quiz, related_name='questions', on_delete=models.CASCADE, default=None)
def __str__(self):
return self.description
class Answer(models.Model):
question = models.ForeignKey(Question, related_name='answers', on_delete=models.CASCADE)
description = models.CharField(max_length=1000, blank=False)
true_or_false = models.BooleanField(default=False, blank=False)
def __str__(self):
return self.description
views.py
class QuestionViewSet(viewsets.ModelViewSet):
queryset = Question.objects.all()
serializer_class = QuestionSerializer
class AnswerViewSet(viewsets.ModelViewSet):
queryset = Answer.objects.all()
serializer_class = AnswerSerializer
class QuizViewSet(viewsets.ModelViewSet):
queryset = Quiz.objects.all()
serializer_class = QuizSerializer
class OnlyQuizViewSet(generics.ListAPIView):
def get_queryset(self):
queryset = Quiz.objects.filter(id=self.kwargs['pk'])
return queryset
serializer_class = QuizSerializer
serializers.py
class AnswerSerializer(serializers.ModelSerializer):
class Meta:
model = Answer
fields = ['id', 'description', 'true_or_false']
class QuestionSerializer(serializers.ModelSerializer):
answers = AnswerSerializer(many=True)
class Meta:
model = Question
fields = ['id','description','answers']
read_only_fields = ('quiz',)
def create(self, validated_data):
answers_data = validated_data.pop('answers')
question = Question.objects.create(**validated_data)
for answer in answers_data:
Answer.objects.create(question=question, **answer)
return question
class QuizSerializer(serializers.ModelSerializer):
questions = QuestionSerializer(many=True)
class Meta:
model = Quiz
fields = ['id', 'title', 'questions']
def create(self, validated_data):
questions_data = validated_data.pop('questions')
quiz = Quiz.objects.create(**validated_data)
for questions in questions_data:
Question.objects.create(quiz=quiz, **questions)
return quiz
With the above serializers, I'm getting the error below:
Direct assignment to the reverse side of a related set is prohibited. Use answers.set() instead.
So what's the right way to build double-nested serializers? So far I haven't been able to find anything to help me
I'm gonna assume you get that error when you try posting to the Quiz api, you'll have a problem when you want to create a question
Question.objects.create(quiz=quiz, **questions)
since this question also includes a list of answers that needs to be created first.
Try using the serializer you already have like this
class QuizSerializer(serializers.ModelSerializer):
questions = QuestionSerializer(many=True)
class Meta:
model = Quiz
fields = ['id', 'title', 'questions']
def create(self, validated_data):
questions_data = validated_data.pop('questions')
quiz = Quiz.objects.create(**validated_data)
for questions in questions_data:
question_serializer = QuestionSerializer({**questions, "quiz": quiz}) # might need to send quiz.id instead
question_serializer.is_valid()
question_serializer.save()
return quiz
and update your QuestionSerializer to use AnswerSerializer the same way (or you can create Answers explicitly there)

Scrapy following links, extracting new ones and following them

I'm trying to create a scraper that scrapes a website for its products. I decided to extract all the categories links from the navigation menu, then follow them and extract all the products links, which I later parse in the parse_product function. But I don't actually what's the best way to do that. I'm struggling with following parse_menu links and futher extractiong product links. Criticize my code pls.
class DiorSpider(CrawlSpider):
name = 'newdior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
rules = (
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us',
)), callback='parse_menu'),
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us/products/.*',
)), callback='parse_product'),
)
def parse_menu(self, response):
menu = response.xpath('//a[#class="navigation-item-link"]').extract()
for item in menu:
link = re.compile(r'a class="navigation-item-link" href="([a-zA-Z0-9_/-]*)"').findall(item)
if link:
absolute_url = response.urljoin(link[0])
yield absolute_url
def parse_product(self, response):
class DiorSpider(Spider): #crawlspider is used mostly when you use Linkextractors.
name = 'newdior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
#if you're going through nevigation bar, no need to add Rules.
def parse(self, response):
links = response.xpath('//a[#class="navigation-item-link"]/#href').extract() #here you can easily extract links
for link in links:
#link = re.compile(r'a class="navigation-item-link" href="([a-zA-Z0-9_/-]*)"').findall(item)
#links are extracted in xpath above.
absolute_url = response.urljoin(link)
yield Request(absolute_url, self.parse_product)
def parse_product(self, response):

How to get an object with a similar tag using django

I have an object (a blogpost) which can have multiple tags in django. I'm trying to get related objects with one or more of these same tags.
For example: You have a blogpost with a few tags, like 'food', 'drinks' and 'restaurants'. When you open this blogpost, there are displayed some 'related' blogposts (meaning they share one or more tags). An example of such a related blogpost would have the tags: 'soda', 'lemonade' and 'drinks'.
Here is my view:
instance = get_object_or_404(Blog, id=id)
tags = instance.tags.values()
related = []
for x in tags: #to put all the tags in an array
related.append(x['name'])
for a in Blog.objects.raw('SELECT * FROM "blog_table" WHERE related in "blog_table"."tags"'):
print (a.name) #this should display the name of all the related blogposts (probably including itself)
Here are my models:
class Tag(models.Model):
name = models.CharField(max_length=500)
number = models.IntegerField(null=True, blank=True)
def __str__(self):
return str(self.number) + ' ' + self.name
class Blog(models.Model):
name = models.CharField(null=False, max_length=500, verbose_name='title of blogpost', unique=True)
body = models.TextField(null=False, verbose_name='body of the blogpost')
tags = models.ManyToManyField(Tag, blank=True, null=True)
def __str__(self):
return self.name
To get the blogs that have similar instance tag, you can do this:
tags = instance.tag.all()
for tag in tags:
print(Blog.objects.filter(tags=tag))

Change city when parsing website with Scrapy

As I understood Scrapy works asynchronously and requests are unordered.Now, I can parse a list of items on some website's page and go to details to parse additional information.The problem is that after doing all of that work, I need to parse same data for another city.City changes by making request like http://www.example.com/city/set/1.
My spider looks like this :
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
def start_requests(self):
for category in CATEGORIES:
if 'subcategories' in category:
subcategories = category['subcategories']
for subcategory in subcategories:
url = subcategory['url']
yield scrapy.Request(
url = url,
callback = self.parse,
meta = {
'category': category,
'subcategory': subcategory
}
)
def parse(self, response):
pass
What is the best approach to do this?
You can simply chain requests:
def parse(self, response):
item = dict()
# fill up item with data
city_url = '' #make city url
yield Request(city_url,
meta={'item': item}, # carry item to next callback in meta
callback=self.parse_city)
def parse_city(self, response):
# get item from meta
item = response.meta['item']
# add more stuff to your item
item['some_city_data'] = ''
yield item