Calling on all Scrapy experts to look into what this newbie missed.
I am getting the following error
KeyError(f"{self.__class__.__name__} does not support field: {key}"
My code is as follows:
In spiders/myCrawler.py
def parse_item(self, response):
item = scrapy.Item()
item['title'] = response.css('.p-card__info-title')
return item
and in my settings.py
class MyItem(scrapy.Item):
title = scrapy.Field()
I am unable to figure out what exactly I am doing wrong. Please help. Thank you.
You are using
def parse_item(self, response):
item = scrapy.Item()
item['title'] = response.css('.p-card__info-title')
return item
In this you are initializing scrapy.Item() instead of your own item
def parse_item(self, response):
item = MyItem()
item['title'] = response.css('.p-card__info-title')
return item
Related
I am trying to create a post request for a game api. The game implies that a user can label a picture. A label entered once is a tagging, a label entered twice for the same resource is a tag.
This is how I am trying to create a Tagging so far:
saved_tagging = Tagging.objects.create(user_id=current_user_id,
gameround=gameround,
resource=random_resource,
tag='tag newwww',
created=datetime.now(),
score=score,
origin=origin
)
tagging_serializer = TaggingSerializer(saved_tagging)
At the moment I am getting the ValueError: Cannot assign "'tag newwww'": "Tagging.tag" must be a "Tag" instance.
Is there any way that I can avoid this?
Here are also my models and the relevant serializer.
models.py
class Tag(models.Model):
name = models.CharField(max_length=256)
language = models.CharField(max_length=256)
objects = models.Manager()
def create(self, validated_data):
tag_data = validated_data.pop('tag')
Tag.objects.create(**tag_data)
return tag_data
def __str__(self):
return self.name or ''
class Tagging(models.Model):
user = models.ForeignKey(CustomUser, on_delete=models.SET_NULL, null=True)
gameround = models.ForeignKey(Gameround, on_delete=models.CASCADE, related_name='taggings')
resource = models.ForeignKey(Resource, on_delete=models.CASCADE, related_name='taggings')
tag = models.ForeignKey(Tag, on_delete=models.CASCADE, related_name='tagging')
created = models.DateTimeField(editable=False)
score = models.PositiveIntegerField(default=0)
origin = models.URLField(max_length=256, blank=True, default='')
objects = models.Manager()
def create(self, validated_data):
tag_data = validated_data.pop('tag')
tagging = Tagging.objects.create(**validated_data)
Tag.objects.create(name=tagging, **tag_data)
return tagging
def __str__(self):
return str(self.tag) or ''
serializers.py
class TaggingSerializer(serializers.ModelSerializer):
tag = StringRelatedField()
resource = ResourceSerializer(read_only=True)
gameround = GameroundSerializer(read_only=True)
class Meta:
model = Tagging
fields = ('id', 'tag', 'gameround', 'created', 'score', 'resource', 'origin')
def create(self, validated_data):
return Tagging.objects.create(**validated_data)
def to_representation(self, data):
data = super().to_representation(data)
return data
tag must be a Tag Instance !
So... you can do this in two ways (in my opinion).
First you can create a Tag object in your view and than pass this object to tag value in your Tagging create method.
Or create a service layer on your app, and create a custom create method for your model. This in my opinion is superior because your a centralizing your rules in one method instead of in one view.
Ex.:
services/tag_service.py
def create(user_id,gameround,resource,tag,...origin):
if not isinstance(tag, Tag):
#create your tag model based on the String passed
your_new_tag_object = Tag.objects.create(tag=tag, ...)
# Here your create others rules too for model creation
# Return your model .create method
return Tagging.objects.create(user_id=user_id,...tag=your_new_tag_object,...)
And Than use this new create method inside your POST serializer.
from services import tag_service
class TaggingSerializer(serializers.ModelSerializer):
# your normal serializer here
def create(self, validated_data):
return tag_service.create(**validated_data)
I'm trying to build an api that have an endpoint which in the POST receives the JSON like bellow:
{
"title":"Quiz 1",
"questions":[
{
"description":"Question 1?",
"answers":[
{
"description":"Answer 1",
"true_or_false":true
},
{
"description":"Answer 2",
"true_or_false":false
}
]
},
{
"description":"Question 2?",
"answers":[
{
"description":"Answer 1",
"true_or_false":true
},
{
"description":"Answer 2",
"true_or_false":false
}
]
}
]
}
But I don't know how to build the serializers for double nested fields, like a list of answers within a question and a list of questions within a quiz. The code I made so far is below:
models.py
from django.db import models
class Quiz(models.Model):
title = models.CharField(max_length=200, blank=False)
def __str__(self):
return self.title
class Question(models.Model):
description = models.CharField(max_length=10000, blank=False)
quiz = models.ForeignKey(Quiz, related_name='questions', on_delete=models.CASCADE, default=None)
def __str__(self):
return self.description
class Answer(models.Model):
question = models.ForeignKey(Question, related_name='answers', on_delete=models.CASCADE)
description = models.CharField(max_length=1000, blank=False)
true_or_false = models.BooleanField(default=False, blank=False)
def __str__(self):
return self.description
views.py
class QuestionViewSet(viewsets.ModelViewSet):
queryset = Question.objects.all()
serializer_class = QuestionSerializer
class AnswerViewSet(viewsets.ModelViewSet):
queryset = Answer.objects.all()
serializer_class = AnswerSerializer
class QuizViewSet(viewsets.ModelViewSet):
queryset = Quiz.objects.all()
serializer_class = QuizSerializer
class OnlyQuizViewSet(generics.ListAPIView):
def get_queryset(self):
queryset = Quiz.objects.filter(id=self.kwargs['pk'])
return queryset
serializer_class = QuizSerializer
serializers.py
class AnswerSerializer(serializers.ModelSerializer):
class Meta:
model = Answer
fields = ['id', 'description', 'true_or_false']
class QuestionSerializer(serializers.ModelSerializer):
answers = AnswerSerializer(many=True)
class Meta:
model = Question
fields = ['id','description','answers']
read_only_fields = ('quiz',)
def create(self, validated_data):
answers_data = validated_data.pop('answers')
question = Question.objects.create(**validated_data)
for answer in answers_data:
Answer.objects.create(question=question, **answer)
return question
class QuizSerializer(serializers.ModelSerializer):
questions = QuestionSerializer(many=True)
class Meta:
model = Quiz
fields = ['id', 'title', 'questions']
def create(self, validated_data):
questions_data = validated_data.pop('questions')
quiz = Quiz.objects.create(**validated_data)
for questions in questions_data:
Question.objects.create(quiz=quiz, **questions)
return quiz
With the above serializers, I'm getting the error below:
Direct assignment to the reverse side of a related set is prohibited. Use answers.set() instead.
So what's the right way to build double-nested serializers? So far I haven't been able to find anything to help me
I'm gonna assume you get that error when you try posting to the Quiz api, you'll have a problem when you want to create a question
Question.objects.create(quiz=quiz, **questions)
since this question also includes a list of answers that needs to be created first.
Try using the serializer you already have like this
class QuizSerializer(serializers.ModelSerializer):
questions = QuestionSerializer(many=True)
class Meta:
model = Quiz
fields = ['id', 'title', 'questions']
def create(self, validated_data):
questions_data = validated_data.pop('questions')
quiz = Quiz.objects.create(**validated_data)
for questions in questions_data:
question_serializer = QuestionSerializer({**questions, "quiz": quiz}) # might need to send quiz.id instead
question_serializer.is_valid()
question_serializer.save()
return quiz
and update your QuestionSerializer to use AnswerSerializer the same way (or you can create Answers explicitly there)
I'm trying to create a scraper that scrapes a website for its products. I decided to extract all the categories links from the navigation menu, then follow them and extract all the products links, which I later parse in the parse_product function. But I don't actually what's the best way to do that. I'm struggling with following parse_menu links and futher extractiong product links. Criticize my code pls.
class DiorSpider(CrawlSpider):
name = 'newdior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
rules = (
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us',
)), callback='parse_menu'),
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us/products/.*',
)), callback='parse_product'),
)
def parse_menu(self, response):
menu = response.xpath('//a[#class="navigation-item-link"]').extract()
for item in menu:
link = re.compile(r'a class="navigation-item-link" href="([a-zA-Z0-9_/-]*)"').findall(item)
if link:
absolute_url = response.urljoin(link[0])
yield absolute_url
def parse_product(self, response):
class DiorSpider(Spider): #crawlspider is used mostly when you use Linkextractors.
name = 'newdior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
#if you're going through nevigation bar, no need to add Rules.
def parse(self, response):
links = response.xpath('//a[#class="navigation-item-link"]/#href').extract() #here you can easily extract links
for link in links:
#link = re.compile(r'a class="navigation-item-link" href="([a-zA-Z0-9_/-]*)"').findall(item)
#links are extracted in xpath above.
absolute_url = response.urljoin(link)
yield Request(absolute_url, self.parse_product)
def parse_product(self, response):
I have an object (a blogpost) which can have multiple tags in django. I'm trying to get related objects with one or more of these same tags.
For example: You have a blogpost with a few tags, like 'food', 'drinks' and 'restaurants'. When you open this blogpost, there are displayed some 'related' blogposts (meaning they share one or more tags). An example of such a related blogpost would have the tags: 'soda', 'lemonade' and 'drinks'.
Here is my view:
instance = get_object_or_404(Blog, id=id)
tags = instance.tags.values()
related = []
for x in tags: #to put all the tags in an array
related.append(x['name'])
for a in Blog.objects.raw('SELECT * FROM "blog_table" WHERE related in "blog_table"."tags"'):
print (a.name) #this should display the name of all the related blogposts (probably including itself)
Here are my models:
class Tag(models.Model):
name = models.CharField(max_length=500)
number = models.IntegerField(null=True, blank=True)
def __str__(self):
return str(self.number) + ' ' + self.name
class Blog(models.Model):
name = models.CharField(null=False, max_length=500, verbose_name='title of blogpost', unique=True)
body = models.TextField(null=False, verbose_name='body of the blogpost')
tags = models.ManyToManyField(Tag, blank=True, null=True)
def __str__(self):
return self.name
To get the blogs that have similar instance tag, you can do this:
tags = instance.tag.all()
for tag in tags:
print(Blog.objects.filter(tags=tag))
As I understood Scrapy works asynchronously and requests are unordered.Now, I can parse a list of items on some website's page and go to details to parse additional information.The problem is that after doing all of that work, I need to parse same data for another city.City changes by making request like http://www.example.com/city/set/1.
My spider looks like this :
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
def start_requests(self):
for category in CATEGORIES:
if 'subcategories' in category:
subcategories = category['subcategories']
for subcategory in subcategories:
url = subcategory['url']
yield scrapy.Request(
url = url,
callback = self.parse,
meta = {
'category': category,
'subcategory': subcategory
}
)
def parse(self, response):
pass
What is the best approach to do this?
You can simply chain requests:
def parse(self, response):
item = dict()
# fill up item with data
city_url = '' #make city url
yield Request(city_url,
meta={'item': item}, # carry item to next callback in meta
callback=self.parse_city)
def parse_city(self, response):
# get item from meta
item = response.meta['item']
# add more stuff to your item
item['some_city_data'] = ''
yield item