Scrapy following links, extracting new ones and following them - scrapy

I'm trying to create a scraper that scrapes a website for its products. I decided to extract all the categories links from the navigation menu, then follow them and extract all the products links, which I later parse in the parse_product function. But I don't actually what's the best way to do that. I'm struggling with following parse_menu links and futher extractiong product links. Criticize my code pls.
class DiorSpider(CrawlSpider):
name = 'newdior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
rules = (
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us',
)), callback='parse_menu'),
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us/products/.*',
)), callback='parse_product'),
)
def parse_menu(self, response):
menu = response.xpath('//a[#class="navigation-item-link"]').extract()
for item in menu:
link = re.compile(r'a class="navigation-item-link" href="([a-zA-Z0-9_/-]*)"').findall(item)
if link:
absolute_url = response.urljoin(link[0])
yield absolute_url
def parse_product(self, response):

class DiorSpider(Spider): #crawlspider is used mostly when you use Linkextractors.
name = 'newdior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
#if you're going through nevigation bar, no need to add Rules.
def parse(self, response):
links = response.xpath('//a[#class="navigation-item-link"]/#href').extract() #here you can easily extract links
for link in links:
#link = re.compile(r'a class="navigation-item-link" href="([a-zA-Z0-9_/-]*)"').findall(item)
#links are extracted in xpath above.
absolute_url = response.urljoin(link)
yield Request(absolute_url, self.parse_product)
def parse_product(self, response):

Related

How write a Django REST APIView POST request with a condition?

I am trying to create a post request for a game api. The game implies that a user can label a picture. A label entered once is a tagging, a label entered twice for the same resource is a tag.
This is how I am trying to create a Tagging so far:
saved_tagging = Tagging.objects.create(user_id=current_user_id,
gameround=gameround,
resource=random_resource,
tag='tag newwww',
created=datetime.now(),
score=score,
origin=origin
)
tagging_serializer = TaggingSerializer(saved_tagging)
At the moment I am getting the ValueError: Cannot assign "'tag newwww'": "Tagging.tag" must be a "Tag" instance.
Is there any way that I can avoid this?
Here are also my models and the relevant serializer.
models.py
class Tag(models.Model):
name = models.CharField(max_length=256)
language = models.CharField(max_length=256)
objects = models.Manager()
def create(self, validated_data):
tag_data = validated_data.pop('tag')
Tag.objects.create(**tag_data)
return tag_data
def __str__(self):
return self.name or ''
class Tagging(models.Model):
user = models.ForeignKey(CustomUser, on_delete=models.SET_NULL, null=True)
gameround = models.ForeignKey(Gameround, on_delete=models.CASCADE, related_name='taggings')
resource = models.ForeignKey(Resource, on_delete=models.CASCADE, related_name='taggings')
tag = models.ForeignKey(Tag, on_delete=models.CASCADE, related_name='tagging')
created = models.DateTimeField(editable=False)
score = models.PositiveIntegerField(default=0)
origin = models.URLField(max_length=256, blank=True, default='')
objects = models.Manager()
def create(self, validated_data):
tag_data = validated_data.pop('tag')
tagging = Tagging.objects.create(**validated_data)
Tag.objects.create(name=tagging, **tag_data)
return tagging
def __str__(self):
return str(self.tag) or ''
serializers.py
class TaggingSerializer(serializers.ModelSerializer):
tag = StringRelatedField()
resource = ResourceSerializer(read_only=True)
gameround = GameroundSerializer(read_only=True)
class Meta:
model = Tagging
fields = ('id', 'tag', 'gameround', 'created', 'score', 'resource', 'origin')
def create(self, validated_data):
return Tagging.objects.create(**validated_data)
def to_representation(self, data):
data = super().to_representation(data)
return data
tag must be a Tag Instance !
So... you can do this in two ways (in my opinion).
First you can create a Tag object in your view and than pass this object to tag value in your Tagging create method.
Or create a service layer on your app, and create a custom create method for your model. This in my opinion is superior because your a centralizing your rules in one method instead of in one view.
Ex.:
services/tag_service.py
def create(user_id,gameround,resource,tag,...origin):
if not isinstance(tag, Tag):
#create your tag model based on the String passed
your_new_tag_object = Tag.objects.create(tag=tag, ...)
# Here your create others rules too for model creation
# Return your model .create method
return Tagging.objects.create(user_id=user_id,...tag=your_new_tag_object,...)
And Than use this new create method inside your POST serializer.
from services import tag_service
class TaggingSerializer(serializers.ModelSerializer):
# your normal serializer here
def create(self, validated_data):
return tag_service.create(**validated_data)

Scrapy KeyError(f"{self.__class__.__name__} does not support field: {key}"

Calling on all Scrapy experts to look into what this newbie missed.
I am getting the following error
KeyError(f"{self.__class__.__name__} does not support field: {key}"
My code is as follows:
In spiders/myCrawler.py
def parse_item(self, response):
item = scrapy.Item()
item['title'] = response.css('.p-card__info-title')
return item
and in my settings.py
class MyItem(scrapy.Item):
title = scrapy.Field()
I am unable to figure out what exactly I am doing wrong. Please help. Thank you.
You are using
def parse_item(self, response):
item = scrapy.Item()
item['title'] = response.css('.p-card__info-title')
return item
In this you are initializing scrapy.Item() instead of your own item
def parse_item(self, response):
item = MyItem()
item['title'] = response.css('.p-card__info-title')
return item

How to get an object with a similar tag using django

I have an object (a blogpost) which can have multiple tags in django. I'm trying to get related objects with one or more of these same tags.
For example: You have a blogpost with a few tags, like 'food', 'drinks' and 'restaurants'. When you open this blogpost, there are displayed some 'related' blogposts (meaning they share one or more tags). An example of such a related blogpost would have the tags: 'soda', 'lemonade' and 'drinks'.
Here is my view:
instance = get_object_or_404(Blog, id=id)
tags = instance.tags.values()
related = []
for x in tags: #to put all the tags in an array
related.append(x['name'])
for a in Blog.objects.raw('SELECT * FROM "blog_table" WHERE related in "blog_table"."tags"'):
print (a.name) #this should display the name of all the related blogposts (probably including itself)
Here are my models:
class Tag(models.Model):
name = models.CharField(max_length=500)
number = models.IntegerField(null=True, blank=True)
def __str__(self):
return str(self.number) + ' ' + self.name
class Blog(models.Model):
name = models.CharField(null=False, max_length=500, verbose_name='title of blogpost', unique=True)
body = models.TextField(null=False, verbose_name='body of the blogpost')
tags = models.ManyToManyField(Tag, blank=True, null=True)
def __str__(self):
return self.name
To get the blogs that have similar instance tag, you can do this:
tags = instance.tag.all()
for tag in tags:
print(Blog.objects.filter(tags=tag))

Change city when parsing website with Scrapy

As I understood Scrapy works asynchronously and requests are unordered.Now, I can parse a list of items on some website's page and go to details to parse additional information.The problem is that after doing all of that work, I need to parse same data for another city.City changes by making request like http://www.example.com/city/set/1.
My spider looks like this :
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
def start_requests(self):
for category in CATEGORIES:
if 'subcategories' in category:
subcategories = category['subcategories']
for subcategory in subcategories:
url = subcategory['url']
yield scrapy.Request(
url = url,
callback = self.parse,
meta = {
'category': category,
'subcategory': subcategory
}
)
def parse(self, response):
pass
What is the best approach to do this?
You can simply chain requests:
def parse(self, response):
item = dict()
# fill up item with data
city_url = '' #make city url
yield Request(city_url,
meta={'item': item}, # carry item to next callback in meta
callback=self.parse_city)
def parse_city(self, response):
# get item from meta
item = response.meta['item']
# add more stuff to your item
item['some_city_data'] = ''
yield item

Filter query with variable list in django

I have a three table model like that :
I want to filter results of a query with a variable list of items for exemple :
listTags = ["landscape","green"]
results = ListTag.objects.filter(tag__name__in=listTags).select_related()
But the result of that query is all the ListTag objects with landscape OR green but what i want it's a list of ListTag objects with landscape AND green
I saw a lot a of answers about that problem but a lot of them use a static list of tags, what i want it's to filter with a variable listtags list
Edit : the model
class Picture(models.Model):
title = models.CharField(max_length=50,null=True, blank=False, verbose_name=('name'))
def __str__(self):
return self.titre
class Tag(models.Model):
name = models.CharField(max_length=50,null=True, blank=False, verbose_name=('name'))
def __str__(self):
return self.name
class ListTags(models.Model):
picture = models.ForeignKey(Picture, blank=False, on_delete=models.CASCADE, related_name='picture')
tag = models.ForeignKey(Tag, blank=False, on_delete=models.CASCADE, related_name='tag')
You can try to use Django Q object.
In your case this could be:
from django.db.models import Q
...
listTags = ["landscape","green"]
query = Q()
for tag in listTags:
query &= Q(tag__name = tag)
results = ListTag.objects.filter(query).select_related()
addition:
if you want just pictures with tags, then you could use many-to-many relationships. But if you want use tags for different types of models, then u need to use generic relations.
In first case models structure could be:
from django.db import models
class Tag(models.Model):
name = models.CharField(max_length=50, null=True, blank=False, verbose_name=('name'))
def __str__(self):
return self.name
class Picture(models.Model):
title = models.CharField(max_length=50, null=True, blank=False, verbose_name=('name'))
tags = models.ManyToManyField(Tag)
def __str__(self):
return self.title
With m2m relation Q object will not work, so to get all pictures with landscape and green tags you can use filter chaining:
listTags = ["landscape", "green"]
results = models.Picture.objects.all()
for tag in listTags:
results = results.filter(tags__name = tag)
I believe the code below would make this an AND query rather than an OR query:
listTags = ["landscape","green"]
filters = {}
for value in listTags:
filters['tag__name__in'] = value
results = ListTag.objects.filter(**filters)