Remove duplicates based on a unique ID - scrapy

I want to remove duplicate data based on unique id's. Each listing from the site has a unique id and so I want to filter the data to remove any duplicates.
After looking at the documentation for scrapy: Filter duplicates
I have tried to implement a similar pipeline to remove the duplicates, however I am unfamiliar on how to get this to work.
Here's what I have truied:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import json
from itertools import zip_longest
from collections import defaultdict
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
headers = {
'authority': 'www.theparking.eu',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'origin': 'https://www.theparking.eu',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
#'referer': 'https://www.theparking.eu/used-cars/used-cars/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class DuplicatesPipeline:
def __init__(self):
# self.ids_seen = set()
self.titles_seen = set()
def process_item(self, unique_id, spider):
if unique_id in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(unique_id)
return unique_id
class Countryitem(scrapy.Item):
make = Field(output_processor = TakeFirst())
unique_id = Field(output_processor = TakeFirst())
page_number = Field(output_processor = TakeFirst())
class CountrySpider(scrapy.Spider):
name = "country"
test_dict={'country_id': [4,5,109,7,6,8,87],
'country': ['australia','austria','bahamas','belarus','belgium','bosnia and herzegovina','brasil'],
'make': [20, 13, 131, 113, 32, 62, 104],
'model': [1108, 4655, 687, 492, 499, 702, 6143],
'engine': [5, 11, 10, 7, 14, 21, 170]}
#for links, pages, id, country in zip(url_data.links, url_data.pages, url_data.id, url_data.country):
def start_requests(self):
for id_ in zip(self.test_dict['country_id']):
for id_marque in self.test_dict['make']:
for models in self.test_dict['model']:
for engine in self.test_dict['engine']:
for page in range(1, 10000):
yield scrapy.FormRequest(
url = f'https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_pays%3D{id_}%26id_marque%3D{id_marque}%26id_modele%3D{models}%26id_motorisation%3D{engine}',
method="POST",
callback = self.parse,
formdata = {
'ajax': '{"tab_id":"t0","cur_page":%s,"cur_trie":"distance","query":"","critere":{"id_pays":[%s],"id_marque":[%s], "id_modele":[%s], "id_motorisation":[%s]},"sliders":{"prix":{"id":"#range_prix","face":"prix","max_counter":983615,"min":"1","max":"400000"},"km":{"id":"#range_km","face":"km","max_counter":1071165,"min":"1","max":"500000"},"millesime":{"id":"#range_millesime","face":"millesime","max_counter":1163610,"min":"1900","max":"2022"}},"req_num":1,"nb_results":"11795660","current_location_distance":-1,"logged_in":false}' % (page,id_, id_marque, models, engine),
'tabs': '["t0"]'
},
headers=headers,
cb_kwargs = {
'page_number':page
}
)
def parse(self, response,page_number):
container = json.loads(response.text)
test=container['#lists']
soup = BeautifulSoup(test, 'lxml')
for i in soup:
carMake = i.select("a.external.tag_f_titre > span.title-block.brand:nth-child(1)")
carUnique = i.select('li[tref]')
for make, unique in zip_longest(
carMake, carUnique
):
loader = ItemLoader(Countryitem())
# loader.add_value('page_number', page_number)
loader.add_value("unique_id", unique['tref'])
loader.add_value("page_number",page_number)
if make != None:
loader.add_value('make', make.text)
else:
loader.add_value('make', "None")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'park.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(CountrySpider)
process.start()

class DuplicatesPipeline:
def __init__(self):
self.titles_seen = set()
def process_item(self, item, spider):
if item['unique_id'] in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(item['unique_id'])
return item
Also add to custom_settings:
custom_settings = {
'ITEM_PIPELINES': {
'myproject.path_to_your_file.DuplicatesPipeline': 300
}
}

Related

Spider grabs 1 item from each page and not all items

My spiders seems to only grab about 1 job-listing from each webpage. When I remove parse_jobs, and load_item() in parse I can extract all the job-listings for each page. So the issue is likely when it goes to parse_jobs and loads items, however I cannot seem to figure out the issue.
Here's what I have tried:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst, MapCompose, Join
import pandas as pd
from collections import defaultdict
from scrapy_splash import SplashRequest
headers = {
'authority': 'api2.branch.io',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'content-type': 'application/x-www-form-urlencoded',
'accept': '*/*',
'origin': 'https://www.reed.co.uk',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.reed.co.uk/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class ReedItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
region = Field(input_processor = MapCompose(str.strip),
output_processor = TakeFirst())
items = Field(output_processor = TakeFirst())
post = Field(input_processor = MapCompose(),
output_processor = Join(" "))
page_no = Field(output_processor = TakeFirst())
class ReedSpider(scrapy.Spider):
name = 'reed'
degrees={'upper': ['Finance','Accounting','Aeronautical-Engineering','Manufacturing-Engineering'],
'degrees_entry': ['degree-finance-entry','degree-accounting-entry','degree-aeronautical-engineering-entry','degree-manufacturing-engineering-entry'],
'degrees_graduate': ['degree-finance-graduate','degree-accounting-graduate','degree-aeronautical-engineering-graduate','degree-manufacturing-engineering-graduate'],
'degrees': ['degree-finance','degree-accounting','degree-aeronautical-engineering','degree-manufacturing-engineering'],
'graduate_entry': ['graduate-finance-entry','graduate-accounting-entry','graduate-aeronautical-engineering-entry','graduate-manufacturing-engineering-entry'],
'graduate': ['graduate-finance','graduate-accounting','graduate-aeronautical-engineering','graduate-manufacturing-engineering'],
'sector': ['Accountancy_Finance','Accountancy_Finance','Engineering_Manufacturing','Engineering_Manufacturing'],
'degree_type': ['Accountancy_finance','Accountancy_finance','Aeronautical_Engineering','Manufacturing_Engineering']}
degree = pd.DataFrame(degrees)
start_urls = defaultdict(list)
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':0.5,
#'LOG_LEVEL':'INFO',
}
def start_requests(self):
for degrees, degrees_entry,degrees_graduate, graduate_entry, graduate,sector in zip(self.degree.degrees,self.degree.degrees_entry,self.degree.degrees_graduate,self.degree.graduate_entry,self.degree.graduate, self.degree.sector):
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_graduate}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate}-jobs')
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
headers=headers,
callback = self.parse,
cb_kwargs = {
'items':items,
'page_no':0
}
)
def parse(self, response, items, page_no):
container = response.xpath("//div[#class='row search-results']")
for lists in container:
page_no += 1
loader = ItemLoader(ReedItem(), selector = lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
links = response.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback = self.parse_jobs,
cb_kwargs = {
'loader':loader
})
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url = next_page,
callback = self.parse,
headers=headers,
cb_kwargs = {
'items':items,
'page_no':page_no
})
def parse_jobs(self, response, loader):
loader.add_value('post',response.xpath('(//span[#itemprop="description"]/p/text()) | (//span[#itemprop="description"]/p//text()) | (//span[#itemprop="description"]/ul//li/text())').getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'reed_jobs_post.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ReedSpider)
process.start()
I commented out the lines I changed.
def parse(self, response, items, page_no):
# container = response.xpath("//div[#class='row search-results']")
container = response.xpath("//div[#class='row search-results']//article")
page_no += 1
for lists in container:
# page_no += 1
loader = ItemLoader(ReedItem(), selector=lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
# links = response.xpath('.//h3[#class="title"]/a/#href').get()
links = lists.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback=self.parse_jobs,
cb_kwargs={
'loader': loader
}
)
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url=next_page,
callback=self.parse,
headers=headers,
cb_kwargs={
'items': items,
'page_no': page_no
})

Pandas only saving first page of scraped data into CSV

I'm trying to scrape data from the first 5 pages of this site and save it to a CSV. Everything seems to be working fine but only the first page of the site is getting saved to the CSV. I think it may be an indentation issue but I haven't been able to figure it out. Any help would be appreciated, thanks!
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import random
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'pis=3; ut=hB0ZZQzmgNnOh9lBQEu8bTmaWMULGZQUahdMlzzGd1k; SESSION_TOKEN=j-_KSJ8wscb0aeKUf-Ndr5wDcTCFdY3E2qwROIDEt5U; zjs_user_id=null; zjs_anonymous_id=%22hB0ZZQzmgNnOh9lBQEu8bTmaWMULGZQUahdMlzzGd1k%22; _pxvid=ef32ffea-7620-11eb-9542-0242ac12000d; _ga=GA1.2.845026454.1614116940; _gid=GA1.2.991776272.1614116940; _gcl_au=1.1.1819279014.1614116941; OptanonConsent=isIABGlobal=false&datestamp=Tue+Feb+23+2021+16%3A21%3A01+GMT-0600+(Central+Standard+Time)&version=5.9.0&landingPath=NotLandingPage&groups=1%3A1%2C0_172180%3A1%2C0_248632%3A1%2C0_172218%3A1%2C0_172151%3A1%2C0_172362%3A1%2C3%3A1%2C0_172152%3A1%2C0_172351%3A1%2C4%3A1%2C0_172338%3A1%2C0_172360%3A1%2C0_172153%3A1%2C0_172154%3A1%2C0_172343%3A1%2C0_177347%3A1%2C0_172331%3A1%2C0_172155%3A1%2C0_172156%3A1%2C0_248627%3A1%2C0_172157%3A1%2C0_248631%3A1%2C0_172158%3A1%2C0_172357%3A1%2C0_248633%3A1%2C0_172348%3A1%2C0_172159%3A1%2C0_172160%3A1%2C0_172161%3A1%2C0_172162%3A1%2C0_172163%3A1%2C0_172164%3A1%2C0_172165%3A1%2C0_172166%3A1%2C0_172167%3A1%2C0_172168%3A1%2C0_172169%3A1%2C0_172170%3A1%2C0_172171%3A1%2C0_172172%3A1%2C0_172173%3A1%2C0_172174%3A1%2C0_172175%3A1%2C0_172176%3A1%2C0_172177%3A1%2C0_172178%3A1%2C0_172179%3A1%2C0_172181%3A1%2C0_172182%3A1%2C0_172183%3A1%2C0_172184%3A1%2C0_172185%3A1%2C0_172186%3A1%2C0_172187%3A1%2C0_172188%3A1%2C0_172189%3A1%2C0_172190%3A1%2C0_172191%3A1%2C0_172192%3A1%2C0_172193%3A1%2C0_172195%3A1%2C0_172197%3A1%2C0_172198%3A1%2C0_172199%3A1%2C0_172200%3A1%2C0_172201%3A1%2C0_172202%3A1%2C0_172203%3A1%2C0_172204%3A1%2C0_172205%3A1%2C0_172206%3A1%2C0_172207%3A1%2C0_172208%3A1%2C0_172209%3A1%2C0_172210%3A1%2C0_172211%3A1%2C0_172212%3A1%2C0_172213%3A1%2C0_172214%3A1%2C0_172215%3A1%2C0_172216%3A1%2C0_172217%3A1%2C0_172219%3A1%2C0_172220%3A1%2C0_172221%3A1%2C0_172222%3A1%2C0_172223%3A1%2C0_172330%3A1%2C0_172333%3A1%2C0_172334%3A1%2C0_172335%3A1%2C0_172336%3A1%2C0_172337%3A1%2C0_172339%3A1%2C0_172340%3A1%2C0_172341%3A1%2C0_172342%3A1%2C0_172344%3A1%2C0_248628%3A1%2C0_172345%3A1%2C0_172346%3A1%2C0_172349%3A1%2C0_172350%3A1%2C0_172352%3A1%2C0_172353%3A1%2C0_172354%3A1%2C0_172355%3A1%2C0_172356%3A1%2C0_172358%3A1%2C0_172359%3A1%2C0_172361%3A1%2C0_248629%3A1%2C0_248630%3A1%2C0_248634%3A1&AwaitingReconsent=false; _px3=d6e4661fe0f89390bd501cf6d96d7c4ce6da6b629f038f745c417aec166457da:jkvtO/Et7fQoQ9uQjR7cLnpUmnMnTHJjbtYEYxtF8Af3XMaosoyoSH29Qf+5aiOY4Z/BqkATEDsYMrO6hKGNOQ==:1000:v1Auy0PIGkZc2wIJIcWfwOV3SoBz2sZHwNv/67LxTEKseVa/NakBSB7e9s397Ol/RCx/TcpBu3ZoJilwD/sP/3PIkNcxZXjbK+aHVEpfKf37sDvp8iNYyLqZ6QjNsa/0NXHrpVIWto2qgiaU21O2v9R9EgDeaTBEt4MCmMT87V4=',
'Host': 'hotpads.com',
'sec-ch-ua': '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
params = {
'lat: 41.7985 lon: -87.7117 z: 11'
}
pages = np.arange(1, 6)
for page in pages:
page = requests.get('https://hotpads.com/chicago-il/apartments-for-rent' + '?page=' + str(page), headers = headers)
source = page.content
soup = BeautifulSoup(source, 'lxml')
postings = soup.find_all('div', class_="AreaListingsContainer")
time.sleep(random.randint(2,10))
for p in postings:
url = ["https://hotpads.com" + u['href'] for u in p.find_all('a', href=True, text=True)]
address = [a.get_text() for a in p.find_all('h4', class_="styles__ListingCardName-y78yl0-8 jQmZHq")]
price = [p.get_text() for p in p.find_all('div', class_="styles__ListingCardPrice-y78yl0-17 cguwHc")]
beds = [b.get_text() for b in p.find_all('div', class_="styles__ListingCardBedDisplay-y78yl0-7 iPqMa")]
homes = list(zip(url, address, price, beds))
df = pd.DataFrame(homes, columns = ['URL', 'Address', 'Price', 'Beds'])
print(df)
df.to_csv('Chicago_homes.csv')
It looks like you're overwriting the original df in every iteration. There's two solutions to this.
1- Initialize the dataframe before the loop and write the entire frame to the file after the loop has completed.
df = pd.DataFrame()
for page in pages:
#do stuff...
df = pd.concat([df,pd.DataFrame(homes, columns = ['URL', 'Address', 'Price', 'Beds'])])
#put this outside of the loop.
df.to_csv('Chicago_homes.csv')
2- Append the dataframe data to the file within the loop. This is beneficial when executing many loops, and you don't want to hold the large dataframe in memory.
for page in pages:
#do stuff..
if os.path.exists('Chicago_homes.csv'):
df.to_csv('Chicago_homes.csv',mode='a',header=False)
else:
df.to_csv('Chicago_homes.csv')

How to parse two different items in scrapy?

I am using scrapy 2.1 for parsing a category result page.
There are 2 different things I want to scrape from that site:
Category information like e.g. Title and URL
Product items within that category page
Number 2 works, but I am struggling on how to implement the storage of category info. My first attempt is to create another Item Class CatItem:
class CatItem(scrapy.Item):
title = scrapy.Field() # char -
url = scrapy.Field() # char -
level = scrapy.Field() # int -
class ProductItem(scrapy.Item):
title = scrapy.Field() # char -
Let's parse the page:
def parse_item(self, response):
# save category info
category = CatItem()
category['url'] = response.url
category['title'] = response.url
category['level'] = 1
yield category
# now let's parse all products within that category
for selector in response.xpath("//article//ul/div[#data-qa-id='result-list-entry']"):
product = ProductItem()
product['title'] = selector.xpath(".//a/h2/text()").extract_first()
yield product
My Pipeline:
class mysql_pipeline(object):
def __init__(self):
self.create_connection()
def create_connection(self):
settings = get_project_settings()
def process_item(self, item, spider):
self.store_db(item, spider)
return item
Now here I don't know how to proceed. There is only one "item" within process_item definition.
How can I pass the category information to the store_db method as well?
You can check item type in your pipeline:
from your_project.items import CatItem, ProductItem
class YourPipeline(object):
...
def process_item(self, item, spider):
if isinstance(item, CatItem):
save_category(item)
return item
UPDATE Simple PoC code:
import scrapy
import csv
from scrapy.crawler import CrawlerProcess
class BooksPipeline(object):
def process_item(self, item, spider):
filename = None
if isinstance(item, CategoryItem):
filename = 'Categories.csv'
elif isinstance(item, BookItem):
filename = 'Books.csv'
with open(filename, 'a', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Title', 'URL'], lineterminator="\n")
writer.writerow(item)
return item
class BooksSpider(scrapy.Spider):
name = "books"
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for book_url in response.xpath('//ol/li//h3/a/#href').getall():
yield scrapy.Request(
url=response.urljoin(book_url),
callback=self.parse_book,
)
def parse_book(self, response):
category = CategoryItem()
category['Title'] = response.xpath('//ul[#class="breadcrumb"]/li[last() - 1]/a/text()').get()
category['URL'] = response.xpath('//ul[#class="breadcrumb"]/li[last() - 1]/a/#href').get()
yield category
book = BookItem()
book['Title'] = response.xpath('//h1/text()').get()
book['URL'] = response.url
yield book
class BookItem(scrapy.Item):
Title = scrapy.Field()
URL = scrapy.Field()
class CategoryItem(scrapy.Item):
Title = scrapy.Field()
URL = scrapy.Field()
if __name__ == "__main__":
process = CrawlerProcess(
{
'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
'DOWNLOAD_TIMEOUT':100,
'ITEM_PIPELINES': {
'__main__.BooksPipeline': 300,
}
}
)
process.crawl(BooksSpider)
process.start()

Scrapy Spider not writing to Postgres in the correct format

I'm scraping the Science of Us website for articles related to mental health and trying to dump it to a postgres database I'm running locally. The scrapy output is stored in a dictionary that looks like articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
On running my code, it dumps the entire list of values for each key into the column with name == key. Instead, I would like each article to be one row in the database e.g. Article 1 would have its own row with its title, teaser, link, date, author and source in each of the columns.
Here is the relevant code:
1) spider.py
from scrapy.spiders import Spider
from scrapy import Request
from mhnewsbot_app.items import SOUItem
import string
mh_search_terms = ["DEPRESS", "MENTAL HEALTH", "EMOTIONAL HEALTH", "MENTAL DISORDER", "DIGITAL MEDICINE", "ANXI", "PSYCH", "THERAPY", "THERAPIST"]
tbl = string.maketrans('-', ' ') #To protect against cases where the article has hyphens or other special characters
articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
def url_lister():
url_list = []
article_count = 0
while article_count < 150:
url = 'http://nymag.com/scienceofus/?start=%s' %article_count
url_list.append(url)
article_count += 50
return url_list
class SOUSpider(Spider):
name = 'scienceofus'
start_urls = url_lister()
def parse(self, response):
for article in response.xpath('//ul[#class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
articles['title'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]/text()').extract()[title.index(i)])
articles['teaser'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/p[#class = "teaser"]/text()').extract()[title.index(i)])
articles['link'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/a[#class = "read-more"]/#href').extract()[title.index(i)])
articles['date'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/div[#class="headline-above"]/time/text()').extract()[title.index(i)])
articles['author'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/span[#class="by-authors"]/span/span[#class="author"]/text()').extract()[title.index(i)])
articles['source'].append('Science Of Us')
return articles
2) pipelines.py
from sqlalchemy.orm import sessionmaker
from models import Articles, db_connect, create_articles_table
class ArticlesPipeline(object):
def __init__(self):
engine = db_connect()
create_articles_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
article = Articles(**item)
try:
session.add(article)
session.commit()
except :
session.rollback()
raise
finally:
session.close()
return item
you are outputting 1 item, with multiple values on their fields, better output one item per value, because that's how your database seems to accept it:
def parse(self, response):
for article in response.xpath('//ul[#class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
article_item = {}
article_item['title'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]/text()').extract()[title.index(i)]
article_item['teaser'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/p[#class = "teaser"]/text()').extract()[title.index(i)]
article_item['link'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/a[#class = "read-more"]/#href').extract()[title.index(i)]
article_item['date'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/div[#class="headline-above"]/time/text()').extract()[title.index(i)]
article_item['author'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/span[#class="by-authors"]/span/span[#class="author"]/text()').extract()[title.index(i)]
article_item['source'] = 'Science Of Us'
yield article_item

Getting repeated requests from same url with different values

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item