I'm scraping the Science of Us website for articles related to mental health and trying to dump it to a postgres database I'm running locally. The scrapy output is stored in a dictionary that looks like articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
On running my code, it dumps the entire list of values for each key into the column with name == key. Instead, I would like each article to be one row in the database e.g. Article 1 would have its own row with its title, teaser, link, date, author and source in each of the columns.
Here is the relevant code:
from scrapy.spiders import Spider
from scrapy import Request
from mhnewsbot_app.items import SOUItem
import string
tbl = string.maketrans('-', ' ') #To protect against cases where the article has hyphens or other special characters
articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
def url_lister():
url_list = []
article_count = 0
while article_count < 150:
url = '' %article_count
article_count += 50
return url_list
class SOUSpider(Spider):
name = 'scienceofus'
start_urls = url_lister()
def parse(self, response):
for article in response.xpath('//ul[#class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
articles['title'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]/text()').extract()[title.index(i)])
articles['teaser'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/p[#class = "teaser"]/text()').extract()[title.index(i)])
articles['link'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/a[#class = "read-more"]/#href').extract()[title.index(i)])
articles['date'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/div[#class="headline-above"]/time/text()').extract()[title.index(i)])
articles['author'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/span[#class="by-authors"]/span/span[#class="author"]/text()').extract()[title.index(i)])
articles['source'].append('Science Of Us')
return articles
from sqlalchemy.orm import sessionmaker
from models import Articles, db_connect, create_articles_table
class ArticlesPipeline(object):
def __init__(self):
engine = db_connect()
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
article = Articles(**item)
except :
return item

you are outputting 1 item, with multiple values on their fields, better output one item per value, because that's how your database seems to accept it:
def parse(self, response):
for article in response.xpath('//ul[#class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
article_item = {}
article_item['title'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]/text()').extract()[title.index(i)]
article_item['teaser'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/p[#class = "teaser"]/text()').extract()[title.index(i)]
article_item['link'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/a[#class = "read-more"]/#href').extract()[title.index(i)]
article_item['date'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/div[#class="headline-above"]/time/text()').extract()[title.index(i)]
article_item['author'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/span[#class="by-authors"]/span/span[#class="author"]/text()').extract()[title.index(i)]
article_item['source'] = 'Science Of Us'
yield article_item


scrapy-playwright returning nothing but an error

I'm learning Scrapy-playwright and it's fighting me. I'm attempting to gather store locations from a site using the CrawlSpider with a rule including a process_request that triggers the request to run through playwright. In my callback def I can print a value found on the page, but not return or yield anything. I've attempted to cache the data into an item, and return/yield a dict, all of which produces the error.
ERROR: Spider must return request, item, or None, got 'Deferred'
I'm stumped.
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from banners.items import StoreItem
from import PageCoroutine
from scrapy.http.response import Response
def set_playwright_true(request, response):
request.meta["playwright"] = True
request.meta["playwright_include_page"] = True
request.meta["playwright_page_coroutines"] = ('wait_for_selector', '')
return request
class StoreSpider(CrawlSpider):
name = "retailer"
allowed_domains = ['']
start_urls = ['']
custom_settings = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
} ,
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ,
rules = (
Rule(LinkExtractor(allow=(r'store/[0-9]+'), deny=(r'store/[0-9]+.+/.+')), callback='parse_item', follow=False, process_request=set_playwright_true),
async def parse_item(self, response):
items = []
item = StoreItem()'***** Start processing ' + response.url + '. *****')
Name = response.css('meta[itemprop=alternateName]').attrib['content'] + ' - ' + response.css('').get()
item['Name'] = Name
item['StoreID'] = response.css('meta[itemprop=storeID]').attrib['content']
item['Address1'] = response.css('').get()
item['City'] = response.css('').get()
item['State'] = response.css('').get()
item['Zip'] = response.css('').get()
item['Phone'] = response.css('').get()
item['Latitude'] = response.css('meta[itemprop=latitude]').attrib['content']
item['Longitude'] = response.css('meta[itemprop=longitude]').attrib['content']
Changing parse_item from an async def to a plain def resolved the issue.
async def parse_item(self, response):
changed to
def parse_item(self, response):

Following urls in javascript - Scrapy Splash

I am extremely new to web scraping. I manage to extract information from static websites but am now trying my hand following urls and extracting data (which ofcourse involves some javascript). I have installed scrapy-splash for the same which is running perfectly fine.
The website I am trying to scrape is and the button to the top right side takes you to the next page (which is javascript, hence splash). I want to scrape some basic data (like company name, sectors etc) on all the pages till the last one. This is what I have done so far and I need help to correct this to successfully execute.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = ['']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team
next_page = response.css('li.item_detail-nav-item--next a::attr(href)').extract()
if next_page is not None:
yield SplashRequest(urlparse.urljoin('',next_page),callback=self.parse, args={"wait":3})
This gives me the correct information for the start_url but doesn't proceed to the next page.
Update. The issue was in the order in which I had the scraping of websites. Below is the updated code which worked well.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = [
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companylink = response.css('div.tiles.js-portfolio-tiles a::attr(href)').extract()
for i in companylink:
yield response.follow('' + str(i), callback=self.parse1)
def parse1(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
about_company = response.css('h2.item_detail-main-content-heading::text').extract()
about_company_detail = response.css('div.markdown p::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team,
'about_company': about_company,
'about_company_detail' : about_company_detail

scrapy.exceptions.NotSupported: Unsupported URL scheme '': no handler available for that scheme

I collect links from several directories and then insert them into start_urls as a link variable
import scrapy
class SplashSpider(scrapy.Spider):
f = open('text.txt')
links =
name = 'spide'
start_urls = [str(links)]
def parse(self, response):
title = response.css('.title::text').extract()
description = response.css("div#desc").extract()
title = list(map(str.strip, title))
description = list(map(str.strip, description))
'Title': title,
'Main Info': description,
but I'm catching an error: scrapy.exceptions.NotSupported: Unsupported URL scheme '': no handler available for that scheme
my text.txt file:
'https://','https://', ... , 'https://', 'https://'
import scrapy
class SplashSpider(scrapy.Spider):
with open('text.txt') as f:
links = f.readlines()
links = list(map(lambda x: x.strip().replace(' ', ''), links))
name = 'spider'
start_urls = links
def parse(self, response):
title = response.css('.title::text').extract()
description = response.css("div#desc").extract()
title = list(map(str.strip, title))
description = list(map(str.strip, description))
'Title': title,
'Main Info': description,

Getting repeated requests from same url with different values

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
Third problem:
How would I go to implement the next page (comment out code inside
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = [""]
start_urls = [
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = '' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] =
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = ''
getphone = ''
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = [""]
start_urls = [
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
domain = gumtree + href
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item

Scrapy doesn't return start_urls as an item

Using the following code, I can get Scrapy to crawl the pages of a site, parse those pages and return the results of each page parse as an item for processing in a pipeline.
My issue is that I cannot work out how to process the start_url page. The start_url never gets passed to the parse_item function.
What am I missing?
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
ignored_extensions = [
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
# office suites
'xls', 'xlsx', 'ppt', 'pptx', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp',
# other
'css', 'exe', 'bin', 'rss', 'zip', 'rar',
rules = [
Rule(LinkExtractor(deny_extensions=ignored_extensions), follow=True, callback='parse_item')
def __init__(self, start_url, source, *args, **kwargs):
super(GenericSpider, self).__init__(*args, **kwargs)
#set common settings
Bootstrap.init(self, kwargs)
self.source = source
self.start_urls = [start_url]
self.allowed_domains = [urlparse.urlparse(start_url).hostname]
def parse_item(self, response):
process response and return item ....
You'll want to define parse_start_url, something like the following should do:
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
def parse_item(self, response):
process response and return item ....
parse_start_url = parse_item