Scrapy Spider not writing to Postgres in the correct format - scrapy

I'm scraping the Science of Us website for articles related to mental health and trying to dump it to a postgres database I'm running locally. The scrapy output is stored in a dictionary that looks like articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
On running my code, it dumps the entire list of values for each key into the column with name == key. Instead, I would like each article to be one row in the database e.g. Article 1 would have its own row with its title, teaser, link, date, author and source in each of the columns.
Here is the relevant code:
1) spider.py
from scrapy.spiders import Spider
from scrapy import Request
from mhnewsbot_app.items import SOUItem
import string
mh_search_terms = ["DEPRESS", "MENTAL HEALTH", "EMOTIONAL HEALTH", "MENTAL DISORDER", "DIGITAL MEDICINE", "ANXI", "PSYCH", "THERAPY", "THERAPIST"]
tbl = string.maketrans('-', ' ') #To protect against cases where the article has hyphens or other special characters
articles = {'title': [], 'teaser': [], 'link': [], 'date': [], 'author': [], 'source': []}
def url_lister():
url_list = []
article_count = 0
while article_count < 150:
url = 'http://nymag.com/scienceofus/?start=%s' %article_count
url_list.append(url)
article_count += 50
return url_list
class SOUSpider(Spider):
name = 'scienceofus'
start_urls = url_lister()
def parse(self, response):
for article in response.xpath('//ul[#class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
articles['title'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]/text()').extract()[title.index(i)])
articles['teaser'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/p[#class = "teaser"]/text()').extract()[title.index(i)])
articles['link'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/a[#class = "read-more"]/#href').extract()[title.index(i)])
articles['date'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/div[#class="headline-above"]/time/text()').extract()[title.index(i)])
articles['author'].append(article.xpath('.//li[contains(#class, "newsfeed-article")]/span[#class="by-authors"]/span/span[#class="author"]/text()').extract()[title.index(i)])
articles['source'].append('Science Of Us')
return articles
2) pipelines.py
from sqlalchemy.orm import sessionmaker
from models import Articles, db_connect, create_articles_table
class ArticlesPipeline(object):
def __init__(self):
engine = db_connect()
create_articles_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
article = Articles(**item)
try:
session.add(article)
session.commit()
except :
session.rollback()
raise
finally:
session.close()
return item

you are outputting 1 item, with multiple values on their fields, better output one item per value, because that's how your database seems to accept it:
def parse(self, response):
for article in response.xpath('//ul[#class="newsfeed-article-list"]'):
title = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]').extract()
for i in title:
for search_term in mh_search_terms:
if search_term in i.upper().strip():
article_item = {}
article_item['title'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/a[#class="headline-link"]/h3[#class="headline"]/text()').extract()[title.index(i)]
article_item['teaser'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/p[#class = "teaser"]/text()').extract()[title.index(i)]
article_item['link'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/a[#class = "read-more"]/#href').extract()[title.index(i)]
article_item['date'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/div[#class="headline-wrapper"]/div[#class="headline-above"]/time/text()').extract()[title.index(i)]
article_item['author'] = article.xpath('.//li[contains(#class, "newsfeed-article")]/span[#class="by-authors"]/span/span[#class="author"]/text()').extract()[title.index(i)]
article_item['source'] = 'Science Of Us'
yield article_item

Related

scrapy-playwright returning nothing but an error

I'm learning Scrapy-playwright and it's fighting me. I'm attempting to gather store locations from a site using the CrawlSpider with a rule including a process_request that triggers the request to run through playwright. In my callback def I can print a value found on the page, but not return or yield anything. I've attempted to cache the data into an item, and return/yield a dict, all of which produces the error.
ERROR: Spider must return request, item, or None, got 'Deferred'
I'm stumped.
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from banners.items import StoreItem
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
def set_playwright_true(request, response):
request.meta["playwright"] = True
request.meta["playwright_include_page"] = True
request.meta["playwright_page_coroutines"] = ('wait_for_selector', 'span.store-name-city')
return request
class StoreSpider(CrawlSpider):
name = "retailer"
allowed_domains = ['retailer.com']
start_urls = ['https://www.retailer.com/store/0000-city-ak']
custom_settings = {
'ROBOTSTXT_OBEY': True ,
#'DOWNLOAD_DELAY': .5 ,
#'CONCURRENT_REQUESTS_PER_DOMAIN': 3 ,
'DOWNLOAD_HANDLERS': {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
} ,
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ,
}
rules = (
Rule(LinkExtractor(allow=('directory/ak/anchorage'))),
Rule(LinkExtractor(allow=(r'store/[0-9]+'), deny=(r'store/[0-9]+.+/.+')), callback='parse_item', follow=False, process_request=set_playwright_true),
)
async def parse_item(self, response):
items = []
item = StoreItem()
self.logger.info('***** Start processing ' + response.url + '. *****')
Name = response.css('meta[itemprop=alternateName]').attrib['content'] + ' - ' + response.css('span.store-name-city::text').get()
print(Name)
item['Name'] = Name
item['StoreID'] = response.css('meta[itemprop=storeID]').attrib['content']
item['Address1'] = response.css('span.store-address-line-1::text').get()
item['City'] = response.css('span.store-address-city::text').get()
item['State'] = response.css('span.store-address-state::text').get()
item['Zip'] = response.css('span.store-address-postal::text').get()
item['Phone'] = response.css('div.store-phone::text').get()
item['Latitude'] = response.css('meta[itemprop=latitude]').attrib['content']
item['Longitude'] = response.css('meta[itemprop=longitude]').attrib['content']
items.append(item)
return(items)
Changing parse_item from an async def to a plain def resolved the issue.
async def parse_item(self, response):
changed to
def parse_item(self, response):

Following urls in javascript - Scrapy Splash

I am extremely new to web scraping. I manage to extract information from static websites but am now trying my hand following urls and extracting data (which ofcourse involves some javascript). I have installed scrapy-splash for the same which is running perfectly fine.
The website I am trying to scrape is https://www.ta.com/portfolio/investments/ari-network-services-inc and the button to the top right side takes you to the next page (which is javascript, hence splash). I want to scrape some basic data (like company name, sectors etc) on all the pages till the last one. This is what I have done so far and I need help to correct this to successfully execute.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = ['https://www.ta.com/portfolio/investments/ari-network-services-inc']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team
}
next_page = response.css('li.item_detail-nav-item--next a::attr(href)').extract()
if next_page is not None:
yield SplashRequest(urlparse.urljoin('https://www.ta.com',next_page),callback=self.parse, args={"wait":3})
This gives me the correct information for the start_url but doesn't proceed to the next page.
Update. The issue was in the order in which I had the scraping of websites. Below is the updated code which worked well.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = [
'https://www.ta.com/portfolio/business-services',
'https://www.ta.com/portfolio/consumer',
'https://www.ta.com/portfolio/financial-services',
'https://www.ta.com/portfolio/healthcare',
'https://www.ta.com/portfolio/technology'
]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companylink = response.css('div.tiles.js-portfolio-tiles a::attr(href)').extract()
for i in companylink:
yield response.follow('https://www.ta.com' + str(i), callback=self.parse1)
def parse1(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
about_company = response.css('h2.item_detail-main-content-heading::text').extract()
about_company_detail = response.css('div.markdown p::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team,
'about_company': about_company,
'about_company_detail' : about_company_detail
}

scrapy.exceptions.NotSupported: Unsupported URL scheme '': no handler available for that scheme

I collect links from several directories and then insert them into start_urls as a link variable
import scrapy
class SplashSpider(scrapy.Spider):
f = open('text.txt')
links = f.read()
name = 'spide'
start_urls = [str(links)]
f.close()
def parse(self, response):
title = response.css('.title::text').extract()
description = response.css("div#desc").extract()
title = list(map(str.strip, title))
description = list(map(str.strip, description))
yield{
'Title': title,
'Main Info': description,
}
but I'm catching an error: scrapy.exceptions.NotSupported: Unsupported URL scheme '': no handler available for that scheme
my text.txt file:
'https:// url1.com','https:// url2.com', ... , 'https:// url300000.com', 'https:// url300001.com'
import scrapy
class SplashSpider(scrapy.Spider):
with open('text.txt') as f:
links = f.readlines()
links = list(map(lambda x: x.strip().replace(' ', ''), links))
name = 'spider'
start_urls = links
def parse(self, response):
title = response.css('.title::text').extract()
description = response.css("div#desc").extract()
title = list(map(str.strip, title))
description = list(map(str.strip, description))
yield{
'Title': title,
'Main Info': description,
}

Getting repeated requests from same url with different values

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item

Scrapy doesn't return start_urls as an item

Using the following code, I can get Scrapy to crawl the pages of a site, parse those pages and return the results of each page parse as an item for processing in a pipeline.
My issue is that I cannot work out how to process the start_url page. The start_url never gets passed to the parse_item function.
What am I missing?
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
ignored_extensions = [
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
# office suites
'xls', 'xlsx', 'ppt', 'pptx', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp',
# other
'css', 'exe', 'bin', 'rss', 'zip', 'rar',
]
rules = [
Rule(LinkExtractor(deny_extensions=ignored_extensions), follow=True, callback='parse_item')
]
def __init__(self, start_url, source, *args, **kwargs):
super(GenericSpider, self).__init__(*args, **kwargs)
#set common settings
Bootstrap.init(self, kwargs)
self.source = source
self.start_urls = [start_url]
self.allowed_domains = [urlparse.urlparse(start_url).hostname]
def parse_item(self, response):
process response and return item ....
You'll want to define parse_start_url, something like the following should do:
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
...
def parse_item(self, response):
process response and return item ....
parse_start_url = parse_item