How to set headless = Flase in scrapy-playwright? - scrapy

In scrapy-playwright, how to set "headless = False". I am trying something like this.
def start_requests(self):
yield scrapy.Request(
url=url,
callback = self.parse,
meta= dict(
playwright = True,
playwright_include_page = True,
playwright_page_coroutines = [
PageCoroutine("click", selector = "//input[#name='typeAheadInputField']"),
]
)
)
return super().start_requests()
def parse(self, response):
pass

settings.py
PLAYWRIGHT_LAUNCH_OPTIONS = {"headless": False}

Related

scrapy_selenium, how to pass driver to parse

I'm running this code using scrapy_selenium but I'm not able to pass the driver to parse_page, can anyone identify what I'm doing wrong?
class LSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url='https://www.url.com',
wait_time=3,
screenshot=True,
callback=self.login
)
def login(self, response):
driver = response.request.meta['driver']
search_input = driver.find_element_by_xpath("(//input[#class='input__input'])[1]")
search_input.send_keys("user")
search_input_password = driver.find_element_by_xpath("(//input[#class='input__input'])[2]")
search_input_password.send_keys("password")
search_input.send_keys(Keys.ENTER)
time.sleep(3)
yield SeleniumRequest(
url='https:url.com/url',
callback=self.parse_page,
screenshot=True
)
def parse_page(self, response):
driver = response.request.meta['driver']
search = driver.find_element_by_xpath("//input[#class='search-box']")

scrapy pagination not working on tripadvisor

i'm trying to scrape the restaurant pages on tripadvisor (just to learn how it works)
However, i only get the first page.
What am I missing?
here is the code, thanks!
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = ['https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name' : stores.css('a.bHGqj::text').extract(),
'link' : stores.css('a.bHGqj').xpath("#href").extract()
}
next_page = ('http://tripadvisor.com' + response.css('a.nav').attrib['href']).extract()
##next_page = response.xpath('//a[contains(text(), "Next")]/#href).extract())
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
#djmystica, Now it's working fine
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = [
'https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name': stores.css('a.bHGqj::text').extract_first(),
'link': stores.css('a.bHGqj').xpath("#href").extract_first()}
#next_page = ('http://tripadvisor.com' +response.css('a.nav').attrib['href']).extract()
next_page = response.xpath('//a[contains(text(), "Next")]/#href').extract_first()
abs_next_page = f'https://www.tripadvisor.com{next_page}'
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if abs_next_page is not None:
yield response.follow(abs_next_page, callback=self.parse)

Scrapy response incomplete get url how to

I would like to parse the value obtained from parse again by connecting to another url. How do I fix it?
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import StackItem
class StackSpider(Spider):
name = "stack"
allowed_domains = ["*"]
global n
#n = 1997
start_urls = ['https://www.melon.com/chart/age/list.htm?chartType=YE&chartGenre=KPOP&chartDate=2010',]
def parse(self, response):
url = 'https://www.melon.com/song/detail.htm?songId='
questions = Selector(response).xpath('//*[#id="frm"]/table/tbody/tr')
for question in questions:
item = StackItem()
item['musicid'] = question.xpath('td/div/input/#value').extract()[0]
item['title'] = question.xpath('td[4]/div/div/div/span/strong/a/#title').extract()
item['artlist'] = question.xpath(
'td[4]/div/div/div[2]/div[1]/a/text()').extract()
item['album'] = question.xpath(
'td[4]/div/div/div[2]/div[2]/a/text()').extract()
item['sunwhi'] = question.xpath(
'td[2]/div/span/text()').extract()[0]
response_url=requests.get(url+musicid)
def parse(self, response):
questions = Selector(response).xpath('//*[#id="downloadfrm"]/div/div/div[2]/div[2]/dl/dd')
for question in questions:
item = StackItem()
item['album'] = question.xpath('a/text()').extract()[0]
yield item
class StackSpider(Spider):
name = "stack"
allowed_domains = ["*"]
global n
#n = 1997
start_urls = ['https://www.melon.com/chart/age/list.htm?chartType=YE&chartGenre=KPOP&chartDate=2010',]
def parse(self, response):
url = 'https://www.melon.com/song/detail.htm?songId='
questions = Selector(response).xpath('//*[#id="frm"]/table/tbody/tr')
for question in questions:
item = StackItem()
item['musicid'] = question.xpath('td/div/input/#value').extract()[0]
item['title'] = question.xpath('td[4]/div/div/div/span/strong/a/#title').extract()
item['artlist'] = question.xpath(
'td[4]/div/div/div[2]/div[1]/a/text()').extract()
item['album'] = question.xpath(
'td[4]/div/div/div[2]/div[2]/a/text()').extract()
item['sunwhi'] = question.xpath(
'td[2]/div/span/text()').extract()[0]
response_url=requests.get(url+musicid)
def parse(self, response):
questions = Selector(response).xpath('//*[#id="downloadfrm"]/div/div/div[2]/div[2]/dl/dd')
for question in questions:
item = StackItem()
item['album'] = question.xpath('a/text()').extract()[0]
yield item

Get next page using scrapy

I'm interested in get contractors data for atlanta from this page:
http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658
So for I can open the links for the categories
'Additions & Remodeling'
'Architects & Engineers'
'Fountains & Ponds'
......
.....
.....
But I can open only the first page:
http://www.1800contractor.com/d.Additions-Remodeling.Atlanta.GA.-12001.html?startingIndex=0&showDirectory=true
I'm trying to open get the next one with the links of the 'Next' button:
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
request = scrapy.Request(absolute_next_page_url)
yield request
But it makes no difference.
This is the code of my spider:
import scrapy
class Spider_1800(scrapy.Spider):
name = '1800contractor'
allowed_domains = ['1800contractor.com']
start_urls = (
'http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658',
)
def parse(self, response):
urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/#href').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
# process next page
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
request = scrapy.Request(absolute_next_page_url)
yield request
def parse_contractors(self, response):
name = response.xpath(
'/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/#href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
You are not paginating the right request, parse handles the request generated with the urls in start_urls, which means that you need to enter each category in http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658 first.
def parse(self, response):
urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/#href').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
def parse_contractors(self, response):
name = response.xpath(
'/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/#href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/#href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)
After hitting start_url your xpath for picking url for contractors are not working. Next page is present on contractor page therefore its called after contractor url. this will work for you
def parse(self, response):
urls = response.xpath('//table//*[#class="hiCatNaked"]').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
def parse_contractors(self, response):
name=response.xpath('/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/#href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
next_page_url = response.xpath('//a[b[contains(.,'Next')]]/#href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)

Scrapy doesn't return start_urls as an item

Using the following code, I can get Scrapy to crawl the pages of a site, parse those pages and return the results of each page parse as an item for processing in a pipeline.
My issue is that I cannot work out how to process the start_url page. The start_url never gets passed to the parse_item function.
What am I missing?
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
ignored_extensions = [
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
# office suites
'xls', 'xlsx', 'ppt', 'pptx', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp',
# other
'css', 'exe', 'bin', 'rss', 'zip', 'rar',
]
rules = [
Rule(LinkExtractor(deny_extensions=ignored_extensions), follow=True, callback='parse_item')
]
def __init__(self, start_url, source, *args, **kwargs):
super(GenericSpider, self).__init__(*args, **kwargs)
#set common settings
Bootstrap.init(self, kwargs)
self.source = source
self.start_urls = [start_url]
self.allowed_domains = [urlparse.urlparse(start_url).hostname]
def parse_item(self, response):
process response and return item ....
You'll want to define parse_start_url, something like the following should do:
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
...
def parse_item(self, response):
process response and return item ....
parse_start_url = parse_item