Spider grabs 1 item from each page and not all items - scrapy

My spiders seems to only grab about 1 job-listing from each webpage. When I remove parse_jobs, and load_item() in parse I can extract all the job-listings for each page. So the issue is likely when it goes to parse_jobs and loads items, however I cannot seem to figure out the issue.
Here's what I have tried:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst, MapCompose, Join
import pandas as pd
from collections import defaultdict
from scrapy_splash import SplashRequest
headers = {
'authority': 'api2.branch.io',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'content-type': 'application/x-www-form-urlencoded',
'accept': '*/*',
'origin': 'https://www.reed.co.uk',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.reed.co.uk/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class ReedItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
region = Field(input_processor = MapCompose(str.strip),
output_processor = TakeFirst())
items = Field(output_processor = TakeFirst())
post = Field(input_processor = MapCompose(),
output_processor = Join(" "))
page_no = Field(output_processor = TakeFirst())
class ReedSpider(scrapy.Spider):
name = 'reed'
degrees={'upper': ['Finance','Accounting','Aeronautical-Engineering','Manufacturing-Engineering'],
'degrees_entry': ['degree-finance-entry','degree-accounting-entry','degree-aeronautical-engineering-entry','degree-manufacturing-engineering-entry'],
'degrees_graduate': ['degree-finance-graduate','degree-accounting-graduate','degree-aeronautical-engineering-graduate','degree-manufacturing-engineering-graduate'],
'degrees': ['degree-finance','degree-accounting','degree-aeronautical-engineering','degree-manufacturing-engineering'],
'graduate_entry': ['graduate-finance-entry','graduate-accounting-entry','graduate-aeronautical-engineering-entry','graduate-manufacturing-engineering-entry'],
'graduate': ['graduate-finance','graduate-accounting','graduate-aeronautical-engineering','graduate-manufacturing-engineering'],
'sector': ['Accountancy_Finance','Accountancy_Finance','Engineering_Manufacturing','Engineering_Manufacturing'],
'degree_type': ['Accountancy_finance','Accountancy_finance','Aeronautical_Engineering','Manufacturing_Engineering']}
degree = pd.DataFrame(degrees)
start_urls = defaultdict(list)
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':0.5,
#'LOG_LEVEL':'INFO',
}
def start_requests(self):
for degrees, degrees_entry,degrees_graduate, graduate_entry, graduate,sector in zip(self.degree.degrees,self.degree.degrees_entry,self.degree.degrees_graduate,self.degree.graduate_entry,self.degree.graduate, self.degree.sector):
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_graduate}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate}-jobs')
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
headers=headers,
callback = self.parse,
cb_kwargs = {
'items':items,
'page_no':0
}
)
def parse(self, response, items, page_no):
container = response.xpath("//div[#class='row search-results']")
for lists in container:
page_no += 1
loader = ItemLoader(ReedItem(), selector = lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
links = response.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback = self.parse_jobs,
cb_kwargs = {
'loader':loader
})
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url = next_page,
callback = self.parse,
headers=headers,
cb_kwargs = {
'items':items,
'page_no':page_no
})
def parse_jobs(self, response, loader):
loader.add_value('post',response.xpath('(//span[#itemprop="description"]/p/text()) | (//span[#itemprop="description"]/p//text()) | (//span[#itemprop="description"]/ul//li/text())').getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'reed_jobs_post.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ReedSpider)
process.start()

I commented out the lines I changed.
def parse(self, response, items, page_no):
# container = response.xpath("//div[#class='row search-results']")
container = response.xpath("//div[#class='row search-results']//article")
page_no += 1
for lists in container:
# page_no += 1
loader = ItemLoader(ReedItem(), selector=lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
# links = response.xpath('.//h3[#class="title"]/a/#href').get()
links = lists.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback=self.parse_jobs,
cb_kwargs={
'loader': loader
}
)
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url=next_page,
callback=self.parse,
headers=headers,
cb_kwargs={
'items': items,
'page_no': page_no
})

Related

Why am I seeing "no connection adapters found" when trying to use results as a variable, but not while trying to print?

Hope I am asking this the right way - just confused with what's going on: I have my working script (below). I'm trying to take the URLs from a spreadsheet, rather than copy and paste them in - basically, creating urlsA from column N on the sheet connected.
I've tested it out - I can print urlsA to terminal no problem, so I know the Sheet connection is working. I just can't seem to use them when I try to run the full script. I'm receiving this error:
Working code (before pulling links from Google Sheet):
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
profilesA = []
urlsA = ['https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=2&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=6&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=7&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=17&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=23&stats_player_seq=-100']
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
Broken code: "No connection adapters were found" error:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
from unittest import skip
import json
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1cEQlPB_ykJrucnbGgKhlKj49RdLNAzeO6fiO2gkQeNU')
wk = sh.worksheet("Team Select")
profilesA = []
ShUrls = wk.batch_get(('N3:N',))[0]
urlsA = ShUrls
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
I'd inspect this line:
ShUrls = wk.batch_get(('N3:N',))[0]
As you might be pulling a list of lists, hence, this line breaks
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
with the No connection adapters were found error as a list is not a valid URL.
Needed to flatten urlsA after seeing it was an array of arrays. Using this, then calling flatten fixed the issue:
def flatten(l):
fl = []
for sublist in l:
for item in sublist:
fl.append(item)
return fl

When downloading csv files from webpage getting missing url scheme

I am fairly new to scraping and have been trying to directly download .csv files from a website. I managed to fix my last issue with the edit, however I get a new error when trying to download the files. The follow error is:
raise ValueError(f'Missing scheme in request url: {self._url}')
ValueError: Missing scheme in request url: h
I am not sure what is triggering this error because the links follow properly to the next function.
For example, here is what I have tried:
import scrapy
from nhs.items import DownfilesItem
class NhsScapeSpider(scrapy.Spider):
name = 'nhs_scape'
#allowed_domains = ['nh']
start_urls = ['https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/ae-attendances-and-emergency-admissions-2021-22/']
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url = url,
callback = self.parse
)
def parse(self, response):
side_panel = response.xpath("//aside[#class='subnav group minimal_nav desktop-only']//ul[#class='children']//li")
for years in side_panel:
year_links = years.xpath('.//a/#href').get()
yield response.follow(year_links, callback = self.download_files)
def download_files(self, response):
test_files = response.xpath("//article[#class='rich-text']//p")
month_files = response.xpath("//article[#class='rich-text']//h3")
for files, mn in zip(test_files, month_files):
all_files = files.xpath('.//a//#href').getall()
all_file_names = files.xpath('.//a//text()').getall()
month_year = mn.xpath('.//text()').get()
for ind_files,ind_text in zip(all_files, all_file_names):
item = DownfilesItem()
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = ind_files
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = ind_files
item['original_file_name'] = ind_text
yield item
Items.py:
import scrapy
class DownfilesItem(scrapy.Item):
# define the fields for your item here like:
file_urls = scrapy.Field()
original_file_name = scrapy.Field()
Pipelines.py:
from scrapy.pipelines.files import FilesPipeline
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[1]
return file_name
Settings.py:
ITEM_PIPELINES = {'nhs.pipelines.DownfilesPipeline': 150}
FILES_STORE = "Files"
Updated error after #supersuers answer:
IsADirectoryError: [Errno 21] Is a directory: 'Files/'
It seems this is caused by FILES_STORE = "Files", however when I remove this I do not get an error but no files are downloaded neither.
item['file_urls'] should be a list:
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
Edit:
The second error is because of the pipeline, file_name is an empty string, you can change it for example to:
file_name: str = request.url.split("/")[-1]
Edit 2:
I think that the problem is in the xpath selectors, try this and tweak it to your needs:
import scrapy
from tempbuffer.items import DownfilesItem
class NhsScapeSpider(scrapy.Spider):
name = 'nhs_scape'
#allowed_domains = ['nh']
start_urls = ['https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/ae-attendances-and-emergency-admissions-2021-22/']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
side_panel = response.xpath("//aside[#class='subnav group minimal_nav desktop-only']//ul[#class='children']//li")
for years in side_panel:
year_links = years.xpath('.//a/#href').get()
yield response.follow(year_links, callback=self.download_files)
def download_files(self, response):
# test_files = response.xpath("//article[#class='rich-text']//p")
test_files = response.xpath("//article[#class='rich-text']//p[a[contains(#href, '.xls')]]")
# month_files = response.xpath("//article[#class='rich-text']//h3")
# couldn't make a prettier xpath selector
month_files = response.xpath("//article[#class='rich-text']//h3[starts-with(text(), 'January') or starts-with(text(), 'February') or starts-with(text(), 'March') or starts-with(text(), 'April') or starts-with(text(), 'May') or starts-with(text(), 'June') or starts-with(text(), 'July') or starts-with(text(), 'August') or starts-with(text(), 'September') or starts-with(text(), 'October') or starts-with(text(), 'November') or starts-with(text(), 'December')]")
for files, mn in zip(test_files, month_files):
all_files = files.xpath('.//a//#href').getall()
all_file_names = files.xpath('.//a//text()').getall()
month_year = mn.xpath('.//text()').get()
for ind_files, ind_text in zip(all_files, all_file_names):
item = DownfilesItem()
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item

Remove duplicates based on a unique ID

I want to remove duplicate data based on unique id's. Each listing from the site has a unique id and so I want to filter the data to remove any duplicates.
After looking at the documentation for scrapy: Filter duplicates
I have tried to implement a similar pipeline to remove the duplicates, however I am unfamiliar on how to get this to work.
Here's what I have truied:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import json
from itertools import zip_longest
from collections import defaultdict
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
headers = {
'authority': 'www.theparking.eu',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'origin': 'https://www.theparking.eu',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
#'referer': 'https://www.theparking.eu/used-cars/used-cars/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class DuplicatesPipeline:
def __init__(self):
# self.ids_seen = set()
self.titles_seen = set()
def process_item(self, unique_id, spider):
if unique_id in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(unique_id)
return unique_id
class Countryitem(scrapy.Item):
make = Field(output_processor = TakeFirst())
unique_id = Field(output_processor = TakeFirst())
page_number = Field(output_processor = TakeFirst())
class CountrySpider(scrapy.Spider):
name = "country"
test_dict={'country_id': [4,5,109,7,6,8,87],
'country': ['australia','austria','bahamas','belarus','belgium','bosnia and herzegovina','brasil'],
'make': [20, 13, 131, 113, 32, 62, 104],
'model': [1108, 4655, 687, 492, 499, 702, 6143],
'engine': [5, 11, 10, 7, 14, 21, 170]}
#for links, pages, id, country in zip(url_data.links, url_data.pages, url_data.id, url_data.country):
def start_requests(self):
for id_ in zip(self.test_dict['country_id']):
for id_marque in self.test_dict['make']:
for models in self.test_dict['model']:
for engine in self.test_dict['engine']:
for page in range(1, 10000):
yield scrapy.FormRequest(
url = f'https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_pays%3D{id_}%26id_marque%3D{id_marque}%26id_modele%3D{models}%26id_motorisation%3D{engine}',
method="POST",
callback = self.parse,
formdata = {
'ajax': '{"tab_id":"t0","cur_page":%s,"cur_trie":"distance","query":"","critere":{"id_pays":[%s],"id_marque":[%s], "id_modele":[%s], "id_motorisation":[%s]},"sliders":{"prix":{"id":"#range_prix","face":"prix","max_counter":983615,"min":"1","max":"400000"},"km":{"id":"#range_km","face":"km","max_counter":1071165,"min":"1","max":"500000"},"millesime":{"id":"#range_millesime","face":"millesime","max_counter":1163610,"min":"1900","max":"2022"}},"req_num":1,"nb_results":"11795660","current_location_distance":-1,"logged_in":false}' % (page,id_, id_marque, models, engine),
'tabs': '["t0"]'
},
headers=headers,
cb_kwargs = {
'page_number':page
}
)
def parse(self, response,page_number):
container = json.loads(response.text)
test=container['#lists']
soup = BeautifulSoup(test, 'lxml')
for i in soup:
carMake = i.select("a.external.tag_f_titre > span.title-block.brand:nth-child(1)")
carUnique = i.select('li[tref]')
for make, unique in zip_longest(
carMake, carUnique
):
loader = ItemLoader(Countryitem())
# loader.add_value('page_number', page_number)
loader.add_value("unique_id", unique['tref'])
loader.add_value("page_number",page_number)
if make != None:
loader.add_value('make', make.text)
else:
loader.add_value('make', "None")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'park.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(CountrySpider)
process.start()
class DuplicatesPipeline:
def __init__(self):
self.titles_seen = set()
def process_item(self, item, spider):
if item['unique_id'] in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(item['unique_id'])
return item
Also add to custom_settings:
custom_settings = {
'ITEM_PIPELINES': {
'myproject.path_to_your_file.DuplicatesPipeline': 300
}
}

Why not all of the coordinates are generated while running Geopy to scrape yellow pages?

The output is a csv file with a list of businesses including name, address, telephone and coordinates, for some reason only partial coordinates are generated, the ones that aren't generated and ran in a single run with geopy will find the coordinates, so potentially geopy can find the coordinates for all of them but for some reason it skips sometimes, I thought it might be needing some time to call the api and added threading but it didn't solve the issue.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import threading
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ypscraper#gmail.com")
main_list = []
def extract(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup.find_all('div', class_ = 'listing__content__wrap--flexed jsGoToMp')
def transform(articles):
for item in articles:
name = item.find('a', class_ ='listing__name--link listing__link jsListingName').text
try:
street = item.find('span', {'itemprop':'streetAddress'}).text
except:
street = ''
try:
city = item.find('span', {'itemprop':'addressLocality'}).text
except:
city = ''
try:
province = item.find('span', {'itemprop':'addressRegion'}).text
except:
province = ''
try:
postCode = item.find('span', {'itemprop':'postalCode'}).text
except:
postCode = ''
try:
phone = item.find('li', class_ = 'mlr__submenu__item').text.strip()
except:
phone = ''
try:
def search_geo():
global location
location = geolocator.geocode(street + ' ' + city)
print(street + ' ' + city)
thread = threading.Thread(target=search_geo)
thread.start()
thread.join()
slatitude = location.latitude
except:
slatitude = ''
try:
thread = threading.Thread(target=search_geo)
thread.start()
thread.join()
slongitude = location.longitude
except:
slongitude = ''
business = {
'name': name,
'street': street,
'city': city,
'province': province,
'postCode': postCode,
'phone': phone,
'slongitude': slongitude,
'slatitude': slatitude
}
main_list.append(business)
return
def load():
df = pd.DataFrame(main_list)
df.to_csv('repairshopsbc', index=False)
for x in range(1,2):
print(f'Getting page {x}')
articles = extract(f'https://www.yellowpages.ca/search/si/{x}/car+repair/British+Columbia+BC')
transform(articles)
time.sleep(5)
load()
print('Saved to CSV')

Getting repeated requests from same url with different values

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item