Pandas only saving first page of scraped data into CSV - pandas

I'm trying to scrape data from the first 5 pages of this site and save it to a CSV. Everything seems to be working fine but only the first page of the site is getting saved to the CSV. I think it may be an indentation issue but I haven't been able to figure it out. Any help would be appreciated, thanks!
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import random
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'pis=3; ut=hB0ZZQzmgNnOh9lBQEu8bTmaWMULGZQUahdMlzzGd1k; SESSION_TOKEN=j-_KSJ8wscb0aeKUf-Ndr5wDcTCFdY3E2qwROIDEt5U; zjs_user_id=null; zjs_anonymous_id=%22hB0ZZQzmgNnOh9lBQEu8bTmaWMULGZQUahdMlzzGd1k%22; _pxvid=ef32ffea-7620-11eb-9542-0242ac12000d; _ga=GA1.2.845026454.1614116940; _gid=GA1.2.991776272.1614116940; _gcl_au=1.1.1819279014.1614116941; OptanonConsent=isIABGlobal=false&datestamp=Tue+Feb+23+2021+16%3A21%3A01+GMT-0600+(Central+Standard+Time)&version=5.9.0&landingPath=NotLandingPage&groups=1%3A1%2C0_172180%3A1%2C0_248632%3A1%2C0_172218%3A1%2C0_172151%3A1%2C0_172362%3A1%2C3%3A1%2C0_172152%3A1%2C0_172351%3A1%2C4%3A1%2C0_172338%3A1%2C0_172360%3A1%2C0_172153%3A1%2C0_172154%3A1%2C0_172343%3A1%2C0_177347%3A1%2C0_172331%3A1%2C0_172155%3A1%2C0_172156%3A1%2C0_248627%3A1%2C0_172157%3A1%2C0_248631%3A1%2C0_172158%3A1%2C0_172357%3A1%2C0_248633%3A1%2C0_172348%3A1%2C0_172159%3A1%2C0_172160%3A1%2C0_172161%3A1%2C0_172162%3A1%2C0_172163%3A1%2C0_172164%3A1%2C0_172165%3A1%2C0_172166%3A1%2C0_172167%3A1%2C0_172168%3A1%2C0_172169%3A1%2C0_172170%3A1%2C0_172171%3A1%2C0_172172%3A1%2C0_172173%3A1%2C0_172174%3A1%2C0_172175%3A1%2C0_172176%3A1%2C0_172177%3A1%2C0_172178%3A1%2C0_172179%3A1%2C0_172181%3A1%2C0_172182%3A1%2C0_172183%3A1%2C0_172184%3A1%2C0_172185%3A1%2C0_172186%3A1%2C0_172187%3A1%2C0_172188%3A1%2C0_172189%3A1%2C0_172190%3A1%2C0_172191%3A1%2C0_172192%3A1%2C0_172193%3A1%2C0_172195%3A1%2C0_172197%3A1%2C0_172198%3A1%2C0_172199%3A1%2C0_172200%3A1%2C0_172201%3A1%2C0_172202%3A1%2C0_172203%3A1%2C0_172204%3A1%2C0_172205%3A1%2C0_172206%3A1%2C0_172207%3A1%2C0_172208%3A1%2C0_172209%3A1%2C0_172210%3A1%2C0_172211%3A1%2C0_172212%3A1%2C0_172213%3A1%2C0_172214%3A1%2C0_172215%3A1%2C0_172216%3A1%2C0_172217%3A1%2C0_172219%3A1%2C0_172220%3A1%2C0_172221%3A1%2C0_172222%3A1%2C0_172223%3A1%2C0_172330%3A1%2C0_172333%3A1%2C0_172334%3A1%2C0_172335%3A1%2C0_172336%3A1%2C0_172337%3A1%2C0_172339%3A1%2C0_172340%3A1%2C0_172341%3A1%2C0_172342%3A1%2C0_172344%3A1%2C0_248628%3A1%2C0_172345%3A1%2C0_172346%3A1%2C0_172349%3A1%2C0_172350%3A1%2C0_172352%3A1%2C0_172353%3A1%2C0_172354%3A1%2C0_172355%3A1%2C0_172356%3A1%2C0_172358%3A1%2C0_172359%3A1%2C0_172361%3A1%2C0_248629%3A1%2C0_248630%3A1%2C0_248634%3A1&AwaitingReconsent=false; _px3=d6e4661fe0f89390bd501cf6d96d7c4ce6da6b629f038f745c417aec166457da:jkvtO/Et7fQoQ9uQjR7cLnpUmnMnTHJjbtYEYxtF8Af3XMaosoyoSH29Qf+5aiOY4Z/BqkATEDsYMrO6hKGNOQ==:1000:v1Auy0PIGkZc2wIJIcWfwOV3SoBz2sZHwNv/67LxTEKseVa/NakBSB7e9s397Ol/RCx/TcpBu3ZoJilwD/sP/3PIkNcxZXjbK+aHVEpfKf37sDvp8iNYyLqZ6QjNsa/0NXHrpVIWto2qgiaU21O2v9R9EgDeaTBEt4MCmMT87V4=',
'Host': 'hotpads.com',
'sec-ch-ua': '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
params = {
'lat: 41.7985 lon: -87.7117 z: 11'
}
pages = np.arange(1, 6)
for page in pages:
page = requests.get('https://hotpads.com/chicago-il/apartments-for-rent' + '?page=' + str(page), headers = headers)
source = page.content
soup = BeautifulSoup(source, 'lxml')
postings = soup.find_all('div', class_="AreaListingsContainer")
time.sleep(random.randint(2,10))
for p in postings:
url = ["https://hotpads.com" + u['href'] for u in p.find_all('a', href=True, text=True)]
address = [a.get_text() for a in p.find_all('h4', class_="styles__ListingCardName-y78yl0-8 jQmZHq")]
price = [p.get_text() for p in p.find_all('div', class_="styles__ListingCardPrice-y78yl0-17 cguwHc")]
beds = [b.get_text() for b in p.find_all('div', class_="styles__ListingCardBedDisplay-y78yl0-7 iPqMa")]
homes = list(zip(url, address, price, beds))
df = pd.DataFrame(homes, columns = ['URL', 'Address', 'Price', 'Beds'])
print(df)
df.to_csv('Chicago_homes.csv')

It looks like you're overwriting the original df in every iteration. There's two solutions to this.
1- Initialize the dataframe before the loop and write the entire frame to the file after the loop has completed.
df = pd.DataFrame()
for page in pages:
#do stuff...
df = pd.concat([df,pd.DataFrame(homes, columns = ['URL', 'Address', 'Price', 'Beds'])])
#put this outside of the loop.
df.to_csv('Chicago_homes.csv')
2- Append the dataframe data to the file within the loop. This is beneficial when executing many loops, and you don't want to hold the large dataframe in memory.
for page in pages:
#do stuff..
if os.path.exists('Chicago_homes.csv'):
df.to_csv('Chicago_homes.csv',mode='a',header=False)
else:
df.to_csv('Chicago_homes.csv')

Related

Why am I seeing "no connection adapters found" when trying to use results as a variable, but not while trying to print?

Hope I am asking this the right way - just confused with what's going on: I have my working script (below). I'm trying to take the URLs from a spreadsheet, rather than copy and paste them in - basically, creating urlsA from column N on the sheet connected.
I've tested it out - I can print urlsA to terminal no problem, so I know the Sheet connection is working. I just can't seem to use them when I try to run the full script. I'm receiving this error:
Working code (before pulling links from Google Sheet):
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
profilesA = []
urlsA = ['https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=2&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=6&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=7&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=17&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=23&stats_player_seq=-100']
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
Broken code: "No connection adapters were found" error:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
from unittest import skip
import json
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1cEQlPB_ykJrucnbGgKhlKj49RdLNAzeO6fiO2gkQeNU')
wk = sh.worksheet("Team Select")
profilesA = []
ShUrls = wk.batch_get(('N3:N',))[0]
urlsA = ShUrls
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
I'd inspect this line:
ShUrls = wk.batch_get(('N3:N',))[0]
As you might be pulling a list of lists, hence, this line breaks
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
with the No connection adapters were found error as a list is not a valid URL.
Needed to flatten urlsA after seeing it was an array of arrays. Using this, then calling flatten fixed the issue:
def flatten(l):
fl = []
for sublist in l:
for item in sublist:
fl.append(item)
return fl

ValueError: 4 columns passed, passed data had 1 columns

'NEW LEARNER'
If data in column from webpage is not an integer, I cannot append the row to my data frame.
[webpage data as seen by this image] (https://i.stack.imgur.com/KBjRU.png)
Here is my code:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
ticker = input("Type your ticker symbol: ")
def get_balance_sheet_from_yfinance_web(ticker):
url = f"https://finance.yahoo.com/quote/%7Bticker%7D/balance-sheet?p=%7Bticker%7D"
header = {'Connection': 'keep-alive',
'Expires': '-1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
r = requests.get(url, headers=header)
html = r.text
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
col = []
pd.set_option('max_colwidth', None)
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
pd.set_option('max_colwidth', None)
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = df.append(row)
df.to_csv(f'{ticker}.csv')
return df
print(get_balance_sheet_from_yfinance_web(ticker))
I have tried replace('-', 0)

Spider grabs 1 item from each page and not all items

My spiders seems to only grab about 1 job-listing from each webpage. When I remove parse_jobs, and load_item() in parse I can extract all the job-listings for each page. So the issue is likely when it goes to parse_jobs and loads items, however I cannot seem to figure out the issue.
Here's what I have tried:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst, MapCompose, Join
import pandas as pd
from collections import defaultdict
from scrapy_splash import SplashRequest
headers = {
'authority': 'api2.branch.io',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'content-type': 'application/x-www-form-urlencoded',
'accept': '*/*',
'origin': 'https://www.reed.co.uk',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.reed.co.uk/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class ReedItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
region = Field(input_processor = MapCompose(str.strip),
output_processor = TakeFirst())
items = Field(output_processor = TakeFirst())
post = Field(input_processor = MapCompose(),
output_processor = Join(" "))
page_no = Field(output_processor = TakeFirst())
class ReedSpider(scrapy.Spider):
name = 'reed'
degrees={'upper': ['Finance','Accounting','Aeronautical-Engineering','Manufacturing-Engineering'],
'degrees_entry': ['degree-finance-entry','degree-accounting-entry','degree-aeronautical-engineering-entry','degree-manufacturing-engineering-entry'],
'degrees_graduate': ['degree-finance-graduate','degree-accounting-graduate','degree-aeronautical-engineering-graduate','degree-manufacturing-engineering-graduate'],
'degrees': ['degree-finance','degree-accounting','degree-aeronautical-engineering','degree-manufacturing-engineering'],
'graduate_entry': ['graduate-finance-entry','graduate-accounting-entry','graduate-aeronautical-engineering-entry','graduate-manufacturing-engineering-entry'],
'graduate': ['graduate-finance','graduate-accounting','graduate-aeronautical-engineering','graduate-manufacturing-engineering'],
'sector': ['Accountancy_Finance','Accountancy_Finance','Engineering_Manufacturing','Engineering_Manufacturing'],
'degree_type': ['Accountancy_finance','Accountancy_finance','Aeronautical_Engineering','Manufacturing_Engineering']}
degree = pd.DataFrame(degrees)
start_urls = defaultdict(list)
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':0.5,
#'LOG_LEVEL':'INFO',
}
def start_requests(self):
for degrees, degrees_entry,degrees_graduate, graduate_entry, graduate,sector in zip(self.degree.degrees,self.degree.degrees_entry,self.degree.degrees_graduate,self.degree.graduate_entry,self.degree.graduate, self.degree.sector):
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_graduate}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate}-jobs')
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
headers=headers,
callback = self.parse,
cb_kwargs = {
'items':items,
'page_no':0
}
)
def parse(self, response, items, page_no):
container = response.xpath("//div[#class='row search-results']")
for lists in container:
page_no += 1
loader = ItemLoader(ReedItem(), selector = lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
links = response.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback = self.parse_jobs,
cb_kwargs = {
'loader':loader
})
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url = next_page,
callback = self.parse,
headers=headers,
cb_kwargs = {
'items':items,
'page_no':page_no
})
def parse_jobs(self, response, loader):
loader.add_value('post',response.xpath('(//span[#itemprop="description"]/p/text()) | (//span[#itemprop="description"]/p//text()) | (//span[#itemprop="description"]/ul//li/text())').getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'reed_jobs_post.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ReedSpider)
process.start()
I commented out the lines I changed.
def parse(self, response, items, page_no):
# container = response.xpath("//div[#class='row search-results']")
container = response.xpath("//div[#class='row search-results']//article")
page_no += 1
for lists in container:
# page_no += 1
loader = ItemLoader(ReedItem(), selector=lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
# links = response.xpath('.//h3[#class="title"]/a/#href').get()
links = lists.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback=self.parse_jobs,
cb_kwargs={
'loader': loader
}
)
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url=next_page,
callback=self.parse,
headers=headers,
cb_kwargs={
'items': items,
'page_no': page_no
})

Remove duplicates based on a unique ID

I want to remove duplicate data based on unique id's. Each listing from the site has a unique id and so I want to filter the data to remove any duplicates.
After looking at the documentation for scrapy: Filter duplicates
I have tried to implement a similar pipeline to remove the duplicates, however I am unfamiliar on how to get this to work.
Here's what I have truied:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import json
from itertools import zip_longest
from collections import defaultdict
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
headers = {
'authority': 'www.theparking.eu',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'origin': 'https://www.theparking.eu',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
#'referer': 'https://www.theparking.eu/used-cars/used-cars/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class DuplicatesPipeline:
def __init__(self):
# self.ids_seen = set()
self.titles_seen = set()
def process_item(self, unique_id, spider):
if unique_id in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(unique_id)
return unique_id
class Countryitem(scrapy.Item):
make = Field(output_processor = TakeFirst())
unique_id = Field(output_processor = TakeFirst())
page_number = Field(output_processor = TakeFirst())
class CountrySpider(scrapy.Spider):
name = "country"
test_dict={'country_id': [4,5,109,7,6,8,87],
'country': ['australia','austria','bahamas','belarus','belgium','bosnia and herzegovina','brasil'],
'make': [20, 13, 131, 113, 32, 62, 104],
'model': [1108, 4655, 687, 492, 499, 702, 6143],
'engine': [5, 11, 10, 7, 14, 21, 170]}
#for links, pages, id, country in zip(url_data.links, url_data.pages, url_data.id, url_data.country):
def start_requests(self):
for id_ in zip(self.test_dict['country_id']):
for id_marque in self.test_dict['make']:
for models in self.test_dict['model']:
for engine in self.test_dict['engine']:
for page in range(1, 10000):
yield scrapy.FormRequest(
url = f'https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_pays%3D{id_}%26id_marque%3D{id_marque}%26id_modele%3D{models}%26id_motorisation%3D{engine}',
method="POST",
callback = self.parse,
formdata = {
'ajax': '{"tab_id":"t0","cur_page":%s,"cur_trie":"distance","query":"","critere":{"id_pays":[%s],"id_marque":[%s], "id_modele":[%s], "id_motorisation":[%s]},"sliders":{"prix":{"id":"#range_prix","face":"prix","max_counter":983615,"min":"1","max":"400000"},"km":{"id":"#range_km","face":"km","max_counter":1071165,"min":"1","max":"500000"},"millesime":{"id":"#range_millesime","face":"millesime","max_counter":1163610,"min":"1900","max":"2022"}},"req_num":1,"nb_results":"11795660","current_location_distance":-1,"logged_in":false}' % (page,id_, id_marque, models, engine),
'tabs': '["t0"]'
},
headers=headers,
cb_kwargs = {
'page_number':page
}
)
def parse(self, response,page_number):
container = json.loads(response.text)
test=container['#lists']
soup = BeautifulSoup(test, 'lxml')
for i in soup:
carMake = i.select("a.external.tag_f_titre > span.title-block.brand:nth-child(1)")
carUnique = i.select('li[tref]')
for make, unique in zip_longest(
carMake, carUnique
):
loader = ItemLoader(Countryitem())
# loader.add_value('page_number', page_number)
loader.add_value("unique_id", unique['tref'])
loader.add_value("page_number",page_number)
if make != None:
loader.add_value('make', make.text)
else:
loader.add_value('make', "None")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'park.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(CountrySpider)
process.start()
class DuplicatesPipeline:
def __init__(self):
self.titles_seen = set()
def process_item(self, item, spider):
if item['unique_id'] in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(item['unique_id'])
return item
Also add to custom_settings:
custom_settings = {
'ITEM_PIPELINES': {
'myproject.path_to_your_file.DuplicatesPipeline': 300
}
}

python selenium/soup not scrolling and printing entire job containers in linkedined

Here's the problem statement: The base_site link below takes us to a job search URL.
There are small containers that show jobs on the left pane of the webpage.
The problem is that with this code I can only see 7 containers as output.
For example, it shows the 1st seven job result locations in the output whereas I am expecting all of them to be shown in the output. For this, I am using scrolltoview but that doesn't seem to help as well.
What is it that I'm missing?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from time import sleep
def get_driver():
options = Options()
options.add_argument("user-data-dir=C:\\Users\\abc\\AppData\\Local\\Google\\Chrome\\User Data")
path = 'C:\\Program Files (x86)\\Google\\chromedriver.exe'
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(path, options=options)
text_search = 'Product Development Engineer'
location_search = 'california'
# base_site = 'https://www.linkedin.com/jobs'
base_site = 'https://www.linkedin.com/jobs/search/?currentJobId=2638809245&f_E=3%2C4&f_JT=F&f_SB2=3&f_TPR=r60' \
'4800&geoId=102095887&keywords=product%20development%20engineer&location=California%2C%20United%20States&sortBy=R'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
"70.0.3538.102 Safari/537.36 Edge/18.19582"}
driver.get(base_site)
parsing_job_data(driver, base_site, headers)
def parsing_job_data(driver, base_site, headers):
try:
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
soup = BeautifulSoup(driver.page_source, 'lxml')
results = soup.find_all('div', class_="job-card-container relative job-card-list job-card-container--clickable "
"job-card-list--underline-title-on-hover jobs-search-results-list__list-"
"item--active jobs-search-two-pane__job-card-container--viewport-tracking"
"-0")
sleep(1)
each_container = soup.select('[class*="occludable-update"]', limit=20)
for container in each_container:
element = driver.find_element_by_class_name("artdeco-entity-lockup__caption")
driver.execute_script("arguments[0].scrollIntoView(true);", element)
element.click()
job_title = container.find('a', class_='disabled ember-view job-card-container__link job-card-list__title').text
location = container.find('li', class_='job-card-container__metadata-item').text
job_title = job_title.strip()
location = location.strip()
print(job_title, ', ', location)
except Exception as e:
print(e)
if __name__ == "__main__":
get_driver()
import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
allin = []
async def worker(channel):
async with channel:
async for num in channel:
async with httpx.AsyncClient(timeout=None) as client:
client.headers.update(headers)
params = {
"currentJobId": "2638809245",
"f_E": "3,4",
"f_JT": "F",
"f_SB2": "3",
"f_TPR": "r604800",
"geoId": "102095887",
"keywords": "product development engineer",
"location": "California, United States",
"sortBy": "R",
"position": "1",
"pageNum": "0",
"start": num
}
r = await client.get('https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search', params=params)
soup = await get_soup(r.text)
goal = [(x.h3.get_text(strip=True), x.select_one('.job-search-card__location').get_text(strip=True))
for x in soup.select('.base-search-card__info')]
allin.extend(goal)
async def main():
async with trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(2):
nurse.start_soon(worker, receiver.clone())
async with sender:
for num in range(0, 450, 25):
await sender.send(num)
df = pd.DataFrame(allin, columns=["Title", "Location"])
print(df)
#df.to_csv('result.csv', index=False)
if __name__ == "__main__":
trio.run(main)
Output:
Title Location
0 Packaging Process Engineer Fremont, CA
1 Project Engineer Oakland, CA
2 Process Engineer- Materials and Fibers Santa Clarita, CA
3 Senior Product Design Engineer Carson, CA
4 Design Engineer Sacramento, CA
.. ... ...
436 Software Development Engineer Irvine, CA
437 Software Development Engineer Sunnyvale, CA
438 Software Development Engineer San Luis Obispo, CA
439 Software Development Engineer - Luna Irvine, CA
440 Software Development Engineer Irvine, CA
[441 rows x 2 columns]