How to crawl start URLs only?

How to crawl start URLs only? - scrapy

I'm trying to make the spider crawl just the provided start URLs without following any extracted links. I've tried setting rules = (Rule (follow=False),) but it still follows links. Does anyone know how to download the start URLs only?
EDIT:
Here's some code
class Spider(CrawlSpider):
name = 'spider'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
def __init__(self, mode, *args, **kwargs):
if mode == 'scan':
self.start_urls = ['https://www.example.com/']
self.rules = (Rule (callback="parse_obj", follow=False),
)
self.custom_settings = {
'COMPRESSION_ENABLED': True,
'URLLENGTH_LIMIT': 100,
'DOWNLOAD_DELAY': 1
}
elif mode == 'crawl':
# something else
super(Spider, self).__init__(*args, **kwargs)

Related

Why am I seeing "no connection adapters found" when trying to use results as a variable, but not while trying to print?

Hope I am asking this the right way - just confused with what's going on: I have my working script (below). I'm trying to take the URLs from a spreadsheet, rather than copy and paste them in - basically, creating urlsA from column N on the sheet connected.
I've tested it out - I can print urlsA to terminal no problem, so I know the Sheet connection is working. I just can't seem to use them when I try to run the full script. I'm receiving this error:
Working code (before pulling links from Google Sheet):
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
profilesA = []
urlsA = ['https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=2&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=6&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=7&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=17&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=23&stats_player_seq=-100']
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
Broken code: "No connection adapters were found" error:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
from unittest import skip
import json
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1cEQlPB_ykJrucnbGgKhlKj49RdLNAzeO6fiO2gkQeNU')
wk = sh.worksheet("Team Select")
profilesA = []
ShUrls = wk.batch_get(('N3:N',))[0]
urlsA = ShUrls
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)

I'd inspect this line:
ShUrls = wk.batch_get(('N3:N',))[0]
As you might be pulling a list of lists, hence, this line breaks
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
with the No connection adapters were found error as a list is not a valid URL.

Needed to flatten urlsA after seeing it was an array of arrays. Using this, then calling flatten fixed the issue:
def flatten(l):
fl = []
for sublist in l:
for item in sublist:
fl.append(item)
return fl

python requests.request "GET" returns different result

I am sending a GET request to a website using requests.request in python. When I set User-Agent in the header I only get script tags in the response. When I do not set User-Agent I get all the tags but script. What is the problem? Any idea?
Code with only script tags:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
URL = 'https://www.gate-away.com/properties/lazio/rome/rome/id/567087'
response = requests.request('GET', URL, headers=headers).text
soup = BeautifulSoup(response, 'html.parser')
print(len(soup.findAll('script', {'type': "text/javascript"})))
print(len(soup.findAll('div')))
Output is:
15 (a non-zero number)
0
Code with all tags but script:
headers = {}
URL = 'https://www.gate-away.com/properties/lazio/rome/rome/id/567087'
response = requests.request('GET', URL, headers=headers).text
soup = BeautifulSoup(response, 'html.parser')
print(len(soup.findAll('script', {'type': "text/javascript"})))
print(len(soup.findAll('div')))
Output is:
0
100 (a non-zero number)

import re
import requests
from bs4 import BeautifulSoup
import json
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
url = "https://www.gate-away.com/properties/lazio/rome/rome/id/567087"
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
data_script = soup.find('script', string=re.compile("var preloadedData = "))
actual_data = json.loads(data_script.contents[0].split('var preloadedData = ')[1].rsplit(';', 1)[0])
print(actual_data)
This will return a dict with pretty much all data in question.

Debugging Selenium crawling errors

I'm trying to crawl pictures using selenium, but I keep getting errors. What should I do?
Here's the syntax.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import urllib.request
driver = webdriver.Chrome()
driver.get("https://www.google.co.kr/imghp?hl=ko&tab=wi&authuser=0&ogbl")
elem = driver.find_element_by_name("q")
elem.send_keys("조코딩")
elem.send_keys(Keys.RETURN)
SCROLL_PAUSE_TIME = 1
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
try:
driver.find_element_by_css_selector(".mye4qd").click()
except:
break
last_height = new_height
images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd")
count = 1
for image in images:
try:
image.click()
time.sleep(2)
imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(imgUrl, str(count) + ".jpg")
count = count + 1
except:
pass
driver.close()
Below is the error syntax.
google.py:24: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead
driver.find_element_by_css_selector(".mye4qd").click()
google.py:29: DeprecationWarning: find_elements_by_* commands are deprecated. Please use find_elements() instead
images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd")
google.py:35: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead
imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
[1936:1984:0514/142442.544:ERROR:ssl_client_socket_impl.cc(999)] handshake failed; returned -1, SSL error code 1, net_error -200
I keep trying to change the syntax, but I get an error even if I keep doing it.
What should I do?

I cant get text from atribute span in bs4

when i try to get text i have a output like:
price = item.find('span').text
AttributeError: 'NoneType' object has no attribute 'text'
code:
#___IMPORTS_____
from datetime import date
import calendar
import requests
from bs4 import BeautifulSoup
#_______________
url= 'https://www.investing.com/currencies/eur-usd'
page = requests.get(url, headers = {'User-
Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/91.0.4472.124 Safari/537.36'})
#print(f'Status code is: {page.status_code}')
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('div', class_='first
inlineblock')[0]
for item in table:
price = item.find('span').text
print(price)

Try:
#___IMPORTS_____
from datetime import date
import calendar
import requests
from bs4 import BeautifulSoup
#_______________
url= 'https://www.investing.com/currencies/eur-usd'
page = requests.get(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})
#print(f'Status code is: {page.status_code}')
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('div', class_='first inlineblock')
for item in table:
price = item.find('span', class_='float_lang_base_2')
print(price.text)
1.1753
1.1752
- 0.4
Or if you require the field:
for item in table:
field = item.find('span', class_='float_lang_base_1')
price = item.find('span', class_='float_lang_base_2')
print(field.text, ':', price.text)
Prev. Close : 1.1753
Open : 1.1752
1-Year Change : - 0.4

python selenium/soup not scrolling and printing entire job containers in linkedined

Here's the problem statement: The base_site link below takes us to a job search URL.
There are small containers that show jobs on the left pane of the webpage.
The problem is that with this code I can only see 7 containers as output.
For example, it shows the 1st seven job result locations in the output whereas I am expecting all of them to be shown in the output. For this, I am using scrolltoview but that doesn't seem to help as well.
What is it that I'm missing?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from time import sleep
def get_driver():
options = Options()
options.add_argument("user-data-dir=C:\\Users\\abc\\AppData\\Local\\Google\\Chrome\\User Data")
path = 'C:\\Program Files (x86)\\Google\\chromedriver.exe'
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(path, options=options)
text_search = 'Product Development Engineer'
location_search = 'california'
# base_site = 'https://www.linkedin.com/jobs'
base_site = 'https://www.linkedin.com/jobs/search/?currentJobId=2638809245&f_E=3%2C4&f_JT=F&f_SB2=3&f_TPR=r60' \
'4800&geoId=102095887&keywords=product%20development%20engineer&location=California%2C%20United%20States&sortBy=R'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
"70.0.3538.102 Safari/537.36 Edge/18.19582"}
driver.get(base_site)
parsing_job_data(driver, base_site, headers)
def parsing_job_data(driver, base_site, headers):
try:
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
soup = BeautifulSoup(driver.page_source, 'lxml')
results = soup.find_all('div', class_="job-card-container relative job-card-list job-card-container--clickable "
"job-card-list--underline-title-on-hover jobs-search-results-list__list-"
"item--active jobs-search-two-pane__job-card-container--viewport-tracking"
"-0")
sleep(1)
each_container = soup.select('[class*="occludable-update"]', limit=20)
for container in each_container:
element = driver.find_element_by_class_name("artdeco-entity-lockup__caption")
driver.execute_script("arguments[0].scrollIntoView(true);", element)
element.click()
job_title = container.find('a', class_='disabled ember-view job-card-container__link job-card-list__title').text
location = container.find('li', class_='job-card-container__metadata-item').text
job_title = job_title.strip()
location = location.strip()
print(job_title, ', ', location)
except Exception as e:
print(e)
if __name__ == "__main__":
get_driver()

import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
allin = []
async def worker(channel):
async with channel:
async for num in channel:
async with httpx.AsyncClient(timeout=None) as client:
client.headers.update(headers)
params = {
"currentJobId": "2638809245",
"f_E": "3,4",
"f_JT": "F",
"f_SB2": "3",
"f_TPR": "r604800",
"geoId": "102095887",
"keywords": "product development engineer",
"location": "California, United States",
"sortBy": "R",
"position": "1",
"pageNum": "0",
"start": num
}
r = await client.get('https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search', params=params)
soup = await get_soup(r.text)
goal = [(x.h3.get_text(strip=True), x.select_one('.job-search-card__location').get_text(strip=True))
for x in soup.select('.base-search-card__info')]
allin.extend(goal)
async def main():
async with trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(2):
nurse.start_soon(worker, receiver.clone())
async with sender:
for num in range(0, 450, 25):
await sender.send(num)
df = pd.DataFrame(allin, columns=["Title", "Location"])
print(df)
#df.to_csv('result.csv', index=False)
if __name__ == "__main__":
trio.run(main)
Output:
Title Location
0 Packaging Process Engineer Fremont, CA
1 Project Engineer Oakland, CA
2 Process Engineer- Materials and Fibers Santa Clarita, CA
3 Senior Product Design Engineer Carson, CA
4 Design Engineer Sacramento, CA
.. ... ...
436 Software Development Engineer Irvine, CA
437 Software Development Engineer Sunnyvale, CA
438 Software Development Engineer San Luis Obispo, CA
439 Software Development Engineer - Luna Irvine, CA
440 Software Development Engineer Irvine, CA
[441 rows x 2 columns]

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

How to crawl start URLs only? - scrapy

Related

Why am I seeing "no connection adapters found" when trying to use results as a variable, but not while trying to print?

python requests.request "GET" returns different result

Debugging Selenium crawling errors

I cant get text from atribute span in bs4

python selenium/soup not scrolling and printing entire job containers in linkedined

Categories

Resources