Coding return NULL - beautifulsoup

import requests # request img from web
from bs4 import BeautifulSoup
imglinks = []
for i in range(1, 26):
resp = requests.get('https://******/************.html')
soup = BeautifulSoup(resp.content, "html.parser")
links = soup.select('.container > .****** > .img')
for link in links:
imglinks.append(link['src'])
print(links)
My code return NULL (actually it return []) maybe there are some problem with my soup.select ?

Sometimes, BeautifulSoup is unable to find select elements within a webpage. You might either need to render the page using javascript by using the html_session library or you have been blocked by the website for trying to access it using a bot.
If it isn't either of those reasons, i suggest manually scraping it off by finding indexes in request.text

Related

How can I keep scraping data from my data set contains links of linkedin's job description by selenium and for loop as it is stopped and doesn't work?

I made a dataset of linkedin links for data science jobs (with 1000 rows) and made a for-loop to open each link by selenium and extract the job description by beautifulsoup.
It had worked until 121 rows extracted and then it displayed an error and stopped. when I wanted to start again from row 122, it displays this error from link
the link (name of the dataset column), has problem
and does not open linkedin. I tested and see selenium can open google for example.
my loop is:
# keep the loop from 123
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome(r"C:\Users\shahi\chromedriver.exe")
jobdescriptions2=[]
for link in links[121:1000] :
try:
# This instance will be used to log into LinkedIn
driver.get(link)
time.sleep(5)
src = driver.page_source
except:
scr= 'problem in scraping data'
print(f'the link {link}, has problem')
try:
# Now using beautiful soup
soup = BeautifulSoup(src, 'lxml')
# Extracting the HTML of the complete introduction box
job = soup.find("div", {'class': 'decorated-job-posting__details'})
jobdescription = job.find ('div', {'class':'show-more-less-html__markup'})
except:
jobdescription= 'an error in data parcing'
jobdescriptions2.append(jobdescription)
Could you please advise me how I can scrape all 1000 rows?

how to use time sleep to make selenium output consistent

This might be the stupidest question i asked yet but this is driving me nuts...
Basically i want to get all links from profiles but for some reason selenium gives different amounts of links most of the time ( sometimes all sometimes only a tenth)
I experimented with time.sleep and i know its affecting the output somehow but i dont understand where the problem is.
(but thats just my hypothesis maybe thats wrong)
I have no other explanation why i get incosistent output. Since i get all profile links from time to time the program is able to find all relevant profiles.
heres what the output should be (for different gui input)
input:anlagenbau output:3070
Fahrzeugbau output:4065
laserschneiden output:1311
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from urllib.request import urlopen
from datetime import date
from datetime import datetime
import easygui
import re
from selenium.common.exceptions import NoSuchElementException
import time
#input window suchbegriff
suchbegriff = easygui.enterbox("Suchbegriff eingeben | Hinweis: suchbegriff sollte kein '/' enthalten")
#get date and time
now = datetime.now()
current_time = now.strftime("%H-%M-%S")
today = date.today()
date = today.strftime("%Y-%m-%d")
def get_profile_url(label_element):
# get the url from a result element
onlick = label_element.get_attribute("onclick")
# some regex magic
return re.search(r"(?<=open\(\')(.*?)(?=\')", onlick).group()
def load_more_results():
# load more results if needed // use only on the search page!
button_wrapper = wd.find_element_by_class_name("loadNextBtn")
button_wrapper.find_element_by_tag_name("span").click()
#### Script starts here ####
# Set some Selenium Options
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# Webdriver
wd = webdriver.Chrome(options=options)
# Load URL
wd.get("https://www.techpilot.de/zulieferer-suchen?"+str(suchbegriff))
# lets first wait for the timeframe
iframe = WebDriverWait(wd, 5).until(
EC.frame_to_be_available_and_switch_to_it("efficientSearchIframe")
)
# the result parent
result_pane = WebDriverWait(wd, 5).until(
EC.presence_of_element_located((By.ID, "resultPane"))
)
#get all profilelinks as list
time.sleep(5)
href_list = []
wait = WebDriverWait(wd, 15)
while True:
try:
#time.sleep(1)
wd.execute_script("loadFollowing();")
#time.sleep(1)
try:
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
except TimeoutException:
break
#time.sleep(1) # beeinflusst in irgeneiner weise die findung der ergebnisse
result_elements = wd.find_elements_by_class_name("fancyCompLabel")
#time.sleep(1)
for element in result_elements:
url = get_profile_url(element)
href_list.append(url)
#time.sleep(2)
while True:
try:
element = wd.find_element_by_class_name('fancyNewProfile')
wd.execute_script("""var element = arguments[0];element.parentNode.removeChild(element);""", element)
except NoSuchElementException:
break
except NoSuchElementException:
break
wd.close #funktioniert noch nicht
print("####links secured: "+str(len(href_list)))
Since you say that the sleep is affecting the number of results, it sounds like they're loading asynchronously and populating as they're loaded, instead of all at once.
The first question is whether you can ask the web site developers to change this, to only show them when they're all loaded at once.
Assuming you don't work for the same company as them, consider:
Is there something else on the page that shows up when they're all loaded? It could be a button or a status message, for instance. Can you wait for that item to appear, and then get the list?
How frequently do new items appear? You could poll for the number of results relatively infrequently, such as only every 2 or 3 seconds, and then consider the results all present when you get the same number of results twice in a row.
The issue is the method presence_of_all_elements_located doesn't wait for all elements matching a passed locator. It waits for presence of at least 1 element matching the passed locator and then returns a list of elements found on the page at that moment matching that locator.
In Java we have
wait.until(ExpectedConditions.numberOfElementsToBeMoreThan(element, expectedElementsAmount));
and
wait.until(ExpectedConditions.numberOfElementsToBe(element, expectedElementsAmount));
With these methods you can wait for predefined amount of elements to appear etc.
Selenium with Python doesn't support these methods.
The only thing you can see with Selenium in Python is to build some custom method to do these actions.
So if you are expecting some amount of elements /links etc. to appear / be presented on the page you can use such method.
This will make your test stable and will avoid usage of hardcoded sleeps.
UPD
I have found this solution.
This looks to be the solution for the mentioned above methods.
This seems to be a Python equivalent for wait.until(ExpectedConditions.numberOfElementsToBeMoreThan(element, expectedElementsAmount));
myLength = 9
WebDriverWait(browser, 20).until(lambda browser: len(browser.find_elements_by_xpath("//img[#data-blabla]")) > int(myLength))
And this
myLength = 10
WebDriverWait(browser, 20).until(lambda browser: len(browser.find_elements_by_xpath("//img[#data-blabla]")) == int(myLength))
Is equivalent for Java wait.until(ExpectedConditions.numberOfElementsToBe(element, expectedElementsAmount));

Web Scraping app using Scrapy / Selenium generates Error: "ModuleNotFoundError 'selenium'"

Good morning!
I have recently started learning Python and moved onto applying the little I know to create the monstrosity seen below.
In brief:
I am attempting to scrape SEC Edgar (https://www.sec.gov/edgar/searchedgar/cik.htm) for CIK codes of companies which I want to study in more detail (for now just 1 company to see if it's the right approach).
To scrape the CIK-code, I created a scrapy spider, imported selenium and created 3 functions - 1st to insert "company name" in the input name, 2nd to activate the "submit" button and finally, a function to scrape the CIK code once the submit is activated and return item.
Apart from adding the item to items.py, I haven't changed the middlewares or settings.
For some reason, I am getting ModuleNotFoundError for 'selenium', although I have installed the packages and imported selenium & webdriver along with everything else.
I have tried to mess around with indentation and rephrased the code but achieved no improvement.
import selenium
from selenium import webdriver
import scrapy
from ..items import Sec1Item
from scrapy import Selector
class SecSpSpider(scrapy.Spider):
name = 'SEC_sp'
start_urls =
['http://https://www.sec.gov/edgar/searchedgar/cik.htm/']
def parse(self,response):
company_name = 'INOGEN INC'
return scrapy.FormRequest.from_response(response, formdata ={
'company': company_name
}, callback=self.start_requests())
def start_requests(self):
driver = webdriver.Chrome()
driver.get(self.start_urls)
while True:
next_url = driver.find_element_by_css_selector(
'.search-button'
)
try:
self.parse(driver.page_source)
next_url.click()
except:
break
driver.close()
def parse_page(self, response):
items = Sec1Item()
CIK_code = response.css('a::text').extract()
items["CIK Code: "] = Sec1Item
yield items
I seem not to be able to get over the import selenium error, hence I am not sure about the extent of needed adjustments to the remainder of my spider.
Error message:
"File/Users/user1/PycharmProjects/Scraper/SEC_1/SEC_1/spiders/SEC_sp.py", line 1, in <module>
import selenium
ModuleNotFoundError: No module named 'selenium'
Thank you for any assistance and help!

dividing urllib2/beautifulsoup requests into smaller request packages

I wanted to assemble a set of patent IDs about the search term 'automobile'. I wrote this code:
import urllib2
from bs4 import BeautifulSoup
import sys
import StringIO
import re
search_term = 'automobile'
patent_list = []
for i in range(100): #for the first 100 pages of results
web_page = 'https://www.lens.org/lens/search?q=' + str(search_term) + '&sat=P&l=en&st=true&p=' + str(i) + '&n=100'
page = urllib2.urlopen(web_page)
soup = BeautifulSoup(page,'html.parser')
for aref in soup.findAll("a",href=True):
if re.findall('/lens/patent',aref['href']):
link = aref['href']
split_link = link.split('/')
if len(split_link) == 4:
patent_list.append(split_link[-1])
print '\n'.join(set(patent_list))
However, I got a 503 error. I googled this and found it: '
The server is currently unable to handle the request due to a
temporary overloading or maintenance of the server.'
Does this mean
Do not use an algorithm, manually assemble the IDs instead or
Break the request down into smaller chunks.
If the answer is (2), how would I break this into smaller requests?
Does this mean (1) Do not use an algorithm, manually assemble the IDs instead or (2) Break the request down into smaller chunks.
Neither.
I don't understand what algorithm you are speaking about, but no.
I'm not sure neither what you means by "smaller chunks", but again no.
503 basically means the server is too busy or sometimes offline.
When you run your script (or if you browse the website with your browser) you will notice how the server takes time to handle a single request, so you can guess if it struggles to handle a single request, 100 requests in a row is a little too much for your target.
But still, the firsts 16, 17 or 18 calls works great. Maybe the server just needs a little more time between each query to handle that?
Just add import time at top of your file, time.sleep(10) and the end of your loop and profit.
You surely want to add some logs here and there, here is my version of your code (I just added time.sleep() + some prints)
import urllib2
from bs4 import BeautifulSoup
import sys
import StringIO
import re
import time
search_term = 'automobile'
patent_list = []
for i in range(100): #for the first 100 pages of results
web_page = 'https://www.lens.org/lens/search?q=' + str(search_term) + '&sat=P&l=en&st=true&p=' + str(i) + '&n=100'
print('fetching {} ({})'.format(i, web_page))
page = urllib2.urlopen(web_page)
print('webpage fetched')
soup = BeautifulSoup(page,'html.parser')
for aref in soup.findAll("a",href=True):
if re.findall('/lens/patent',aref['href']):
link = aref['href']
split_link = link.split('/')
if len(split_link) == 4:
patent_list.append(split_link[-1])
print('sleeping ten seconds')
time.sleep(10)
print '\n'.join(set(patent_list))
Now the protip: There are no more than 400 items in database, so you can stop a page 4. You better check in your loop if you got result and if not break the loop.

How to poll Selenium Hub for the number of it's registered Nodes?

I searched for any answer in the Selenium Grid Documentation but couldnt find anything.
Could it somehow be possible to poll the Selenium Hub and take back the number of nodes that are registered to it?
If you check the Grid Console (http://selenium.hub.ip.address:4444/grid/console) you will find valuable information about Grid's nodes, browsers, IPs, etc.
This is my grid. I have two nodes, one Linux and one Windows:
If you go through the links (Configuration, View Config,...) you will find information about each node and browser.
I finally put together this
def grid_nodes_num(grid_console_url="http://my_super_company.com:8080/grid/console#"):
import requests
from bs4 import BeautifulSoup
r = requests.get(grid_console_url)
html_doc = r.text
soup = BeautifulSoup(html_doc)
# print soup.prettify() # for debuggimg
grid_nodes = soup.find_all("p", class_="proxyid")
if grid_nodes == []:
print "-No Nodes detected. Grid is down!-"
else:
nodes_num = len(grid_nodes)
print "-Detected ",nodes_num," node(s)-"
return nodes_num