I am trying to access an intranet site with HTTP Basic Authentication enabled.
Here's the code I'm using:
from bs4 import BeautifulSoup
import urllib.request, base64, urllib.error
request = urllib.request.Request(url)
string = '%s:%s' % ('username','password')
base64string = base64.standard_b64encode(string.encode('utf-8'))
request.add_header("Authorization", "Basic %s" % base64string)
try:
u = urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
print(e)
print(e.headers)
soup = BeautifulSoup(u.read(), 'html.parser')
print(soup.prettify())
But it doesn't work and fails with 401 Authorization required. I can't figure out why it's not working.
The solution given here works without any modifications.
from bs4 import BeautifulSoup
import urllib.request
# create a password manager
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
# Add the username and password.
# If we knew the realm, we could use it instead of None.
top_level_url = "http://example.com/foo/"
password_mgr.add_password(None, top_level_url, username, password)
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
# create "opener" (OpenerDirector instance)
opener = urllib.request.build_opener(handler)
# use the opener to fetch a URL
u = opener.open(url)
soup = BeautifulSoup(u.read(), 'html.parser')
The previous code works as well. You just have to decode the utf-8 encoded string otherwise the header contains a byte-sequence.
from bs4 import BeautifulSoup
import urllib.request, base64, urllib.error
request = urllib.request.Request(url)
string = '%s:%s' % ('username','password')
base64string = base64.standard_b64encode(string.encode('utf-8'))
request.add_header("Authorization", "Basic %s" % base64string.decode('utf-8'))
try:
u = urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
print(e)
print(e.headers)
soup = BeautifulSoup(u.read(), 'html.parser')
print(soup.prettify())
UTF-8 encoding might not work. You can try to use ASCII or ISO-8859-1 encoding instead.
Also, try to access the intranet site with a web browser and check how the Authorization header is different from the one you are generating.
Encode using "ascii". This worked for me.
import base64
import urllib.request
url = "http://someurl/path"
username = "someuser"
token = "239487svksjdf08234"
request = urllib.request.Request(url)
base64string = base64.b64encode((username + ":" + token).encode("ascii"))
request.add_header("Authorization", "Basic {}".format(base64string.decode("ascii")))
response = urllib.request.urlopen(request)
response.read() # final response string
Related
I'm trying to get the url src from the following html
For some reason when i try to print out the logo url, I get [] as a response. My code is as follows:
from urllib.request import Request
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
url = 'https://growjo.com/industry/Cannabis'
request = Request(
url,
headers={'User-Agent': 'Mozilla/5.0'}
)
page = urlopen(request)
page_content_bytes = page.read()
page_html = page_content_bytes.decode("utf-8")
soup = BeautifulSoup(page_html, "html.parser")
company_rows = soup.find_all("table",{"class":"jss31"})[0].find_all("tbody")[0].find_all("tr")
for company in company_rows:
company_data = company.find_all("td")
logo = company_data[1].find_all("div",{"class":"lazyload-wrapper"})[0].find_all("a")
name = company_data[1].text
print(logo)
break
I tried printing out the 'a' tags...i tried the 'img'...they all respond with []. Its as if bs4 is not reading within the div class=lazyload-wrapper
Any help would be greatly appreciated.
The urls those contain logos are entirely dynamic
Bs4 can't render JS
API is restricted by authentication
Use an automation tool something like seleniun
Here I use Selenium4 with bs4
WebDriverManager is here
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("--headless")
#chrome to stay open
#options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
url= 'https://growjo.com/industry/Cannabis'
driver.get(url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "html.parser")
company_rows = soup.select('table.jss31 tbody tr')
for company in company_rows:
log = company.select_one('td[class="jss38 jss40 jss46"] div + a')
logo = 'https://growjo.com' + log.get('href') if log else None
print(logo)
Output:
https://growjo.com/company/Dutchie
https://growjo.com/company/Ascend_Wellness_Holdings
https://growjo.com/company/Hiku_Brands
https://growjo.com/company/C3_Industries
https://growjo.com/company/Jane_Technologies
https://growjo.com/company/Headset
https://growjo.com/company/Jushi_Holdings
https://growjo.com/company/FLOWER_CO.
https://growjo.com/company/Columbia_Care
https://growjo.com/company/Cannabis_Control_Commission
https://growjo.com/company/FIGR
https://growjo.com/company/Leafly
https://growjo.com/company/Hound_Labs
https://growjo.com/company/Leaf_Trade
https://growjo.com/company/Wurk
https://growjo.com/company/Sundial_Cannabis
https://growjo.com/company/BEYOND_%2F_HELLO
https://growjo.com/company/PharmaCann
https://growjo.com/company/LeafLink
https://growjo.com/company/Connected_Cannabis_Co.
https://growjo.com/company/NATURE'S_MEDICINES
https://growjo.com/company/Althea_Group
https://growjo.com/company/CURE_Pharmaceutical
https://growjo.com/company/urban-gro
https://growjo.com/company/NABIS
None
https://growjo.com/company/Medisun
https://growjo.com/company/Mammoth_Distribution
https://growjo.com/company/Dosecann_Cannabis_Solutions
https://growjo.com/company/Vireo_Health
https://growjo.com/company/Dama_Financial
https://growjo.com/company/Caliber
https://growjo.com/company/springbig
https://growjo.com/company/Westleaf
https://growjo.com/company/INSA
https://growjo.com/company/Pure_Sunfarms
https://growjo.com/company/Sensi_Media_Group
https://growjo.com/company/Verano_Holdings
https://growjo.com/company/TILT_Holdings
https://growjo.com/company/Bloom_Medicinals
https://growjo.com/company/Planet_13_Holdings
https://growjo.com/company/Liberty_Health_Sciences
https://growjo.com/company/Calyx_Peak_Companies
https://growjo.com/company/Vangst
https://growjo.com/company/Fire_&_Flower
https://growjo.com/company/Revolution_Enterprises
https://growjo.com/company/4Front_Ventures
https://growjo.com/company/Calyx_Containers
https://growjo.com/company/GreenTech_Industries
https://growjo.com/company/BZAM_Cannabis
https://growjo.com/company/Cova_Software
None
https://growjo.com/company/Up_Cannabis
https://growjo.com/company/Cann_Group
https://growjo.com/company/Holistic_Industries
https://growjo.com/company/Treez
https://growjo.com/company/INDIVA
https://growjo.com/company/Kiva_Confections
https://growjo.com/company/MariMed
https://growjo.com/company/MCR_Labs
https://growjo.com/company/Vicente_Sederberg
https://growjo.com/company/Demetrix
https://growjo.com/company/365_Cannabis
https://growjo.com/company/LivWell_Enlightened_Health
https://growjo.com/company/High_Tide
https://growjo.com/company/The_Hawthorne_Gardening_Company
https://growjo.com/company/WYLD
https://growjo.com/company/VidaCann
https://growjo.com/company/Sira_Naturals
https://growjo.com/company/iAnthus
https://growjo.com/company/EastHORN_Clinical_Services
https://growjo.com/company/PharmaCielo
https://growjo.com/company/OCS_Ontario_Cannabis_Store
https://growjo.com/company/Hugh_Wood_Canada
https://growjo.com/company/Wana_Brands
https://growjo.com/company/Parallel
https://growjo.com/company/Weedmaps
None
https://growjo.com/company/Dark_Heart_Nursery
https://growjo.com/company/Stealth_Monitoring
https://growjo.com/company/dicentra
https://growjo.com/company/Sunday_Goods_&_The_Pharm
https://growjo.com/company/Phase_Zero_Design
https://growjo.com/company/Sava
https://growjo.com/company/Ceylon_Solutions
https://growjo.com/company/Green_Flower
https://growjo.com/company/Shryne_Group
https://growjo.com/company/MJ_Freeway
https://growjo.com/company/Theory_Wellness
https://growjo.com/company/HEXO_Corp
https://growjo.com/company/Lightshade
https://growjo.com/company/New_Frontier_Data
https://growjo.com/company/Mission_Dispensaries
https://growjo.com/company/FLUENT_Cannabis_Care
https://growjo.com/company/Superette
https://growjo.com/company/HdL_Companies
https://growjo.com/company/Helix_Technologies
https://growjo.com/company/Mary's_Medicinals
https://growjo.com/company/Indus_Holdings
https://growjo.com/company/Auxly
https://growjo.com/company/Good_Chemistry
https://growjo.com/company/Khiron_Life_Sciences_Corp
https://growjo.com/company/The_Apothecarium
I'm trying to get the names of top 250 IMDb movies using BeautifulSoup. The code does not execute properly and shows no errors.
import requests
from bs4 import BeautifulSoup
url = "https://www.imdb.com/chart/top"
response = requests.get(url)
rc = response.content
soup = BeautifulSoup(rc,"html.parser")
for i in soup.find_all("td",{"class:":"titleColumn"}):
print(i)
I'm expecting it show me all of the td tags with titleColumn classes but it is not working. Am I missing something? Thanks in advance!
Remove the : after the class:
{"class:":"titleColumn"}
to
{"class":"titleColumn"}
Example ++
import requests
from bs4 import BeautifulSoup
url = "https://www.imdb.com/chart/top"
response = requests.get(url)
rc = response.content
soup = BeautifulSoup(rc,"html.parser")
data = []
for i in soup.find_all("td",{"class":"titleColumn"}):
data.append({
'people':i.a['title'],
'title':i.a.get_text(),
'info':i.span.get_text()
})
data
I have the following code using requests to generate CSRFToken import requests:
import requests
class TokenGenerator:
def GenerateToken(self, uid, pwd):
TokenGenerator.session = requests.Session()
TokenGenerator.resp = TokenGenerator.session.get("http://chp1766.neilsoft.in:8000/", cookies={'contact.language': 'en'},auth=(uid,pwd))
if TokenGenerator.resp.status_code == 200:
token = TokenGenerator.session.cookies.get('CSRFToken')
print("TOKEN")
print(token)
obj = TokenGenerator()
obj.GenerateToken('caddok', '')
How can I replace the requests module with urllib2?
With urllib you can get the same result with the following code:
req1 = urllib2.Request("YOUR URL")
response = urllib2.urlopen(req1)
cookie = response.headers.get('CSRFToken')
print(cookie)
I am using Google Custom Search to index content on my website.
When I use a REST client to make the get request at
https://www.googleapis.com/customsearch/v1?key=xxx&q=query&cx=xx
I get response in sub seconds.
But when I try to make the call using my code, it takes up six seconds. What am I doing wrong ?
__author__ = 'xxxx'
import urllib2
import logging
import gzip
from cfc.apikey.googleapi import get_api_key
from cfc.url.processor import set_query_parameter
from StringIO import StringIO
CX = 'xxx:xxx'
URL = "https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=sd&fields=kind,items(title)" % (get_api_key(), CX)
def get_results(query):
url = set_query_parameter(URL, 'q', query)
request = urllib2.Request(url)
request.add_header('Accept-encoding', 'gzip')
request.add_header('User-Agent','cfc xxxx (gzip)')
response = urllib2.urlopen(request)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
return data
I have implemented performance tips mentioned in Performance Tips. I would appreciate any help. Thanks.
I'm trying to replicate the following successful cURL operation with Grinder.
curl -X PUT -d "title=Here%27s+the+title&content=Here%27s+the+content&signature=myusername%3A3ad1117dab0ade17bdbd47cc8efd5b08" http://www.mysite.com/api
Here's my script:
from net.grinder.script import Test
from net.grinder.script.Grinder import grinder
from net.grinder.plugin.http import HTTPRequest
from HTTPClient import NVPair
import hashlib
test1 = Test(1, "Request resource")
request1 = HTTPRequest(url="http://www.mysite.com/api")
test1.record(request1)
log = grinder.logger.info
test1.record(log)
m = hashlib.md5()
class TestRunner:
def __call__(self):
params = [NVPair("title","Here's the title"),NVPair("content", "Here's the content")]
params.sort(key=lambda param: param.getName())
ps = ""
for param in params:
ps = ps + param.getValue() + ":"
ps = ps + "myapikey"
m.update(ps)
params.append(NVPair("signature", ("myusername:" + m.hexdigest())))
request1.setFormData(tuple(params))
result = request1.PUT()
The test runs okay, but it seems that my script doesn't actually send any of the params data to the API, and I can't work out why. There are no errors generated, but I get a 401 Unauthorized response from the API, indicating that a successful PUT request reached it, but obviously without a signature the request was rejected.
This isn't exactly an answer, more of a workaround that I came up with, that I've decided to post since this question hasn't yet received any responses, and it may help anyone else trying to achieve the same thing.
The workaround is basically to use the httplib and urllib modules to build and make the PUT request instead of the HTTPClient module.
import hashlib
import httplib, urllib
....
params = [("title", "Here's the title"),("content", "Here's the content")]
params.sort(key=lambda param: param[0])
ps = ""
for param in params:
ps = ps + param[1] + ":"
ps = ps + "myapikey"
m = hashlib.md5()
m.update(ps)
params.append(("signature", "myusername:" + m.hexdigest()))
params = urllib.urlencode(params)
print params
headers = {"Content-type": "application/x-www-form-urlencoded"}
conn = httplib.HTTPConnection("www.mysite.com:80")
conn.request("PUT", "/api", params, headers)
response = conn.getresponse()
print response.status, response.reason
print response.read()
conn.close()
(Based on the example at the bottom of this documentation page.)
You have to refer to the multi-form posting example in Grinder script gallery, but changing the Post to Put. It works for me.
files = ( NVPair("self", "form.py"), )
parameters = ( NVPair("run number", str(grinder.runNumber)), )
# This is the Jython way of creating an NVPair[] Java array
# with one element.
headers = zeros(1, NVPair)
# Create a multi-part form encoded byte array.
data = Codecs.mpFormDataEncode(parameters, files, headers)
grinder.logger.output("Content type set to %s" % headers[0].value)
# Call the version of POST that takes a byte array.
result = request1.PUT("/upload", data, headers)