I have been trying to figure out a way to web scrap 500 analysts and more without getting blocked. Is there a way to fix the 403 forbidden error?
Will I need to go incognito, or have multiple accounts?
Added the error code it gives me but I notice it stops at analyst # 18. I checked and nothing is wrong with that analyst. It has to do the website blocking me for webscrapping.
ID1 = Predictions['analyst'].drop_duplicates()
len(ID1)
len(ID)
IDD = ID1.loc[0:500]
Analyst = []
Analyst
path = r'/Users/ashleyrabanales/chromedriver' #f path
driver_service = Service(executable_path=path)
driver = webdriver.Chrome(service=driver_service)
url = 'https://estimize.com/'
for ID in IDD:
time.sleep(random.uniform(2,5))
driver.get(f"{url}/users/{ID}")
time.sleep(random.uniform(2,3))
# row to append to final list/dataframe
try:
login = driver.find_element('xpath','//*[#id="top-navigation"]/div[1]/ul/li[8]/a')
login.click()
time.sleep(random.uniform(3,8))
# list username and password as variables
username = 'cbrown180#student.gsu.edu'
password = '123456'
# find email field in HTML and send username to field -- sleep 2 seconds
email_field = driver.find_element('name','user[login]')
email_field.send_keys(username)
time.sleep(random.uniform(2,9))
# find the password field and input password and submit
password_field = driver.find_element('name','user[password]')
password_field.send_keys(password)
password_field.submit()
time.sleep(random.uniform(3,3))
except NoSuchElementException:
pass
row = {}
# create variable to assign ticker to the row
row['AnalystID'] = ID
# grab analyst id... not just the name grab the data user to use in url for final scraper
name = driver.find_element('xpath',f'//*[#id="users_show"]/div[4]/div[1]/div[1]/div[1]/div/h1/a').text
row['Name'] = name
role = driver.find_element('xpath', f'//*[#id="users_show"]/div[4]/div[1]/div[1]/div[1]/div/ul').text
row['Role'] = role
join_date = driver.find_element('xpath', f'//*[#id="users_show"]/div[4]/div[1]/div[1]/div[1]/div/div[2]/div[2]').text
row['Join Date'] = join_date
cs = driver.find_element('xpath', f'//*[#id="confidence-wrap"]/div/div[2]').text
row['Analyst Confidence Score'] = cs
Err = driver.find_element('xpath',f'//*[#id="profile-tab-wrap"]/div[1]/div[1]/div[3]').text
row['Error rate'] = Err
Accu = driver.find_element('xpath',f'//*[#id="profile-tab-wrap"]/div[1]/div[2]/div[2]').text
row['Accuracy Percentile'] = Accu
Points = driver.find_element('xpath',f'//*[#id="profile-tab-wrap"]/div[2]/div[1]/div[3]').text
row['Points'] = Points
PointsE = driver.find_element('xpath',f'//*[#id="profile-tab-wrap"]/div[2]/div[2]/div[2]').text
row['Points/Estimate'] = PointsE
Stocks = driver.find_element('xpath',f'//*[#id="profile-tab-wrap"]/div[3]/div[1]/div[3]').text
row['Stocks'] = Stocks
Pending = driver.find_element('xpath',f'//*[#id="profile-tab-wrap"]/div[3]/div[2]/div[3]').text
row['Pending'] = Pending
if row not in Analyst:
Analyst.append(row)
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py in find_element(self, by, value)
859 value = '[name="%s"]' % value
860
--> 861 return self.execute(Command.FIND_ELEMENT, {"using": by, "value": value})["value"]
862
863 def find_elements(self, by=By.ID, value: Optional[str] = None) -> List[WebElement]:
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params)
442 response = self.command_executor.execute(driver_command, params)
443 if response:
--> 444 self.error_handler.check_response(response)
445 response["value"] = self._unwrap_value(response.get("value", None))
446 return response
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response)
247 alert_text = value["alert"].get("text")
...
18 chromedriver 0x000000010849d81e chromedriver + 4782110
19 libsystem_pthread.dylib 0x00007fff765332eb _pthread_body + 126
20 libsystem_pthread.dylib 0x00007fff76536249 _pthread_start + 66
21 libsystem_pthread.dylib 0x00007fff7653240d thread_start + 13
Related
I'm attempting to scrape Amazon for iPhone 11 names and prices, but when I run the code, I get the following error:
The Error I get:
My code is as the following:
```
#First project
class CrawledInfo:
def __init__(self, product_name, product_price, cust_name = None, cust_location = None, rating = None, review = None, review_date = None)-> None:
self.cust_name = cust_name
self.cust_location = cust_location
self.product_name = product_name
self.product_price = product_price
self.rating = rating
self.review = review
self.review_date = review_date
class CrawlerBot:
def item(self, name):
count = 1
page = 1
pageIncrement = 1
maxRetrieves = 100
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
l = []
#Declaring options
options = Options()
options.headless = False
options.add_experimental_option('detach', True)
browser = webdriver.Chrome(ChromeDriverManager().install(), options=options)
browser.maximize_window()
browser.get(url)
browser.set_page_load_timeout(10)
while True:
try:
if pageIncrement * page > maxRetrieves:
break
if count > pageIncrement:
page +=1
count = 1
#Capture item name
xPathTitle = '//*[#id="search"]/div[1]/div[2]/div/span[3]/div[2]/div[' + str(count) + ']/div/span/div/div/div[2]/div[2]/div/div[1]/div/div/div[1]/h2/a/span'
title = browser.find_element_by_xpath(xPathTitle)
titleText = title.get_attribute('innerHTML').splitLines()[0]
title.click()
#Capture item price
xPathPrice = '//*[#id="price_inside_buybox"]'
price = browser.find_element_by_xpath(xPathPrice)
priceText = price.get_attribute('innerHTML').splitLines()
#Return to the search page
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
browser.get(url)
browser.set_page_load_timeout(10)
#Send the results to class CrawledInfo
info = CrawledInfo(titleText, priceText)
l.append(info)
count +=1
except Exception as e:
print('Exception: ', e)
count +=1
if pageIncrement * page > maxRetrieves:
break
if count > pageIncrement:
page +=1
count = 1
#Return to the search page
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
browser.get(url)
browser.set_page_load_timeout(10)
browser.close()
return l
#Creating the object
start_crawler = CrawlerBot()
with open('results', 'w', newline='', encoding='utf-8') as fileWriter:
dataWriter = csv.writer(fileWriter, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for dat in start_crawler.item('iphone 11'):
dataWriter.writerow([dat.product_name, dat.product_price])
```
Anyone who has an idea of what's wrong?
When my code is working write I'm expecting it to create a csv file with the names of iPhone 11 together with their prices
I am new to python. I am getting an error when running below code. The issue seems to be with date. can someone help me to correct i please. I have tried changing the date format in the excel but it does not solve the issue. The excel have a list of several bonds. I want to generate the coupon dates of the different bonds
BondData = pd.read_excel (r'C:\Users\Avishen\Desktop\Python\BONDDATA.xlsx')
Data = pd.DataFrame(BondData)
def scheduledates():
tenor = ql.Period(ql.Semiannual)
day_count = ql.Thirty360
calendar = ql.UnitedStates()
businessConvention = ql.Unadjusted
dateGeneration = ql.DateGeneration.Backward
monthEnd = False
# Dates in Bond Period
return ql.Schedule (issueDate, maturityDate, tenor, calendar, businessConvention,
businessConvention , dateGeneration, monthEnd)
new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
new_df["ISIN"] = Data.ISIN
new_df
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-877415e9cf83> in <module>
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
~\anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7546 kwds=kwds,
7547 )
-> 7548 return op.get_result()
7549
7550 def applymap(self, func) -> "DataFrame":
~\anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
178 return self.apply_raw()
179
--> 180 return self.apply_standard()
181
182 def apply_empty_result(self):
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
269
270 def apply_standard(self):
--> 271 results, res_index = self.apply_series_generator()
272
273 # wrap results
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
298 for i, v in enumerate(series_gen):
299 # ignore SettingWithCopy here in case the user mutates
--> 300 results[i] = self.f(v)
301 if isinstance(results[i], ABCSeries):
302 # If we have a view on v, we need to make a copy because
<ipython-input-4-877415e9cf83> in <lambda>(x)
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
<ipython-input-4-877415e9cf83> in scheduledates()
8
9 def scheduledates():
---> 10 issueDate = ql.Date(Data.issuedate)
11 maturityDate = ql.Date(Data.maturitydate)
12 tenor = ql.Period(ql.Semiannual)
~\anaconda3\lib\site-packages\QuantLib\QuantLib.py in __init__(self, *args)
425
426 def __init__(self, *args):
--> 427 _QuantLib.Date_swiginit(self, _QuantLib.new_Date(*args))
428
429 def weekdayNumber(self):
TypeError: Wrong number or type of arguments for overloaded function 'new_Date'.
Possible C/C++ prototypes are:
Date::Date()
Date::Date(Day,Month,Year)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond,Microsecond)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond)
Date::Date(Day,Month,Year,Hour,Minute,Second)
Date::Date(BigInteger)
Date::Date(std::string const &,std::string)
---------------------------------------------------------------------------
Data = pd.DataFrame(BondData)
Fields from Bond Data
ISIN
issuedate
maturitydate
coupon
Tradeyield
Bond_Price
MarketPrice
Nominal_Amount
From the traceback, the problem is the line:
issueDate = ql.Date(Data.issuedate)
(which for some reason is not in the code you pasted). Coming from Excel, issuedate should be an integer and thus compatible with the ql.Date constructor, but it's possible that pandas is reading it as a string or some other type. You should examine the data frame and check the type of the column. If it's not what you expect, you'll have to figure out if there are data in that column that pandas can't interpret as integers, and either clean them up of force the conversion somehow before passing them to ql.Date.
I need to concatenate tables created from a loop. The have repeats of the names in the columns but they are telling a different story, but for some reason when running this code I get an error:
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
Here is the code:
url = 'https://www.impactfees.com/publications%20pdf/2019survey.pdf'
tables = camelot.read_pdf(url, flavor = 'stream', edge_tol = 500, pages = '4-end')
i = 0
while i in range(0,tables.n):
table_value = tables[i].df.loc[0,4]
header = 1
header = tables[i].df.iloc[header]
tables[i].df = tables[i].df.rename(columns = header)
nan_v = float("NaN")
tables[i].df.replace('',nan_v,inplace = True)
tables[i].df.dropna(subset = ['Jurisdiction'], inplace = True)
tables[i].df.replace(['Jurisdiction'], nan_v, inplace = True)
tables[i].df.dropna(subset = ['Jurisdiction'], inplace = True)
# Tot_col = tables[i].df.columns.get_loc('Total')
# tables[i].df = tables[i].df.iloc[:,0:Tot_col+1]
tables[i].df['report_name'] = table_value
tables[i].df.loc[~tables[i].df.index.duplicated(keep = 'first')]
i = i + 1
dfs = pd.concat([table.df for table in tables])
dfs
and here is the error I am getting:
InvalidIndexError Traceback (most recent call last)
<ipython-input-133-2617eb5ae448> in <module>
23 i = i + 1
24
---> 25 dfs = pd.concat([table.df for table in tables])
26
27
~\anaconda3\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
296 )
297
--> 298 return op.get_result()
299
300
~\anaconda3\lib\site-packages\pandas\core\reshape\concat.py in get_result(self)
514 obj_labels = obj.axes[1 - ax]
515 if not new_labels.equals(obj_labels):
--> 516 indexers[ax] = obj_labels.get_indexer(new_labels)
517
518 mgrs_indexers.append((obj._mgr, indexers))
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_indexer(self, target, method, limit, tolerance)
3169
3170 if not self.is_unique:
-> 3171 raise InvalidIndexError(
3172 "Reindexing only valid with uniquely valued Index objects"
3173 )
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
camelot has an issue. I had to patch utils.py to use a different user-agent
the pages are not fully consistent, hence passing a list to rename(columns=) does not work. You need to pass a dict
have kept two data frames - one with target rows, other with excluded rows
there remain inconsistent columns e.g. Drain Parks
import pandas as pd
import camelot
url = 'https://www.impactfees.com/publications%20pdf/2019survey.pdf'
tables = camelot.read_pdf(url, flavor = 'stream', edge_tol = 500, pages = '4-end')
df = pd.DataFrame()
dfexc = pd.DataFrame()
for i in range(tables.n):
dft = tables[i].df.rename(columns={i:v.replace("\n"," ") for i,v in tables[i].df.iloc[1].items() if v!=""})
if " " in dft.columns[0]:
dft = dft.rename(columns={dft.columns[0]:dft.columns[0].split(" ")[0], 1:dft.columns[0].split(" ")[1]})
m = (dft.State.str.len()!=2) | (dft.index < 2)
dfexc = pd.concat([dfexc, tables[i].df.loc[m].assign(page=i)])
df = pd.concat([df, dft.loc[~m].assign(page=i)])#.reset_index(drop=True)
I am getting error while I directly passing dataframe column into stopword.
How Can I resolve this
stop_words_corpus=pd.DataFrame(word_dictionary_corpus.Word.unique(),columns=feature_names)
cv = CountVectorizer( max_features = 200,analyzer='word',stop_words= stop_words_corpus)
cv_txt = cv.fit_transform(data.pop('Clean_addr'))
****Updated Error***
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
867
868 vocabulary, X = self._count_vocab(raw_documents,
--> 869 self.fixed_vocabulary_)
870
871 if self.binary:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
783 vocabulary.default_factory = vocabulary.__len__
784
--> 785 analyze = self.build_analyzer()
786 j_indices = []
787 indptr = _make_int_array()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in build_analyzer(self)
260
261 elif self.analyzer == 'word':
--> 262 stop_words = self.get_stop_words()
263 tokenize = self.build_tokenizer()
264
I fixed the error taht error still having the issue
Try this:
cv = CountVectorizer(max_features = 200,
analyzer='word',
stop_words=stop_words_corpus.stack().unique())
We need to make the dataframe into NpArray to pass stopwords in to the countvectorizer
stop_word =stop_words_corpus['Word'].values
cv = CountVectorizer(max_features = 200,
analyzer='word',
stop_words=stop_word)
I would like to connect to bigQuery from datalab and execute update commands.
I run the following code for the API & authentication:
from google.cloud import bigquery
# Get everything we possibly can from the service account JSON file
#set GOOGLE_APPLICATION_CREDENTIALS
cred = bigquery.Client.from_service_account_json('OrielResearch-da46e752c7ff.json')
# Instantiates a client
client = bigquery.Client(project='speedy-emissary-167213',credentials=cred)
# The name of the dataset
dataset_name = 'pgp_orielresearch'
# The name of the table
table_name = 'update_queries'
# Perform a synchronous query.
QUERY = (
'SELECT * FROM [speedy-emissary-167213:pgp_orielresearch.update_queries]')
query = client.run_sync_query(QUERY)
dataset = client.dataset(dataset_name)
tables, token = dataset.list_tables()
and get the following error:
AttributeError: 'Client' object has no attribute 'authorize'
any idea?
the full stack is:
AttributeErrorTraceback (most recent call last)
<ipython-input-2-616f54fa35ba> in <module>()
19 query = client.run_sync_query(QUERY)
20 dataset = client.dataset(dataset_name)
---> 21 t = dataset.list_tables()
22 #query.timeout_ms = TIMEOUT_MS
23 #query.run()
/usr/local/lib/python2.7/dist-packages/google/cloud/bigquery/dataset.py in list_tables(self, max_results, page_token)
568 connection = self._client.connection
569 resp = connection.api_request(method='GET', path=path,
--> 570 query_params=params)
571 tables = [Table.from_api_repr(resource, self)
572 for resource in resp.get('tables', ())]
/usr/local/lib/python2.7/dist-packages/google/cloud/connection.pyc in api_request(self, method, path, query_params, data, content_type, api_base_url, api_version, expect_json, _target_object)
344 response, content = self._make_request(
345 method=method, url=url, data=data, content_type=content_type,
--> 346 target_object=_target_object)
347
348 if not 200 <= response.status < 300:
/usr/local/lib/python2.7/dist-packages/google/cloud/connection.pyc in _make_request(self, method, url, data, content_type, headers, target_object)
242 headers['User-Agent'] = self.USER_AGENT
243
--> 244 return self._do_request(method, url, headers, data, target_object)
245
246 def _do_request(self, method, url, headers, data,
/usr/local/lib/python2.7/dist-packages/google/cloud/connection.pyc in _do_request(self, method, url, headers, data, target_object)
270 :returns: The HTTP response object and the content of the response.
271 """
--> 272 return self.http.request(uri=url, method=method, headers=headers,
273 body=data)
274
/usr/local/lib/python2.7/dist-packages/google/cloud/connection.pyc in http(self)
101 self._http = httplib2.Http()
102 if self._credentials:
--> 103 self._http = self._credentials.authorize(self._http)
104 return self._http
105
AttributeError: 'Client' object has no attribute 'authorize'
Trying setting the credentials like so:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'OrielResearch-da46e752c7ff.json'
from google.cloud.bigquery.client import Client
client = Client()