Url Redirect Pyhton - selenium

I want to redirect the page to another url, but it's not happening. I would appreciate your help.
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64',
}
response = requests.get(ad_link, headers=headers, allow_redirects=True)
ic(response.status_code)
#ic(response.history)
if response.history:
ic("Request was redirected")
for resp in response.history:
ic(resp.status_code, resp.url)
ic('For Control')
ic("Final destination:")
ic(response.status_code, response.all_links)
ic(ad_link)
else:
ic("Request was not redirected")
ic(response)
time.sleep(sleep_time)
I wrote a loop like this, the content is 200 but it doesn't open the url to the screen. It doesn't run the for here at all.
I go to a url and log in and then I want it to take me to a link from the column named 'ad_link' in the database. Because I don't want to have to open that session all the time. I want to browse the links in my database with an open session.
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="",
database="m"
)
mycursor = mydb.cursor()
urllib3.disable_warnings()
sql = "SELECT ad_link FROM test"
mycursor.execute(sql)
myresult = mycursor.fetchall()
all_links = myresult[0:]
len_all_links = len(all_links)
dataframe = pd.DataFrame(all_links, columns=['links'])
x = 0
y = 5
#def fonksiyon(i):
#global x
#global y
number = np.arange(x,y)
for i in tqdm(number):
ad_link = dataframe.links[i] #ad_link = dataframe["links"][i]
print(ad_link)
Display = []
prefs = {"profile.managed_default_content_settings.images": 2} # this is to not load images
sx = random.randint(1000, 1500)
sn = random.randint(3000, 4500)
options = Options()
options = webdriver.ChromeOptions()
time.sleep(5)
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("detach", True)
capabilities = options.to_capabilities()
os.environ['WDM_SSL_VERIFY'] = '0'
options.add_experimental_option("prefs", prefs)
wsize = "--window-size=" + str(sx - 10) + ',' + str(sn - 10)
options.add_argument(str(wsize))
options.add_argument("prefs", )
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
options.add_argument(['--headless', '--disable-gpu', '--window-size=1920,1080', '--no-sandbox', '--disable-dev-shm-usage'])
service = Service(executable_path = r'C:\Users\Wiveda\chromedriver.exe')
test = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
sleep_time = 5
test.get(ad_link)
time.sleep(sleep_time)
ad_source = test.page_source
ad_soup = BeautifulSoup(ad_source, 'lxml')
mainresults = ad_soup.find_all('div', {'class': 'cBox u-inherit '})
try:
WebDriverWait(test, timeout=10).until(
lambda d: d.find_element(By.XPATH, "//button[#class='sc-bczRLJ-accept-btn']")).click()
WebDriverWait(test, timeout=10).until(
lambda d: d.find_element(By.XPATH, "//p[#class='phone-container']")).click()
tel_number = test.find_element(By.XPATH, "//p[#class='phone-container']").text
ic(tel_number)
except:
tel_number = 'Not Found Tel Number'
ic(tel_number)
time.sleep(1)
search_words = ""
try:
web_text = test.find_element(By.XPATH, "/html/body/div[6]/div/div[2]/div[3]/div[1]")
words = ["Import", "x", "y", "z"]
search_words = [word for word in words if re.findall(word, web_text)]
text_words = ''
if search_words:
for i, word in enumerate(search_words):
if i < len(search_words) - 1:
text_words += f"{word}, "
else:
text_words += f"{word}."
ic(f"\nCannot send mail because it contains the word.Index : {text_words}")
ic(re.findall)
print("İf tamamlandı")
print("Try tamamlandı")
except Exception:
text_words = "Not Found Words"
ic(text_words)
time.sleep(1)
#mainresults = ad_soup.find_all('div', {'class': 'cBox cBox--content u-overflow-inherit '})
try:
brand_and_model = ad_soup.find("h1", {"class": ('h u-word')}).get_text()
except:
brand_and_model = ' '
try:
model_version = ad_soup.find("div", {"class": ('list-title')}).get_text()
except:
model_version = ' '
try:
location = ad_soup.find("p", {"class": ('seller-address')}).get_text()
except:
location = ' '
try:
url_id = ad_soup.find(" ", {"class": ('')}).get_text()
except:
url_id = ''
cars_data = pd.DataFrame({
'brand_and_model': brand_and_model,
'model_version': model_version,
'location': location,
'tel_number': tel_number,
'url_id': url_id,
},
index=[0])
try:
table_pre = ad_soup.find("div", {"class": "cBox cBox--content cBox-body"}) # 1 (6 in one)
all_div = table_pre.findAll("div", {"class": ('key-feature__content')}) # 6 (2 in one)
all_title = table_pre.findAll("div", {"class": ('key-feature__label')}) # 6
all_results = table_pre.findAll("div", {"class": ('key-feature__value')}) # 6
except:
pass
description_list = []
value_list = []
try:
div_length = len(all_div)
except:
div_length = 6
for i in range(div_length):
try:
description_list.append(all_title[i].text)
description_list = list(map(lambda x: x.replace(" ", "_"), description_list))
value_list.append(all_results[i].text)
except:
description_list.append('')
value_list.append('')
all_key = []
all_value = []
try:
pdiv = ad_soup.find_all('div', {'class': 'bullet-list'})
except:
pass
equipment_key = []
try:
equipment_key_length = len(pdiv)
except:
equipment_key_length = 1
equipment_value = []
try:
dd_ul_li_length = len(pdiv)
except:
dd_ul_li_length = 1
df3 = pd.DataFrame(list(zip(equipment_key, equipment_value)), columns=['all_key', 'all_value'])
df2 = pd.DataFrame(list(zip(all_key, all_value)), columns=['all_key', 'all_value'])
df1 = pd.DataFrame(list(zip(description_list, value_list)), columns=['description_list', 'value_list'])
df1 = df1.set_index('description_list').T.reset_index(drop=True)
df1 = df1.rename_axis(None, axis=1)
df1['link'] = ad_link
df1.insert(0, "brand_and_model", brand_and_model)
df1.insert(1, "model_version", model_version)
df1.insert(2, "location", location)
df1.insert(5, "tel_number", tel_number)
df2_3 = pd.concat([df2, df3])
df2_3 = df2_3.set_index('all_key').T.reset_index(drop=True)
df2_3 = df2_3.rename_axis(None, axis=1)
df_last = pd.concat([df1, df2_3], axis=1)
df_last = df_last.astype(str).groupby(df_last.columns, sort=False, axis=1).agg(
lambda x: x.apply(','.join, 1))
now = datetime.now()
datetime_string = str(now.strftime("%Y%m%d_%H%M%S"))
df_last['ad_link'] = ad_link
df_last['download_date_time'] = datetime_string
config = configparser.RawConfigParser()
config.read(filenames='my.properties')
scrap_db = pymysql.connect(host='localhost', user='root', password='', database='m',
charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
cursor = scrap_db.cursor()
sql = """CREATE TABLE CARS(
brand_and_model VARCHAR(32),
model_version VARCHAR(64),
location VARCHAR(64),
tel_number VARCHAR(32),
mileage VARCHAR(32),
first_registration DATE(7),
ad_link VARCHAR(256),
download_date_time DATE(32),
search words VARCHAR(64)
url_id int(9)
)"""
#cursor.execute(sql) #Save data to the table
for row_count in range(0, df_last.shape[0]):
chunk = df_last.iloc[row_count:row_count + 1, :].values.tolist()
brand_and_model = ""
model_version = ""
location = ""
tel_number = ""
mileage = ""
first_registration = ""
ad_link = ""
download_date_time = ""
url_id = ""
lenght_of_chunk = len(chunk[0])
if "brand_and_model" in cars_data:
try:
brand_and_model = chunk[0][0]
except:
brand_and_model = ""
if "model_version" in cars_data:
try:
model_version = chunk[0][1]
except:
model_version = ""
if "location" in cars_data:
try:
location = chunk[0][2]
except:
location = ""
if "tel_number" in cars_data:
try:
tel_number = chunk[0][5]
except:
tel_number = ""
if "Kilometerstand" in description_list:
index_no = description_list.index("Kilometerstand")
try:
mileage = value_list[index_no]
except:
mileage = ""
if "Erstzulassung" in description_list:
index_no = description_list.index("Erstzulassung")
try:
first_registration = value_list[index_no]
except:
first_registration = ""
if chunk[0][lenght_of_chunk - 2] != "":
ad_link = chunk[0][lenght_of_chunk - 2] # ad_link
if chunk[0][lenght_of_chunk - 1] != "":
download_date_time = chunk[0][lenght_of_chunk - 1] # datetime_string
if (brand_and_model == ' '):
control = "false"
else:
control = "true"
if control == "true":
mySql_insert_query = "INSERT INTO CARS(brand_and_model,model_version,location,tel_number,mileage,first_registration,ad_link,download_date_time,url_id) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
val = (brand_and_model, model_version, location, tel_number, mileage, first_registration, ad_link, download_date_time, url_id)
time.sleep(5)
cursor = scrap_db.cursor()
cursor.execute(mySql_insert_query, val)
scrap_db.commit()
ic(cursor.rowcount, "Record inserted successfully into *CARS* table")
if (tel_number == 'Not Found Tel Number') and (text_words == 'Not Found Words'):
control = "true"
else:
control = "pass"
time.sleep(10)
if control == "true":
ic("Mail Sending")
test.find_element(By.XPATH, "/div[2]/div[2]/div/div[4]/div/span").click()
test.implicitly_wait(5)
eMl = test.find_element(By.XPATH, "/html/div[1]/form/div[1]/div/input")
test.implicitly_wait(3)
eMl.click()
time.sleep(10)
eMl.send_keys("v#gmail.com")
time.sleep(8)
passw = WebDriverWait(test, 20).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[2]/div[1]/form/div[2]/div/input")))
test.implicitly_wait(3)
passw.click()
time.sleep(1)
passw.send_keys('W' + Keys.ENTER)
time.sleep(5)
test.find_element(By.XPATH,'/html/div/div[2]/a').click()
time.sleep(3)
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64',
}
response = requests.get(ad_link, headers=headers, allow_redirects=True, timeout=2.50)
ic(response.status_code)
if response.history:
for step in response.history:
ic("Request was redirected")
for resp in response.history:
ic(resp.status_code, resp.url)
ic('For Control')
ic("Final destination:")
ic(response.status_code, response.all_links)
ic(ad_link)
else:
ic("Request was not redirected")
ic(response)
time.sleep(sleep_time)
ic('destination_status')

Related

WebDriverException: When trying to scrap amazon for product title and price using Selenium

I'm attempting to scrape Amazon for iPhone 11 names and prices, but when I run the code, I get the following error:
The Error I get:
My code is as the following:
```
#First project
class CrawledInfo:
def __init__(self, product_name, product_price, cust_name = None, cust_location = None, rating = None, review = None, review_date = None)-> None:
self.cust_name = cust_name
self.cust_location = cust_location
self.product_name = product_name
self.product_price = product_price
self.rating = rating
self.review = review
self.review_date = review_date
class CrawlerBot:
def item(self, name):
count = 1
page = 1
pageIncrement = 1
maxRetrieves = 100
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
l = []
#Declaring options
options = Options()
options.headless = False
options.add_experimental_option('detach', True)
browser = webdriver.Chrome(ChromeDriverManager().install(), options=options)
browser.maximize_window()
browser.get(url)
browser.set_page_load_timeout(10)
while True:
try:
if pageIncrement * page > maxRetrieves:
break
if count > pageIncrement:
page +=1
count = 1
#Capture item name
xPathTitle = '//*[#id="search"]/div[1]/div[2]/div/span[3]/div[2]/div[' + str(count) + ']/div/span/div/div/div[2]/div[2]/div/div[1]/div/div/div[1]/h2/a/span'
title = browser.find_element_by_xpath(xPathTitle)
titleText = title.get_attribute('innerHTML').splitLines()[0]
title.click()
#Capture item price
xPathPrice = '//*[#id="price_inside_buybox"]'
price = browser.find_element_by_xpath(xPathPrice)
priceText = price.get_attribute('innerHTML').splitLines()
#Return to the search page
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
browser.get(url)
browser.set_page_load_timeout(10)
#Send the results to class CrawledInfo
info = CrawledInfo(titleText, priceText)
l.append(info)
count +=1
except Exception as e:
print('Exception: ', e)
count +=1
if pageIncrement * page > maxRetrieves:
break
if count > pageIncrement:
page +=1
count = 1
#Return to the search page
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
browser.get(url)
browser.set_page_load_timeout(10)
browser.close()
return l
#Creating the object
start_crawler = CrawlerBot()
with open('results', 'w', newline='', encoding='utf-8') as fileWriter:
dataWriter = csv.writer(fileWriter, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for dat in start_crawler.item('iphone 11'):
dataWriter.writerow([dat.product_name, dat.product_price])
```
Anyone who has an idea of what's wrong?
When my code is working write I'm expecting it to create a csv file with the names of iPhone 11 together with their prices

ValueError: 4 columns passed, passed data had 1 columns

'NEW LEARNER'
If data in column from webpage is not an integer, I cannot append the row to my data frame.
[webpage data as seen by this image] (https://i.stack.imgur.com/KBjRU.png)
Here is my code:
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
ticker = input("Type your ticker symbol: ")
def get_balance_sheet_from_yfinance_web(ticker):
url = f"https://finance.yahoo.com/quote/%7Bticker%7D/balance-sheet?p=%7Bticker%7D"
header = {'Connection': 'keep-alive',
'Expires': '-1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
r = requests.get(url, headers=header)
html = r.text
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
col = []
pd.set_option('max_colwidth', None)
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
pd.set_option('max_colwidth', None)
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = df.append(row)
df.to_csv(f'{ticker}.csv')
return df
print(get_balance_sheet_from_yfinance_web(ticker))
I have tried replace('-', 0)

Oanda API - Issue Price - Instruments

I'm using Oanda API to automate Trading strategies, I have a 'price' error that only occurs when selecting some instruments such as XAG (silver), my guess is that there is a classification difference but Oanda is yet to answer on the matter.
The error does not occur when selecting Forex pairs.
If anyone had such issues in the past and managed to solve it I'll be happy to hear form them.
PS: I'm UK based and have access to most products including CFDs
class SMABollTrader(tpqoa.tpqoa):
def __init__(self, conf_file, instrument, bar_length, SMA, dev, SMA_S, SMA_L, units):
super().__init__(conf_file)
self.instrument = instrument
self.bar_length = pd.to_timedelta(bar_length)
self.tick_data = pd.DataFrame()
self.raw_data = None
self.data = None
self.last_bar = None
self.units = units
self.position = 0
self.profits = []
self.price = []
#*****************add strategy-specific attributes here******************
self.SMA = SMA
self.dev = dev
self.SMA_S = SMA_S
self.SMA_L = SMA_L
#************************************************************************
def get_most_recent(self, days = 5):
while True:
time.sleep(2)
now = datetime.utcnow()
now = now - timedelta(microseconds = now.microsecond)
past = now - timedelta(days = days)
df = self.get_history(instrument = self.instrument, start = past, end = now,
granularity = "S5", price = "M", localize = False).c.dropna().to_frame()
df.rename(columns = {"c":self.instrument}, inplace = True)
df = df.resample(self .bar_length, label = "right").last().dropna().iloc[:-1]
self.raw_data = df.copy()
self.last_bar = self.raw_data.index[-1]
if pd.to_datetime(datetime.utcnow()).tz_localize("UTC") - self.last_bar < self.bar_length:
break
def on_success(self, time, bid, ask):
print(self.ticks, end = " ")
recent_tick = pd.to_datetime(time)
df = pd.DataFrame({self.instrument:(ask + bid)/2},
index = [recent_tick])
self.tick_data = self.tick_data.append(df)
if recent_tick - self.last_bar > self.bar_length:
self.resample_and_join()
self.define_strategy()
self.execute_trades()
def resample_and_join(self):
self.raw_data = self.raw_data.append(self.tick_data.resample(self.bar_length,
label="right").last().ffill().iloc[:-1])
self.tick_data = self.tick_data.iloc[-1:]
self.last_bar = self.raw_data.index[-1]
def define_strategy(self): # "strategy-specific"
df = self.raw_data.copy()
#******************** define your strategy here ************************
df["SMA"] = df[self.instrument].rolling(self.SMA).mean()
df["Lower"] = df["SMA"] - df[self.instrument].rolling(self.SMA).std() * self.dev
df["Upper"] = df["SMA"] + df[self.instrument].rolling(self.SMA).std() * self.dev
df["distance"] = df[self.instrument] - df.SMA
df["SMA_S"] = df[self.instrument].rolling(self.SMA_S).mean()
df["SMA_L"] = df[self.instrument].rolling(self.SMA_L).mean()
df["position"] = np.where(df[self.instrument] < df.Lower) and np.where(df["SMA_S"] > df["SMA_L"] ,1,np.nan)
df["position"] = np.where(df[self.instrument] > df.Upper) and np.where(df["SMA_S"] < df["SMA_L"], -1, df["position"])
df["position"] = np.where(df.distance * df.distance.shift(1) < 0, 0, df["position"])
df["position"] = df.position.ffill().fillna(0)
self.data = df.copy()
#***********************************************************************
def execute_trades(self):
if self.data["position"].iloc[-1] == 1:
if self.position == 0 or None:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
elif self.position == -1:
order = self.create_order(self.instrument, self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
self.position = 1
elif self.data["position"].iloc[-1] == -1:
if self.position == 0:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
self.position = -1
elif self.data["position"].iloc[-1] == 0:
if self.position == -1:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
self.position = 0
def report_trade(self, order, going):
time = order["time"]
units = order["units"]
price = order["price"]
pl = float(order["pl"])
self.profits.append(pl)
cumpl = sum(self.profits)
print("\n" + 100* "-")
print("{} | {}".format(time, going))
print("{} | units = {} | price = {} | P&L = {} | Cum P&L = {}".format(time, units, price, pl, cumpl))
print(100 * "-" + "\n")
trader = SMABollTrader("oanda.cfg", "EUR_GBP", "15m", SMA = 82, dev = 4, SMA_S = 38, SMA_L = 135, units = 100000)
trader.get_most_recent()
trader.stream_data(trader.instrument, stop = None )
if trader.position != 0: # if we have a final open position
close_order = trader.create_order(trader.instrument, units = -trader.position * trader.units,
suppress = True, ret = True)
trader.report_trade(close_order, "GOING NEUTRAL")
trader.signal = 0
I have done Hagmann course as well and I have recognised your code immediately.
Firstly the way you define your positions is not the best. Look at the section of combining two strategies. There are two ways.
Now regarding your price problem I had a similar situation with BTC. You can download it's historical data but when I plotted it to the strategy code and started to stream I had exactly the same error indicating that tick data was never streamed.
I am guessing that simply not all instruments are tradeable via api or in your case maybe you tried to stream beyond trading hours?

Why not all of the coordinates are generated while running Geopy to scrape yellow pages?

The output is a csv file with a list of businesses including name, address, telephone and coordinates, for some reason only partial coordinates are generated, the ones that aren't generated and ran in a single run with geopy will find the coordinates, so potentially geopy can find the coordinates for all of them but for some reason it skips sometimes, I thought it might be needing some time to call the api and added threading but it didn't solve the issue.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import threading
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ypscraper#gmail.com")
main_list = []
def extract(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup.find_all('div', class_ = 'listing__content__wrap--flexed jsGoToMp')
def transform(articles):
for item in articles:
name = item.find('a', class_ ='listing__name--link listing__link jsListingName').text
try:
street = item.find('span', {'itemprop':'streetAddress'}).text
except:
street = ''
try:
city = item.find('span', {'itemprop':'addressLocality'}).text
except:
city = ''
try:
province = item.find('span', {'itemprop':'addressRegion'}).text
except:
province = ''
try:
postCode = item.find('span', {'itemprop':'postalCode'}).text
except:
postCode = ''
try:
phone = item.find('li', class_ = 'mlr__submenu__item').text.strip()
except:
phone = ''
try:
def search_geo():
global location
location = geolocator.geocode(street + ' ' + city)
print(street + ' ' + city)
thread = threading.Thread(target=search_geo)
thread.start()
thread.join()
slatitude = location.latitude
except:
slatitude = ''
try:
thread = threading.Thread(target=search_geo)
thread.start()
thread.join()
slongitude = location.longitude
except:
slongitude = ''
business = {
'name': name,
'street': street,
'city': city,
'province': province,
'postCode': postCode,
'phone': phone,
'slongitude': slongitude,
'slatitude': slatitude
}
main_list.append(business)
return
def load():
df = pd.DataFrame(main_list)
df.to_csv('repairshopsbc', index=False)
for x in range(1,2):
print(f'Getting page {x}')
articles = extract(f'https://www.yellowpages.ca/search/si/{x}/car+repair/British+Columbia+BC')
transform(articles)
time.sleep(5)
load()
print('Saved to CSV')

Unable to parse data correctly in BeautifulSoup

Below is a snippet of the code I am using in order to parse data off a webpage
link1 = "https://www.codechef.com/status/" + sys.argv[1] + "?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(link1)
s = response.read()
soup = BeautifulSoup(s)
l = soup.findAll('tr',{'class' : 'kol'})
Here is the URL of an example page that gets stored in the variable link1
https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO
Now, the problems is that the variable l always gets an empty list even though there are entries in the table generated by the HTML tags I am trying to find.
Please help me out with this.
EDIT
Complete Code
from BeautifulSoup import BeautifulSoup
import urllib2
import os
import sys
import subprocess
import time
import HTMLParser
import requests
html_parser = HTMLParser.HTMLParser()
link = "https://www.codechef.com/status/"+sys.argv[1]+"?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open(link)
s = response.read()
soup = BeautifulSoup(s)
try:
l = soup.findAll('div',{'class' : 'pageinfo'})
for x in l:
str_val = str(x.contents)
pos = str_val.find('of')
i = pos+3
x = 0
while i < len(str_val):
if str_val[i] >= str(0) and str_val[i] <= str(9):
x = x*10 + int(str_val[i])
i += 1
except:
x = 1
print x
global lis
lis = list()
break_loop = 0
for i in range(0,x):
print i
if break_loop == 1:
break
if i == 0:
link1 = link
else:
link1 = "https://www.codechef.com/status/"+sys.argv[1]+"?page="+str(i)+"&sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
# opener = urllib2.build_opener()
# opener.addheaders = [('User-agent', 'Mozilla/5.0')]
# response = opener.open(link1)
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = requests.get(link1, headers={'User-Agent': useragent})
# s = response.read()
soup = BeautifulSoup(req.content)
l = soup.findAll('tr',{'class' : r'\"kol\"'})
print l
for val in l:
lang_val = val.find('td',{'width' : '70'})
lang = lang_val.renderContents().strip()
print lang
try:
data = val.find('td',{'width' : '51'})
data_val = data.span.contents
except:
break
if lang != 'PHP':
break_loop = 1
break
if len(data_val) > 1 and html_parser.unescape(data_val[2]) != '100':
continue
str_val = str(val.td.contents)
p = 0
j = 0
while p < len(str_val):
if str_val[p] >= str(0) and str_val[p] <= str(9):
j = j*10 + int(str_val[p])
p += 1
lis.insert(0,str(j))
if len(lis) > 0:
try:
os.mkdir(sys.argv[1]+"_php")
except:
pass
count = 1
for data in lis:
cmd = "python parse_data_final.py "+data+" > "+sys.argv[1]+"_php/"+sys.argv[1]+"_"+str(count)+".php"
subprocess.call(cmd, shell=True)
count += 1
Your code doesn't work because because your class is wrong, try it with:
l = soup.findAll('tr',{'class' : r'\"kol\"'})
You can also get the tags like this:
l = soup.find('table', {'class': 'dataTable'}).tbody
Also, you should probably be using requests depending on which version of python you're using. Here's an example:
import requests
from bs4 import BeautifulSoup
url = "https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO"
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = requests.get(url, headers={'User-Agent': useragent})
soup = BeautifulSoup(req.content, "html.parser")
#l = soup.findAll('tr',{'class' : r'\"kol\"'})
l = soup.find('table', {'class': 'dataTable'}).tbody