Url Redirect Pyhton - selenium
I want to redirect the page to another url, but it's not happening. I would appreciate your help.
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64',
}
response = requests.get(ad_link, headers=headers, allow_redirects=True)
ic(response.status_code)
#ic(response.history)
if response.history:
ic("Request was redirected")
for resp in response.history:
ic(resp.status_code, resp.url)
ic('For Control')
ic("Final destination:")
ic(response.status_code, response.all_links)
ic(ad_link)
else:
ic("Request was not redirected")
ic(response)
time.sleep(sleep_time)
I wrote a loop like this, the content is 200 but it doesn't open the url to the screen. It doesn't run the for here at all.
I go to a url and log in and then I want it to take me to a link from the column named 'ad_link' in the database. Because I don't want to have to open that session all the time. I want to browse the links in my database with an open session.
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="",
database="m"
)
mycursor = mydb.cursor()
urllib3.disable_warnings()
sql = "SELECT ad_link FROM test"
mycursor.execute(sql)
myresult = mycursor.fetchall()
all_links = myresult[0:]
len_all_links = len(all_links)
dataframe = pd.DataFrame(all_links, columns=['links'])
x = 0
y = 5
#def fonksiyon(i):
#global x
#global y
number = np.arange(x,y)
for i in tqdm(number):
ad_link = dataframe.links[i] #ad_link = dataframe["links"][i]
print(ad_link)
Display = []
prefs = {"profile.managed_default_content_settings.images": 2} # this is to not load images
sx = random.randint(1000, 1500)
sn = random.randint(3000, 4500)
options = Options()
options = webdriver.ChromeOptions()
time.sleep(5)
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("detach", True)
capabilities = options.to_capabilities()
os.environ['WDM_SSL_VERIFY'] = '0'
options.add_experimental_option("prefs", prefs)
wsize = "--window-size=" + str(sx - 10) + ',' + str(sn - 10)
options.add_argument(str(wsize))
options.add_argument("prefs", )
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
options.add_argument(['--headless', '--disable-gpu', '--window-size=1920,1080', '--no-sandbox', '--disable-dev-shm-usage'])
service = Service(executable_path = r'C:\Users\Wiveda\chromedriver.exe')
test = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
sleep_time = 5
test.get(ad_link)
time.sleep(sleep_time)
ad_source = test.page_source
ad_soup = BeautifulSoup(ad_source, 'lxml')
mainresults = ad_soup.find_all('div', {'class': 'cBox u-inherit '})
try:
WebDriverWait(test, timeout=10).until(
lambda d: d.find_element(By.XPATH, "//button[#class='sc-bczRLJ-accept-btn']")).click()
WebDriverWait(test, timeout=10).until(
lambda d: d.find_element(By.XPATH, "//p[#class='phone-container']")).click()
tel_number = test.find_element(By.XPATH, "//p[#class='phone-container']").text
ic(tel_number)
except:
tel_number = 'Not Found Tel Number'
ic(tel_number)
time.sleep(1)
search_words = ""
try:
web_text = test.find_element(By.XPATH, "/html/body/div[6]/div/div[2]/div[3]/div[1]")
words = ["Import", "x", "y", "z"]
search_words = [word for word in words if re.findall(word, web_text)]
text_words = ''
if search_words:
for i, word in enumerate(search_words):
if i < len(search_words) - 1:
text_words += f"{word}, "
else:
text_words += f"{word}."
ic(f"\nCannot send mail because it contains the word.Index : {text_words}")
ic(re.findall)
print("İf tamamlandı")
print("Try tamamlandı")
except Exception:
text_words = "Not Found Words"
ic(text_words)
time.sleep(1)
#mainresults = ad_soup.find_all('div', {'class': 'cBox cBox--content u-overflow-inherit '})
try:
brand_and_model = ad_soup.find("h1", {"class": ('h u-word')}).get_text()
except:
brand_and_model = ' '
try:
model_version = ad_soup.find("div", {"class": ('list-title')}).get_text()
except:
model_version = ' '
try:
location = ad_soup.find("p", {"class": ('seller-address')}).get_text()
except:
location = ' '
try:
url_id = ad_soup.find(" ", {"class": ('')}).get_text()
except:
url_id = ''
cars_data = pd.DataFrame({
'brand_and_model': brand_and_model,
'model_version': model_version,
'location': location,
'tel_number': tel_number,
'url_id': url_id,
},
index=[0])
try:
table_pre = ad_soup.find("div", {"class": "cBox cBox--content cBox-body"}) # 1 (6 in one)
all_div = table_pre.findAll("div", {"class": ('key-feature__content')}) # 6 (2 in one)
all_title = table_pre.findAll("div", {"class": ('key-feature__label')}) # 6
all_results = table_pre.findAll("div", {"class": ('key-feature__value')}) # 6
except:
pass
description_list = []
value_list = []
try:
div_length = len(all_div)
except:
div_length = 6
for i in range(div_length):
try:
description_list.append(all_title[i].text)
description_list = list(map(lambda x: x.replace(" ", "_"), description_list))
value_list.append(all_results[i].text)
except:
description_list.append('')
value_list.append('')
all_key = []
all_value = []
try:
pdiv = ad_soup.find_all('div', {'class': 'bullet-list'})
except:
pass
equipment_key = []
try:
equipment_key_length = len(pdiv)
except:
equipment_key_length = 1
equipment_value = []
try:
dd_ul_li_length = len(pdiv)
except:
dd_ul_li_length = 1
df3 = pd.DataFrame(list(zip(equipment_key, equipment_value)), columns=['all_key', 'all_value'])
df2 = pd.DataFrame(list(zip(all_key, all_value)), columns=['all_key', 'all_value'])
df1 = pd.DataFrame(list(zip(description_list, value_list)), columns=['description_list', 'value_list'])
df1 = df1.set_index('description_list').T.reset_index(drop=True)
df1 = df1.rename_axis(None, axis=1)
df1['link'] = ad_link
df1.insert(0, "brand_and_model", brand_and_model)
df1.insert(1, "model_version", model_version)
df1.insert(2, "location", location)
df1.insert(5, "tel_number", tel_number)
df2_3 = pd.concat([df2, df3])
df2_3 = df2_3.set_index('all_key').T.reset_index(drop=True)
df2_3 = df2_3.rename_axis(None, axis=1)
df_last = pd.concat([df1, df2_3], axis=1)
df_last = df_last.astype(str).groupby(df_last.columns, sort=False, axis=1).agg(
lambda x: x.apply(','.join, 1))
now = datetime.now()
datetime_string = str(now.strftime("%Y%m%d_%H%M%S"))
df_last['ad_link'] = ad_link
df_last['download_date_time'] = datetime_string
config = configparser.RawConfigParser()
config.read(filenames='my.properties')
scrap_db = pymysql.connect(host='localhost', user='root', password='', database='m',
charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
cursor = scrap_db.cursor()
sql = """CREATE TABLE CARS(
brand_and_model VARCHAR(32),
model_version VARCHAR(64),
location VARCHAR(64),
tel_number VARCHAR(32),
mileage VARCHAR(32),
first_registration DATE(7),
ad_link VARCHAR(256),
download_date_time DATE(32),
search words VARCHAR(64)
url_id int(9)
)"""
#cursor.execute(sql) #Save data to the table
for row_count in range(0, df_last.shape[0]):
chunk = df_last.iloc[row_count:row_count + 1, :].values.tolist()
brand_and_model = ""
model_version = ""
location = ""
tel_number = ""
mileage = ""
first_registration = ""
ad_link = ""
download_date_time = ""
url_id = ""
lenght_of_chunk = len(chunk[0])
if "brand_and_model" in cars_data:
try:
brand_and_model = chunk[0][0]
except:
brand_and_model = ""
if "model_version" in cars_data:
try:
model_version = chunk[0][1]
except:
model_version = ""
if "location" in cars_data:
try:
location = chunk[0][2]
except:
location = ""
if "tel_number" in cars_data:
try:
tel_number = chunk[0][5]
except:
tel_number = ""
if "Kilometerstand" in description_list:
index_no = description_list.index("Kilometerstand")
try:
mileage = value_list[index_no]
except:
mileage = ""
if "Erstzulassung" in description_list:
index_no = description_list.index("Erstzulassung")
try:
first_registration = value_list[index_no]
except:
first_registration = ""
if chunk[0][lenght_of_chunk - 2] != "":
ad_link = chunk[0][lenght_of_chunk - 2] # ad_link
if chunk[0][lenght_of_chunk - 1] != "":
download_date_time = chunk[0][lenght_of_chunk - 1] # datetime_string
if (brand_and_model == ' '):
control = "false"
else:
control = "true"
if control == "true":
mySql_insert_query = "INSERT INTO CARS(brand_and_model,model_version,location,tel_number,mileage,first_registration,ad_link,download_date_time,url_id) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
val = (brand_and_model, model_version, location, tel_number, mileage, first_registration, ad_link, download_date_time, url_id)
time.sleep(5)
cursor = scrap_db.cursor()
cursor.execute(mySql_insert_query, val)
scrap_db.commit()
ic(cursor.rowcount, "Record inserted successfully into *CARS* table")
if (tel_number == 'Not Found Tel Number') and (text_words == 'Not Found Words'):
control = "true"
else:
control = "pass"
time.sleep(10)
if control == "true":
ic("Mail Sending")
test.find_element(By.XPATH, "/div[2]/div[2]/div/div[4]/div/span").click()
test.implicitly_wait(5)
eMl = test.find_element(By.XPATH, "/html/div[1]/form/div[1]/div/input")
test.implicitly_wait(3)
eMl.click()
time.sleep(10)
eMl.send_keys("v#gmail.com")
time.sleep(8)
passw = WebDriverWait(test, 20).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[2]/div[1]/form/div[2]/div/input")))
test.implicitly_wait(3)
passw.click()
time.sleep(1)
passw.send_keys('W' + Keys.ENTER)
time.sleep(5)
test.find_element(By.XPATH,'/html/div/div[2]/a').click()
time.sleep(3)
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64',
}
response = requests.get(ad_link, headers=headers, allow_redirects=True, timeout=2.50)
ic(response.status_code)
if response.history:
for step in response.history:
ic("Request was redirected")
for resp in response.history:
ic(resp.status_code, resp.url)
ic('For Control')
ic("Final destination:")
ic(response.status_code, response.all_links)
ic(ad_link)
else:
ic("Request was not redirected")
ic(response)
time.sleep(sleep_time)
ic('destination_status')
Related
WebDriverException: When trying to scrap amazon for product title and price using Selenium
I'm attempting to scrape Amazon for iPhone 11 names and prices, but when I run the code, I get the following error: The Error I get: My code is as the following: ``` #First project class CrawledInfo: def __init__(self, product_name, product_price, cust_name = None, cust_location = None, rating = None, review = None, review_date = None)-> None: self.cust_name = cust_name self.cust_location = cust_location self.product_name = product_name self.product_price = product_price self.rating = rating self.review = review self.review_date = review_date class CrawlerBot: def item(self, name): count = 1 page = 1 pageIncrement = 1 maxRetrieves = 100 url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page) l = [] #Declaring options options = Options() options.headless = False options.add_experimental_option('detach', True) browser = webdriver.Chrome(ChromeDriverManager().install(), options=options) browser.maximize_window() browser.get(url) browser.set_page_load_timeout(10) while True: try: if pageIncrement * page > maxRetrieves: break if count > pageIncrement: page +=1 count = 1 #Capture item name xPathTitle = '//*[#id="search"]/div[1]/div[2]/div/span[3]/div[2]/div[' + str(count) + ']/div/span/div/div/div[2]/div[2]/div/div[1]/div/div/div[1]/h2/a/span' title = browser.find_element_by_xpath(xPathTitle) titleText = title.get_attribute('innerHTML').splitLines()[0] title.click() #Capture item price xPathPrice = '//*[#id="price_inside_buybox"]' price = browser.find_element_by_xpath(xPathPrice) priceText = price.get_attribute('innerHTML').splitLines() #Return to the search page url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page) browser.get(url) browser.set_page_load_timeout(10) #Send the results to class CrawledInfo info = CrawledInfo(titleText, priceText) l.append(info) count +=1 except Exception as e: print('Exception: ', e) count +=1 if pageIncrement * page > maxRetrieves: break if count > pageIncrement: page +=1 count = 1 #Return to the search page url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page) browser.get(url) browser.set_page_load_timeout(10) browser.close() return l #Creating the object start_crawler = CrawlerBot() with open('results', 'w', newline='', encoding='utf-8') as fileWriter: dataWriter = csv.writer(fileWriter, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) for dat in start_crawler.item('iphone 11'): dataWriter.writerow([dat.product_name, dat.product_price]) ``` Anyone who has an idea of what's wrong? When my code is working write I'm expecting it to create a csv file with the names of iPhone 11 together with their prices
ValueError: 4 columns passed, passed data had 1 columns
'NEW LEARNER' If data in column from webpage is not an integer, I cannot append the row to my data frame. [webpage data as seen by this image] (https://i.stack.imgur.com/KBjRU.png) Here is my code: import pandas as pd import requests from datetime import datetime from bs4 import BeautifulSoup ticker = input("Type your ticker symbol: ") def get_balance_sheet_from_yfinance_web(ticker): url = f"https://finance.yahoo.com/quote/%7Bticker%7D/balance-sheet?p=%7Bticker%7D" header = {'Connection': 'keep-alive', 'Expires': '-1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' } r = requests.get(url, headers=header) html = r.text soup = BeautifulSoup(html, "html.parser") div = soup.find_all('div', attrs={'class': 'D(tbhg)'}) if len(div) < 1: print("Fail to retrieve table column header") exit(0) col = [] pd.set_option('max_colwidth', None) for h in div[0].find_all('span'): text = h.get_text() if text != "Breakdown": col.append(datetime.strptime(text, "%m/%d/%Y")) df = pd.DataFrame(columns=col) pd.set_option('max_colwidth', None) for div in soup.find_all('div', attrs={'data-test': 'fin-row'}): i = 0 idx = "" val = [] for h in div.find_all('span'): if i == 0: idx = h.get_text() else: num = int(h.get_text().replace(",", "")) * 1000 val.append(num) i += 1 row = pd.DataFrame([val], columns=col, index=[idx]) df = df.append(row) df.to_csv(f'{ticker}.csv') return df print(get_balance_sheet_from_yfinance_web(ticker)) I have tried replace('-', 0)
Oanda API - Issue Price - Instruments
I'm using Oanda API to automate Trading strategies, I have a 'price' error that only occurs when selecting some instruments such as XAG (silver), my guess is that there is a classification difference but Oanda is yet to answer on the matter. The error does not occur when selecting Forex pairs. If anyone had such issues in the past and managed to solve it I'll be happy to hear form them. PS: I'm UK based and have access to most products including CFDs class SMABollTrader(tpqoa.tpqoa): def __init__(self, conf_file, instrument, bar_length, SMA, dev, SMA_S, SMA_L, units): super().__init__(conf_file) self.instrument = instrument self.bar_length = pd.to_timedelta(bar_length) self.tick_data = pd.DataFrame() self.raw_data = None self.data = None self.last_bar = None self.units = units self.position = 0 self.profits = [] self.price = [] #*****************add strategy-specific attributes here****************** self.SMA = SMA self.dev = dev self.SMA_S = SMA_S self.SMA_L = SMA_L #************************************************************************ def get_most_recent(self, days = 5): while True: time.sleep(2) now = datetime.utcnow() now = now - timedelta(microseconds = now.microsecond) past = now - timedelta(days = days) df = self.get_history(instrument = self.instrument, start = past, end = now, granularity = "S5", price = "M", localize = False).c.dropna().to_frame() df.rename(columns = {"c":self.instrument}, inplace = True) df = df.resample(self .bar_length, label = "right").last().dropna().iloc[:-1] self.raw_data = df.copy() self.last_bar = self.raw_data.index[-1] if pd.to_datetime(datetime.utcnow()).tz_localize("UTC") - self.last_bar < self.bar_length: break def on_success(self, time, bid, ask): print(self.ticks, end = " ") recent_tick = pd.to_datetime(time) df = pd.DataFrame({self.instrument:(ask + bid)/2}, index = [recent_tick]) self.tick_data = self.tick_data.append(df) if recent_tick - self.last_bar > self.bar_length: self.resample_and_join() self.define_strategy() self.execute_trades() def resample_and_join(self): self.raw_data = self.raw_data.append(self.tick_data.resample(self.bar_length, label="right").last().ffill().iloc[:-1]) self.tick_data = self.tick_data.iloc[-1:] self.last_bar = self.raw_data.index[-1] def define_strategy(self): # "strategy-specific" df = self.raw_data.copy() #******************** define your strategy here ************************ df["SMA"] = df[self.instrument].rolling(self.SMA).mean() df["Lower"] = df["SMA"] - df[self.instrument].rolling(self.SMA).std() * self.dev df["Upper"] = df["SMA"] + df[self.instrument].rolling(self.SMA).std() * self.dev df["distance"] = df[self.instrument] - df.SMA df["SMA_S"] = df[self.instrument].rolling(self.SMA_S).mean() df["SMA_L"] = df[self.instrument].rolling(self.SMA_L).mean() df["position"] = np.where(df[self.instrument] < df.Lower) and np.where(df["SMA_S"] > df["SMA_L"] ,1,np.nan) df["position"] = np.where(df[self.instrument] > df.Upper) and np.where(df["SMA_S"] < df["SMA_L"], -1, df["position"]) df["position"] = np.where(df.distance * df.distance.shift(1) < 0, 0, df["position"]) df["position"] = df.position.ffill().fillna(0) self.data = df.copy() #*********************************************************************** def execute_trades(self): if self.data["position"].iloc[-1] == 1: if self.position == 0 or None: order = self.create_order(self.instrument, self.units, suppress = True, ret = True) self.report_trade(order, "GOING LONG") elif self.position == -1: order = self.create_order(self.instrument, self.units * 2, suppress = True, ret = True) self.report_trade(order, "GOING LONG") self.position = 1 elif self.data["position"].iloc[-1] == -1: if self.position == 0: order = self.create_order(self.instrument, -self.units, suppress = True, ret = True) self.report_trade(order, "GOING SHORT") elif self.position == 1: order = self.create_order(self.instrument, -self.units * 2, suppress = True, ret = True) self.report_trade(order, "GOING SHORT") self.position = -1 elif self.data["position"].iloc[-1] == 0: if self.position == -1: order = self.create_order(self.instrument, self.units, suppress = True, ret = True) self.report_trade(order, "GOING NEUTRAL") elif self.position == 1: order = self.create_order(self.instrument, -self.units, suppress = True, ret = True) self.report_trade(order, "GOING NEUTRAL") self.position = 0 def report_trade(self, order, going): time = order["time"] units = order["units"] price = order["price"] pl = float(order["pl"]) self.profits.append(pl) cumpl = sum(self.profits) print("\n" + 100* "-") print("{} | {}".format(time, going)) print("{} | units = {} | price = {} | P&L = {} | Cum P&L = {}".format(time, units, price, pl, cumpl)) print(100 * "-" + "\n") trader = SMABollTrader("oanda.cfg", "EUR_GBP", "15m", SMA = 82, dev = 4, SMA_S = 38, SMA_L = 135, units = 100000) trader.get_most_recent() trader.stream_data(trader.instrument, stop = None ) if trader.position != 0: # if we have a final open position close_order = trader.create_order(trader.instrument, units = -trader.position * trader.units, suppress = True, ret = True) trader.report_trade(close_order, "GOING NEUTRAL") trader.signal = 0
I have done Hagmann course as well and I have recognised your code immediately. Firstly the way you define your positions is not the best. Look at the section of combining two strategies. There are two ways. Now regarding your price problem I had a similar situation with BTC. You can download it's historical data but when I plotted it to the strategy code and started to stream I had exactly the same error indicating that tick data was never streamed. I am guessing that simply not all instruments are tradeable via api or in your case maybe you tried to stream beyond trading hours?
Why not all of the coordinates are generated while running Geopy to scrape yellow pages?
The output is a csv file with a list of businesses including name, address, telephone and coordinates, for some reason only partial coordinates are generated, the ones that aren't generated and ran in a single run with geopy will find the coordinates, so potentially geopy can find the coordinates for all of them but for some reason it skips sometimes, I thought it might be needing some time to call the api and added threading but it didn't solve the issue. import requests from bs4 import BeautifulSoup import pandas as pd import time import threading from geopy.geocoders import Nominatim geolocator = Nominatim(user_agent="ypscraper#gmail.com") main_list = [] def extract(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'} r = requests.get(url, headers=headers) soup = BeautifulSoup(r.content, 'html.parser') return soup.find_all('div', class_ = 'listing__content__wrap--flexed jsGoToMp') def transform(articles): for item in articles: name = item.find('a', class_ ='listing__name--link listing__link jsListingName').text try: street = item.find('span', {'itemprop':'streetAddress'}).text except: street = '' try: city = item.find('span', {'itemprop':'addressLocality'}).text except: city = '' try: province = item.find('span', {'itemprop':'addressRegion'}).text except: province = '' try: postCode = item.find('span', {'itemprop':'postalCode'}).text except: postCode = '' try: phone = item.find('li', class_ = 'mlr__submenu__item').text.strip() except: phone = '' try: def search_geo(): global location location = geolocator.geocode(street + ' ' + city) print(street + ' ' + city) thread = threading.Thread(target=search_geo) thread.start() thread.join() slatitude = location.latitude except: slatitude = '' try: thread = threading.Thread(target=search_geo) thread.start() thread.join() slongitude = location.longitude except: slongitude = '' business = { 'name': name, 'street': street, 'city': city, 'province': province, 'postCode': postCode, 'phone': phone, 'slongitude': slongitude, 'slatitude': slatitude } main_list.append(business) return def load(): df = pd.DataFrame(main_list) df.to_csv('repairshopsbc', index=False) for x in range(1,2): print(f'Getting page {x}') articles = extract(f'https://www.yellowpages.ca/search/si/{x}/car+repair/British+Columbia+BC') transform(articles) time.sleep(5) load() print('Saved to CSV')
Unable to parse data correctly in BeautifulSoup
Below is a snippet of the code I am using in order to parse data off a webpage link1 = "https://www.codechef.com/status/" + sys.argv[1] + "?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO" opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] response = opener.open(link1) s = response.read() soup = BeautifulSoup(s) l = soup.findAll('tr',{'class' : 'kol'}) Here is the URL of an example page that gets stored in the variable link1 https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO Now, the problems is that the variable l always gets an empty list even though there are entries in the table generated by the HTML tags I am trying to find. Please help me out with this. EDIT Complete Code from BeautifulSoup import BeautifulSoup import urllib2 import os import sys import subprocess import time import HTMLParser import requests html_parser = HTMLParser.HTMLParser() link = "https://www.codechef.com/status/"+sys.argv[1]+"?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO" opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] response = opener.open(link) s = response.read() soup = BeautifulSoup(s) try: l = soup.findAll('div',{'class' : 'pageinfo'}) for x in l: str_val = str(x.contents) pos = str_val.find('of') i = pos+3 x = 0 while i < len(str_val): if str_val[i] >= str(0) and str_val[i] <= str(9): x = x*10 + int(str_val[i]) i += 1 except: x = 1 print x global lis lis = list() break_loop = 0 for i in range(0,x): print i if break_loop == 1: break if i == 0: link1 = link else: link1 = "https://www.codechef.com/status/"+sys.argv[1]+"?page="+str(i)+"&sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO" # opener = urllib2.build_opener() # opener.addheaders = [('User-agent', 'Mozilla/5.0')] # response = opener.open(link1) useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' req = requests.get(link1, headers={'User-Agent': useragent}) # s = response.read() soup = BeautifulSoup(req.content) l = soup.findAll('tr',{'class' : r'\"kol\"'}) print l for val in l: lang_val = val.find('td',{'width' : '70'}) lang = lang_val.renderContents().strip() print lang try: data = val.find('td',{'width' : '51'}) data_val = data.span.contents except: break if lang != 'PHP': break_loop = 1 break if len(data_val) > 1 and html_parser.unescape(data_val[2]) != '100': continue str_val = str(val.td.contents) p = 0 j = 0 while p < len(str_val): if str_val[p] >= str(0) and str_val[p] <= str(9): j = j*10 + int(str_val[p]) p += 1 lis.insert(0,str(j)) if len(lis) > 0: try: os.mkdir(sys.argv[1]+"_php") except: pass count = 1 for data in lis: cmd = "python parse_data_final.py "+data+" > "+sys.argv[1]+"_php/"+sys.argv[1]+"_"+str(count)+".php" subprocess.call(cmd, shell=True) count += 1
Your code doesn't work because because your class is wrong, try it with: l = soup.findAll('tr',{'class' : r'\"kol\"'}) You can also get the tags like this: l = soup.find('table', {'class': 'dataTable'}).tbody Also, you should probably be using requests depending on which version of python you're using. Here's an example: import requests from bs4 import BeautifulSoup url = "https://www.codechef.com/status/CIELAB?sort_by=All&sorting_order=asc&language=29&status=15&handle=&Submit=GO" useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' req = requests.get(url, headers={'User-Agent': useragent}) soup = BeautifulSoup(req.content, "html.parser") #l = soup.findAll('tr',{'class' : r'\"kol\"'}) l = soup.find('table', {'class': 'dataTable'}).tbody