Remove duplicate url's python beautifulsoup - beautifulsoup

I want to remove the duplicate url's from the file having list of url's. it has "http://www.naver.com/나눔글꼴.jpg" and they are repeating.. here is my code:
from bs4 import BeautifulSoup
import lxml
import re
import urllib.request
p = re.compile('나눔글꼴')
html = 'http://www.naver.com'
data = urllib.request.urlopen("http://www.naver.com").read()
soup = BeautifulSoup(data, 'lxml')
links = p.findall(str(soup))
i = set()
for i in links:
link = 'http://www.naver.com/' + str(i) + '.jpg'
print(link)

You forgot to give input to the set() method:
soup = BeautifulSoup(data, 'lxml')
links = p.findall(str(soup))
i = set(links)
for x in i:
link = 'http://www.naver.com/' + str(x) + '.jpg'
print(link)

Related

Extract Title Tags BeautifulSoup

I need help because I wanted to write code for finding out the title tags on a website. Although I used the code from another question and applied it to this scenario, there are no title tags whenever I print 'Beschreibung'.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.parse
webseite = 'https://www.entega.de/sitemap/'
response = requests.get(webseite)
response.status_code
soup = BeautifulSoup (response.content, 'html.parser')
result_container = soup.find_all('div', {'class':'clearfix'})
url_part_1 = 'https://www.entega.de/sitemap/'
url_part_2 = []
for item in result_container:
for link in item.find_all ('a', {'class':'modSitemap__lvl1Link ui-link' }):
url_part_2.append (link.get ('href'))
url_joined = []
for i in url_part_2:
url_joined.append (urllib.parse.urljoin(url_part_1,i))
Überschrift = []
Beschreibung = []
Verlinkungen = []
for link in url_joined:
response = requests.get (link)
soup = BeautifulSoup (response.content, 'html.parser')
Beschreibung.append(soup.find_all('a', title=True, class_='modSitemap__lvl1Link ui-link'))
You are getting nothing because these links don't have an <a class="modSitemap__lvl1Link ui-link"> tag. They do have classes that start with that string though. You could expand to that. Or you can simply just get the <a> tags that have a title attribute.
So change your loop to either:
import re
for link in url_joined:
response = requests.get (link)
soup = BeautifulSoup (response.content, 'html.parser')
Beschreibung.append(soup.find_all('a', {'class':re.compile("^modSitemap__lvl1Link ui-link")}, title=True, ))
or
for link in url_joined:
response = requests.get (link)
soup = BeautifulSoup (response.content, 'html.parser')
Beschreibung.append(soup.find_all('a', title=True))

Scrape url link in table by BS4

I tried to scrape the hyperlinks in the tag (a herf) of the table. However, it doesn't work. Can you help to improve this code?
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
dfs = pd.DataFrame()
for i in range(1,11):
driver = webdriver.Chrome()
driver.get('https://racing.hkjc.com/racing/information/English/racing/RaceCard.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo='+str(i)+'')
res = driver.execute_script('return document.documentElement.outerHTML')
time.sleep(3)
driver.quit()
soup = BeautifulSoup(res, 'lxml')
h_table = soup.find('table', {'class':'table_bd f_tac f_fs13'})
def tableDataText(h_table):
rows = []
trs = h_table.find_all('tr')
headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
return rows
result_table = tableDataText(h_table)
df = pd.DataFrame(result_table[1:], columns=result_table[0])
dfs = pd.concat([dfs, df], ignore_index=True)
Your question and the expected result is not that clear and should be improved - If just wanna grab all the urls from the href you can go with:
from bs4 import BeautifulSoup
from selenium import webdriver
linkList = []
for i in range(1,11):
driver = webdriver.Chrome()
driver.get('https://racing.hkjc.com/racing/information/English/racing/RaceCard.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo='+str(i)+'')
time.sleep(6)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
for a in soup.select('table#racecardlist table a'):
linkList.append('https://racing.hkjc.com'+a['href'])
linkList

Append href links into a dataframe list, getting all required info, but only links from last page appears

Using Beautiful Soup and pandas, I am trying to append all the links on a site into a list with the following code. I am able to scrape all pages with relevant information in the table. The code seems work to me somehow. But the small problem occurs is that just only links in the last page appears. The output is not what I expected. In the end, I'd like to append a list containing all 40 links (next to the required info) in 2 pages. I try scraping 2 pages first although there are 618 pages in total. Do you have any advice how to adjust the code so that each link is appended into the table? Many thanks in advance.
import pandas as pd
import requests
from bs4 import BeautifulSoup
hdr={'User-Agent':'Chrome/84.0.4147.135'}
dfs=[]
for page_number in range(2):
http= "http://example.com/&Page={}".format(page_number+1)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.text, 'html.parser')
table = soup.find('table')
df_list= pd.read_html(url.text)
df = pd.concat(df_list)
dfs.append(df)
links = []
for tr in table.findAll("tr"):
trs = tr.findAll("td")
for each in trs:
try:
link = each.find('a')['href']
links.append(link)
except:
pass
df['Link'] = links
final_df = pd.concat(dfs)
final_df.to_csv('myfile.csv',index=False,encoding='utf-8-sig')
It's with your logic. You only add the links column to the last df since it's outside your loop. Get the links within the page loop, then add that to df, then you can append the df to your dfs list:
import pandas as pd
import requests
from bs4 import BeautifulSoup
hdr={'User-Agent':'Chrome/84.0.4147.135'}
dfs=[]
for page_number in range(2):
http= "http://example.com/&Page={}".format(page_number+1)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.text, 'html.parser')
table = soup.find('table')
df_list= pd.read_html(url.text)
df = pd.concat(df_list)
links = []
for tr in table.findAll("tr"):
trs = tr.findAll("td")
for each in trs:
try:
link = each.find('a')['href']
links.append(link)
except:
pass
df['Link'] = links
dfs.append(df)
final_df = pd.concat(dfs)
final_df.to_csv('myfile.csv',index=False,encoding='utf-8-sig')

Unable to parse an element from eBay via BeautifulSoup

I am trying to scrape "number of items sold" on eBay but for some reason I cannot. I already have the title, price, and all I need is total_sold_price which I am unable to attain. Every time I run my code, I just get a blank for total_sold_price.
try:
title_selenium = driver.find_element_by_xpath('//*[#id="itemTitle"]').text
except:
title_selenium = ""
try:
price_selenium = driver.find_element_by_xpath('//*[#id="prcIsum"]').text.strip().split()
except:
price_selenium = ""
try:
total_sold_price_BeautifulSoup = soup.find('span', {'class': 'vi-qtyS-hot-red'}).text
except:
total_sold_price_BeautifulSoup = ""
My entire code: https://pastebin.com/bu8HgCDZ
Thank you so much.
Fixed it for you. You need to make the soup call inside your loop.
Note: I am using this path '../chromedriver', please change it to your path before running the code.
Code
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Chrome('../chromedriver')
driver.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=watches&_sacat=0&_pgn=1')
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.maximize_window()
tempList = []
for link in soup.find_all('a', href=True):
if 'itm' in link['href']:
print(link['href'])
tempList.append(link['href'])
array_length = len(tempList)
for i in range(array_length):
driver.get(tempList[i])
timeout = 5
try:
element_present = EC.presence_of_element_located((By.XPATH, '//*[#id="itemTitle"]'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
try:
title_selenium = driver.find_element_by_xpath('//*[#id="itemTitle"]').text
except:
title_selenium = ""
try:
price_selenium = driver.find_element_by_xpath('//*[#id="prcIsum"]').text.strip().split()
except:
price_selenium = ""
#you need to call soup here due to your loop structure
soup = BeautifulSoup(driver.page_source, 'lxml')
try:
total_sold_price_BeautifulSoup = soup.find('span', {'class': 'vi-qtyS-hot-red'}).text
except:
total_sold_price_BeautifulSoup = ""
print("title: ", title_selenium)
print("price: ", price_selenium)
print("total_sold_price: ", total_sold_price_BeautifulSoup)
print("\n")
i+=1
driver.close()

Parse bs4 table from Baseball reference

url = 'https://www.baseball-reference.com/boxes/NYN/NYN201704030.shtml'
def make_soup(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup
soup = make_soup(url)
I'm trying to locate the play by play table on that page and I've exhausted every option. Any thoughts on how to locate?
This is the tbody located under div.table_outer_container.mobile_table
You can use Selenium in combination with BeautifulSoup to scrape that table content as follows:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.baseball-reference.com/boxes/NYN/NYN201704030.shtml")
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
pbp_table = soup.find_all("table", {"id":"play_by_play"})
for tag in pbp_table:
print (tag.text)
If you want to use this code be sure to look at the Selenium guide on drivers and download the latest geckodriver if you're using Firefox as in that code above.
It is commented out in the source:
Look for the something to identify the comment, i.e the play_by_play id
from requests import get
from bs4 import BeautifulSoup, Comment
cont = get("https://www.baseball-reference.com/boxes/NYN/NYN201704030.shtml").content
soup = BeautifulSoup(cont, "lxml")
# Search Comments
comment = soup.find(text=lambda n: isinstance(n, Comment) and 'id="play_by_play"' in n)
soup2 = BeautifulSoup(comment)
table = soup2.select("#play_by_play")[0]
Which gets what you want:
In [3]: from requests import get
...: from bs4 import BeautifulSoup, Comment
...: cont = get("https://www.baseball-reference.com/boxes/NYN/NYN201704030.sh
...: tml").content
...: soup = BeautifulSoup(cont, "lxml")
...: comment = soup.find(text=lambda n: isinstance(n, Comment) and 'id="pla
...: y_by_play"' in n)
...: soup2 = BeautifulSoup(comment, "lxml")
...: table = soup2.select("#play_by_play")[0]
...: print(table.select_one(".pbp_summary_top").text)
...:
Top of the 1st, Braves Batting, Tied 0-0, Mets' Noah Syndergaard facing 1-2-3
In [4]:
You can also use a regex with text=...:
cont = get("https://www.baseball-reference.com/boxes/NYN/NYN201704030.shtml").content
soup = BeautifulSoup(cont, "lxml")
comment = soup.find(text=compile('id="play_by_play"'))
soup2 = BeautifulSoup(comment, "lxml")
table = soup2.select("#play_by_play")[0]