Why is beautifulsoup not parsing a simple Wikipedia Table - beautifulsoup

To help fight covid19 here in the Philippines, I'm trying to do data analysis. My data source is table of incidences in Wikipedia. See https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines
Tried to get table in python with Beautiful soup but I cannot seem to get the content of the columns [Facility of admission or consultation, Had recent travel history abroad]. See screenshot:
What am I doing wrong?
Here's my code: (can also be found here https://github.com/gio888/covid19_ph2/blob/master/covid_import_from_wikipedia.ipynb)
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Philippines_medical_cases_summary"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', class_='wikitable')
n_columns = 0
n_rows=0
column_names = []
for row in table.find_all('tr'):
td_tags = row.find_all('td')
if len(td_tags) > 0:
n_rows+=1
if n_columns == 0:
n_columns = len(td_tags)
th_tags = row.find_all('th')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns,index= range(0,n_rows))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
for col in df:
try:
df[col] = df[col].astype(float)
except ValueError:
pass
df

import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver = webdriver.Firefox()
driver.get(
"https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines")
items = [["yes", "Yes"], ["no", "No"], [
"TBA", "TBA"], ["status-d", "Died"], ["status-r", "Recovered"], ["status-a", "Admitted"]]
for item in items:
script = (
"document.querySelectorAll('.{}').forEach((element) => element.innerHTML = '{}')".format(*item))
driver.execute_script(script)
df = pd.read_html(driver.page_source)[2]
df.to_csv("data.csv", index=False)
driver.quit()
Output: View Online

import pandas as pd
import requests
from bs4 import BeautifulSoup
#url = "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Philippines_medical_cases_summary"
css_content = {
'status-a': 'Admitted',
'status-r': 'Recovered',
'status-d': 'Died',
'yes':'Yes',
'no': 'No',
'tba':'TBA',
"covid-sticky":'skip_header'
}
def Check_att(source,value,attribute='class'):
# <tag att='value'> <td class='x'>
if col_value : return col_value
if value in source.attrs.get(attribute, []) :
return css_content.get(value,'')
return ''
url = 'https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', class_='wikitable')
column_names = [col_name.text.rstrip('\n').strip() for col_name in table.select('tr.covid-sticky > th')]
n_rows = len(table.select('tr > td'))
df = pd.DataFrame(columns = column_names,index= range(0,n_rows))
for row_index,row in enumerate(table.find_all('tr')[1:],0):
# if Check_att(row,"covid-sticky") :continue
columns = row.find_all('td')
for col_index , column in enumerate(columns,0):
col_value = ''
col_value = Check_att(column,'status-a')
col_value = Check_att(column,'status-r')
col_value = Check_att(column,'status-d')
col_value = Check_att(column,'yes')
col_value = Check_att(column,'no')
col_value = Check_att(column,'tba')
if not col_value :
col_value = column.get_text().rstrip('\n').strip()
df.iat[row_index,col_index] = col_value
for col in df:
try:
df[col] = df[col].astype(float)
except ValueError:
pass
print(df)

Related

How can I fix 'list' object has no attribute 'to_csv' issue?

from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys # keys içerisinden enter yapabilmesini sağlıyoruz
browser = webdriver.Chrome("C:/Users/EMRE/Desktop/SCRAPE/chromedriver_win32/chromedriver.exe")
import pandas as pd
browser.get("http://event.ybu.edu.tr/kulupler/")
import csv
#browser.fullscreen_window()
#time.sleep(2)
#for i in range(6):
#browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') #sayfayı aşağıya doğru çekmek için
#time.sleep(1)
Kulup_button = browser.find_element_by_xpath("/html/body/div[2]/div[2]/div[1]/a/div/div[1]/div") #ilk kulüp için sonra değiştir
Kulup_button.click()
time.sleep(1)
for i in range(1):
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') #sayfayı aşağıya doğru çekmek için
time.sleep(1)
kulupnames = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[1]/td[2]")
kulupList=[]
for kulupname in kulupnames:
kulupList.append(kulupname.text)
mails = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-orange.btn-social")
MailList=[]
for mail in mails:
MailList.append(mail.text)
FacebookAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-blue.btn-social")
FacebookList=[]
for FacebookAdress in FacebookAdresses:
FacebookList.append(FacebookAdress.text)
TwitterAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-aqua")
TwitterList=[]
for TwitterAdress in TwitterAdresses:
TwitterList.append(TwitterAdress.text)
InstagramAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-light-blue")
InstagramList=[]
for InstagramAdress in InstagramAdresses:
InstagramList.append(InstagramAdress.text)
AkademikDanismanlar = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[2]/td[2]")
DanismanList=[]
for AkademikDanisman in AkademikDanismanlar:
DanismanList.append(AkademikDanisman.text)
KulupBaskanlari = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[3]/td[2]")
BaskanList=[]
for KulupBaskani in KulupBaskanlari:
BaskanList.append(KulupBaskani.text)
ToplamUyeler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[4]/td[2]")
UyeList=[]
for Uye in ToplamUyeler:
UyeList.append(Uye.text)
Etkinlikler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[5]/td[2]")
EtkinlikList=[]
for Etkinlik in Etkinlikler:
EtkinlikList.append(Etkinlik.text)
time.sleep(5)
browser.quit()
DataFile = csv.writer(open('AYBU.csv','w'))
DataFile.writerow(['KulupAdi','MailAdresi','FacebookAdresi','TwitterAdresi','InstagramAdres','AkademikDanisman','KulupBaskani','ToplamUyeSayisi','ToplamEtkinlikSayisi'])
DataFile.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
liste = ['kulupList','MailList','FacebookList','TwitterList','InstagramList','DanismanList','BaskanList','UyeList','EtkinlikList']
df = pd.DataFrame(data = liste)
liste.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
I am trying to save my variable list as dataframe to csv.
You have a couple flaws in your code that I can see.
I took your code and made it work and I'll explain how:
import csv
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys # keys içerisinden enter yapabilmesini sağlıyoruz
browser = webdriver.Chrome()
time.sleep(5)
browser.get("http://event.ybu.edu.tr/kulupler/")
Kulup_button = browser.find_element_by_xpath("/html/body/div[2]/div[2]/div[1]/a/div/div[1]/div") #ilk kulüp için sonra değiştir
Kulup_button.click()
time.sleep(1)
for _ in range(1):
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') #sayfayı aşağıya doğru çekmek için
time.sleep(1)
kulupnames = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[1]/td[2]")
kulupList = [kulupname.text for kulupname in kulupnames]
mails = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-orange.btn-social")
MailList = [mail.text for mail in mails]
FacebookAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-blue.btn-social")
FacebookList = [FacebookAdress.text for FacebookAdress in FacebookAdresses]
TwitterAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-aqua")
TwitterList = [TwitterAdress.text for TwitterAdress in TwitterAdresses]
InstagramAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-light-blue")
InstagramList = [InstagramAdress.text for InstagramAdress in InstagramAdresses]
AkademikDanismanlar = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[2]/td[2]")
DanismanList = [
AkademikDanisman.text for AkademikDanisman in AkademikDanismanlar
]
KulupBaskanlari = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[3]/td[2]")
BaskanList = [KulupBaskani.text for KulupBaskani in KulupBaskanlari]
ToplamUyeler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[4]/td[2]")
UyeList = [Uye.text for Uye in ToplamUyeler]
Etkinlikler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[5]/td[2]")
EtkinlikList = [Etkinlik.text for Etkinlik in Etkinlikler]
time.sleep(5)
browser.quit()
with open('AYBU.csv','w') as datafile:
DataFile = csv.writer(datafile)
DataFile.writerow(['KulupAdi','MailAdresi','FacebookAdresi','TwitterAdresi','InstagramAdres','AkademikDanisman','KulupBaskani','ToplamUyeSayisi','ToplamEtkinlikSayisi'])
liste = [kulupList,MailList,FacebookList,TwitterList,InstagramList,DanismanList,BaskanList,UyeList,EtkinlikList]
df = pd.DataFrame(data = liste)
df.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
The key changes here are at the bottom (don't mind the clean up of the generators).
DataFile = csv.writer(open('AYBU.csv','w'))
DataFile.writerow(['KulupAdi','MailAdresi','FacebookAdresi','TwitterAdresi','InstagramAdres','AkademikDanisman','KulupBaskani','ToplamUyeSayisi','ToplamEtkinlikSayisi'])
DataFile.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
liste = ['kulupList','MailList','FacebookList','TwitterList','InstagramList','DanismanList','BaskanList','UyeList','EtkinlikList']
df = pd.DataFrame(data = liste)
liste.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
This code doesnt work.
with open('AYBU.csv','w') as datafile:
DataFile = csv.writer(datafile)
DataFile.writerow(['KulupAdi','MailAdresi','FacebookAdresi','TwitterAdresi','InstagramAdres','AkademikDanisman','KulupBaskani','ToplamUyeSayisi','ToplamEtkinlikSayisi'])
liste = [kulupList,MailList,FacebookList,TwitterList,InstagramList,DanismanList,BaskanList,UyeList,EtkinlikList]
df = pd.DataFrame(data = liste)
df.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
You had 'list' as strings.
pandas is able to use .to_csv but csv.writer is not.

Web Scraping Table with class from CoinMarketCap w

I'm trying to scrape an entire table of data from:
https://coinmarketcap.com/currencies/ethereum/historical-data/
I'm trying to extract the table:
<table class="h7vnx2-2 jNaLNi cmc-table ">
with
table = soup.find('table', {"class":"h7nvx2-2 jNaLNi cmc-table"})
and it returns: None
here's the full code:
import requests
from bs4 import BeautifulSoup
def main():
URL = "https://coinmarketcap.com/currencies/ethereum/historical-data/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.find('table', {"class":"h7nvx2-2 jNaLNi cmc-table"})
print(table)
if __name__ == "__main__":
main()
import pandas as pd
from urllib.parse import urlencode
def main(url):
params = {
'id': '1027',
'convertId': '2781',
'timeStart': '1623283200',
'timeEnd': '1628553600'
}
df = pd.DataFrame(pd.read_json(
url + urlencode(params))['data']['quotes'])
df = pd.DataFrame.from_records(df['quote'])
print(df)
main('https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?')
As stated, the data is dynamically rendered. Go to the api to get the data directly:
import requests
import pandas as pd
url ='https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical'
payload = {
'id':'1027',
'convertId':'2781',
'timeStart':'1623283200',
'timeEnd':'1628553600' }
jsonData = requests.get(url, params=payload).json()
df = pd.json_normalize(jsonData, record_path=['data','quotes'])
#rows = [i['quote'] for i in jsonData['data']['quotes']]
#df = pd.DataFrame(rows)
Output:
print(df)
Output from spyder call 'get_namespace_view':
open high ... marketCap timestamp
0 2611.142652 2619.957793 ... 2.872870e+11 2021-06-10T23:59:59.999Z
1 2472.858836 2495.414705 ... 2.736314e+11 2021-06-11T23:59:59.999Z
2 2354.752218 2447.227868 ... 2.758389e+11 2021-06-12T23:59:59.999Z
3 2372.690096 2547.367910 ... 2.916739e+11 2021-06-13T23:59:59.999Z
4 2508.770462 2606.432929 ... 2.951126e+11 2021-06-14T23:59:59.999Z
.. ... ... ... ... ...
56 2725.669632 2840.430748 ... 3.307572e+11 2021-08-05T23:59:59.999Z
57 2827.503486 2944.903352 ... 3.382378e+11 2021-08-06T23:59:59.999Z
58 2891.707469 3170.229727 ... 3.694372e+11 2021-08-07T23:59:59.999Z
59 3161.232779 3184.603971 ... 3.526859e+11 2021-08-08T23:59:59.999Z
60 3012.885711 3185.701187 ... 3.707654e+11 2021-08-09T23:59:59.999Z
[61 rows x 7 columns]

Only crawler the first page and save detailed contents as dataframe in Python

I'm trying to loop pages, crawler and save detailed contents from this link:
Based on the code from here, I've modified the code to:
import pandas as pd
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
def get_main_urls() -> list:
start_url = f"{BASE_URL}/index.html"
return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]
def get_follow_urls(urls: list, session: requests.Session()) -> iter:
for url in urls[:1]: # remove [:1] to scrape all the pages
body = session.get(url).content
s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]
updated_df = pd.DataFrame()
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
# print(f"Fetching data for {key}...")
dfs = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4")
# https://stackoverflow.com/questions/39710903/pd-read-html-imports-a-list-rather-than-a-dataframe
for df in dfs:
# df = dfs[0].T
df = dfs[0].T.iloc[1:, :].copy()
updated_df = updated_df.append(df)
print(updated_df)
cols = ['项目编号', '转让/出租标的名称', '转让方/出租方名称', '转让标的评估价/年租金评估价(元)',
'转让底价/年租金底价(元)', '受让方/承租方名称', '成交价/成交年租金(元)', '成交日期']
updated_df.columns = cols
updated_df.to_excel('./data.xlsx', index = False)
But it only successfully crawler the first page, how could I crawler all the pages and also add url column? Thanks.
Is this what you're looking for? This processes all the urls and dumps a list of dataframes to a single excel file.
Here's how:
import pandas as pd
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
COLUMNS = [
'项目编号', '转让/出租标的名称', '转让方/出租方名称',
'转让标的评估价/年租金评估价(元)', '转让底价/年租金底价(元)',
'受让方/承租方名称', '成交价/成交年租金(元)', '成交日期', 'URL'
]
def get_main_urls() -> list:
start_url = f"{BASE_URL}/index.html"
return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]
def get_follow_urls(urls: list, session: requests.Session()) -> iter:
for url in urls:
body = session.get(url).content
s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]
def post_process(list_of_dataframes: list, source_url: str) -> pd.DataFrame():
temp_df = list_of_dataframes[0]
temp_df = temp_df.append(
pd.Series(["URL", source_url], index=temp_df.columns),
ignore_index=True,
)
return temp_df.T.iloc[1:, :].copy()
def dump_to_excel(post_processed_dfs: list):
df = pd.concat(post_processed_dfs)
df.columns = COLUMNS
df.to_excel("scraped_data.xlsx", index=False)
processed_dfs = []
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
print(f"Fetching data for {key}...")
df_list = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4",
)
processed_dfs.append(post_process(df_list, follow_url))
dump_to_excel(processed_dfs)
Output:

Pandas DataFrame - How to extract string patterns with hidden characters

I am scraping names, prices and images from this website. There are 8 items in total, but in the DF I would like to filter only the items that contain the pattern "Original Zaino Antifurto". When I try to apply the bp_filter to the DF I get an error, probably due to hidden characters.
Does anyone know how to filter for this pattern avoiding the error?
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = []
prices_xd = []
picts_xd = []
for container in con_xd:
name = container.find("a", class_="product-item-link").text
names_xd.append(name)
for container in con_xd:
price = container.find("span", class_="price").text
prices_xd.append(price)
for container in con_xd:
pict = container.find("a").get("href")
picts_xd.append(pict)
bp_xd = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
bp_xd['Item_Price_EUR'] = bp_xd['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
bp_xd['(XD-Design) Item_Name'] = bp_xd['(XD-Design) Item_Name'].str.strip()
bp_filter = bp_xd['(XD-Design) Item_Name'][bp_xd['(XD-Design) Item_Name'].str.contains('Original Zaino Antifurto')]
# bp_xd[bp_filter]
Here you have the fixed working code
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = [c.find("a", class_="product-item-link").text for c in con_xd]
prices_xd = [c.find("span", class_="price").text for c in con_xd]
picts_xd = [c.find("a").get("href") for c in con_xd]
df = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
df['Item_Price_EUR'] = df['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
df['(XD-Design) Item_Name'] = df['(XD-Design) Item_Name'].str.strip()
df = df.loc[df['(XD-Design) Item_Name'].apply(lambda x: 1 if 'Original Zaino Antifurto' in x else 0) == 1]

'int' object has no attribute 'replace' error in python3.x

I don't get why this error occurs. Coz from my point of view the three columns 'WWBO','IBO','DBO' has exact same structure but when I apply 'replace' only WWBO works. Does it have sth with fillna?
Need your help!
import requests
from bs4 import BeautifulSoup as bs
#Read url
URL = "https://www.the-numbers.com/box-office-records/worldwide/all- movies/cumulative/released-in-2019"
data = requests.get(URL).text
#parse url
soup = bs(data, "html.parser")
#find the tables you want
table = soup.findAll("table")[1:]
#read it into pandas
df = pd.read_html(str(table))
#concat both the tables
df = pd.concat([df[0],df[1]])
df = df.rename(columns={'Rank':'Rank',
'Movie':'Title',
'Worldwide Box Office':'WWBO',
'Domestic Box Office':'DBO',
'International Box Office':'IBO',
'DomesticShare':'Share'})
#drop columns
market = df.drop(columns=['Rank','Share'])
market = market.fillna(0)
#replace $ -> ''
market['WWBO'] = market['WWBO'].map(lambda s: s.replace('$',''))
market['IBO'] = market['IBO'].map(lambda s: s.replace('$',''))
market['DBO'] = market['DBO'].map(lambda s: s.replace('$',''))
market
Error is:::
AttributeError: 'int' object has no attribute 'replace'
it is Pandas bugs auto casting '0' values to int, to solutions for this either eliminate the 0 value or cast the columns to string as below
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
#Read url
URL = "https://www.the-numbers.com/box-office-records/worldwide/all-movies/cumulative/released-in-2019"
data = requests.get(URL).text
#parse url
soup = bs(data, "html.parser")
#find the tables you want
table = soup.findAll("table")[1:]
#read it into pandas
df = pd.read_html(str(table))
#concat both the tables
df = pd.concat([df[0],df[1]])
df = df.rename(columns={'Rank':'Rank',
'Movie':'Title',
'Worldwide Box Office':'WWBO',
'Domestic Box Office':'DBO',
'International Box Office':'IBO',
'DomesticShare':'Share'})
#drop columns
market = df.drop(columns=['Rank','Share'])
market = market.fillna(0)
#replace $ -> ''
market['WWBO'] = market['WWBO'].map(lambda s: s.replace('$',''))
market['IBO']=market['IBO'].astype(str)
market['IBO'] = market['IBO'].map(lambda s: s.replace('$',''))
market['DBO']=market['DBO'].astype(str)
market['DBO'] = market['DBO'].map(lambda s: s.replace('$',''))
>> market[['WWBO','IBO','DBO']]
WWBO IBO DBO
0 2,622,240,021 1,842,814,023 779,425,998
1 1,121,905,659 696,535,598 425,370,061
2 692,163,684 692,163,684 0
3 518,883,574 358,491,094 160,392,480
4 402,976,036 317,265,826 85,710,210
5 358,234,705 220,034,625 138,200,080
6 342,904,508 231,276,537 111,627,971
7 326,150,303 326,150,303 0
8 293,766,097 192,548,368 101,217,729
9 255,832,826 255,832,826 0
10 253,940,650 79,203,380 174,737,270
11 245,303,505 134,268,500 111,035,005
12 190,454,964 84,648,456 105,806,508
13 155,313,390 98,312,634 57,000,756
clearly one or more of these fields(market['WWBO'], market['IBO'], market['DBO']) have integer values and you are trying to perform string operation i.e. replace over it that's it is throwing error that
AttributeError: 'int' object has no attribute 'replace'
could you first print those values and see what are they or if you have many then its better to perform type check first like
if market['WWBO'].dtype == object:
market['WWBO'].map(lambda s: s.replace('$',''))
else:
pass
let me know if this works for you or not