I don't get why this error occurs. Coz from my point of view the three columns 'WWBO','IBO','DBO' has exact same structure but when I apply 'replace' only WWBO works. Does it have sth with fillna?
Need your help!
import requests
from bs4 import BeautifulSoup as bs
#Read url
URL = "https://www.the-numbers.com/box-office-records/worldwide/all- movies/cumulative/released-in-2019"
data = requests.get(URL).text
#parse url
soup = bs(data, "html.parser")
#find the tables you want
table = soup.findAll("table")[1:]
#read it into pandas
df = pd.read_html(str(table))
#concat both the tables
df = pd.concat([df[0],df[1]])
df = df.rename(columns={'Rank':'Rank',
'Movie':'Title',
'Worldwide Box Office':'WWBO',
'Domestic Box Office':'DBO',
'International Box Office':'IBO',
'DomesticShare':'Share'})
#drop columns
market = df.drop(columns=['Rank','Share'])
market = market.fillna(0)
#replace $ -> ''
market['WWBO'] = market['WWBO'].map(lambda s: s.replace('$',''))
market['IBO'] = market['IBO'].map(lambda s: s.replace('$',''))
market['DBO'] = market['DBO'].map(lambda s: s.replace('$',''))
market
Error is:::
AttributeError: 'int' object has no attribute 'replace'
it is Pandas bugs auto casting '0' values to int, to solutions for this either eliminate the 0 value or cast the columns to string as below
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
#Read url
URL = "https://www.the-numbers.com/box-office-records/worldwide/all-movies/cumulative/released-in-2019"
data = requests.get(URL).text
#parse url
soup = bs(data, "html.parser")
#find the tables you want
table = soup.findAll("table")[1:]
#read it into pandas
df = pd.read_html(str(table))
#concat both the tables
df = pd.concat([df[0],df[1]])
df = df.rename(columns={'Rank':'Rank',
'Movie':'Title',
'Worldwide Box Office':'WWBO',
'Domestic Box Office':'DBO',
'International Box Office':'IBO',
'DomesticShare':'Share'})
#drop columns
market = df.drop(columns=['Rank','Share'])
market = market.fillna(0)
#replace $ -> ''
market['WWBO'] = market['WWBO'].map(lambda s: s.replace('$',''))
market['IBO']=market['IBO'].astype(str)
market['IBO'] = market['IBO'].map(lambda s: s.replace('$',''))
market['DBO']=market['DBO'].astype(str)
market['DBO'] = market['DBO'].map(lambda s: s.replace('$',''))
>> market[['WWBO','IBO','DBO']]
WWBO IBO DBO
0 2,622,240,021 1,842,814,023 779,425,998
1 1,121,905,659 696,535,598 425,370,061
2 692,163,684 692,163,684 0
3 518,883,574 358,491,094 160,392,480
4 402,976,036 317,265,826 85,710,210
5 358,234,705 220,034,625 138,200,080
6 342,904,508 231,276,537 111,627,971
7 326,150,303 326,150,303 0
8 293,766,097 192,548,368 101,217,729
9 255,832,826 255,832,826 0
10 253,940,650 79,203,380 174,737,270
11 245,303,505 134,268,500 111,035,005
12 190,454,964 84,648,456 105,806,508
13 155,313,390 98,312,634 57,000,756
clearly one or more of these fields(market['WWBO'], market['IBO'], market['DBO']) have integer values and you are trying to perform string operation i.e. replace over it that's it is throwing error that
AttributeError: 'int' object has no attribute 'replace'
could you first print those values and see what are they or if you have many then its better to perform type check first like
if market['WWBO'].dtype == object:
market['WWBO'].map(lambda s: s.replace('$',''))
else:
pass
let me know if this works for you or not
Related
Need some help appending several webscraping resaults to a panda df.
Currently im only getting the output from one of the URLs to the DF.
I left out the URLs, if you need them i will supply them to you.
##libs
import bs4
import requests
import re
from time import sleep
import pandas as pd
from bs4 import BeautifulSoup as bs
##webscraping targets
URLs = ["URL1","URL2","URL3"]
## Get columns
column_list = []
r1 = requests.get(URLs[0])
soup1 = bs(r1.content)
data1 = soup1.find_all('dl', attrs= {"class": "border XSText rightAlignText noMarginTop highlightOnHover thickBorderBottom noTopBorder"})
columns = soup1.find_all('dt')
for col in columns:
column_list.append(col.text.strip()) # strip() removes extra space from the text
##Get values
value_list = []
for url in URLs:
r1 = requests.get(url)
soup1 = bs(r1.content)
data1 = soup1.find_all('dl', attrs= {"class": "border XSText rightAlignText noMarginTop highlightOnHover thickBorderBottom noTopBorder"})
values = soup1.find_all('dd')
for val in values:
value_list.append(val.text.strip())
df=pd.DataFrame(list(zip(column_list,value_list)))
df.transpose()
Current output only showing the resaults of one URL:
Expected output:
The problem here is with your zip function. It will only zip the values until the length of the shortest list, in this case, the column_list. Leaving all the other values unused.
If you want to append the other values to the dataframe as well you will have to iterate over then. So change the last two lines on your code to this and it should work:
result = [[i] for i in column_list]
for i, a in enumerate(value_list):
result[i % len(column_list)].extend([a])
df = pd.DataFrame(result)
df.transpose()
I am scraping names, prices and images from this website. There are 8 items in total, but in the DF I would like to filter only the items that contain the pattern "Original Zaino Antifurto". When I try to apply the bp_filter to the DF I get an error, probably due to hidden characters.
Does anyone know how to filter for this pattern avoiding the error?
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = []
prices_xd = []
picts_xd = []
for container in con_xd:
name = container.find("a", class_="product-item-link").text
names_xd.append(name)
for container in con_xd:
price = container.find("span", class_="price").text
prices_xd.append(price)
for container in con_xd:
pict = container.find("a").get("href")
picts_xd.append(pict)
bp_xd = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
bp_xd['Item_Price_EUR'] = bp_xd['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
bp_xd['(XD-Design) Item_Name'] = bp_xd['(XD-Design) Item_Name'].str.strip()
bp_filter = bp_xd['(XD-Design) Item_Name'][bp_xd['(XD-Design) Item_Name'].str.contains('Original Zaino Antifurto')]
# bp_xd[bp_filter]
Here you have the fixed working code
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = [c.find("a", class_="product-item-link").text for c in con_xd]
prices_xd = [c.find("span", class_="price").text for c in con_xd]
picts_xd = [c.find("a").get("href") for c in con_xd]
df = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
df['Item_Price_EUR'] = df['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
df['(XD-Design) Item_Name'] = df['(XD-Design) Item_Name'].str.strip()
df = df.loc[df['(XD-Design) Item_Name'].apply(lambda x: 1 if 'Original Zaino Antifurto' in x else 0) == 1]
i need to know what is happening in my code? it should give data in separate columns it is giving me same data in a oath columns.
i tried to change the value of row variable but it didn't found the reason
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd
import time
arrayofRequest= []
prices=[]
location=[]
columns=['Price', 'Location']
df = pd.DataFrame(columns=columns)
for i in range(0,50):
arrayofRequest.append("https://www.zameen.com/Homes/Karachi-2-"+str(i+1)+".html?gclid=Cj0KCQjw3JXtBRC8ARIsAEBHg4mj4jX1zZUt3WzGScjH6nfwzrEqkuILarcmg372imSneelSXPj0fGIaArNeEALw_wcB")
request = requests.get(arrayofRequest[i])
soupobj= BeautifulSoup(request.content,"lxml")
# print(soupobj.prettify())
links =soupobj.find_all('span',{'class':'f343d9ce'})
addresses =soupobj.find_all('div',{'class':'_162e6469'})
price = ""
for i in range(0,len(links)):
price = str(links[i]).split(">")
price = price[len(price)-2].split("<")[0]
prices.append(price)
address = str(addresses[i]).split(">")
address = address[len(address)-2].split("<")[0]
location.append(address)
row=location[i]+","+prices[i]
df = df.append(pd.Series(row, index=columns), ignore_index=False)
# filewriter = csv.writer(csvfile, delimiter=',',filewriter.writerow(['Price', 'Location']),filewriter.writerow([prices[0],location[0]])
df.to_csv('DATA.csv', index=False)
because of this:
pd.Series(row, index=columns)
try smthg like
pd.DataFrame([[locations[i], prices[i]]], index=columns))
However this could be done only once outside of your for loop
pd.DataFrame(list(zip(locations, prices)), index=columns))
I am trying to set an ARIMA model to some data, for this, I used 'autocorrelation_plot()' with my time series. It's generates however the error in the title.
I have an attribute table composed, among others, of a Date and time fiels.
I extracted them (after transforming the attribute table into a numpy table), put them in a 'datetime' variable and appended them all in a list:
O,A = [],[]
dt = datetime.strptime(dt1, "%Y/%m/%d %H:%M")
A.append(dt)
I tried then to create time series and printed them to be sure of the results:
data2 = pd.Series(A, O)
print data2
The results were satisfying, until I decided to auto-correlate :
Auto-correlation command :
autocorrelation_plot(data2)
After this command, it returns:
TypeError: ufunc add cannot use operands with types dtype('M8[ns]') and dtype('M8[ns]')
I guess it's due to the conversion of the datetime.strptime to a numpy ?
I tried to follow some suggestions from previous questions
index.to_pydatetime() , dtype, M8[ns] error ..., in vain.
Minimal reproducible example:
from pandas import datetime
from pandas import DataFrame
import pandas as pd
from matplotlib import pyplot as plt
from pandas.tools.plotting import autocorrelation_plot
arr = arcpy.da.TableToNumPyArray(inTable ,("PROVINCE","ZONE_CODE","MEAN", "Datetime","Time"))
arr_length = len(arr)
j = 1
O,A = [],[]
while j<=55: #I have 55 provinces
i = 0
while i<arr_length:
if arr[i][1]== j:
O.append(arr[i][2])
c = str(arr[i][3])
d = str(c[0:4]+"/"+c[5:7]+"/"+c[8:10])
t = str(arr[i][4])
if t=="10":
dt1 = str(d+" 10:00")
else:
dt1 = str(d+" 14:00")
dt = datetime.strptime(dt1, "%Y/%m/%d %H:%M")
A.append(dt)
i = i+1
data2 = pd.Series(A, O)
print data2
autocorrelation_plot(data2)
del A[:]
del O[:]
j += 1
Screenshot of the results:
results
I used this to solve my issue:
import matplotlib.dates as mpl_dates
df.reset_index(inplace=True)
df['Date']=df['Date'].apply(mpl_dates.date2num)
df = df.astype(float)
I found a solution, it can look barbaric, but it works!
I've just "recreated" pd.Series() with the pd.Series I had:
data2 = pd.Series(O, A)
autocorrelation_plot(pd.Series(data2))
plt.show()
I have the following IPython Notebook, I am trying to access data base of movies from rotten tomatoes website.
But Rotten Tomatoes limits to 10,000 API requests a day
So I don't want to re-run this function every time when I restart the notebook, I am trying to save and reload this data as a CSV file. When I convert the data to a csv file I am getting this processing symbol[*] inside IPython notebook. After some time I am getting the following error
ConnectionError: HTTPConnectionPool(host='api.rottentomatoes.com', port=80): Max retries exceeded with url: /api/public/v1.0/movie_alias.json?apikey=5xr26r2qtgf9h3kcq5kt6y4v&type=imdb&id=0113845 (Caused by <class 'socket.gaierror'>: [Errno 11002] getaddrinfo failed)
Is this problem due to slow internet connection? Should I make some changes to my code? Kindly help me with this.
The code for the file is shown below:
%matplotlib inline
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
api_key = '5xr26r2qtgf9h3kcq5kt6y4v'
movie_id = '770672122' # toy story 3
url = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % movie_id
#these are "get parameters"
options = {'review_type': 'top_critic', 'page_limit': 20, 'page': 1, 'apikey': api_key}
data = requests.get(url, params=options).text
data = json.loads(data) # load a json string into a collection of lists and dicts
print json.dumps(data['reviews'][0], indent=2) # dump an object into a json string
from io import StringIO
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file,delimiter='\t')
movies
#print the first row
movies[['id', 'title', 'imdbID', 'year']]
def base_url():
return 'http://api.rottentomatoes.com/api/public/v1.0/'
def rt_id_by_imdb(imdb):
"""
Queries the RT movie_alias API. Returns the RT id associated with an IMDB ID,
or raises a KeyError if no match was found
"""
url = base_url() + 'movie_alias.json'
imdb = "%7.7i" % imdb
params = dict(id=imdb, type='imdb', apikey=api_key)
r = requests.get(url, params=params).text
r = json.loads(r)
return r['id']
def _imdb_review(imdb):
"""
Query the RT reviews API, to return the first page of reviews
for a movie specified by its IMDB ID
Returns a list of dicts
"""
rtid = rt_id_by_imdb(imdb)
url = base_url() + 'movies/{0}/reviews.json'.format(rtid)
params = dict(review_type='top_critic',
page_limit=20,
page=1,
country='us',
apikey=api_key)
data = json.loads(requests.get(url, params=params).text)
data = data['reviews']
data = [dict(fresh=r['freshness'],
quote=r['quote'],
critic=r['critic'],
publication=r['publication'],
review_date=r['date'],
imdb=imdb, rtid=rtid
) for r in data]
return data
def fetch_reviews(movies, row):
m = movies.irow(row)
try:
result = pd.DataFrame(_imdb_review(m['imdbID']))
result['title'] = m['title']
except KeyError:
return None
return result
def build_table(movies, rows):
dfs = [fetch_reviews(movies, r) for r in range(rows)]
dfs = [d for d in dfs if d is not None]
return pd.concat(dfs, ignore_index=True)
critics = build_table(movies, 3000)
critics.to_csv('critics.csv', index=False)
critics = pd.read_csv('critics.csv')