i've been trying to scrape this site
import pandas as pd
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.nbcsports.com/edge/basketball/nba/injury-report")
soup = BeautifulSoup(r.content,"lxml")
st1 = soup.find("div", attrs={"class":"page-wrapper--sidebar page-wrapper--sidebar-initial container clearfix page-wrapper"})
st2 = st1.find("div",attrs={"class":"content content--main cols-8"})
st3 = st2.find("div", attrs={"class":"block__content"})
st4 = st3.find("div",attrs={"id":"injury-report-page-wrapper"})
st4.find("div",attrs={"class":"injury-report-wall"})
Nothing returns.
I am trying to get the injury data however it doesn't work at all. i've tried bs,pandas couldn't make it. it looks like this data comes from an api but kinda stuckt. Open for advices.
import requests
import pandas as pd
def main(url):
params = {
"sort": "-start_date",
"filter[player.team.meta.drupal_internal__id]": 176,
"filter[player.status.active]": 1,
"filter[active]": 1,
"include": "injury_type,player,player.status,player.position"
}
r = requests.get(url, params=params)
data = []
for item in r.json()['included']:
data.append(item['attributes'])
df = pd.DataFrame().from_dict(data)
print(df)
# df.to_csv('data.csv', index=False)
main('https://www.nbcsports.com/edge/api/injury')
Related
I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)
I am trying to create a choropleth map of the uk using plotly, but every time I try, it outputs an empty page, or the json doesn't match with the dataframe.this is where i obtained the url for the dataframe Here's my code so far:
import pandas as pd
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json') as response:
geojson = json.load(response)
url3 = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
df = pd.read_csv(url3)
df_new=df.replace("areaName", "NAME_2")
from plotly import graph_objects as go
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaCode"],
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True,
)
)
fig.show()
a few things to fix this up:
uk-counties.json is in topojson format, plotly needs a geojson. can fix with the topojson module, for example (or geopandas)
no need to replace "areaName", you want this: locations=df["areaName"]
you need to specify a marker_style. centering and zooming help as well
for good result you need to use only one day's worth of data per choropleth, hence the df = df[df['date'] == '2022-11-23']
the covid data and the topojson don't match up well by districts, so there are gaps in the map
code:
"""
https://stackoverflow.com/questions/71828342/choropleth-plotly-map-displaying-a-white-background
"""
from urllib.request import urlretrieve
import json
from io import StringIO
from plotly import graph_objects as go
import pandas as pd
import topojson as tp
URL_JSON = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json'
URL_DATA = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
CSV_DATA = 'uk_covid.csv'
TOPO_DATA = 'topojson.json'
GEO_DATA = 'geojson.json'
def download():
urlretrieve(URL_JSON, TOPO_DATA)
with open(TOPO_DATA, 'r') as data:
topoJSON = json.load(StringIO(data.read()))
topo = tp.Topology(topoJSON, object_name='GBR_adm2')
# convert to geojson, store in GEO_DATA
topo.to_geojson(GEO_DATA)
df = pd.read_csv(URL_DATA)
df.to_csv(CSV_DATA)
def make_map():
df = pd.read_csv(CSV_DATA)
with open(GEO_DATA, 'r') as data:
geojson = json.load(StringIO(data.read()))
# one day at a time
df = df[df['date'] == '2022-11-23']
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaName"], # <=== not areaCode
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True
)
)
# need a mapbox_style
fig.update_layout(mapbox_style='carto-positron',
mapbox_zoom=5,
mapbox_center_lon=-2.057852,
mapbox_center_lat=53.404854,
height=700,
width=700)
fig.show()
if 0: # only needed once
download()
make_map()
I am trying to import xml to pandas using bs4.
The bs4 import works, but getting pandas to recognise the xml is problematic.
import requests
import bs4
import pandas as pd
url = 'https://www.federalreserve.gov/data.xml'
geturl = requests.get(url).text
data = bs4.BeautifulSoup(geturl, 'lxml')
df = pd.DataFrame(data)
print(df.head())
I am expecting the df to show the first 5 rows of data, but instead i get the following error:
KeyError: 0
Why is pandas producing this KeyError: 0?
Many thanks!
There are five different charts in the xml file. Which one do you want? This is an example using the first chart:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# xml url
xml = 'https://www.federalreserve.gov/data.xml'
# GET request and create soup
r = requests.get(xml)
soup = BeautifulSoup(r.text, 'xml')
# list comprehension to create a list of all the charts in the xml file
charts = [chart for chart in soup.findAll('chart')]
# list comprehension to get the observation index and value of the first chart (i.e, charts[0])
data = [[ob['index'], ob['value']] for ob in charts[0].findAll('observation')]
# create DataFrame
df = pd.DataFrame(data, columns=['Date', 'Value'])
df.head()
Date Value
0 1-Aug-07 870261.00
1 8-Aug-07 865453.00
2 15-Aug-07 864931.00
3 22-Aug-07 862775.00
4 29-Aug-07 872873.00
Update
You can iterate through all the charts and append to a dict. You will then call each DataFrame by the title of the chart:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# xml url
xml = 'https://www.federalreserve.gov/data.xml'
# GET request and create soup
r = requests.get(xml)
soup = BeautifulSoup(r.text, 'xml')
# list comprehension to create a list of all the charts in the xml file
charts = [chart for chart in soup.findAll('chart')]
# empty dict
df_list = {}
for chart in charts:
# list comprehension to get the observation index and value
data = [[ob['index'], ob['value']] for ob in chart.findAll('observation')]
# create DataFrame
df = pd.DataFrame(data, columns=['Date', 'Value'])
# create key from the the chart title and append df
df_list[chart['title']] = []
df_list[chart['title']].append(df)
# calling the second chart
df_list['Selected Assets of the Federal Reserve'][0].head()
Date Value
0 1-Aug-07 870261.00
1 8-Aug-07 865453.00
2 15-Aug-07 864931.00
3 22-Aug-07 862775.00
4 29-Aug-07 872873.00
I made it to the point where all tr data data has been scraped and I am able to get a nice printout. But when I go to implement the pd.DataFrame as in df= pd.DataFrame({"A": a}) etc, I get a syntax error
Here is a list of my imported libraries in the Jupyter Notebook:
import pandas as pd
import numpy as np
import bs4 as bs
import requests
import urllib.request
import csv
import html5lib
from pandas.io.html import read_html
import re
Here is my code:
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs.BeautifulSoup(source,'html.parser')
table_rows = soup.find_all('tr')
table_rows
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
texas_info = pd.DataFrame({
"title": Texas
"Zip Code" : [Zip Code],
"City" :[City],
})
texas_info.head()
I expect to get a dataframe with two columns, one being the 'Zip Code' and the other the 'Cities'
If you want to create manually, with bs4 4.7.1 you can use :not, :contains and :nth-of-type pseudo classes to isolate the two columns of interest, then construct a dict then convert to df
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
zips = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan])')]
cities = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': zips,'Cities': cities}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
You could combine selectors into one line:
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
items = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan]), .inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': items[0::2],'Cities': items[1::2]}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
print(df)
I note you want to create manually but worth knowing for future readers that you could just use pandas read_html
import pandas as pd
table = pd.read_html('https://www.zipcodestogo.com/Texas/')[1]
table.columns = table.iloc[1]
table = table[2:]
table = table.drop(['Zip Code Map', 'County'], axis=1).reset_index(drop=True)
print(table)
Try creating the DataFrame and perform the for loop to append each row in the table into the DataFrame.
df = pd.DataFrame()
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
zipCode = row[0] # assuming first column
city = row[1] # assuming second column
df = df.append({"Zip Code": zipCode, "City" : city}, ignore_index=True)
If you only need these two columns, you should not include title in the DataFrame (that will create another column); that line also happened to be where the syntax error occurred because of the missing comma.
I'm making a text scraper for youtube in which I want to enter data and search videos and collect data of it. I'm facing problems in entering data in the text field. Can anyone suggest me a method to do that?
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
soup = BeautifulSoup(driver.page_source, 'lxml') #Use the page as source
page = driver.get('https://freight.rivigo.com/dashboard/home')
import sys
from importlib import reload
reload
elem = driver.find_element_by_tag_name("body")
no_of_pagedowns = 120
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
no_of_pagedowns-=1
soup = BeautifulSoup(driver.page_source, 'lxml')
In between this code I want to add a custom text in input field, lets say "comedy" and want to get data on that. I'm stuck on how to input data and I'm quite new to this so any sort of help will be helpful.
That page is NOT pointing to YouTube. Check out the working code sample below for an idea of what you can do with the YouTube API.
# https://medium.com/greyatom/youtube-data-in-python-6147160c5833
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from youtube_data import youtube_search
test = youtube_search("Nine Inch Nails")
test.keys()
test['commentCount'][:5]
df = pd.DataFrame(data=test)
df.head()
df1 = df[['title','viewCount','channelTitle','commentCount','likeCount','dislikeCount','tags','favoriteCount','videoId','channelId','categoryId']]
df1.columns = ['Title','viewCount','channelTitle','commentCount','likeCount','dislikeCount','tags','favoriteCount','videoId','channelId','categoryId']
df1.head()
#import numpy as np
#numeric_dtype = ['viewCount','commentCount','likeCount','dislikeCount','favoriteCount']
#for i in numeric_dtype:
# df1[i] = df[i].astype(int)
NIN = df1[df1['channelTitle']=='Nine Inch Nails']
NIN.head()
NIN = NIN.sort_values(ascending=False,by='viewCount')
plt.bar(range(NIN.shape[0]),NIN['viewCount'])
plt.xticks(range(NIN.shape[0]),NIN['Title'],rotation=90)
plt.ylabel('viewCount in 100 millions')
plt.show()