from bs4 import BeautifulSoup
from selenium import webdriver
import urllib2
import time
driver = webdriver.Chrome()
driver.maximize_window()
driver.get("https://www.zillow.com/homes/recently_sold/Culver-City-CA/house,condo,apartment_duplex,townhouse_type/20432063_zpid/51617_rid/12m_days/globalrelevanceex_sort/34.048605,-118.340178,33.963223,-118.47785_rect/12_zm/")
time.sleep(3)
driver.find_element_by_class_name("collapsible-header").click()
soup = BeautifulSoup(driver.page_source)
region = soup.find("div",{"id":"hdp-price-history"})
table = region.find('table',{'class':'zsg-content-component'})
print table
I need crawler price history table but the result is always None
Here is the script which can bring you the result you are trying to get.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("https://www.zillow.com/homes/recently_sold/Culver-City-CA/house,condo,apartment_duplex,townhouse_type/20432063_zpid/51617_rid/12m_days/globalrelevanceex_sort/34.048605,-118.340178,33.963223,-118.47785_rect/12_zm/")
driver.find_element_by_class_name("collapsible-header").click()
time.sleep(5)
tree = BeautifulSoup(driver.page_source,"lxml")
driver.quit()
table_tag = tree.select("table.zsg-table")[0]
tab_data = [[item.text for item in row_data.select("th,td")]
for row_data in table_tag.select("tr")]
for data in tab_data:
print(' '.join(data))
Partial result:
Date Event Price Agents
06/16/17 Sold $940,000-0.9% K. Miller, A. Masket
06/14/17 Price change $949,000-1.0%
05/08/17 Pending sale $959,000
04/17/17 Price change $959,000+1.1%
02/27/17 Pending sale $949,000
If it serves the purpose, don't forget to mark this as your selected answer.
Related
I want to scrape table values in the link below from 2022-06-20 to 2022-06-24:
"https://markets.businessinsider.com/earnings-calendar"
My code block below returns all the links except the ones in the table:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wd.maximize_window()
wd.get("https://markets.businessinsider.com/earnings-calendar")
wd.implicitly_wait(10)
wd.find_element(By.XPATH,"//div[#class='calendar__advanced-search ajax-search__advanced-toggle']").click()
elem = WebDriverWait(wd, 1).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#calendar-input-from')))
elem.send_keys("2022-06-20")
elem = WebDriverWait(wd, 1).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#calendar-input-to')))
elem.send_keys("2022-06-24")
elems = wd.find_elements_by_xpath("//a[#href]")
for elem in elems:
print(elem.get_attribute("href"))
Today is Sunday and the table is empty so I think that's why it finds no links in the table. But my aim is to find the links in the table for the two dates I specified above.
Any help is appreciated!
Hi I have a csv file with items like this
product_id,url
100,https://url/p/Cimory-Yogurt-Squeeze-Original-120-g-745133
"1000,""https://url/p/OREO-Biskuit-Dark-&-White-Chocolate-123,5-g-559227"""
1002,https:/url/p/GARNIER-Micellar-Cleansing-Water-Sensitive-Skin-Pink-125-ml-371378
I tried using
import pandas as pd
productUrl = pd.read_csv('productUrl.csv',sep=","quotechar='"')
It returns back as
product_id
url
100
https://url/p/Cimory-Yogurt-Squeeze-Original-120-g-745133
1000,"https://url/p/OREO-Biskuit-Dark-&-White-Chocolate-123,5-g-559227"
1002
https:/url/p/GARNIER-Micellar-Cleansing-Water-Sensitive-Skin-Pink-125-ml-371378
How do I read the csv? Because the url has commas in there too.
You do not need the quotechar='"', simply read it as is:
pd.read_csv('productUrl.csv')
Be aware that your pandas.read_csv() example wont work cause it is missing a , between parameters
Example
import pandas as pd
from io import StringIO
csvString = """product_id,url
100,https://url/p/Cimory-Yogurt-Squeeze-Original-120-g-745133
1000,"https://url/p/OREO-Biskuit-Dark-&-White-Chocolate-123,5-g-559227"
1002,https:/url/p/GARNIER-Micellar-Cleansing-Water-Sensitive-Skin-Pink-125-ml-371378"""
pd.read_csv(StringIO(csvString))
Output
product_id
url
0
100
https://url/p/Cimory-Yogurt-Squeeze-Original-120-g-745133
1
1000
https://url/p/OREO-Biskuit-Dark-&-White-Chocolate-123,5-g-559227
2
1002
https:/url/p/GARNIER-Micellar-Cleansing-Water-Sensitive-Skin-Pink-125-ml-371378
i am struggling with this problem now for some time and need some help. I reach out to the website "https://www.finanzen.net/boersenkurse" and want to extract the table whoch is the part "Meistgesuchte Aktien". As there are some iun the document, i am getting also the other tables, which i am not interested in.
I want to create a Dataframe out of the data. So each row shoul look the same as on the website. Means that Name = SAP, Kurs 96,33, ect.
'''
from bs4 import BeautifulSoup
import requests
URL = "https://www.finanzen.net/boersenkurse"
html = requests.get(URL, {}).text
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all("tr")
tables
'''
I do noct get how to choose only the relevant tr. If someone has any idea, please let me know. Thanks in advance!
from bs4 import BeautifulSoup
import requests
URL = "https://www.finanzen.net/boersenkurse"
table = soup.find_all("table", class_="table-overflow-hidden")[1]
#Extracting heading of the columns of the table.
rows = table.find_all('tr')
columns=[]
headings = rows[0].find_all('th')
for col in headings:
columns.append(col.text.strip())
print(columns)
#Extracting all data of the table row wise.
all_data=[]
for row in rows[1:]:
data = row.find_all('td')
lst=[]
for d in data:
lst.append(d.text.strip())
all_data.append(lst)
#Creating the dataframe out of the extracted data.
ds = pd.DataFrame(all_data, columns=columns)
ds
Easier to use pandas and index in for table
import pandas as pd
pd.read_html('https://www.finanzen.net/boersenkurse')[2]
soup still using pandas at end:
from bs4 import BeautifulSoup as bs
import requests
import pandas
r = requests.get('https://www.finanzen.net/boersenkurse')
soup = bs(r.content, 'lxml')
t = soup.select_one('div:nth-child(4) table')
pd.DataFrame([[i.text.strip() for i in r.select('th,td')] for r in t.select('tr')])
This is so similar to other posts on SO e.g. here, i just can't see what I'm doing wrong.
I want to scrape the box labelled 'activity' on this page, and I want the output to look like this:
So you can see the two main features of interest compared to the original webpage (1) combining multiple tables into one table, by just creating a new column if the column is not already seen and (2) I want to extract the actual href for that column as opposed to just the name e.g. 'Jacobsen et al' because I was to eventually extract the PMID value (an integer) from the href.
These are my two goals, I wrote this code:
import requests
import pandas as pd
from bs4 import BeautifulSoup
for i in range(23,24):
# try:
res = requests.get("http://www.conoserver.org/index.php?page=card&table=protein&id=" + str(i))
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table',{'class':'activitytable'})
for each_table in table:
#this can print references
print(each_table.a)
#this can print the data frames
df = pd.read_html(str(each_table))
print(df)
#how to combine the two?
Can someone tell me the correct way to print the href individually for each row of each table (e.g essentially so it adds an extra column to each table with the actual href?; so it should print out three tables, with an extra href column in each table)
Then I can try focus on how to combine the tables, I've just mentioned the ultimate goal here in case someone can think of a more pythonic way of killing two birds with one stone/in case it helps but I think they're different issues.
You can initialise a final dataframe. Then as you iterate, store the href as a variable string then add that column to the sub table dataframe. Then you'll just keep appending those dataframes to a final dataframe:
import requests
import pandas as pd
from bs4 import BeautifulSoup
# Initalized empty "final" dataframe
final_df = pd.DataFrame()
for i in range(20,24):
# try:
res = requests.get("http://www.conoserver.org/index.php?page=card&table=protein&id=" + str(i))
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table',{'class':'activitytable'})
for each_table in table:
#Store the href
href = each_table.a['href']
#Get the table
df = pd.read_html(str(each_table))[0]
#Put that href in the column 'ref'
df['ref'] = href
# Append that dataframe into your final dataframe, and repeat
final_df = final_df.append(df, sort=True).reset_index(drop=True)
How do i call data for the dates and values for any shares?
Example: I want to call the stock price and date for apple shares only for dec 2016, dec 2017.
Here is what I've tried:
import pandas as pd
pd.core.common.is_list_like = pd.api.types.is_list_like
import pandas_datareader.data as web
import numpy as np
from matplotlib import style
import matplotlib.pyplot as plt
import datetime as dt
start = dt.datetime(2013,10,1)
end= dt.datetime(2018,4,30)
AAPL_data=[]
AAPL= web.DataReader('AAPL','iex', start, end)
AAPL_data.append(AAPL)
AAPL.loc['2016-12-01':'2016-12-31']
AAPL.loc['2017-12-01':'2017-12-31']
both work for me, for December '16/'17 data respectively. Pandas datetime indexing is very flexible and intuitive, see more examples here.
Incidentally, if you're using the new release of pandas-datareader (0.7.0), you no longer need to insert pd.core.common.is_list_like = pd.api.types.is_list_like.