Convert geojson to geopandas dataframe - dataframe

I am using OpenRouteService API and I am trying to convert the GeoJSON result of the GET Directions service to a GeoPandas dataframe and, in the end, store it as a spatial PostGIS table. My code so far is:
import pandas as pd
import geopandas as gpd
import sqlalchemy as sa
import openrouteservice
def getroute(lon1, lat1, lon2, lat2):
coords = ((lon1, lat1), (lon2, lat2))
params_route = {'profile': 'foot-walking','coordinates' : coords,
'format_out': 'geojson',
'geometry': 'true','geometry_simplify':'true',
'geometry_format': 'geojson',
'instructions': 'false',
}
geometry = client.directions(**params_route)['features']
print geometry
return geometry
# Creating SQLAlchemy's engine to use
client = openrouteservice.Client(key='myapikey')
lon1=8.34234
lat1=48.23424
lon2=8.34423
lat2=48.26424
myroutes = getroute(lon1, lat1, lon2, lat2)
print myroutes
print type(myroutes)
myroutes = gpd.GeoDataFrame(myroutes)
print myroutes
engine = sa.create_engine('postgresql+psycopg2://username:password#host/database', encoding = 'utf-8')
with engine.connect() as conn, conn.begin():
# Note use of regular Pandas `to_sql()` method.
myroutes['geometry'].to_sql('contents', con=conn, schema='schema', if_exists='replace', index=False)
However, I can't seem to overpass the geojson structure and store it. Can anyone help me? The resulting error is
sqlalchemy.exc.ProgrammingError: (psycopg2.ProgrammingError) can't adapt type 'dict' [SQL: 'INSERT INTO paa.contents (geometry) VALUES (%(geometry)s)'] [parameters: {'geometry': {u'type': u'LineString', u'coordinates': [[8.344268, 48.233826], [8.344147, 48.233507], [8.344098, 48.233435], [8.343945, 48.233136], [8.343853, 48.233047], [8.34332, 48.232736], [8.343098, 48.232473], [8.342861, 48.232307], [8.342711, 48.23224], [8.342328, 48.232159], [8.342045, 48.23209], [8.341843, 48.232035], [8.341711, 48.231946], [8.341092, 48.232163], [8.340386, 48.232388], [8.34028, 48.23245], [8.339983, 48.23274], [8.339451, 48.23315], [8.3393, 48.233316], [8.339219, 48.233457], [8.339185, 48.233646], [8.339372, 48.234329], [8.339367, 48.234539], [8.339262, 48.234685], [8.338886, 48.234971], [8.338431, 48.235181], [8.338327, 48.23528], [8.338234, 48.235495], [8.338176, 48.235798], [8.338105, 48.235955], [8.337919, 48.236102], [8.33725, 48.236483], [8.336922, 48.236771], [8.336726, 48.237039], [8.336421, 48.237391], [8.33621, 48.237641], [8.336115, 48.237759], [8.335913, 48.237947], [8.335782, 48.23804], [8.335572, 48.238146], [8.335367, 48.238292], [8.335175, 48.238458], [8.335038, 48.238638], [8.335097, 48.238674], [8.335049, 48.238932], [8.335044, 48.239155], [8.334709, 48.239726], [8.334583, 48.239904], [8.33455, 48.240095], [8.334344, 48.240506], [8.334089, 48.240776], [8.334175, 48.240817], [8.334326, 48.240799], [8.334562, 48.240779], [8.335146, 48.240961], [8.335056, 48.241105], [8.334592, 48.241447], [8.334338, 48.241616], [8.333982, 48.241818], [8.333449, 48.242185], [8.333166, 48.242623], [8.333047, 48.242774], [8.33289, 48.242884], [8.332437, 48.243097], [8.332313, 48.243212], [8.332203, 48.2434], [8.332093, 48.243811], [8.331966, 48.244102], [8.331775, 48.244413], [8.331649, 48.244575], [8.331717, 48.24471], [8.331836, 48.244822], [8.332961, 48.245226], [8.33325, 48.245292], [8.333439, 48.245365], [8.333781, 48.245519], [8.334241, 48.245794], [8.334417, 48.245979], [8.333901, 48.246311], [8.33362, 48.246637], [8.33304, 48.246836], [8.332729, 48.247071], [8.332437, 48.247353], [8.332278, 48.247583], [8.332271, 48.247685], [8.332345, 48.247923], [8.332441, 48.248093], [8.332291, 48.248137], [8.331258, 48.248526], [8.330556, 48.248909], [8.329865, 48.249228], [8.329128, 48.249545], [8.328832, 48.249737], [8.328606, 48.249949], [8.328412, 48.250198], [8.328342, 48.250322], [8.328084, 48.250757], [8.327975, 48.25103], [8.32782, 48.251499], [8.327715, 48.251941], [8.327707, 48.252051], [8.327735, 48.252168], [8.327871, 48.252433], [8.328022, 48.252827], [8.328051, 48.252982], [8.328067, 48.253367], [8.328094, 48.253482], [8.328188, 48.253678], [8.328516, 48.253748], [8.329388, 48.253956], [8.329619, 48.25405], [8.32993, 48.254114], [8.330179, 48.254184], [8.330565, 48.254448], [8.33078, 48.254627], [8.330909, 48.254812], [8.331049, 48.255072], [8.331165, 48.255189], [8.331417, 48.25535], [8.331592, 48.255536], [8.331745, 48.255884], [8.331778, 48.256163], [8.331733, 48.256781], [8.331604, 48.257332], [8.332141, 48.257903], [8.332452, 48.258317], [8.332688, 48.258781], [8.332668, 48.259148], [8.332765, 48.259448], [8.33286, 48.259582], [8.333589, 48.259789], [8.333881, 48.259898], [8.334074, 48.259932], [8.334615, 48.260141], [8.334832, 48.260261], [8.335546, 48.260712], [8.335655, 48.260829], [8.335753, 48.260994], [8.335783, 48.261319], [8.33623, 48.261624], [8.337095, 48.261891], [8.337525, 48.262004], [8.33783, 48.262411], [8.337898, 48.262441], [8.337994, 48.262433], [8.338356, 48.26232], [8.338735, 48.262012], [8.339091, 48.261771], [8.339439, 48.261581], [8.339604, 48.261778], [8.339748, 48.261829], [8.339754, 48.26183], [8.33996, 48.262052], [8.340984, 48.262661], [8.341287, 48.262828], [8.341604, 48.262945], [8.342296, 48.263073], [8.343026, 48.263176], [8.343188, 48.263176], [8.343387, 48.263132], [8.3438, 48.262989], [8.343999, 48.26297], [8.344228, 48.263014], [8.344626, 48.263142], [8.344987, 48.263166], [8.345244, 48.263242], [8.344865, 48.263233], [8.344067, 48.263207], [8.343897, 48.263233], [8.343478, 48.263529], [8.343433, 48.263552]]}}]

Finally got it:
geometry = client.directions(**params_route)['routes'][0]
geometry = pd.DataFrame({k : pd.Series(v) for k, v in geometry.iteritems()})
geometry = geometry[:-2]
geometry['coordinates'] = geometry['geometry'].apply(Point)
geometry['myline'] = 1
geometry = gpd.GeoDataFrame(geometry, geometry='coordinates')
geometry = geometry.groupby('myline')['geometry'].apply(lambda x: LineString(x.tolist()))
geometry = gpd.GeoDataFrame(geometry, geometry='geometry')
myroute = LineString(geometry['geometry'].iloc[0]).wkb_hex
# Update table:
insert_query = """UPDATE schema.contents SET geom = ST_GeomFromWKB(%(geometry)s::geometry, 4326) WHERE id='1'"""
engine.execute(insert_query, geometry=myroute)

Related

ValueError: NaTType does not support timetuple when converting a dataframe to dictionary using to_dict('records')

I'm running this flask app
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS, cross_origin
import json
import pandas as pd
# Create the app object
app = Flask(__name__)
cors = CORS(app, resources= {r"/*": {'origins' : "*"}})
# importing function for calculations
from Record_Matching import Matching
#app.route("/query", methods = ['get'])
#cross_origin()
def query():
# service_account_creds = request.json
query1 = request.args.get('query1', type = str)
query2 = request.args.get('query2', type = str)
querycolumns = request.args.get('querycolumns')
project_id = request.args.get('project_id', type = str)
service_account_creds = request.args.get('service_account')
SS = request.args.get('SS', type = float)
TT = request.args.get('TT', type = float)
result = Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns)
return result
if __name__ == "__main__":
app.run(host="localhost", port=8080, debug=True)
and I'm importing the matching function from this python scripts
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import recordlinkage
from recordlinkage.preprocessing import phonetic
from pandas.io.json import json_normalize
import uuid
from uuid import uuid4
import random
import string
import json
import ast
# Results to data frame function
def gcp2df(sql, client):
query = client.query(sql)
results = query.result()
return results.to_dataframe()
# Exporting df to bigquery - table parameter example: "dataset.tablename"
# def insert(df, table):
# client = bigquery.Client()
# job_config = bigquery.LoadJobConfig(write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
# return client.load_table_from_dataframe(df, table, job_config = job_config)
def pair(df1, df2, TT, querycolumns):
# function to take pair from list and compare:
L = querycolumns
l=len(querycolumns)
p1=0
p2=1
# To generate phonetics we need to make sure all names are in english.
# thus we'll replace non-english words by random english strings
df1[L[p1]] = df1[L[p1]].astype(str)
df2[L[p2]] = df2[L[p2]].astype(str)
for i in range(0,len(df1)):
if df1[L[p1]][i].isascii() == False:
df1[L[p1]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
for i in range(0,len(df2)):
if df2[L[p2]][i].isascii() == False:
df2[L[p2]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
compare = recordlinkage.Compare()
df1["phonetic_given_name"] = phonetic(df1[L[p1]], "soundex")
df2["phonetic_given_name"] = phonetic(df2[L[p2]], "soundex")
df1["initials"] = (df1[L[p1]].str[0] + df1[L[p1]].str[-1])
df2["initials"] = (df2[L[p2]].str[0] + df2[L[p2]].str[-1])
indexer = recordlinkage.Index()
indexer.block('initials')
candidate_links = indexer.index(df1, df2)
compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name")
# O(n) a function that uses two pointers to track consecutive pairs for the input list
while p2 <=l:
compare.string(L[p1], L[p2], method='jarowinkler',threshold = TT, label=L[p1])
p1+=2
p2+=2
features = compare.compute(candidate_links,df1, df2)
return features
def Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns):
service_account_creds = ast.literal_eval(service_account_creds)
credentials = service_account.Credentials(service_account_creds, service_account_creds['client_email'],
service_account_creds['token_uri'])
job_config = bigquery.LoadJobConfig()
client = bigquery.Client( project = project_id)
SS=int(SS)
TT=float(TT)
df1 = gcp2df("""{}""".format(query1), client)
df2 = gcp2df("""{}""".format(query2), client)
querycolumns = json.loads(querycolumns)
querycolumns = list(querycolumns.values())
features = pair(df1, df2, TT, querycolumns)
features['Similarity_score'] = features.sum(axis=1)
features = features[features['Similarity_score']>=SS].reset_index()
final = features[['level_0', 'level_1']]
final.rename(columns= {'level_0':'df1_index', 'level_1':'df2_index'}, inplace= True)
final['Unique_ID'] = [uuid.uuid4() for _ in range(len(final.index))]
final['Unique_ID'] = final['Unique_ID'].astype(str)
final['Similarity_Score'] = SS
final_duplicates = final['df1_index'].value_counts().max()
# insert(final,"test-ahmed-project.Record_Linkage.Matching_Indices")
message = "Mission accomplished!, your highest number of duplicates is " + str(final_duplicates)
return {'message':message,'final':final.to_dict('records'), 'df1':df1.to_dict('records')}
I'm not sure why when I return df1 as a dictionary it shows ValueError error when I try to to use the function from flask app, but when I run it in a jupytor notebook using the same dataframe that I'm taking from bigquery, it works just fine, so why does it not work on the flask app?
I tried to_dict('record') to convert a dataframe to a dictionary,
it looking online many resources suggest the error exists because the data contains missing values, but it shouldn't be a problem because when I try converting the same dataframe to dictionary in jupyter notebook it works just fine.

ProgrammingError when trying to skip duplicate data in postgres sql

PostGres SQL will not accept data which is in violation of primary key. To ignore the duplicate data, I have this code:
import pandas as pd
import psycopg2
import os
import matplotlib
from sqlalchemy import create_engine
from tqdm import tqdm_notebook
from pandas_datareader import data as web
import datetime
from dateutil.relativedelta import relativedelta
db_database = os.environ.get('123')
engine = create_engine('postgresql://postgres:{}#localhost:5433/stockdata'.format(123))
def import_data(Symbol):
df = web.DataReader(Symbol, 'yahoo',start=datetime.datetime.now()-relativedelta(days=3), end= datetime.datetime.now())
insert_init = """INSERT INTO stockprices
(Symbol, Date, Volume, Open, Close, High, Low)
VALUES
"""
vals = ",".join(["""('{}','{}','{}','{}','{}','{}','{}')""".format(
Symbol,
Date,
row.High,
row.Low,
row.Open,
row.Close,
row.Volume,
) for Date, row in df.iterrows()])
insert_end ="""ON CONFLICT (Symbol, Date) DO UPDATE
SET
Volume = EXCLUDED.Volume,
Open = EXCLUDED.Open,
Close = EXCLUDED.Close,
Low = EXCLUDED.Low,
High = EXCLUDED.High
"""
query = insert_init + vals + insert_end
engine.execute(query)
import_data('aapl')
I am getting this error:
ProgrammingError: (psycopg2.errors.UndefinedColumn) column "symbol" of relation "stockprices" does not exist
LINE 2: (Symbol,Date, Volume, Open, Close, H...
^
[SQL: INSERT INTO stockprices
Could you please advise as to what does this error mean? I got rid of all the double quotes as advised in the comment.
I had used this code to create the table:
def create_price_table(symbol):
print(symbol)
df = web.DataReader(symbol, 'yahoo', start=datetime.datetime.now()-relativedelta(days=7), end= datetime.datetime.now())
df['Symbol'] = symbol
df.to_sql(name = "stockprices", con = engine, if_exists='append', index = True)
return 'daily prices table created'
create_price_table('amzn')
Also as was mentioned in the comment. I used this to check the table name:
SELECT table_name
FROM information_schema.tables
WHERE table_schema='public'
AND table_type='BASE TABLE';
Edit 1:
I changed the code as suggested in the comment, now the column name is in small case. Below is the code:
import pandas as pd
import psycopg2
import os
import matplotlib
from sqlalchemy import create_engine
from tqdm import tqdm_notebook
from pandas_datareader import data as web
import datetime
from dateutil.relativedelta import relativedelta
db_database = os.environ.get('123')
engine = create_engine('postgresql://postgres:{}#localhost:5433/stockdata'.format(123))
def create_price_table(symbol):
print(symbol)
df = web.DataReader(symbol, 'yahoo', start=datetime.datetime.now()-relativedelta(days=7), end= datetime.datetime.now())
df['symbol'] = symbol
df = df.rename(columns= {'Open':'open'})
df = df.rename(columns= {'Close':'close'})
df = df.rename(columns= {'High':'high'})
df = df.rename(columns= {'Low':'low'})
df = df.rename(columns= {'Volume':'volume'})
df = df.rename(columns= {'Adj Close':'adj_close'})
df.index.name ='date'
df.to_sql(name = "stockprices", con = engine, if_exists='append', index = True)
return 'daily prices table created'
# create_price_table('amzn')
def import_data(Symbol):
df = web.DataReader(Symbol, 'yahoo', start=datetime.datetime.now()-relativedelta(days=3), end= datetime.datetime.now())
insert_init = """INSERT INTO stockprices
(symbol, date, volume, open, close, high, low)
VALUES
"""
vals = ",".join(["""('{}','{}','{}','{}','{}','{}','{}')""".format(
Symbol,
Date,
row.High,
row.Low,
row.Open,
row.Close,
row.Volume,
) for Date, row in df.iterrows()])
insert_end ="""ON CONFLICT (Symbol, Date) DO UPDATE
SET
Volume = EXCLUDED.Volume,
Open = EXCLUDED.Open,
Close = EXCLUDED.Close,
Low = EXCLUDED.Low,
High = EXCLUDED.High
"""
query = insert_init + vals + insert_end
engine.execute(query)
import_data('aapl')
This code however is producing a new error:
DataError: (psycopg2.errors.InvalidTextRepresentation) invalid input syntax for type bigint: "166.14999389648438"
LINE 4: ('aapl','2022-02-23 00:00:00','166.14999...
^
Per my comment you have two issues:
You are trying to INSERT a float value(166.14999389648438) into an integer field. First thing to figure out is why the mismatch? Do really want the database field to be an integer? Second thing is that trying to force a float into an integer will work if the value is being entered as a float/numeric:
select 166.14999389648438::bigint; 166
Though as you see it gets truncated.
It will not work if entered as a string:
ERROR: invalid input syntax for type bigint: "166.14999389648438"
Which is what you are doing. This leads to the second issue below.
You are not using proper Parameter passing as shown in the link. Where among other things is the warning:
Warning
Never, never, NEVER use Python string concatenation (+) or string parameters interpolation (%) to pass variables to a SQL query string. Not even at gunpoint.
For the purposes of this question the important part is that using parameter passing will result in proper type adaptation.

input must be an array, list, tuple or scalar pyproj

I Have a DF in which I am trying to convert the eastings/northings to long/lats. My df looks like this:
import pandas as pd
import numpy as np
import pyproj
Postcode Eastings Northings
0 AB101AB 394235 806529
1 AB101AF 394181 806429
2 AB101AG 394230 806469
3 AB101AH 394371 806359
4 AB101AL 394296 806581
I am using a well know code block to convert the eastings and northings to long/lats and add those long/lats as new columns to the df:
def proj_transform(df):
bng = pyproj.Proj("+init=EPSG:27700")
wgs84 = pyproj.Proj("+init=EPSG:4326")
lats = pd.Series()
lons = pd.Series()
for idx, val in enumerate(df['Eastings']):
lon, lat = pyproj.transform(bng, wgs84, df['Eastings'][idx], df['Northings'][idx])
lats.set_value(idx, lat)
lons.set_value(idx, lon)
df['lat'] = lats
df['lon'] = lons
return df
df_transform = proj_transform(my_df)
However, I keep getting the following error, "input must be an array, list, tuple or scalar". Does anyone have any insight into where I am going wrong here?
This is the fastest method:
https://gis.stackexchange.com/a/334307/144357
from pyproj import Transformer
trans = Transformer.from_crs(
"EPSG:27700",
"EPSG:4326",
always_xy=True,
)
xx, yy = trans.transform(my_df["Eastings"].values, my_df["Northings"].values)
my_df["X"] = xx
my_df["Y"] = yy
Also helpful for reference:
https://pyproj4.github.io/pyproj/stable/gotchas.html#upgrading-to-pyproj-2-from-pyproj-1
https://pyproj4.github.io/pyproj/stable/gotchas.html#init-auth-auth-code-should-be-replaced-with-auth-auth-code
https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
You can use DataFrame.apply with axis=1 and change function like:
def proj_transform(x):
e = x['Eastings']
n = x['Northings']
bng = pyproj.Proj("+init=EPSG:27700")
wgs84 = pyproj.Proj("+init=EPSG:4326")
lon, lat = pyproj.transform(bng, wgs84, e, n)
return pd.Series([lon, lat])
my_df[['lat','lon']] = my_df.apply(proj_transform, axis=1)

Writing Data from pandas dataframe to PostgreSQL gives error of 'DataFrame' objects are mutable, thus they cannot be hashed

i am trying to save a data frame which was first imported in pandas from postgresql as dfraw and then do some manipulation and create another dataframe as df and save it back in postgresql same database using sql alchemy. but when i am trying to save it back its giving error of 'DataFrame' objects are mutable, thus they cannot be hashed
PFB code below
import psycopg2
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
# connect the database to python
# Update connection string information
host = "something.something.azure.com"
dbname = "abcd"
user = "abcd"
password = "abcd"
sslmode = "require"
schema = 'xyz'
# Construct connection string
conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode)
conn = psycopg2.connect(conn_string)
print("Connection established")
cursor = conn.cursor()
# Fetch all rows from table
cursor.execute("SELECT * FROM xyz.abc;")
rows = cursor.fetchall()
# Convert the tuples in dataframes
dfraw = pd.DataFrame(rows, columns =["ID","Timestamp","K","S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"])
dfraw[["S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"]] = dfraw[["S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"]].apply(pd.to_numeric)
dfraw[["Timestamp","K"]]=dfraw[["Timestamp","K"]].apply(pd.to_datetime)
# Creating temp files
temp1 = dfraw
dfraw = temp1
# creating some fucntions for data manipulation and imputations
def remZero(df,dropCol):
for k in df.drop(dropCol,axis=1):
if all(df[k] == 0):
continue
if any(df[k] == 0):
print(k)
df[k] = df[k].replace(to_replace=0, method='ffill')
return df
# Drop Columns function
dropCol = ['Timestamp','K','ID','H','C','S']
dropCol2 = ['Timestamp','K','ID','Shift']
df = remZero(dfraw,dropCol)
from sqlalchemy import create_engine
engine = create_engine('postgresql://abcd:abcd#something.something.azure.com:5432/abcd')
df.to_sql(name = df,
con=engine,
index = False,
if_exists= 'replace'
)
Error Message
Found basic error in the code I just missed putting the inverted comma before the data frame name to be published. The basic hygiene was missed
df.to_sql(name = "df",
con=engine,
index = False,
if_exists= 'replace'
)

Concatenate Data From URLS Recursively Inside one DataFrame

I'm trying to create one dataframe with data from multiple urls I'm scraping. The code works however I'm unable to store the data in one DataFrame recursively. The DataFrame (called frame) is replaced with a new url's data each time rather than having the new data concatenated to the same frame. Thank you, I deeply appreciate your help!
import urllib
import re
import json
import pandas
import pylab
import numpy
import matplotlib.pyplot
from pandas import *
from pylab import *
from threading import Thread
import sqlite3
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
i=0
regex = '<p class="commentText">(.+?)</p>'
regex2 = '<strong>Easiness</strong><span>(.+?)</span></p>'
regex3 = 'Helpfulness</strong><span>(.+?)</span></p>'
regex4 = 'Clarity</strong><span>(.+?)</span></p>'
regex5 = 'Rater Interest</strong><span>(.+?)</span></p>'
regex6 = '<div class="date">(.+?)</div>'
regex7 = '<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'
regex8 = '<meta name="prof_name" content="(.+?)"/>'
pattern = re.compile(regex)
easiness = re.compile(regex2)
helpfulness = re.compile(regex3)
clarity = re.compile(regex4)
interest = re.compile(regex5)
date = re.compile(regex6)
mathclass = re.compile(regex7)
prof_name = re.compile(regex8)
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frame = DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]})
print frame
Use pd.concat:
frames = []
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frames.append(DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]}))
pd.concat(frames)
You are overwriting your frame with each iteration of the loop. As Phillip Cloud suggested, you can make a list of frames that you append with each loop. I simplified your code differently, but I think this gives you what you want.
import urllib
import re
import pandas as pd
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
regex = {'pattern' : re.compile('<p class="commentText">(.+?)</p>'),
'easiness' : re.compile('<strong>Easiness</strong><span>(.+?)</span></p>'),
'helpfulness' : re.compile('Helpfulness</strong><span>(.+?)</span></p>'),
'clarity' : re.compile('Clarity</strong><span>(.+?)</span></p>'),
'interest' : re.compile('Rater Interest</strong><span>(.+?)</span></p>'),
'date' : re.compile('<div class="date">(.+?)</div>'),
'mathclass' : re.compile('<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'),
'prof_name' : re.compile('<meta name="prof_name" content="(.+?)"/>')}
# Make a dictionary with empty lists using the same keys
d = {}
for k in regex.keys():
d[k] = []
# Now fill those lists
for url in urls:
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
for k, v in regex.iteritems():
d[k].append(re.findall(v, htmltext))
frame = pd.DataFrame(d) # Dump the dict into a DataFrame
print frame