Convert geojson to geopandas dataframe - dataframe
I am using OpenRouteService API and I am trying to convert the GeoJSON result of the GET Directions service to a GeoPandas dataframe and, in the end, store it as a spatial PostGIS table. My code so far is:
import pandas as pd
import geopandas as gpd
import sqlalchemy as sa
import openrouteservice
def getroute(lon1, lat1, lon2, lat2):
coords = ((lon1, lat1), (lon2, lat2))
params_route = {'profile': 'foot-walking','coordinates' : coords,
'format_out': 'geojson',
'geometry': 'true','geometry_simplify':'true',
'geometry_format': 'geojson',
'instructions': 'false',
}
geometry = client.directions(**params_route)['features']
print geometry
return geometry
# Creating SQLAlchemy's engine to use
client = openrouteservice.Client(key='myapikey')
lon1=8.34234
lat1=48.23424
lon2=8.34423
lat2=48.26424
myroutes = getroute(lon1, lat1, lon2, lat2)
print myroutes
print type(myroutes)
myroutes = gpd.GeoDataFrame(myroutes)
print myroutes
engine = sa.create_engine('postgresql+psycopg2://username:password#host/database', encoding = 'utf-8')
with engine.connect() as conn, conn.begin():
# Note use of regular Pandas `to_sql()` method.
myroutes['geometry'].to_sql('contents', con=conn, schema='schema', if_exists='replace', index=False)
However, I can't seem to overpass the geojson structure and store it. Can anyone help me? The resulting error is
sqlalchemy.exc.ProgrammingError: (psycopg2.ProgrammingError) can't adapt type 'dict' [SQL: 'INSERT INTO paa.contents (geometry) VALUES (%(geometry)s)'] [parameters: {'geometry': {u'type': u'LineString', u'coordinates': [[8.344268, 48.233826], [8.344147, 48.233507], [8.344098, 48.233435], [8.343945, 48.233136], [8.343853, 48.233047], [8.34332, 48.232736], [8.343098, 48.232473], [8.342861, 48.232307], [8.342711, 48.23224], [8.342328, 48.232159], [8.342045, 48.23209], [8.341843, 48.232035], [8.341711, 48.231946], [8.341092, 48.232163], [8.340386, 48.232388], [8.34028, 48.23245], [8.339983, 48.23274], [8.339451, 48.23315], [8.3393, 48.233316], [8.339219, 48.233457], [8.339185, 48.233646], [8.339372, 48.234329], [8.339367, 48.234539], [8.339262, 48.234685], [8.338886, 48.234971], [8.338431, 48.235181], [8.338327, 48.23528], [8.338234, 48.235495], [8.338176, 48.235798], [8.338105, 48.235955], [8.337919, 48.236102], [8.33725, 48.236483], [8.336922, 48.236771], [8.336726, 48.237039], [8.336421, 48.237391], [8.33621, 48.237641], [8.336115, 48.237759], [8.335913, 48.237947], [8.335782, 48.23804], [8.335572, 48.238146], [8.335367, 48.238292], [8.335175, 48.238458], [8.335038, 48.238638], [8.335097, 48.238674], [8.335049, 48.238932], [8.335044, 48.239155], [8.334709, 48.239726], [8.334583, 48.239904], [8.33455, 48.240095], [8.334344, 48.240506], [8.334089, 48.240776], [8.334175, 48.240817], [8.334326, 48.240799], [8.334562, 48.240779], [8.335146, 48.240961], [8.335056, 48.241105], [8.334592, 48.241447], [8.334338, 48.241616], [8.333982, 48.241818], [8.333449, 48.242185], [8.333166, 48.242623], [8.333047, 48.242774], [8.33289, 48.242884], [8.332437, 48.243097], [8.332313, 48.243212], [8.332203, 48.2434], [8.332093, 48.243811], [8.331966, 48.244102], [8.331775, 48.244413], [8.331649, 48.244575], [8.331717, 48.24471], [8.331836, 48.244822], [8.332961, 48.245226], [8.33325, 48.245292], [8.333439, 48.245365], [8.333781, 48.245519], [8.334241, 48.245794], [8.334417, 48.245979], [8.333901, 48.246311], [8.33362, 48.246637], [8.33304, 48.246836], [8.332729, 48.247071], [8.332437, 48.247353], [8.332278, 48.247583], [8.332271, 48.247685], [8.332345, 48.247923], [8.332441, 48.248093], [8.332291, 48.248137], [8.331258, 48.248526], [8.330556, 48.248909], [8.329865, 48.249228], [8.329128, 48.249545], [8.328832, 48.249737], [8.328606, 48.249949], [8.328412, 48.250198], [8.328342, 48.250322], [8.328084, 48.250757], [8.327975, 48.25103], [8.32782, 48.251499], [8.327715, 48.251941], [8.327707, 48.252051], [8.327735, 48.252168], [8.327871, 48.252433], [8.328022, 48.252827], [8.328051, 48.252982], [8.328067, 48.253367], [8.328094, 48.253482], [8.328188, 48.253678], [8.328516, 48.253748], [8.329388, 48.253956], [8.329619, 48.25405], [8.32993, 48.254114], [8.330179, 48.254184], [8.330565, 48.254448], [8.33078, 48.254627], [8.330909, 48.254812], [8.331049, 48.255072], [8.331165, 48.255189], [8.331417, 48.25535], [8.331592, 48.255536], [8.331745, 48.255884], [8.331778, 48.256163], [8.331733, 48.256781], [8.331604, 48.257332], [8.332141, 48.257903], [8.332452, 48.258317], [8.332688, 48.258781], [8.332668, 48.259148], [8.332765, 48.259448], [8.33286, 48.259582], [8.333589, 48.259789], [8.333881, 48.259898], [8.334074, 48.259932], [8.334615, 48.260141], [8.334832, 48.260261], [8.335546, 48.260712], [8.335655, 48.260829], [8.335753, 48.260994], [8.335783, 48.261319], [8.33623, 48.261624], [8.337095, 48.261891], [8.337525, 48.262004], [8.33783, 48.262411], [8.337898, 48.262441], [8.337994, 48.262433], [8.338356, 48.26232], [8.338735, 48.262012], [8.339091, 48.261771], [8.339439, 48.261581], [8.339604, 48.261778], [8.339748, 48.261829], [8.339754, 48.26183], [8.33996, 48.262052], [8.340984, 48.262661], [8.341287, 48.262828], [8.341604, 48.262945], [8.342296, 48.263073], [8.343026, 48.263176], [8.343188, 48.263176], [8.343387, 48.263132], [8.3438, 48.262989], [8.343999, 48.26297], [8.344228, 48.263014], [8.344626, 48.263142], [8.344987, 48.263166], [8.345244, 48.263242], [8.344865, 48.263233], [8.344067, 48.263207], [8.343897, 48.263233], [8.343478, 48.263529], [8.343433, 48.263552]]}}]
Finally got it:
geometry = client.directions(**params_route)['routes'][0]
geometry = pd.DataFrame({k : pd.Series(v) for k, v in geometry.iteritems()})
geometry = geometry[:-2]
geometry['coordinates'] = geometry['geometry'].apply(Point)
geometry['myline'] = 1
geometry = gpd.GeoDataFrame(geometry, geometry='coordinates')
geometry = geometry.groupby('myline')['geometry'].apply(lambda x: LineString(x.tolist()))
geometry = gpd.GeoDataFrame(geometry, geometry='geometry')
myroute = LineString(geometry['geometry'].iloc[0]).wkb_hex
# Update table:
insert_query = """UPDATE schema.contents SET geom = ST_GeomFromWKB(%(geometry)s::geometry, 4326) WHERE id='1'"""
engine.execute(insert_query, geometry=myroute)
Related
ValueError: NaTType does not support timetuple when converting a dataframe to dictionary using to_dict('records')
I'm running this flask app from flask import Flask, request, jsonify, render_template from flask_cors import CORS, cross_origin import json import pandas as pd # Create the app object app = Flask(__name__) cors = CORS(app, resources= {r"/*": {'origins' : "*"}}) # importing function for calculations from Record_Matching import Matching #app.route("/query", methods = ['get']) #cross_origin() def query(): # service_account_creds = request.json query1 = request.args.get('query1', type = str) query2 = request.args.get('query2', type = str) querycolumns = request.args.get('querycolumns') project_id = request.args.get('project_id', type = str) service_account_creds = request.args.get('service_account') SS = request.args.get('SS', type = float) TT = request.args.get('TT', type = float) result = Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns) return result if __name__ == "__main__": app.run(host="localhost", port=8080, debug=True) and I'm importing the matching function from this python scripts import pandas as pd from google.cloud import bigquery from google.oauth2 import service_account import recordlinkage from recordlinkage.preprocessing import phonetic from pandas.io.json import json_normalize import uuid from uuid import uuid4 import random import string import json import ast # Results to data frame function def gcp2df(sql, client): query = client.query(sql) results = query.result() return results.to_dataframe() # Exporting df to bigquery - table parameter example: "dataset.tablename" # def insert(df, table): # client = bigquery.Client() # job_config = bigquery.LoadJobConfig(write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE) # return client.load_table_from_dataframe(df, table, job_config = job_config) def pair(df1, df2, TT, querycolumns): # function to take pair from list and compare: L = querycolumns l=len(querycolumns) p1=0 p2=1 # To generate phonetics we need to make sure all names are in english. # thus we'll replace non-english words by random english strings df1[L[p1]] = df1[L[p1]].astype(str) df2[L[p2]] = df2[L[p2]].astype(str) for i in range(0,len(df1)): if df1[L[p1]][i].isascii() == False: df1[L[p1]][i] = ''.join(random.choices(string.ascii_lowercase, k=5)) for i in range(0,len(df2)): if df2[L[p2]][i].isascii() == False: df2[L[p2]][i] = ''.join(random.choices(string.ascii_lowercase, k=5)) compare = recordlinkage.Compare() df1["phonetic_given_name"] = phonetic(df1[L[p1]], "soundex") df2["phonetic_given_name"] = phonetic(df2[L[p2]], "soundex") df1["initials"] = (df1[L[p1]].str[0] + df1[L[p1]].str[-1]) df2["initials"] = (df2[L[p2]].str[0] + df2[L[p2]].str[-1]) indexer = recordlinkage.Index() indexer.block('initials') candidate_links = indexer.index(df1, df2) compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name") # O(n) a function that uses two pointers to track consecutive pairs for the input list while p2 <=l: compare.string(L[p1], L[p2], method='jarowinkler',threshold = TT, label=L[p1]) p1+=2 p2+=2 features = compare.compute(candidate_links,df1, df2) return features def Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns): service_account_creds = ast.literal_eval(service_account_creds) credentials = service_account.Credentials(service_account_creds, service_account_creds['client_email'], service_account_creds['token_uri']) job_config = bigquery.LoadJobConfig() client = bigquery.Client( project = project_id) SS=int(SS) TT=float(TT) df1 = gcp2df("""{}""".format(query1), client) df2 = gcp2df("""{}""".format(query2), client) querycolumns = json.loads(querycolumns) querycolumns = list(querycolumns.values()) features = pair(df1, df2, TT, querycolumns) features['Similarity_score'] = features.sum(axis=1) features = features[features['Similarity_score']>=SS].reset_index() final = features[['level_0', 'level_1']] final.rename(columns= {'level_0':'df1_index', 'level_1':'df2_index'}, inplace= True) final['Unique_ID'] = [uuid.uuid4() for _ in range(len(final.index))] final['Unique_ID'] = final['Unique_ID'].astype(str) final['Similarity_Score'] = SS final_duplicates = final['df1_index'].value_counts().max() # insert(final,"test-ahmed-project.Record_Linkage.Matching_Indices") message = "Mission accomplished!, your highest number of duplicates is " + str(final_duplicates) return {'message':message,'final':final.to_dict('records'), 'df1':df1.to_dict('records')} I'm not sure why when I return df1 as a dictionary it shows ValueError error when I try to to use the function from flask app, but when I run it in a jupytor notebook using the same dataframe that I'm taking from bigquery, it works just fine, so why does it not work on the flask app? I tried to_dict('record') to convert a dataframe to a dictionary, it looking online many resources suggest the error exists because the data contains missing values, but it shouldn't be a problem because when I try converting the same dataframe to dictionary in jupyter notebook it works just fine.
ProgrammingError when trying to skip duplicate data in postgres sql
PostGres SQL will not accept data which is in violation of primary key. To ignore the duplicate data, I have this code: import pandas as pd import psycopg2 import os import matplotlib from sqlalchemy import create_engine from tqdm import tqdm_notebook from pandas_datareader import data as web import datetime from dateutil.relativedelta import relativedelta db_database = os.environ.get('123') engine = create_engine('postgresql://postgres:{}#localhost:5433/stockdata'.format(123)) def import_data(Symbol): df = web.DataReader(Symbol, 'yahoo',start=datetime.datetime.now()-relativedelta(days=3), end= datetime.datetime.now()) insert_init = """INSERT INTO stockprices (Symbol, Date, Volume, Open, Close, High, Low) VALUES """ vals = ",".join(["""('{}','{}','{}','{}','{}','{}','{}')""".format( Symbol, Date, row.High, row.Low, row.Open, row.Close, row.Volume, ) for Date, row in df.iterrows()]) insert_end ="""ON CONFLICT (Symbol, Date) DO UPDATE SET Volume = EXCLUDED.Volume, Open = EXCLUDED.Open, Close = EXCLUDED.Close, Low = EXCLUDED.Low, High = EXCLUDED.High """ query = insert_init + vals + insert_end engine.execute(query) import_data('aapl') I am getting this error: ProgrammingError: (psycopg2.errors.UndefinedColumn) column "symbol" of relation "stockprices" does not exist LINE 2: (Symbol,Date, Volume, Open, Close, H... ^ [SQL: INSERT INTO stockprices Could you please advise as to what does this error mean? I got rid of all the double quotes as advised in the comment. I had used this code to create the table: def create_price_table(symbol): print(symbol) df = web.DataReader(symbol, 'yahoo', start=datetime.datetime.now()-relativedelta(days=7), end= datetime.datetime.now()) df['Symbol'] = symbol df.to_sql(name = "stockprices", con = engine, if_exists='append', index = True) return 'daily prices table created' create_price_table('amzn') Also as was mentioned in the comment. I used this to check the table name: SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE'; Edit 1: I changed the code as suggested in the comment, now the column name is in small case. Below is the code: import pandas as pd import psycopg2 import os import matplotlib from sqlalchemy import create_engine from tqdm import tqdm_notebook from pandas_datareader import data as web import datetime from dateutil.relativedelta import relativedelta db_database = os.environ.get('123') engine = create_engine('postgresql://postgres:{}#localhost:5433/stockdata'.format(123)) def create_price_table(symbol): print(symbol) df = web.DataReader(symbol, 'yahoo', start=datetime.datetime.now()-relativedelta(days=7), end= datetime.datetime.now()) df['symbol'] = symbol df = df.rename(columns= {'Open':'open'}) df = df.rename(columns= {'Close':'close'}) df = df.rename(columns= {'High':'high'}) df = df.rename(columns= {'Low':'low'}) df = df.rename(columns= {'Volume':'volume'}) df = df.rename(columns= {'Adj Close':'adj_close'}) df.index.name ='date' df.to_sql(name = "stockprices", con = engine, if_exists='append', index = True) return 'daily prices table created' # create_price_table('amzn') def import_data(Symbol): df = web.DataReader(Symbol, 'yahoo', start=datetime.datetime.now()-relativedelta(days=3), end= datetime.datetime.now()) insert_init = """INSERT INTO stockprices (symbol, date, volume, open, close, high, low) VALUES """ vals = ",".join(["""('{}','{}','{}','{}','{}','{}','{}')""".format( Symbol, Date, row.High, row.Low, row.Open, row.Close, row.Volume, ) for Date, row in df.iterrows()]) insert_end ="""ON CONFLICT (Symbol, Date) DO UPDATE SET Volume = EXCLUDED.Volume, Open = EXCLUDED.Open, Close = EXCLUDED.Close, Low = EXCLUDED.Low, High = EXCLUDED.High """ query = insert_init + vals + insert_end engine.execute(query) import_data('aapl') This code however is producing a new error: DataError: (psycopg2.errors.InvalidTextRepresentation) invalid input syntax for type bigint: "166.14999389648438" LINE 4: ('aapl','2022-02-23 00:00:00','166.14999... ^
Per my comment you have two issues: You are trying to INSERT a float value(166.14999389648438) into an integer field. First thing to figure out is why the mismatch? Do really want the database field to be an integer? Second thing is that trying to force a float into an integer will work if the value is being entered as a float/numeric: select 166.14999389648438::bigint; 166 Though as you see it gets truncated. It will not work if entered as a string: ERROR: invalid input syntax for type bigint: "166.14999389648438" Which is what you are doing. This leads to the second issue below. You are not using proper Parameter passing as shown in the link. Where among other things is the warning: Warning Never, never, NEVER use Python string concatenation (+) or string parameters interpolation (%) to pass variables to a SQL query string. Not even at gunpoint. For the purposes of this question the important part is that using parameter passing will result in proper type adaptation.
input must be an array, list, tuple or scalar pyproj
I Have a DF in which I am trying to convert the eastings/northings to long/lats. My df looks like this: import pandas as pd import numpy as np import pyproj Postcode Eastings Northings 0 AB101AB 394235 806529 1 AB101AF 394181 806429 2 AB101AG 394230 806469 3 AB101AH 394371 806359 4 AB101AL 394296 806581 I am using a well know code block to convert the eastings and northings to long/lats and add those long/lats as new columns to the df: def proj_transform(df): bng = pyproj.Proj("+init=EPSG:27700") wgs84 = pyproj.Proj("+init=EPSG:4326") lats = pd.Series() lons = pd.Series() for idx, val in enumerate(df['Eastings']): lon, lat = pyproj.transform(bng, wgs84, df['Eastings'][idx], df['Northings'][idx]) lats.set_value(idx, lat) lons.set_value(idx, lon) df['lat'] = lats df['lon'] = lons return df df_transform = proj_transform(my_df) However, I keep getting the following error, "input must be an array, list, tuple or scalar". Does anyone have any insight into where I am going wrong here?
This is the fastest method: https://gis.stackexchange.com/a/334307/144357 from pyproj import Transformer trans = Transformer.from_crs( "EPSG:27700", "EPSG:4326", always_xy=True, ) xx, yy = trans.transform(my_df["Eastings"].values, my_df["Northings"].values) my_df["X"] = xx my_df["Y"] = yy Also helpful for reference: https://pyproj4.github.io/pyproj/stable/gotchas.html#upgrading-to-pyproj-2-from-pyproj-1 https://pyproj4.github.io/pyproj/stable/gotchas.html#init-auth-auth-code-should-be-replaced-with-auth-auth-code https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
You can use DataFrame.apply with axis=1 and change function like: def proj_transform(x): e = x['Eastings'] n = x['Northings'] bng = pyproj.Proj("+init=EPSG:27700") wgs84 = pyproj.Proj("+init=EPSG:4326") lon, lat = pyproj.transform(bng, wgs84, e, n) return pd.Series([lon, lat]) my_df[['lat','lon']] = my_df.apply(proj_transform, axis=1)
Writing Data from pandas dataframe to PostgreSQL gives error of 'DataFrame' objects are mutable, thus they cannot be hashed
i am trying to save a data frame which was first imported in pandas from postgresql as dfraw and then do some manipulation and create another dataframe as df and save it back in postgresql same database using sql alchemy. but when i am trying to save it back its giving error of 'DataFrame' objects are mutable, thus they cannot be hashed PFB code below import psycopg2 import pandas as pd import numpy as np import sqlalchemy from sqlalchemy import create_engine # connect the database to python # Update connection string information host = "something.something.azure.com" dbname = "abcd" user = "abcd" password = "abcd" sslmode = "require" schema = 'xyz' # Construct connection string conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode) conn = psycopg2.connect(conn_string) print("Connection established") cursor = conn.cursor() # Fetch all rows from table cursor.execute("SELECT * FROM xyz.abc;") rows = cursor.fetchall() # Convert the tuples in dataframes dfraw = pd.DataFrame(rows, columns =["ID","Timestamp","K","S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"]) dfraw[["S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"]] = dfraw[["S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"]].apply(pd.to_numeric) dfraw[["Timestamp","K"]]=dfraw[["Timestamp","K"]].apply(pd.to_datetime) # Creating temp files temp1 = dfraw dfraw = temp1 # creating some fucntions for data manipulation and imputations def remZero(df,dropCol): for k in df.drop(dropCol,axis=1): if all(df[k] == 0): continue if any(df[k] == 0): print(k) df[k] = df[k].replace(to_replace=0, method='ffill') return df # Drop Columns function dropCol = ['Timestamp','K','ID','H','C','S'] dropCol2 = ['Timestamp','K','ID','Shift'] df = remZero(dfraw,dropCol) from sqlalchemy import create_engine engine = create_engine('postgresql://abcd:abcd#something.something.azure.com:5432/abcd') df.to_sql(name = df, con=engine, index = False, if_exists= 'replace' ) Error Message
Found basic error in the code I just missed putting the inverted comma before the data frame name to be published. The basic hygiene was missed df.to_sql(name = "df", con=engine, index = False, if_exists= 'replace' )
Concatenate Data From URLS Recursively Inside one DataFrame
I'm trying to create one dataframe with data from multiple urls I'm scraping. The code works however I'm unable to store the data in one DataFrame recursively. The DataFrame (called frame) is replaced with a new url's data each time rather than having the new data concatenated to the same frame. Thank you, I deeply appreciate your help! import urllib import re import json import pandas import pylab import numpy import matplotlib.pyplot from pandas import * from pylab import * from threading import Thread import sqlite3 urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546'] i=0 regex = '<p class="commentText">(.+?)</p>' regex2 = '<strong>Easiness</strong><span>(.+?)</span></p>' regex3 = 'Helpfulness</strong><span>(.+?)</span></p>' regex4 = 'Clarity</strong><span>(.+?)</span></p>' regex5 = 'Rater Interest</strong><span>(.+?)</span></p>' regex6 = '<div class="date">(.+?)</div>' regex7 = '<div class="class"><p style="word-wrap:break-word;">(.+?)</p>' regex8 = '<meta name="prof_name" content="(.+?)"/>' pattern = re.compile(regex) easiness = re.compile(regex2) helpfulness = re.compile(regex3) clarity = re.compile(regex4) interest = re.compile(regex5) date = re.compile(regex6) mathclass = re.compile(regex7) prof_name = re.compile(regex8) while i < len(urls): htmlfile = urllib.urlopen(urls[i]) htmltext = htmlfile.read() content = re.findall(pattern,htmltext) Easiness = re.findall(easiness,htmltext) Helpfulness = re.findall(helpfulness, htmltext) Clarity = re.findall(clarity, htmltext) Interest = re.findall(interest, htmltext) Date = re.findall(date, htmltext) Class = re.findall(mathclass, htmltext) PROFNAME=re.findall(prof_name, htmltext) i+=1 frame = DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness, 'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class, 'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]}) print frame
Use pd.concat: frames = [] while i < len(urls): htmlfile = urllib.urlopen(urls[i]) htmltext = htmlfile.read() content = re.findall(pattern,htmltext) Easiness = re.findall(easiness,htmltext) Helpfulness = re.findall(helpfulness, htmltext) Clarity = re.findall(clarity, htmltext) Interest = re.findall(interest, htmltext) Date = re.findall(date, htmltext) Class = re.findall(mathclass, htmltext) PROFNAME=re.findall(prof_name, htmltext) i+=1 frames.append(DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness, 'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class, 'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]})) pd.concat(frames)
You are overwriting your frame with each iteration of the loop. As Phillip Cloud suggested, you can make a list of frames that you append with each loop. I simplified your code differently, but I think this gives you what you want. import urllib import re import pandas as pd urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546'] regex = {'pattern' : re.compile('<p class="commentText">(.+?)</p>'), 'easiness' : re.compile('<strong>Easiness</strong><span>(.+?)</span></p>'), 'helpfulness' : re.compile('Helpfulness</strong><span>(.+?)</span></p>'), 'clarity' : re.compile('Clarity</strong><span>(.+?)</span></p>'), 'interest' : re.compile('Rater Interest</strong><span>(.+?)</span></p>'), 'date' : re.compile('<div class="date">(.+?)</div>'), 'mathclass' : re.compile('<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'), 'prof_name' : re.compile('<meta name="prof_name" content="(.+?)"/>')} # Make a dictionary with empty lists using the same keys d = {} for k in regex.keys(): d[k] = [] # Now fill those lists for url in urls: htmlfile = urllib.urlopen(url) htmltext = htmlfile.read() for k, v in regex.iteritems(): d[k].append(re.findall(v, htmltext)) frame = pd.DataFrame(d) # Dump the dict into a DataFrame print frame