Why my code is giving me data in 1 column it should give me in two different column - pandas

i need to know what is happening in my code? it should give data in separate columns it is giving me same data in a oath columns.
i tried to change the value of row variable but it didn't found the reason
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd
import time
arrayofRequest= []
prices=[]
location=[]
columns=['Price', 'Location']
df = pd.DataFrame(columns=columns)
for i in range(0,50):
arrayofRequest.append("https://www.zameen.com/Homes/Karachi-2-"+str(i+1)+".html?gclid=Cj0KCQjw3JXtBRC8ARIsAEBHg4mj4jX1zZUt3WzGScjH6nfwzrEqkuILarcmg372imSneelSXPj0fGIaArNeEALw_wcB")
request = requests.get(arrayofRequest[i])
soupobj= BeautifulSoup(request.content,"lxml")
# print(soupobj.prettify())
links =soupobj.find_all('span',{'class':'f343d9ce'})
addresses =soupobj.find_all('div',{'class':'_162e6469'})
price = ""
for i in range(0,len(links)):
price = str(links[i]).split(">")
price = price[len(price)-2].split("<")[0]
prices.append(price)
address = str(addresses[i]).split(">")
address = address[len(address)-2].split("<")[0]
location.append(address)
row=location[i]+","+prices[i]
df = df.append(pd.Series(row, index=columns), ignore_index=False)
# filewriter = csv.writer(csvfile, delimiter=',',filewriter.writerow(['Price', 'Location']),filewriter.writerow([prices[0],location[0]])
df.to_csv('DATA.csv', index=False)

because of this:
pd.Series(row, index=columns)
try smthg like
pd.DataFrame([[locations[i], prices[i]]], index=columns))
However this could be done only once outside of your for loop
pd.DataFrame(list(zip(locations, prices)), index=columns))

Related

'poorly' organized csv file

I have a CSV file that I have to do some data processing and it's a bit of a mess. It's about 20 columns long, but there are multiple datasets that are concatenated in each column. see dummy file below
I'm trying to import each sub file into a separate pandas dataframe, but I'm not sure the best way to parse the csv other than manually hardcoding importing a certain length. any suggestions? I guess if there is some way to find where the spaces are (I could loop through the entire file and find them, and then read each block, but that doesn't seem very efficient). I have lots of csv files like this to read.
import pandas as pd
nrows = 20
skiprows = 0 #but this only reads in the first block
df = pd.read_csv(csvfile, nrows=nrows, skiprows=skiprows)
Below is a dummy example:
TIME,HDRA-1,HDRA-2,HDRA-3,HDRA-4
0.473934934,0.944026678,0.460177668,0.157028404,0.221362174
0.911384892,0.336694914,0.586014563,0.828339071,0.632790473
0.772652589,0.318146985,0.162987171,0.555896202,0.659099194
0.541382917,0.033706768,0.229596419,0.388057901,0.465507295
0.462815443,0.088206108,0.717132904,0.545779038,0.268174922
0.522861489,0.736462083,0.532785319,0.961993893,0.393424116
0.128671067,0.56740537,0.689995486,0.518493779,0.94916205
0.214026742,0.176948186,0.883636252,0.732258971,0.463732841
0.769415726,0.960761306,0.401863804,0.41823372,0.812081565
0.529750933,0.360314266,0.461615009,0.387516958,0.136616263
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.92264286,0.026312552,0.905839375,0.869477136,0.985560264
0.410573341,0.004825381,0.920616162,0.19473237,0.848603523
0.999293171,0.259955029,0.380094352,0.101050014,0.428047493
0.820216119,0.655118219,0.586754951,0.568492346,0.017038336
0.040384337,0.195101879,0.778631044,0.655215972,0.701596844
0.897559206,0.659759362,0.691643603,0.155601111,0.713735399
0.860188233,0.805013656,0.772153733,0.809025634,0.257632085
0.844167809,0.268060979,0.015993504,0.95131982,0.321210766
0.86288383,0.236599974,0.279435193,0.311005146,0.037592509
0.938348876,0.941851279,0.582434058,0.900348616,0.381844182
0.344351819,0.821571854,0.187962046,0.218234588,0.376122331
0.829766776,0.869014514,0.434165111,0.051749472,0.766748447
0.327865017,0.938176948,0.216764504,0.216666543,0.278110502
0.243953506,0.030809033,0.450110334,0.097976735,0.762393831
0.484856452,0.312943244,0.443236377,0.017201097,0.038786057
0.803696521,0.328088545,0.764850865,0.090543472,0.023363909
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.342418934,0.290979228,0.84201758,0.690964176,0.927385229
0.173485057,0.214049903,0.27438753,0.433904377,0.821778689
0.982816721,0.094490904,0.105895645,0.894103833,0.34362529
0.738593272,0.423470984,0.343551191,0.192169774,0.907698897
0.021809601,0.406001002,0.072701623,0.964640184,0.023427393
0.406226618,0.421944527,0.413150342,0.337243905,0.515996389
0.829989793,0.168974332,0.246064043,0.067662474,0.851182924
0.812736737,0.667154845,0.118274705,0.484017732,0.052666038
0.215947395,0.145078319,0.484063281,0.79414799,0.373845815
0.497877968,0.554808367,0.370429652,0.081553316,0.793608698
0.607612542,0.424703584,0.208995066,0.249033837,0.808169709
0.199613478,0.065853429,0.77236195,0.757789625,0.597225697
0.044167285,0.1024231,0.959682778,0.892311813,0.621810775
0.861175219,0.853442735,0.742542086,0.704287769,0.435969078
0.706544823,0.062501379,0.482065481,0.598698867,0.845585046
0.967217599,0.13127149,0.294860203,0.191045015,0.590202032
0.031666757,0.965674812,0.177792841,0.419935921,0.895265056
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.306849588,0.177454423,0.538670939,0.602747137,0.081221293
0.729747557,0.11762043,0.409064884,0.051577964,0.666653287
0.492543468,0.097222882,0.448642979,0.130965724,0.48613413
0.0802024,0.726352481,0.457476151,0.647556514,0.033820374
0.617976299,0.934428994,0.197735831,0.765364856,0.350880707
0.07660401,0.285816636,0.276995238,0.047003343,0.770284864
0.620820688,0.700434525,0.896417099,0.652364756,0.93838793
0.364233925,0.200229902,0.648342989,0.919306736,0.897029239
0.606100716,0.203585366,0.167232701,0.523079381,0.767224301
0.616600448,0.130377791,0.554714839,0.468486555,0.582775753
0.254480861,0.933534632,0.054558237,0.948978985,0.731855548
0.620161044,0.583061202,0.457991555,0.441254272,0.657127968
0.415874646,0.408141761,0.843133575,0.40991199,0.540792744
0.254903429,0.655739954,0.977873649,0.210656057,0.072451639
0.473680525,0.298845701,0.144989283,0.998560665,0.223980961
0.30605008,0.837920854,0.450681322,0.887787908,0.793229776
0.584644405,0.423279153,0.444505314,0.686058204,0.041154856
from io import StringIO
import pandas as pd
data ="""
TIME,HDRA-1,HDRA-2,HDRA-3,HDRA-4
0.473934934,0.944026678,0.460177668,0.157028404,0.221362174
0.911384892,0.336694914,0.586014563,0.828339071,0.632790473
0.772652589,0.318146985,0.162987171,0.555896202,0.659099194
0.541382917,0.033706768,0.229596419,0.388057901,0.465507295
0.462815443,0.088206108,0.717132904,0.545779038,0.268174922
0.522861489,0.736462083,0.532785319,0.961993893,0.393424116
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.92264286,0.026312552,0.905839375,0.869477136,0.985560264
0.410573341,0.004825381,0.920616162,0.19473237,0.848603523
0.999293171,0.259955029,0.380094352,0.101050014,0.428047493
0.820216119,0.655118219,0.586754951,0.568492346,0.017038336
0.040384337,0.195101879,0.778631044,0.655215972,0.701596844
TIME,HDRB-1,HDRB-2,HDRB-3,HDRB-4
0.342418934,0.290979228,0.84201758,0.690964176,0.927385229
0.173485057,0.214049903,0.27438753,0.433904377,0.821778689
0.982816721,0.094490904,0.105895645,0.894103833,0.34362529
0.738593272,0.423470984,0.343551191,0.192169774,0.907698897
"""
df = pd.read_csv(StringIO(data), header=None)
start_marker = 'TIME'
grouper = (df.iloc[:, 0] == start_marker).cumsum()
groups = df.groupby(grouper)
frames = [gr.T.set_index(gr.index[0]).T for _, gr in groups]

Pandas combine mutilple columns in a BQ table to generate payload for FB conversions api

I am reading from a bigquery table to generate a payload to upload to FB conversions api.
cols=["payload","client_user_agent","event_source_url"]
I am copying the column values directly from the bq table as I am unable to print the full output of the dataframe in note book.
payload="{"pageDetail":{"pageName":"Confirmation","pageContentType":"cart","pageSiteSection":"cart","breadcrumbs":[{"title":"Home","url":"/en/home.html"},{"title":"Cart","url":"/cart"},{"title":"Confirmation","url":"/order-confirmation="}],"pageCategory":"Home","pageCategory1":"Cart","pageCategory2":"Confirmation","proBtbGlobalHeader":false},"orderDetails":{"hceid":"3b94a","orderConfirmed":true,"orderDate":"2021-01-15","orderId":"0123","unique":2,"pricingSummary":{"total":54.01},"items":[{"productId":"0456","quantity":1,"shippingAddress":{"postalCode":"V4N 3X3"},"promotion":{"voucherCode":null},"clickToInstall":{"eligible":false}},{"productId":"0789","quantity":1,"fulfillment":{"fulfillmentCost":""},"shippingAddress":{"postalCode":"A4N 3Y3"},"promotion":{"voucherCode":null},"clickToInstall":{"eligible":false}}],"billingAddress":{"postalCode":"M$X1A7"}},"event":{"type":"Load","page":"Confirmation","timestamp":1610706772998,"language":"English","url":"https://www"}}"
client_user_agent="Mozilla/5.0"
event_source_url= "https://www.def.com="
I need the value for email=[orderDetails][hceid] and value=["orderDetails"]["pricingSummary"]["total"]
Initially all the payload I wanted was in a single column and I was able to achieve the uploads with the following code
import time
from facebook_business.adobjects.serverside.event import Event
from facebook_business.adobjects.serverside.event_request import EventRequest
from facebook_business.adobjects.serverside.user_data import UserData
from facebook_business.adobjects.serverside.custom_data import CustomData
from facebook_business.api import FacebookAdsApi
import pandas as pd
import json
FacebookAdsApi.init(access_token=access_token)
query='''SELECT JSON_EXTRACT(payload, '$') AS payload FROM `project.dataset.events` WHERE eventType = 'Page Load' AND pagename = "Confirmation" limit 1'''
df = pd.read_gbq(query, project_id= project, dialect='standard')
payload = df.to_dict(orient="records")
for i in payload:
#print(type(i["payload"]))
k = json.loads(i["payload"])
email = k["orderDetails"]["hcemuid"]
user_data = UserData(email)
value=k["orderDetails"]["pricingSummary"]["total"]
order_id = k["orderDetails"]["orderId"]
custom_data = CustomData(
currency='CAD',
value=value)
event = Event(
event_name='Purchase',
event_time=int(time.time()),
user_data=user_data,
custom_data=custom_data,
event_id = order_id,
data_processing_options= [])
events = [event]
#print(events)
event_request = EventRequest(
events=events,
test_event_code='TEST8609',
pixel_id=pixel_id)
#print(event_request)
a=event_request.execute()
print(a)
Now there are additional values client_user_agent that needs to be part of user data and event_source_url as parts of events in the above code that are present as two different columns in GBQ table.
I have tried similar code as above for multiple columns but I am receiving a
TypeError: Object of type Series is not JSON serializable
So I tried concatenating the columns and then create a json serializable object but I am not able to do an upload.
Below is where I am stuck and lost and not sure how to proceed further any inputs appreciated.
import time
from facebook_business.adobjects.serverside.event import Event
from facebook_business.adobjects.serverside.event_request import EventRequest
from facebook_business.adobjects.serverside.user_data import UserData
from facebook_business.adobjects.serverside.custom_data import CustomData
from facebook_business.api import FacebookAdsApi
import pandas as pd
import json
FacebookAdsApi.init(access_token=access_token)
query='''SELECT payload AS payload,location.userAgent as client_user_agent,location.referrer as event_source_url FROM `project.Dataset.events` WHERE eventType = 'Page Load' AND pagename = "Confirmation" limit 1'''
df = pd.read_gbq(query, project_id= project, dialect='standard')
df.reset_index(drop=True, inplace=True)
payload = df.to_dict(orient="records")
print(payload)
## cols = ['payload', 'client_user_agent', 'event_source_url']
## df['combined'] = df[cols].apply(lambda row: ','.join(row.values.astype(str)), axis=1)
## del df["payload"]
## del df["client"]
## del df["source"]
## payload = df.to_dict(orient="records")
#tried concatinating all columns in a the dataframe but not able to create a valid json object for upload
columns = ['payload', 'client_user_agent', 'event_source_url']
df['payload'] = df['payload'].str.replace(r'}"$', '')
payload = df[columns].to_dict(orient='records')
print(payload)
## df = df.drop(columns=columns)
## pd.options.display.max_rows = 4000
# #print(payload)
# for i in payload:
# print(i["payload"])
# k = json.loads(i["payload"])
# email = k["orderDetails"]["hcemuid"]
# print(email)
I am following the instructions from this page:https://developers.facebook.com/docs/marketing-api/conversions-api
I have used the bigquery json_extract_scalar function to extract data from nested column instead of pandas which is a relatively better solution for my scenario.

Webscraping several URLs into panda df

Need some help appending several webscraping resaults to a panda df.
Currently im only getting the output from one of the URLs to the DF.
I left out the URLs, if you need them i will supply them to you.
##libs
import bs4
import requests
import re
from time import sleep
import pandas as pd
from bs4 import BeautifulSoup as bs
##webscraping targets
URLs = ["URL1","URL2","URL3"]
## Get columns
column_list = []
r1 = requests.get(URLs[0])
soup1 = bs(r1.content)
data1 = soup1.find_all('dl', attrs= {"class": "border XSText rightAlignText noMarginTop highlightOnHover thickBorderBottom noTopBorder"})
columns = soup1.find_all('dt')
for col in columns:
column_list.append(col.text.strip()) # strip() removes extra space from the text
##Get values
value_list = []
for url in URLs:
r1 = requests.get(url)
soup1 = bs(r1.content)
data1 = soup1.find_all('dl', attrs= {"class": "border XSText rightAlignText noMarginTop highlightOnHover thickBorderBottom noTopBorder"})
values = soup1.find_all('dd')
for val in values:
value_list.append(val.text.strip())
df=pd.DataFrame(list(zip(column_list,value_list)))
df.transpose()
Current output only showing the resaults of one URL:
Expected output:
The problem here is with your zip function. It will only zip the values until the length of the shortest list, in this case, the column_list. Leaving all the other values unused.
If you want to append the other values to the dataframe as well you will have to iterate over then. So change the last two lines on your code to this and it should work:
result = [[i] for i in column_list]
for i, a in enumerate(value_list):
result[i % len(column_list)].extend([a])
df = pd.DataFrame(result)
df.transpose()

How do I convert NBA-API List to DataFrame

Having an issue converting NBA-API object to a DataFrame. What I get is a list of the dataframe. How do I pull the DataFrame out the list or skip the list and create the DataFrame.
## NBA API endpoints needed to obtain data
import nba_api.stats.endpoints
from nba_api.stats.static import players
from nba_api.stats.static import teams
from nba_api.stats.endpoints import shotchartdetail
import pandas as pd
from pandas import DataFrame
import matplotlib as mpl
import matplotlib.pyplot as plt
BBplayers = players.get_players()
BBteams = teams.get_teams()
print(type(players))
print(BBplayers[0])
print(len(BBplayers))
## LEN PLAYERS = 4501 PLAYER TYPE IS DICTIONARIES INSIDE LIST
Durant = [player for player in BBplayers if player['full_name'] == 'Kevin Durant'][0]
Durant_id = Durant['id']
print(Durant_id)
## Durant ID = 201142
Thunder = [name for name in BBteams if name['full_name']=='Oklahoma City Thunder'][0]
Thunder_id = Thunder['id']
print(Thunder_id)
print(type(Thunder_id))
## Thunder ID = 1610612760
DurantShotsChart = shotchartdetail.ShotChartDetail(player_id='201142',team_id=1610612760)
print(DurantShotsChart)
NewDF=DurantShotsChart.get_data_frames()
print(type(NewDF))
print(NewDF[1])
print(len(NewDF))
According to the API documentation you should get two dataframes. And since the get_data_frames method always outputs a list of DataFrames, you can retrieve them separately by indexing:
league_averages = NewDF[0]
Which has the columns ['GRID_TYPE', 'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'FGA', 'FGM', 'FG_PCT'].
And then
shot_chart_detail= NewDf[1]
Which has the following columns ['GRID_TYPE', 'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING', 'SECONDS_REMAINING', 'EVENT_TYPE', 'ACTION_TYPE', 'SHOT_TYPE', 'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE', 'LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE', 'HTM', 'VTM']
This?
df1=NewDF[0]
df2=NewDF[1]

Set Multiple Restrictions for Rows Called to Print in Pandas

import pandas as pd
import numpy as np
#load data
#data file and py file must be in same file path
df = pd.read_csv('cbp15st.txt', delimiter = ',', encoding = 'utf-8-
sig')
#define load data DataFrame columns
state = df['FIPSTATE']
industry = df['NAICS']
legal_form_of_organization = df['LFO']
suppression_flag = df['EMPFLAG']
total_establishment = df['EST']
establishment_1_4 = df['N1_4']
establishment_5_9 = df['N5_9']
establishment_10_19 = df['N10_19']
establishment_20_49 = df['N20_49']
establishment_50_99 = df['N50_99']
establishment_100_249 = df['N100_249']
establishment_250_499 = df['N250_499']
establishment_500_999 = df['N500_999']
establishment_1000_more = df['N1000']
#use df.loc to parse dataset for partiuclar value types
print(df.loc[df['EMPFLAG']=='A'], df.loc[df['FIPSTATE']==1],
df.loc[df['NAICS']=='------'])
Currently using df.loc to locate specific values from the df columns, but will read out those columns that contain all of these values, not only these values (like an or vs and statement)
Trying to find a way to place multiple restrictions on this to only get column reads that meet criteria x y and z.
Current Readout from above:
enter image description here
You can use & operator while specifying multiple filtering criteria, something like:
df1 = df.loc[(df['EMPFLAG']=='A']) & (df['FIPSTATE']==1) & (df['NAICS']=='------')]
print(df1)