checking for duplicates in panda data frame - pandas

import pandas as pd
from io import StringIO
import requests
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline
url = 'https://m-selig.ae.illinois.edu/ads/coord/b737a.dat'
response = requests.get(url).text
lines = []
for idx, line in enumerate(response.split('\n'), start=1):
if all([x.replace('.','').replace('-','').isdecimal() for x in line.split()]):
lines.append(line)
lines = [x.split() for x in lines]
df = pd.DataFrame(lines)
df = df.dropna(axis=0)
df = df.astype(float)
df = df[~(df > 1).any(1)]
print(df)
output...
0 1
2 0.0000 0.0177
3 0.0023 0.0309
4 0.0050 0.0372
5 0.0076 0.0415
6 0.0143 0.0499
7 0.0249 0.0582
8 0.0495 0.0730
9 0.0740 0.0814
10 0.0990 0.0866
11 0.1530 0.0907
12 0.1961 0.0905
13 0.2504 0.0887
14 0.3094 0.0858
15 0.3520 0.0833
16 0.3919 0.0804
17 0.4477 0.0756
18 0.5034 0.0696
19 0.5593 0.0626
20 0.5965 0.0575
21 0.6488 0.0498
22 0.8351 0.0224
23 0.9109 0.0132
24 1.0000 0.0003
26 0.0000 0.0177
27 0.0022 0.0038
28 0.0049 -0.0018
29 0.0072 -0.0053
30 0.0119 -0.0106
31 0.0243 -0.0204
32 0.0486 -0.0342
33 0.0716 -0.0457
34 0.0979 -0.0516
35 0.1488 -0.0607
36 0.1953 -0.0632
37 0.2501 -0.0632
38 0.2945 -0.0626
39 0.3579 -0.0610
40 0.3965 -0.0595
41 0.4543 -0.0563
42 0.5050 -0.0527
43 0.5556 -0.0482
44 0.6063 -0.0427
45 0.6485 -0.0375
46 0.8317 -0.0149
47 0.9410 -0.0053
48 1.0000 -0.0003
This is my code for a website I'm scraping data from. I'm running into a problem where the x points start from zero, go up, and come back down to zero creating a line in the middle of the plot which I don't need.
Notice how there is two df[0] = 0 on rows 2 and 26, How can I write a code where it detects duplicates?

Try one of the following?
Out of the loop
df1=df.drop_duplicates(keep='first', inplace=False, ignore_index=False)
Inside your loop
lines = []
lines1 = []
for idx, line in enumerate(response.split('\n'), start=1):
if all([x.replace('.','').replace('-','').isdecimal() for x in line.split()]):
if not (line in lines1): lines.append(line)
lines1.append(line)

Related

Converting a string to number in jupyter

Here is my code:
def value_and_wage_conversion(value):
if isinstance(value,str):
if 'M' in out:
out = float(out.replace('M', ''))*1000000
elif 'K' in value:
out = float(out.replace('K', ''))*1000
return float(out)
fifa_18['Value'] = fifa_18['Value'].apply(lambda x: value_and_wage_conversion(x))
fifa_18['Wage'] = fifa_18['Wage'].apply(lambda x: value_and_wage_conversion(x))
Here is the error message:
--------------------------------------------------------------------------- UnboundLocalError Traceback (most recent call
last) in
7 return float(out)
8
----> 9 fifa_18['Value'] = fifa_18['Value'].apply(lambda x: value_and_wage_conversion(x))
10 fifa_18['Wage'] = fifa_18['Wage'].apply(lambda x: value_and_wage_conversion(x))
c:\users\brain\appdata\local\programs\python\python39\lib\site-packages\pandas\core\series.py
in apply(self, func, convert_dtype, args, **kwds) 4136
else: 4137 values = self.astype(object)._values
-> 4138 mapped = lib.map_infer(values, f, convert=convert_dtype) 4139 4140 if len(mapped) and
isinstance(mapped[0], Series):
pandas_libs\lib.pyx in pandas._libs.lib.map_infer()
in (x)
7 return float(out)
8
----> 9 fifa_18['Value'] = fifa_18['Value'].apply(lambda x: value_and_wage_conversion(x))
10 fifa_18['Wage'] = fifa_18['Wage'].apply(lambda x: value_and_wage_conversion(x))
in value_and_wage_conversion(value)
1 def value_and_wage_conversion(value):
2 if isinstance(value,str):
----> 3 if 'M' in out:
4 out = float(out.replace('M', ''))*1000000
5 elif 'K' in value:
UnboundLocalError: local variable 'out' referenced before assignment
You were almost there but you need to fix your function
For example
import numpy as np
import pandas as pd
# generate a random sample
values = ['10M', '10K', 10.5, '200M', '200K', 200]
size = 100
np.random.seed(1)
df = pd.DataFrame({
'Value': np.random.choice(values, size),
'Wage': np.random.choice(values, size),
})
print(df)
Value Wage
0 200 200
1 200M 200M
2 200K 200
3 10M 10M
4 10K 200M
.. ... ...
95 200K 200
96 200 200M
97 10.5 200K
98 200K 10.5
99 200M 10M
[100 rows x 2 columns]
Define function and apply
def value_and_wage_conversion(value):
if isinstance(value, str):
if 'M' in value:
value = float(value.replace('M', ''))*1000000
elif 'K' in value:
value = float(value.replace('K', ''))*1000
return float(value)
df['Value'] = df['Value'].apply(lambda x: value_and_wage_conversion(x))
df['Wage'] = df['Wage'].apply(lambda x: value_and_wage_conversion(x))
print(df)
Value Wage
0 200.0 200.0
1 200000000.0 200000000.0
2 200000.0 200.0
3 10000000.0 10000000.0
4 10000.0 200000000.0
.. ... ...
95 200000.0 200.0
96 200.0 200000000.0
97 10.5 200000.0
98 200000.0 10.5
99 200000000.0 10000000.0
[100 rows x 2 columns]
and check
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Value 100 non-null float64
1 Wage 100 non-null float64
dtypes: float64(2)
memory usage: 1.7 KB

Splitting a coordinate string into X and Y columns with a pandas data frame

So I created a pandas data frame showing the coordinates for an event and number of times those coordinates appear, and the coordinates are shown in a string like this.
Coordinates Occurrences x
0 (76.0, -8.0) 1 0
1 (-41.0, -24.0) 1 1
2 (69.0, -1.0) 1 2
3 (37.0, 30.0) 1 3
4 (-60.0, 1.0) 1 4
.. ... ... ..
63 (-45.0, -11.0) 1 63
64 (80.0, -1.0) 1 64
65 (84.0, 24.0) 1 65
66 (76.0, 7.0) 1 66
67 (-81.0, -5.0) 1 67
I want to create a new data frame that shows the x and y coordinates individually and shows their occurrences as well like this--
x Occurrences y Occurrences
76 ... -8 ...
-41 ... -24 ...
69 ... -1 ...
37 ... -30 ...
60 ... 1 ...
I have tried to split the string but don't think I am doing it correctly and don't know how to add it to the table regardless--I think I'd have to do something like a for loop later on in my code--I scraped the data from an API, here is the code to set up the data frame shown.
for key in contents['liveData']['plays']['allPlays']:
# for plays in key['result']['event']:
# print(key)
if (key['result']['event'] == "Shot"):
#print(key['result']['event'])
scoordinates = (key['coordinates']['x'], key['coordinates']['y'])
if scoordinates not in shots:
shots[scoordinates] = 1
else:
shots[scoordinates] += 1
if (key['result']['event'] == "Goal"):
#print(key['result']['event'])
gcoordinates = (key['coordinates']['x'], key['coordinates']['y'])
if gcoordinates not in goals:
goals[gcoordinates] = 1
else:
goals[gcoordinates] += 1
#create data frame using pandas
gdf = pd.DataFrame(list(goals.items()),columns = ['Coordinates','Occurences'])
print(gdf)
sdf = pd.DataFrame(list(shots.items()),columns = ['Coordinates','Occurences'])
print()
try this
import re
df[['x', 'y']] = df.Coordinates.apply(lambda c: pd.Series(dict(zip(['x', 'y'], re.findall('[-]?[0-9]+\.[0-9]+', c.strip())))))
using the in-built string methods to achieve this should be performant:
df[["x", "y"]] = df["Coordinates"].str.strip(r"[()]").str.split(",", expand=True).astype(np.float)
(this also converts x,y to float values, although not requested probably desired)

I am sure that the type of "items_tmp_dic2" is dict,so why report this error?

import pandas as pd
import numpy as np
path = 'F:/datasets/kaggle/predict_future_sales/'
train_raw = pd.read_csv(path + 'sales_train.csv')
items = pd.read_csv(path + 'items.csv')
item_category_id = items['item_category_id']
item_id = train_raw.item_id
train_raw.head()
date date_block_num shop_id item_id item_price item_cnt_day
0 02.01.2013 0 59 22154 999.00 1.0
1 03.01.2013 0 25 2552 899.00 1.0
2 05.01.2013 0 25 2552 899.00 -1.0
3 06.01.2013 0 25 2554 1709.05 1.0
4 15.01.2013 0 25 2555 1099.00 1.0
items.head()
item_name item_id item_category_id
0 ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D 0 40
1 !ABBYY FineReader 12 Professional Edition Full... 1 76
2 ***В ЛУЧАХ СЛАВЫ (UNV) D 2 40
3 ***ГОЛУБАЯ ВОЛНА (Univ) D 3 40
4 ***КОРОБКА (СТЕКЛО) D 4 40
Then I want to add a "item_category_id" to train_raw,you mean from the data of items,so i want to creat a dict of item_id and item_category_id
item_category_id = items['item_category_id']
item_id = train_raw.item_id
items_tmp = items.drop(['item_name'],axis=1)
items_tmp_dic = items_tmp.to_dict('split')
items_tmp_dic = items_tmp_dic.get('data')
items_tmp_dic2 = dict(items_tmp_dic)
ic_id = []
for i in np.nditer(item_id.values[:10]):
ic_id.append(items_tmp_dic2.get(i))
print(len(ic_id))
wrong
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-50-be637620ea6d> in <module>
6 ic_id = []
7 for i in np.nditer(item_id.values[:10]):
----> 8 ic_id.append(items_tmp_dic2.get(i))
9 print(len(ic_id))
TypeError: unhashable type: 'numpy.ndarray'
but when I run
for i in np.nditer(item_id.values[:10]):
print(i)
I get
22154
2552
2552
2554
2555
2564
2565
2572
2572
2573
I have ensured that the type of "items_tmp_dic2" is dict,so why ?
I have solved it by using int()
for i in np.nditer(item_id.values[:10]):
ic_id.append(items_tmp_dic2.get(int(i)))

Why does changing "Date" column to datetime ruin graph?

I have a dataframe with financial data in it (Date, Open, Close, Low, High).
I want to graph the date versus one column (eg. Open). When I convert the "Date" column to a date, the graph does not display correctly. Why could this be happening?
import pandas as pd
import matplotlib.pyplot as plt
def plot_one_data(df, column, title= "No Title", color = "black"):
df.plot(x="Date", y=column, title=title)
#show plot
filename= title+ ".png"
plt.savefig(filename)
#Load in the csv files
df = pd.read_csv('data/df.csv')
print(df.head())
plot_one_data(df, "Open", "Before Converting to Date - good but no dates on x axis")
df['Date'] = pd.to_datetime(df["Date"])
print(df.head())
plot_one_data(df, "Open", "After Converting to Date - bad!")
Text Output:
Date Open ... Adj Close Volume
0 14-08-06 1266.670044 ... 1268.209961 2118020000
1 15-08-06 1268.189941 ... 1285.579956 2334100000
2 16-08-06 1285.270020 ... 1295.430054 2554570000
3 17-08-06 1295.369995 ... 1297.479980 2458340000
4 18-08-06 1297.479980 ... 1302.300049 2033910000
[5 rows x 7 columns]
Date Open ... Adj Close Volume
0 2006-08-14 1266.670044 ... 1268.209961 2118020000
1 2006-08-15 1268.189941 ... 1285.579956 2334100000
2 2006-08-16 1285.270020 ... 1295.430054 2554570000
3 2006-08-17 1295.369995 ... 1297.479980 2458340000
4 2006-08-18 1297.479980 ... 1302.300049 2033910000
Solution - add format
Updated line
df['Date'] = pd.to_datetime(df["Date"], format='%d-%m-%y')
I found that looking at the first 30 entries (before and after changing to datetime) helped. It works until the the end of August 2006 then jumps to the 9th of January!
First 30 entries in original data
Date Open ... Adj Close Volume
0 14-08-06 1266.670044 ... 1268.209961 2118020000
1 15-08-06 1268.189941 ... 1285.579956 2334100000
2 16-08-06 1285.270020 ... 1295.430054 2554570000
3 17-08-06 1295.369995 ... 1297.479980 2458340000
4 18-08-06 1297.479980 ... 1302.300049 2033910000
5 21-08-06 1302.300049 ... 1297.520020 1759240000
6 22-08-06 1297.520020 ... 1298.819946 1908740000
7 23-08-06 1298.729980 ... 1292.989990 1893670000
8 24-08-06 1292.969971 ... 1296.060059 1930320000
9 25-08-06 1295.920044 ... 1295.089966 1667580000
10 28-08-06 1295.089966 ... 1301.780029 1834920000
11 29-08-06 1301.569946 ... 1304.280029 2093720000
12 30-08-06 1303.699951 ... 1305.369995 2060690000
13 31-08-06 1304.250000 ... 1303.819946 1974540000 #Smooth change from August to September
14 01-09-06 1303.800049 ... 1311.010010 1800520000
15 05-09-06 1310.939941 ... 1313.250000 2114480000
16 06-09-06 1313.040039 ... 1300.260010 2329870000
17 07-09-06 1300.209961 ... 1294.020020 2325850000
18 08-09-06 1294.020020 ... 1298.920044 2132890000
19 11-09-06 1298.859985 ... 1299.540039 2506430000
20 12-09-06 1299.530029 ... 1313.000000 2791580000
21 13-09-06 1312.739990 ... 1318.069946 2597220000
22 14-09-06 1318.000000 ... 1316.280029 2351220000
23 15-09-06 1316.280029 ... 1319.660034 3198030000
24 18-09-06 1319.849976 ... 1321.180054 2325080000
25 19-09-06 1321.170044 ... 1317.640015 2390850000
26 20-09-06 1318.280029 ... 1325.180054 2543070000
27 21-09-06 1324.890015 ... 1318.030029 2627440000
28 22-09-06 1318.030029 ... 1314.780029 2162880000
29 25-09-06 1314.780029 ... 1326.369995 2710240000
First 30 entries after changing to datetime
[30 rows x 7 columns]
Date Open ... Adj Close Volume
0 2006-08-14 1266.670044 ... 1268.209961 2118020000
1 2006-08-15 1268.189941 ... 1285.579956 2334100000
2 2006-08-16 1285.270020 ... 1295.430054 2554570000
3 2006-08-17 1295.369995 ... 1297.479980 2458340000
4 2006-08-18 1297.479980 ... 1302.300049 2033910000
5 2006-08-21 1302.300049 ... 1297.520020 1759240000
6 2006-08-22 1297.520020 ... 1298.819946 1908740000
7 2006-08-23 1298.729980 ... 1292.989990 1893670000
8 2006-08-24 1292.969971 ... 1296.060059 1930320000
9 2006-08-25 1295.920044 ... 1295.089966 1667580000
10 2006-08-28 1295.089966 ... 1301.780029 1834920000
11 2006-08-29 1301.569946 ... 1304.280029 2093720000
12 2006-08-30 1303.699951 ... 1305.369995 2060690000
13 2006-08-31 1304.250000 ... 1303.819946 1974540000 #Fine until here
14 2006-01-09 1303.800049 ... 1311.010010 1800520000 #Problem here
15 2006-05-09 1310.939941 ... 1313.250000 2114480000
16 2006-06-09 1313.040039 ... 1300.260010 2329870000
17 2006-07-09 1300.209961 ... 1294.020020 2325850000
18 2006-08-09 1294.020020 ... 1298.920044 2132890000
19 2006-11-09 1298.859985 ... 1299.540039 2506430000
20 2006-12-09 1299.530029 ... 1313.000000 2791580000
21 2006-09-13 1312.739990 ... 1318.069946 2597220000
22 2006-09-14 1318.000000 ... 1316.280029 2351220000
23 2006-09-15 1316.280029 ... 1319.660034 3198030000
24 2006-09-18 1319.849976 ... 1321.180054 2325080000
25 2006-09-19 1321.170044 ... 1317.640015 2390850000
26 2006-09-20 1318.280029 ... 1325.180054 2543070000
27 2006-09-21 1324.890015 ... 1318.030029 2627440000
28 2006-09-22 1318.030029 ... 1314.780029 2162880000
29 2006-09-25 1314.780029 ... 1326.369995 2710240000
Here is the fixed image:

Extracting table data using BeautifulSoup

Having a little trouble using BeautifulSoup to extract data (zip code and population). Any help appreciated.
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
pop_source = requests.get("https://www.zip-codes.com/city/tx-austin.asp").text
soup = BeautifulSoup(pop_source, 'html5lib')
zip_pop_table = soup.find('table',class_='statTable')
austin_pop = pd.DataFrame(columns=['Zip Code','Population'])
for row in zip_pop_table.find_all('tr'):
cols = row.find_all('td')
Now I'm stuck. Don't really know how to pull the data in the columns I want and append it to the columns I made in the empty dataframe.
Any help appreciated.
You just need to loop over your cols, and dump that into your austin_pop dataframe.
So I did that by making a list of the data from the cols using list comprehension:
row_list = [ data.text for data in cols ]
List comprehension equivalent to a for loop. You can use either.:
row_list = []
for data in cols:
rows_list.append(data.text)
Created a single row, kept the 2 columns you wanted, and then dumped that in to austin_pop:
temp_df = pd.DataFrame([row_list], columns = ['Zip Code','type','county','Population', 'area_codes'])
temp_df = temp_df[['Zip Code', 'Population']]
austin_pop = austin_pop.append(temp_df).reset_index(drop = True)
Full Code:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
url = "https://www.zip-codes.com/city/tx-austin.asp"
pop_source = requests.get("https://www.zip-codes.com/city/tx-austin.asp").text
soup = BeautifulSoup(pop_source, 'html5lib')
zip_pop_table = soup.find('table',class_='statTable')
austin_pop = pd.DataFrame(columns=['Zip Code','Population'])
for row in zip_pop_table.find_all('tr'):
cols = row.find_all('td')
row_list = [ data.text for data in cols ]
temp_df = pd.DataFrame([row_list], columns = ['Zip Code','type','county','Population', 'area_codes'])
temp_df = temp_df[['Zip Code', 'Population']]
austin_pop = austin_pop.append(temp_df).reset_index(drop = True)
austin_pop = austin_pop.iloc[1:, :]
austin_pop['Zip Code'] = austin_pop['Zip Code'].apply(lambda x: x.split()[-1])
Output:
print (austin_pop)
Zip Code Population
1 73301 0
2 73344 0
3 78681 50,606
4 78701 6,841
5 78702 21,334
6 78703 19,690
7 78704 42,117
8 78705 31,340
9 78708 0
10 78709 0
11 78710 0
12 78711 0
13 78712 860
14 78713 0
15 78714 0
16 78715 0
17 78716 0
18 78717 22,538
19 78718 0
20 78719 1,764
21 78720 0
22 78721 11,425
23 78722 5,901
24 78723 28,330
25 78724 21,696
26 78725 6,083
27 78726 13,122
28 78727 26,689
29 78728 20,299
30 78729 27,108
.. ... ...
45 78746 26,928
46 78747 14,808
47 78748 40,651
48 78749 34,449
49 78750 26,814
50 78751 14,385
51 78752 18,064
52 78753 49,301
53 78754 15,036
54 78755 0
55 78756 7,194
56 78757 21,310
57 78758 44,072
58 78759 38,891
59 78760 0
60 78761 0
61 78762 0
62 78763 0
63 78764 0
64 78765 0
65 78766 0
66 78767 0
67 78768 0
68 78772 0
69 78773 0
70 78774 0
71 78778 0
72 78779 0
73 78783 0
74 78799 0
[74 rows x 2 columns]