Looping through pandas data frame and creating new column value - pandas

I'm trying to loop through a csv file which I converted into a pandas data frame.
I need to loop through each line and check the latitude and longitude data I have (2 separate columns) and append a code (0,1 or 2) to the the same line depending on whether the lat, long data falls within a certain range.
I'm somewhat new to python and would love any help ya'll might have.
It's throwing off quite a few errors at me.
book = 'yellow_tripdata_2014-04.csv'
write_book = 'yellow_04.csv'
yank_max_long = -73.921630300
yank_min_long = -73.931169700
yank_max_lat = 40.832823000
yank_min_lat = 40.825582000
mets_max_long = 40.760523000
mets_min_long = 40.753277000
mets_max_lat = -73.841035400
mets_min_lat = -73.850564600
df = pd.read_csv(book)
##To check for Yankee Stadium Lat's and Long's, if within gps units then Stadium_Code = 1 , if mets then Stadium_Code=2
df['Stadium_Code'] = 0
for i, row in df.iterrows():
if yank_min_lat <= float(row['dropoff_latitude']) <= yank_max_lat and yank_min_long <=float(row('dropoff_longitude')) <=yank_max_long:
row['Stadium_Code'] == 1
elif mets_min_lat <= float(row['dropoff_latitude']) <= mets_max_lat and mets_min_long <=float(row('dropoff_longitude')) <=mets_max_long:
row['Stadium_Code'] == 2
I tried using the .loc command but ran into this error message:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-33-9a9166772646> in <module>()
----> 1 yank_mask = (df['dropoff_latitude'] > yank_min_lat) & (df['dropoff_latitude'] <= yank_max_lat) & (df['dropoff_longitude'] > yank_min_long) & (df['dropoff_longitude'] <= yank_max_long)
2
3 mets_mask = (df['dropoff_latitude'] > mets_min_lat) & (df['dropoff_latitude'] <= mets_max_lat) & (df['dropoff_longitude'] > mets_min_long) & (df['dropoff_longitude'] <= mets_max_long)
4
5 df.loc[yank_mask, 'Stadium_Code'] = 1
/Users/benjaminprice/anaconda/lib/python3.4/site-packages/pandas/core/frame.py in __getitem__(self, key)
1795 return self._getitem_multilevel(key)
1796 else:
-> 1797 return self._getitem_column(key)
1798
1799 def _getitem_column(self, key):
/Users/benjaminprice/anaconda/lib/python3.4/site-packages/pandas/core/frame.py in _getitem_column(self, key)
1802 # get column
1803 if self.columns.is_unique:
-> 1804 return self._get_item_cache(key)
1805
1806 # duplicate columns & possible reduce dimensionaility
/Users/benjaminprice/anaconda/lib/python3.4/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1082 res = cache.get(item)
1083 if res is None:
-> 1084 values = self._data.get(item)
1085 res = self._box_item_values(item, values)
1086 cache[item] = res
/Users/benjaminprice/anaconda/lib/python3.4/site-packages/pandas/core/internals.py in get(self, item, fastpath)
2849
2850 if not isnull(item):
-> 2851 loc = self.items.get_loc(item)
2852 else:
2853 indexer = np.arange(len(self.items))[isnull(self.items)]
/Users/benjaminprice/anaconda/lib/python3.4/site-packages/pandas/core/index.py in get_loc(self, key, method)
1570 """
1571 if method is None:
-> 1572 return self._engine.get_loc(_values_from_object(key))
1573
1574 indexer = self.get_indexer([key], method=method)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3824)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3704)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12280)()
pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12231)()
KeyError: 'dropoff_latitude'
I'm usually not too bad at figuring out what these error codes mean, but this one threw me off.

Firstly it's pretty wasteful to iterate row-wise when there are vectorised solutions available that will operate on the whole df at once.
I'd create a boolean mask of your 2 conditions and pass these to .loc to mask the rows that meet the criteria and set these to the values.
Here the masks use the bitwise operators & to and the conditions and parentheses are used around each condition due to operator precedence.
So the following should work:
yank_mask = (df['dropoff_latitude'] > yank_min_lat) & (df['dropoff_latitude'] <= yank_max_lat) & (df['dropoff_longitude'] > yank_min_long) & (df['dropoff_longitude'] <= yank_max_long)
mets_mask = (df['dropoff_latitude'] > mets_min_lat) & (df['dropoff_latitude'] <= mets_max_lat) & (df['dropoff_longitude'] > mets_min_long) & (df['dropoff_longitude'] <= mets_max_long)
df.loc[yank_mask, 'Stadium_Code'] = 1
df.loc[mets_mask, 'Stadium_Code'] = 2
If not already done so I'd read the docs as will aid you in understanding how the above works

Related

Total area is zero in defuzzification

my following code is getting the error: "AssertionError: Total area is zero in defuzzification!", followed by ValueError: Crisp output cannot be calculated, likely because the system is too sparse. Check to make sure this set of input values will activate at least one connected Term in each Antecedent via the current set of Rules.
I'm trying to understand what is wrong but I can't figure it out
its works when I give it a single input but when I use a loop its ends up with this error
import numpy as np
import skfuzzy as fuzz
from skfuzzy import control as ctrl
# range input and output
text = ctrl.Antecedent(np.arange(-1, 1.1, 0.1), 'text')
hashtag = ctrl.Antecedent(np.arange(-1, 1.1, 0.1), 'hashtag')
emoji = ctrl.Antecedent(np.arange(-1, 1.1, 0.1), 'emoji')
sentiment = ctrl.Consequent(np.arange(-1, 1.1, 0.1), 'sentiment')
# status input and output
text['negative'] = fuzz.trapmf(text.universe, [-1,-1, -0.5, -0.1])
text['neutral'] = fuzz.trimf(text.universe, [ -0.2,0, 0.2])
text['positive'] = fuzz.trapmf(text.universe, [0.1, 0.5,1, 1])
hashtag['negative'] = fuzz.trapmf(hashtag.universe, [-1,-1, -0.5, -0.1])
hashtag['neutral'] = fuzz.trimf(hashtag.universe, [ -0.2,0, 0.2])
hashtag['positive'] = fuzz.trapmf(hashtag.universe, [0.1, 0.5,1, 1])
emoji['negative'] = fuzz.trapmf(emoji.universe, [-1,-1, -0.5, -0.1])
emoji['neutral'] = fuzz.trimf(emoji.universe, [ -0.2,0, 0.2])
emoji['positive'] = fuzz.trapmf(emoji.universe, [0.1, 0.5,1, 1])
sentiment['strongly negative'] = fuzz.trimf(sentiment.universe, [-1,-1, -0.6])
sentiment['negative'] = fuzz.trimf(sentiment.universe, [-0.7,-0.5,-0.2])
sentiment['neutral'] = fuzz.trimf(sentiment.universe, [-0.3,0,0.3])
sentiment['positive'] = fuzz.trimf(sentiment.universe, [0.2,0.5,0.7])
sentiment['strongly positive'] = fuzz.trimf(sentiment.universe, [0.6,1,1])
rule1 = ctrl.Rule( text['positive'] & emoji['positive'] & hashtag['positive']| text['positive'] & emoji['positive'] & hashtag['neutral']
|text['positive'] & emoji['neutral'] & hashtag['positive'],
sentiment['strongly positive'])
rule2 = ctrl.Rule(text['positive'] & emoji['negative']& hashtag['positive'] |text['positive']& emoji['neutral']& hashtag['neutral'] |text['positive']& emoji['positive'] & hashtag['negative']
|text['neutral'] & emoji['positive']& hashtag['positive']|text['negative'] & emoji['positive']& hashtag['positive']|
text['neutral'] & emoji['positive']& hashtag['neutral']| text['neutral'] & emoji['neutral']&hashtag['positive'],
sentiment['positive'])
rule3 = ctrl.Rule( text['neutral'] & emoji['neutral']& hashtag['neutral']|text['positive']& emoji['negative'] & hashtag['negative'],
sentiment['neutral'])
rule4 = ctrl.Rule(text['negative'] & emoji['positive']& hashtag['negative']| text['negative'] & emoji['neutral']& hashtag['neutral'] | text['neutral'] & emoji['neutral']& hashtag['negative']
|text['negative'] & emoji['negative']& hashtag['positive']|text['neutral'] & emoji['negative']& hashtag['negative']
|text['neutral'] & emoji['negative']& hashtag['neutral'],
sentiment['negative'])
rule5 = ctrl.Rule( text['negative'] & emoji['negative']& hashtag['negative'] |
text['negative'] & emoji['neutral'] & hashtag['negative']|
text['negative'] & emoji['negative']& hashtag['neutral'],
sentiment['strongly negative'])
liste is a life of dictionary
for dic_tweets in liste:
for key in dic_tweets:
bot.input['text'] = dic_tweets['text']
bot.input['hashtag'] = dic_tweets['hashtag']
bot.input['emoji'] = dic_tweets['emoji']
bot.compute()
l.append(bot.output['sentiment'])
l
sentiment.view(sim=bot)
errors are :
AssertionError Traceback (most recent call last)
~\anaconda3\lib\site-packages\skfuzzy\control\controlsystem.py in defuzz(self)
585 try:
--> 586 return defuzz(ups_universe, output_mf,
587 self.var.defuzzify_method)
~\anaconda3\lib\site-packages\skfuzzy\defuzzify\defuzz.py in defuzz(x, mfx, mode)
247 zero_truth_degree = mfx.sum() == 0 # Approximation of total area
--> 248 assert not zero_truth_degree, 'Total area is zero in defuzzification!'
249
AssertionError: Total area is zero in defuzzification!
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-72-62f88feea746> in <module>
6 bot.input['hashtag'] = dic_tweets['hashtag']
7 bot.input['emoji'] = dic_tweets['emoji']
----> 8 bot.compute()
9 l.append(bot.output['sentiment'])
10
~\anaconda3\lib\site-packages\skfuzzy\control\controlsystem.py in compute(self)
371 for consequent in self.ctrl.consequents:
372 consequent.output[self] = \
--> 373 CrispValueCalculator(consequent, self).defuzz()
374 self.output[consequent.label] = consequent.output[self]
375
~\anaconda3\lib\site-packages\skfuzzy\control\controlsystem.py in defuzz(self)
587 self.var.defuzzify_method)
588 except AssertionError:
--> 589 raise ValueError("Crisp output cannot be calculated, likely "
590 "because the system is too sparse. Check to "
591 "make sure this set of input values will "
ValueError: Crisp output cannot be calculated, likely because the system is too sparse. Check to make sure this set of input values will activate at least one connected Term in each Antecedent via the current set of Rules.

count rows with multiple criterion in pandas [duplicate]

I am trying to modify a DataFrame df to only contain rows for which the values in the column closing_price are between 99 and 101 and trying to do this with the code below.
However, I get the error
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()
and I am wondering if there is a way to do this without using loops.
df = df[(99 <= df['closing_price'] <= 101)]
Consider also series between:
df = df[df['closing_price'].between(99, 101)]
You should use () to group your boolean vector to remove ambiguity.
df = df[(df['closing_price'] >= 99) & (df['closing_price'] <= 101)]
there is a nicer alternative - use query() method:
In [58]: df = pd.DataFrame({'closing_price': np.random.randint(95, 105, 10)})
In [59]: df
Out[59]:
closing_price
0 104
1 99
2 98
3 95
4 103
5 101
6 101
7 99
8 95
9 96
In [60]: df.query('99 <= closing_price <= 101')
Out[60]:
closing_price
1 99
5 101
6 101
7 99
UPDATE: answering the comment:
I like the syntax here but fell down when trying to combine with
expresison; df.query('(mean + 2 *sd) <= closing_price <=(mean + 2
*sd)')
In [161]: qry = "(closing_price.mean() - 2*closing_price.std())" +\
...: " <= closing_price <= " + \
...: "(closing_price.mean() + 2*closing_price.std())"
...:
In [162]: df.query(qry)
Out[162]:
closing_price
0 97
1 101
2 97
3 95
4 100
5 99
6 100
7 101
8 99
9 95
newdf = df.query('closing_price.mean() <= closing_price <= closing_price.std()')
or
mean = closing_price.mean()
std = closing_price.std()
newdf = df.query('#mean <= closing_price <= #std')
If one has to call pd.Series.between(l,r) repeatedly (for different bounds l and r), a lot of work is repeated unnecessarily. In this case, it's beneficial to sort the frame/series once and then use pd.Series.searchsorted(). I measured a speedup of up to 25x, see below.
def between_indices(x, lower, upper, inclusive=True):
"""
Returns smallest and largest index i for which holds
lower <= x[i] <= upper, under the assumption that x is sorted.
"""
i = x.searchsorted(lower, side="left" if inclusive else "right")
j = x.searchsorted(upper, side="right" if inclusive else "left")
return i, j
# Sort x once before repeated calls of between()
x = x.sort_values().reset_index(drop=True)
# x = x.sort_values(ignore_index=True) # for pandas>=1.0
ret1 = between_indices(x, lower=0.1, upper=0.9)
ret2 = between_indices(x, lower=0.2, upper=0.8)
ret3 = ...
Benchmark
Measure repeated evaluations (n_reps=100) of pd.Series.between() as well as the method based on pd.Series.searchsorted(), for different arguments lower and upper. On my MacBook Pro 2015 with Python v3.8.0 and Pandas v1.0.3, the below code results in the following outpu
# pd.Series.searchsorted()
# 5.87 ms ± 321 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# pd.Series.between(lower, upper)
# 155 ms ± 6.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# Logical expressions: (x>=lower) & (x<=upper)
# 153 ms ± 3.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
import numpy as np
import pandas as pd
def between_indices(x, lower, upper, inclusive=True):
# Assumption: x is sorted.
i = x.searchsorted(lower, side="left" if inclusive else "right")
j = x.searchsorted(upper, side="right" if inclusive else "left")
return i, j
def between_fast(x, lower, upper, inclusive=True):
"""
Equivalent to pd.Series.between() under the assumption that x is sorted.
"""
i, j = between_indices(x, lower, upper, inclusive)
if True:
return x.iloc[i:j]
else:
# Mask creation is slow.
mask = np.zeros_like(x, dtype=bool)
mask[i:j] = True
mask = pd.Series(mask, index=x.index)
return x[mask]
def between(x, lower, upper, inclusive=True):
mask = x.between(lower, upper, inclusive=inclusive)
return x[mask]
def between_expr(x, lower, upper, inclusive=True):
if inclusive:
mask = (x>=lower) & (x<=upper)
else:
mask = (x>lower) & (x<upper)
return x[mask]
def benchmark(func, x, lowers, uppers):
for l,u in zip(lowers, uppers):
func(x,lower=l,upper=u)
n_samples = 1000
n_reps = 100
x = pd.Series(np.random.randn(n_samples))
# Sort the Series.
# For pandas>=1.0:
# x = x.sort_values(ignore_index=True)
x = x.sort_values().reset_index(drop=True)
# Assert equivalence of different methods.
assert(between_fast(x, 0, 1, True ).equals(between(x, 0, 1, True)))
assert(between_expr(x, 0, 1, True ).equals(between(x, 0, 1, True)))
assert(between_fast(x, 0, 1, False).equals(between(x, 0, 1, False)))
assert(between_expr(x, 0, 1, False).equals(between(x, 0, 1, False)))
# Benchmark repeated evaluations of between().
uppers = np.linspace(0, 3, n_reps)
lowers = -uppers
%timeit benchmark(between_fast, x, lowers, uppers)
%timeit benchmark(between, x, lowers, uppers)
%timeit benchmark(between_expr, x, lowers, uppers)
Instead of this
df = df[(99 <= df['closing_price'] <= 101)]
You should use this
df = df[(df['closing_price']>=99 ) & (df['closing_price']<=101)]
We have to use NumPy's bitwise Logic operators |, &, ~, ^ for compounding queries.
Also, the parentheses are important for operator precedence.
For more info, you can visit the link
:Comparisons, Masks, and Boolean Logic
If you're dealing with multiple values and multiple inputs you could also set up an apply function like this. In this case filtering a dataframe for GPS locations that fall withing certain ranges.
def filter_values(lat,lon):
if abs(lat - 33.77) < .01 and abs(lon - -118.16) < .01:
return True
elif abs(lat - 37.79) < .01 and abs(lon - -122.39) < .01:
return True
else:
return False
df = df[df.apply(lambda x: filter_values(x['lat'],x['lon']),axis=1)]

How to apply regular function to df third column?

I have df that currently has two columns df[['sys1','dia1']]
I created this function (the parameters are d= df['dia1'] and s = df['sys1'] :
Now i am trying to create a third column by using this function. It would look like this:
df['FirstVisitStge'] = df['FirstVisitStge'].apply(classify)
I am getting an error. I even tried using predefined parameters in the function and still getting an error. What am i doing wrong?
def classify(d,s):
if (d>=90 & d<100 & s<160) or (s >= 140 & s < 160 & d < 100):
return 'Stage 1'
elif (s >= 160 & s <180 & d <110) or (d >= 100 and d < 110 and s > 180):
return 'Stage 2'
elif s >= 180 or d >= 110:
return 'hypertensive crisis'
else:
return 'NA'

Subset two consecutive event occurrence in pandas

I'm trying to get a subset of my data whenever there is consecutive occurrence of an two events in that order. The event is time-stamped. So every time there are continuous 2's and then continuous 3's, I want to subset that to a dataframe and append it to a dictionary. The following code does that but I have to apply this to a very large dataframe of more than 20 mil obs. This is extremely slow using iterrows. How can I make this fast?
df = pd.DataFrame({'Date': [101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122],
'Event': [1,1,2,2,2,3,3,1,3,2,2,3,1,2,3,2,3,2,2,3,3,3]})
dfb = pd.DataFrame(columns = df.columns)
C = {}
f1 = 0
for index, row in df.iterrows():
if ((row['Event'] == 2) & (3 not in dfb['Event'].values)):
dfb = dfb.append(row)
f1 =1
elif ((row['Event'] == 3) & (f1 == 1)):
dfb = dfb.append(row)
elif 3 in dfb['Event'].values:
f1 = 0
C[str(dfb.iloc[0,0])] = dfb
del dfb
dfb = pd.DataFrame(columns = df.columns)
if row['Event'] == 2:
dfb = dfb.append(row)
f1 =1
else:
f1=0
del dfb
dfb = pd.DataFrame(columns = df.columns)
Edit: The desired output is basically a dictionary of the subsets shown in the imagehttps://i.stack.imgur.com/ClWZs.png
If you want to accerlate, you should vectorize your code. You could try it like this (df is the same with your code):
vec = df.copy()
vec['Event_y'] = vec['Event'].shift(1).fillna(0).astype(int)
vec['Same_Flag'] = float('nan')
vec.Same_Flag.loc[(vec['Event_y'] == vec['Event']) & (vec['Event'] != 1)] = 1
vec.dropna(inplace=True)
vec.loc[:, ('Date', 'Event')]
Output is:
Date Event
3 104 2
4 105 2
6 107 3
10 111 2
18 119 2
20 121 3
21 122 3
I think that's close to what you need. You could improve based on that.
I'm not understand why date 104, 105, 107 are not counted.

MemoryError when opening CSV file with pandas

I try to open a CSV file with pandas, but I'm getting a MemoryError. The file is around 300mb. Everything works fine when I use a smaller file.
I am using windows 10 with 64GB RAM. I already tried to change the custom VM options in Pycharm ("help" >> "Edit custom VM options") and set up higher memory numbers but it still doesn't work
import pandas as pd
df = pd.read_csv('report_OOP_Full.csv')
# I tried to add the following line but doesnt help
# df.info(memory_usage='deep')
MemoryError: Unable to allocate 344. MiB for an array with shape (14, 3216774) and data type float64
Process finished with exit code 1
This may not be the most efficient way but have a go.
Reduce or increase the chunk size depending on your RAM availability.
chunks = pd.read_csv('report_OOP_Full.csv', chunksize=10000)
i = 0
chunk_list = []
for chunk in chunks:
i += 1
chunk_list.append(chunk)
df = pd.concat(chunk_list, sort = True)
If this doesnt work. Try this:
chunks = pd.read_csv('report_OOP_Full.csv', chunksize=10000)
i = 0
chunk_list = []
for chunk in chunks:
if i >= 10:
break
i += 1
chunk_list.append(chunk)
df1 = pd.concat(chunk_list, sort = True)
chunks = pd.read_csv('report_OOP_Full.csv', skiprows = 100000, chunksize=10000)
i = 0
chunk_list = []
for chunk in chunks:
if i >= 10:
break
i += 1
chunk_list.append(chunk)
df2 = pd.concat(chunk_list, sort = True)
d3 = pd.concat([d1,d2], sort = True)
skiprows was calculated by how many rows the previous dataframe has read in.
This will break after 10 chunks is loaded. store this as df1. and read in the file again by starting at chunk 11, and append that again.
i understand that you're working with some big data. I encourage you to take a look at this function i found. The link below explains how it works.
credit for this function is here:
credit
def reduce_mem_usage(df):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
df[col] = df[col].astype(np.uint8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
df[col] = df[col].astype(np.uint16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
df[col] = df[col].astype(np.uint32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
df[col] = df[col].astype(np.uint64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
This will make sure your dataframe use as low memory as possible when you're working with it.
I guess another way would be to open only raws which have the same values in the first column ( in this case a string, 1 letter). I dont know if that is possible. for example:
A 4 5 6 3
A 3 4 5 7
A 2 1 4 9
A 1 1 8 7
B 1 2 3 1
B 2 2 3 3
C 1 2 1 2
open first a dataframe with only raws starting with "A" , later do the same with "B" , "C" and so on. I dont know if thats possible but it could help.