Pandas function with 2 arguments to find threshold - pandas

I need to find people with greater or equal to threshold gain =>
dataframe contains column 'capitalGain' with different values 10,20,30,50,1000,5000,10000 ...etc
I try :
Function:
def get_num_people_with_higher_gain(dataframe, threshold_gain)
threshold_gain = dataframe["capitalGain"][dataframe["capitalGain"] >= threshold_gain].count()
return threshold_gain
Call function
df = get_num_people_with_higher_gain(dataframe, threshold_gain)
But I get the following error message:
NameError Traceback (most recent call last)
<ipython-input-50-5485c90412c8> in <module>
----> 1 df = get_num_people_with_higher_gain(dataframe, threshold_gain)
2 threshold = get_num_people_with_higher_gain(dataframe, threshold_gain)
NameError: name 'dataframe' is not defined
Since there are 2 arguments in the function (dataframe, threshold_gain), does it mean that both should be somehow defined within the function ?
Thanks

Finally,
Here is the solution
def get_num_people_with_higher_gain(dataframe, threshold_gain):
result = len(dataframe[dataframe["capitalGain"] >= threshold_gain])
return result
result = get_num_people_with_higher_gain(dataframe,60000)
result

Related

How to get back column value if df.ColumnName, where Columnname is comes from user in Pyspark

My code is below in Pyspark:
Product = results.where(results.ColumnName == ProductName )
Product.show()
Where i want ColumnName should come from User like:
ColumnName = input("enter column name")
but I get the error as below:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-64-3e0600c86491> in <module>()
----> 1 Prediction_Product_Wise = results.where(results.ColumnName == ProductName ).select(
ColumnName,'probability','prediction').orderBy("probability",ascending=False)
2 Prediction_Product_Wise.show()
/usr/local/lib/python3.6/dist-packages/pyspark/sql/dataframe.py in __getattr__(self, name)
1399 if name not in self.columns:
1400 raise AttributeError(
-> 1401 "'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
1402 jc = self._jdf.apply(name)
1403 return Column(jc)
AttributeError: 'DataFrame' object has no attribute 'ColumnName'
Basically I wanted to check whether Product column has that value in it or not.
For example
Product
apple
ball
cat
and the data frame is df then df.Product == apple should return True value.
And I want to give user that at place of Product column he/she can choose any column name according to preference.
Assuming ColumnName is a string, you can do this.
from pyspark.sql.functions import col
Product = results.where(col(ColumnName) === ProductName)
Product.show()
You can try creating a function like this to do your task. This will return False for columns which are not there in the dataframe.
def is_value(df, column_name: str, value):
if column_name in df.columns:
return bool(df.where(df[column_name] == value).first())
return False

AttributeError: 'Styler' object has no attribute 'merge'

I have a problem like that, when i styled data (conditional format) with pandas, i can't merge that datas. You can find my code and error below,
Can anyone give me an advice?
CODE:
cm = sns.diverging_palette(10, 140, s=99, l=50,
n=9, center="light", as_cmap=True)
df_style1 = df_b.style.background_gradient(cmap=cm)
df_style2 = df_c.style.background_gradient(cmap=cm)
df_last = df_style1.merge(df_style2, on= 'EKSPER_ADI', how='left')
ERROR:
AttributeError Traceback (most recent call last)
<ipython-input-148-d1b2ae3dc7a6> in <module>
4 df_style1 = df_b.style.background_gradient(cmap=cm)
5 df_style2 = df_c.style.background_gradient(cmap=cm)
----> 6 df_last = df_style1.merge(df_style1, on= 'EKSPER_ADI', how='left')
AttributeError: 'Styler' object has no attribute 'merge'
I think not possible, first use merge and then apply styles:
df = df_b.merge(df_c, on= 'EKSPER_ADI', how='left')
df_style2 = df.style.background_gradient(cmap=cm)

how to fix the calculation error which says 'DataFrame' object is not callable

im working on football data set and this is following error im getting. please help,
#what is the win rate of HomeTeam?
n_matches = df.shape[0]
n_features = df.shape[1] -1
n_homewin = len(df(df.FTR == 'H'))
win_rate = (float(n_homewin) / (n_matches)) * 100
print ("Total number of matches,{}".format(n_matches))
print ("Number of features,{}".format(n_features))
print ("Number of maches won by hom team,{}".format (n_homewin))
print ("win rate of home team,{:.2f}%" .format(win_rate))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-122-7e4d81fc684e> in <module>
5 n_features = df.shape[1] -1
6
----> 7 n_homewin = len(df(df.FTR == 'H'))
8
9 win_rate = (float(n_homewin) / (n_matches)) * 100
TypeError: 'DataFrame' object is not
expected result should print the team winning ratio
I think problem is with (), need [] for filter by boolean indexing:
n_homewin = len(df[df.FTR == 'H'])
Or simplier count Trues values by sum:
n_homewin = (df.FTR == 'H').sum()
you should modify it to df[df.FTR == 'H']. The parentheses imply a function call

How to set value on MagicMock obj. so that function under test doesn't return MagicMock object

Hey I'm just getting started with unittests and mocks in Python. Trying to test a function that takes a single column dataframe and returns a float after some calculation based on the dataframe values.
import unittest
from unittest.mock import MagicMock
def avg_annual_fcf_growth_rate(fcf_data_frame):
delta = fcf_data_frame.iloc[9][0] - fcf_data_frame.iloc[0][0]
avg_growth_rate = (delta**(1/9))-1
return avg_growth_rate
class Test_DCF(unittest.TestCase):
def test_fcf_calculation(self):
mock_fcf = MagicMock()
mock_fcf.iloc[9][0].return_value = 100
mock_fcf.iloc[0][0].return_value = 10
result = avg_annual_fcf_growth_rate(mock_fcf)
expected = ((100-10)**(1/9)) - 1
self.assertEqual(result, expected)
if __name__ == '__main__':
unittest.main()
#Sample dataframe
Free Cash Flow USD Mil
2008-12 5114.0
2009-12 10909.0
2010-12 11915.0
2011-12 12285.0
2012-12 11175.0
2013-12 16617.0
2014-12 16825.0
2015-12 15409.0
2016-12 19581.0
2017-12 34068.0
I'm running into an issue where the result is a MagicMock object instead of a float. I've tried looking for answers, but not able to wrap my head around how to properly set the return value for the mock object.
Output from running test.
F
======================================================================
FAIL: test_fcf_calculation (__main__.Test_DCF)
----------------------------------------------------------------------
Traceback (most recent call last):
File "path/to/intrinsic_value_dcf_test.py", line 18, in test_fcf_calculation
self.assertEqual(result, expected)
AssertionError: <MagicMock name='mock.iloc.__getitem__().[56 chars]104'> != 0.6486864043382532
----------------------------------------------------------------------
Ran 1 test in 0.004s
FAILED (failures=1)
From what I could tell, the problem arises in repeated __getitem__ calls. The following works though:
def test_fcf_calculation(self):
mock_fcf = MagicMock()
mock_fcf.iloc.__getitem__.return_value = MagicMock(**{'__getitem__.side_effect': [100, 10]})
result = avg_annual_fcf_growth_rate(mock_fcf)
expected = ((100-10)**(1/9)) - 1
print(mock_fcf.mock_calls)
self.assertEqual(result, expected)

utf_16_le_decode SystemError when inserting pandas DataFrame with fast_executemany

This is my code:
def insertDataFrameInDB(cursor, dataFrame, toTable, fieldNames = None):
if fieldNames:
dataFrame = dataFrame[fieldNames]
else:
fieldNames = dataFrame.columns
for r in dataFrame.columns.values:
dataFrame[r] = dataFrame[r].map(str)
dataFrame[r] = dataFrame[r].map(str.strip)
params = [tuple(x) for x in dataFrame.values]
fieldNameStr = ",".join(fieldNames)
valueStr = ",".join(["?"] * len(fieldNames))
sql = "INSERT INTO {} ({}) VALUES({})".format(toTable, fieldNameStr, valueStr)
cursor.fast_executemany = True
cursor.executemany(sql, params)
cursor.commit()
insertDataFrameInDB(cursor, df, "table")
It gives the following error which I really can't address:
DataError Traceback (most recent call last)
DataError: ('String data, right truncation: length 24 buffer 20', '22001')
The above exception was the direct cause of the following exception:
SystemError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\encodings\utf_16_le.py in decode(input, errors)
15 def decode(input, errors='strict'):
---> 16 return codecs.utf_16_le_decode(input, errors, True)
17
SystemError: <built-in function utf_16_le_decode> returned a result with an error set
The above exception was the direct cause of the following exception:
SystemError Traceback (most recent call last)
SystemError: decoding with 'utf-16le' codec failed (SystemError: <built-in function utf_16_le_decode> returned a result with an error set)
The above exception was the direct cause of the following exception:
SystemError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\encodings\utf_16_le.py in decode(input, errors)
15 def decode(input, errors='strict'):
---> 16 return codecs.utf_16_le_decode(input, errors, True)
17
SystemError: <built-in function utf_16_le_decode> returned a result with an error set
The above exception was the direct cause of the following exception:
SystemError Traceback (most recent call last)
SystemError: decoding with 'utf-16le' codec failed (SystemError: <built-in function utf_16_le_decode> returned a result with an error set)
The above exception was the direct cause of the following exception:
SystemError Traceback (most recent call last)
<ipython-input-6-f73d9346f943> in <module>()
12
13 cursor = getCursor(conData)
---> 14 insertDataFrameInDB(cursor, df, "snowplow.sankey")
<ipython-input-1-69ecbca20fc8> in insertDataFrameInDB(cursor, dataFrame, toTable, fieldNames)
29 sql = "INSERT INTO {} ({}) VALUES({})".format(toTable, fieldNameStr, valueStr)
30 cursor.fast_executemany = True
---> 31 cursor.executemany(sql, params)
32 cursor.commit()
SystemError: <class 'pyodbc.Error'> returned a result with an error set
A lot of error searching makes me think it has something to do with the lack of a BOM, I tried to decode the strings in the "params" tuples, also tried str.astype('U'). Does anybody know what causes the problem and possibly how to address that?
You are using Microsoft's "ODBC Driver ... for SQL Server" so fast_executemany should work with pyodbc 4.0.21. However, you can invoke that feature while still using DataFrame#to_sql by using SQLAlchemy execution events as illustrated by this question.
Example: The following code does not take advantage of fast_executemany
import pandas as pd
from sqlalchemy import create_engine
import time
engine = create_engine('mssql+pyodbc://#SQL_panorama')
# test environment
num_rows = 1000
df = pd.DataFrame(
[[x, f'row{x:03}'] for x in range(num_rows)],
columns=['id', 'txt']
)
#
cnxn = engine.connect()
try:
cnxn.execute("DROP TABLE df_to_sql_test")
except:
pass
cnxn.execute("CREATE TABLE df_to_sql_test (id INT PRIMARY KEY, txt NVARCHAR(50))")
# timing test
t0 = time.time()
df.to_sql("df_to_sql_test", engine, if_exists='append', index=False)
print(f"{num_rows} rows written in {(time.time() - t0):.1f} seconds")
Result:
1000 rows written in 25.2 seconds
Adding a SQLAlchemy execution event handler reduces the execution time significantly
import pandas as pd
from sqlalchemy import create_engine, event
import time
engine = create_engine('mssql+pyodbc://#SQL_panorama')
#event.listens_for(engine, 'before_cursor_execute')
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
if executemany:
cursor.fast_executemany = True
# test environment
num_rows = 1000
df = pd.DataFrame(
[[x, f'row{x:03}'] for x in range(num_rows)],
columns=['id', 'txt']
)
#
cnxn = engine.connect()
try:
cnxn.execute("DROP TABLE df_to_sql_test")
except:
pass
cnxn.execute("CREATE TABLE df_to_sql_test (id INT PRIMARY KEY, txt NVARCHAR(50))")
# timing test
t0 = time.time()
df.to_sql("df_to_sql_test", engine, if_exists='append', index=False)
print(f"{num_rows} rows written in {(time.time() - t0):.1f} seconds")
Result:
1000 rows written in 1.6 seconds
For a more complete discussion of this approach, see this answer.