pandas dropna is not removing nan when using np.where - pandas

I have this function
import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon
def test(e, n):
polygon = Polygon([(340,6638),(340,6614),(375,6620),(374,6649)])
point_instance = Point((e, n))
a = polygon.contains(point_instance)
val = np.where(a, 0, np.nan)
return pd.Series([val])
I want to apply above function in my dataframe and then remove the nan
def testData(filename):
df = pd.read_csv(filename)
df['check'] = df\
.apply(lambda x: test(x['E'], x['N']), axis=1)
# I tried both of these and doesnt delete nan values
df.dropna(axis=0, how = 'any', inplace = True)
df1 = df.dropna(axis=0, how='any', subset=['check'])
However, if i save data in a file and use dropna, then it works.
Sample dataframe
Id,E,N
1,5,8
2,6,9
3,7,10
This is the output I am getting
Id E N check
1 5 8 nan
2 6 9 nan
3 7 10 nan

It seems using np.nan inside np.where creates conflict datatypes.
And for that reason, pandas dropna didnt work.
I fixed using pandas map inside my function
a = pd.Series(polygon.contains(point_instance))
val = a.map({True: 0, False: np.nan})
return val

Related

I have problems to replace inf and -inf win nan in pandas

I found a code on the web to replace inf and -inf with np.nan, however it did not work on my computer.
df = pd.DataFrame({"A" : [4.6, 5., np.inf]})
new_dataframe = a_dataframe.replace([np.inf, -np.inf], np.nan)
df
My output
A
0 4.6
1 5.0
2 inf
Does somebody know a solution?
import pandas and numpy, then assign to the dataframe df.replace([np.inf, -np.inf], np.nan). When you do df.replace([np.inf, -np.inf], np.nan) this does not update the dataframe it needs to be assigned with = for the change to happen.
Also, for some reason in the code you provided there is new_dataframe and a_dataframe, which have nothing to do with df. Try the code below.
import pandas as pd
import numpy as np
df = pd.DataFrame({"A": [4.6, 5., np.inf]})
df = df.replace([np.inf, -np.inf], np.nan)
print(df)

How do to fill NaN's in a pandas dataframe with random 1's and 0's

How to replace the NaN values in a pandas dataframe with random 0's and 1's?
df.fillna(random.randint(0,1))
seems to fill the NaN's in certain columns with all 1's or all 0's
#Creating a dummy Dataframe
import pandas as pd
import numpy as np
cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],
'Price': [100,np.nan,27000,np.nan]
}
df = pd.DataFrame(cars, columns = ['Brand', 'Price'])
# Replacing NAN in a particular column
a = df.Price.isnull()
rand_int = np.random.randint(2, size=a.sum())
df.loc[a,'Price' ] = rand_int
print(df)
# For entire dataframe
for i in df:
a = df[i].isnull()
rand_int = np.random.randint(2, size=a.sum())
df.loc[a, i ] = rand_int
print(df)

Get max value of 3 columns from pandas DataFrame?

I've a Pandas DataFrame with 3 columns:
c={'a': [['US']],'b': [['US']], 'c': [['US','BE']]}
df = pd.DataFrame(c, columns = ['a','b','c'])
Now I need the max value of these 3 columns.
I've tried:
df['max_val'] = df[['a','b','c']].max(axis=1)
The result is Nan instead of the expected output: US.
How can I get the max value for these 3 columns? (and what if one of them contains Nan)
Use:
c={'a': [['US', 'BE'],['US']],'b': [['US'],['US']], 'c': [['US','BE'],['US','BE']]}
df = pd.DataFrame(c, columns = ['a','b','c'])
from collections import Counter
df = df[['a','b','c']].apply(lambda x: list(Counter(map(tuple, x)).most_common()[0][0]), 1)
print (df)
0 [US, BE]
1 [US]
dtype: object
if it as # Erfan stated, most common value in a row then .agg(), mode
df.agg('mode', axis=1)
0
0 [US, BE]
1 [US]
while your data are lists, you can't use pandas.mode(). because lists objects are unhashable and mode() function won't work.
a solution is converting the elements of your dataframe's row to strings and then use pandas.mode().
check this:
>>> import pandas as pd
>>> c = {'a': [['US','BE']],'b': [['US']], 'c': [['US','BE']]}
>>> df = pd.DataFrame(c, columns = ['a','b','c'])
>>> x = df.iloc[0].apply(lambda x: str(x))
>>> x.mode()
# Answer:
0 ['US', 'BE']
dtype: object
>>> d = {'a': [['US']],'b': [['US']], 'c': [['US','BE']]}
>>> df2 = pd.DataFrame(d, columns = ['a','b','c'])
>>> z = df.iloc[0].apply(lambda z: str(z))
>>> z.mode()
# Answer:
0 ['US']
dtype: object
As I can see you have some elements as a list type, So I think the below-mentioned code will work fine.
First, append all value into an array
Then, find the most occurring element from that array.
from scipy.stats import mode
arr = []
for i in df:
for j in range(len(df[i])):
for k in range(len(df[i][j])):
arr.append(df[i][j][k])
from collections import Counter
b = Counter(arr)
print(b.most_common())
this will give you an answer as you want.

pandas iat : vscode vs jupiterlab

I used some code that works on vscode :
df.iat[row_marker,column_marker] = thisText
At first df is not defined, row_marker,column_marker equal zero;
It creates new line and column correctly !
I tried the same code in jupiterlab but got an IndexError : I assume it is because the column number is not knowed....
Why is this working in vscode but not in JupiterLab ?
What is the method to add new values manually when we don't know the size of the dataframe ?
New comment : the dataframe exist but is empty
If you have an empty dataframe you can add rows via the append method
import pandas as pd
df = pd.DataFrame()
thisText = 'Hello World'
df = df.append({'column': thisText}, ignore_index=True)
You could use this pattern to build a dataframe that can grow in both dimensions:
import pandas as pd
site_no = 0
df = pd.DataFrame()
while site_no < 4: # your website counter
if site_no < 2: # website with col0 and col1 data
row = pd.DataFrame(
{'col0:': 'Hello', 'col1': 'World'}, index=[site_no])
else: # website with col2 and col3 data
row = pd.DataFrame(
{'col2:': 'Say', 'col3': 'Goodbye'}, index=[site_no])
df = df.append(row)
site_no += 1
df
col0: col1 col2: col3
0 Hello World NaN NaN
1 Hello World NaN NaN
2 NaN NaN Say Goodbye
3 NaN NaN Say Goodbye

Pandas Symmetric Difference a list value between 2 columns

I've a following dataframe, df:
A B
0 [ACL1, ACL2, ACL3] [ACL1, ACL4, ACL2]
I want to perform a symmetric_difference on the A and B list so that the output will be [ACL3,ACL4]
df1 = df['A'].symmetric_difference(df['B'])
print (df1)
AttributeError: 'Series' object has no attribute 'symmetric_difference'
But it give an above error....Did I did wrongly? How can I accomplish the final output?
Thanks..
The problem is that symmetric_difference is a method of sets, instead you could do:
import pandas as pd
data = [[['ACL1', 'ACL2', 'ACL3'], ['ACL1', 'ACL4', 'ACL2']]]
df = pd.DataFrame(data=data, columns=['A', 'B'])
def symmetric_difference(x):
return list(set(x.A).symmetric_difference(x.B))
result = df[['A', 'B']].apply(symmetric_difference, axis=1)
print(result)
Output
0 [ACL3, ACL4]
dtype: object
If do care about the performance
[list(set(x).symmetric_difference(set(y))) for x , y in zip (df.A,df.B)]
[['ACL3', 'ACL4']]