geopandas spatial join with conditions / additional columns - pandas

I have two DataFrames with Lat, Long columns and other additional columns. For example,
import pandas as pd
import geopandas as gpd
df1 = pd.DataFrame({
'id': [0, 1, 2],
'dt': [01-01-2022, 02-01-2022, 03-01-2022],
'Lat': [33.155480, 33.155480, 33.155480],
'Long': [-96.731630, -96.731630, -96.731630]
})
df2 = pd.DataFrame({
'val': ['a', 'b', 'c'],
'dt': [01-01-2022, 02-01-2022, 03-01-2022],
'Lat': [33.155480, 33.155480, 33.155480],
'Long': [-96.731630, -96.731630, -96.731630]
})
I'd like to do a spatial join not just on lat, long but also on date column. Expected output:
id dt lat long val
0 01-01-2022 33.155480 -96.731630 a
1 02-01-2022 33.155480 -96.731630 b
2 03-01-2022 33.155480 -96.731630 c

Related

How to count No. of rows with special characters in all columns of a PySpark DataFrame?

Assume that I have a PySpark DataFrame. Some of the cells contain only special characters.
Sample dataset:
import pandas as pd
data = {'ID': [1, 2, 3, 4, 5, 6],
'col_1': ['A', '?', '<', ' ?', None, 'A?'],
'col_2': ['B', ' ', '', '?>', 'B', '\B']
}
pdf = pd.DataFrame(data)
df = spark.createDataFrame(pdf)
I want to count the number of rows which contain only special characters (except blank cells). Values like 'A?' and '\B' and blank cells are not counted.
The expected output will be:
{'ID': 0, 'col_1': 3, 'col_2': 1}
Is there anyway to do that?
Taking your sample dataset, this should do:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
spark = SparkSession.builder.getOrCreate()
import pandas as pd
data = {'ID': [1, 2, 3, 4, 5, 6],
'col_1': ['A', '?', '<', ' ?', None, 'A?'],
'col_2': ['B', ' ', '', '?>', 'B', '\B']
}
pdf = pd.DataFrame(data)
df = spark.createDataFrame(pdf)
res = {}
for col_name in df.columns:
df = df.withColumn('matched', when((col(col_name).rlike('[^A-Za-z0-9\s]')) & ~(col(col_name).rlike('[A-Za-z0-9]')), True).otherwise(False))
res[col_name] = df.select('ID').where(df.matched).count()
print(res)
The trick is to use regular expressions with two conditions to filter the cells that are valid according to your logic.

KeyError: Timestamp is not in index

I am trying to overlap timestamp from df2 with df1 time window. When ever there is no match I am getting the following error. how can I get the output with out the following error?
Error
KeyError: "[Timestamp('2022-01-01 03:12:02')] not in index"
input
from datetime import datetime, date
import pandas as pd
df1 = pd.DataFrame({'id': ['aa0', 'aa1', 'aa2', 'aa3'],
'number': [1, 2, 2, 1],
'color': ['blue', 'red', 'yellow', "green"],
'date1': [datetime(2022,1,1,1,1,1),
datetime(2022,1,1,2,4,1),
datetime(2022,1,1,3,8,1),
datetime(2022,1,1,4,12,1)],
'date2': [datetime(2022,1,1,2,1,1),
datetime(2022,1,1,3,6,1),
datetime(2022,1,1,3,10,1),
datetime(2022,1,1,4,14,1)] })
input2
df2 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
'value': [10,20,30,40],
'date': [datetime(2022,1,1,1,12,1),
datetime(2022,1,1,1,40,1),
datetime(2022,1,1,3,12,2),
datetime(2022,1,1,4,12,2)] })
Expected output
(2022-01-01 01:01:01, 2022-01-01 02:01:01] 15.0
(2022-01-01 04:12:01, 2022-01-01 04:14:01] 40.0
Code
idx = pd.IntervalIndex.from_arrays(pd.to_datetime(df1['date1']),
pd.to_datetime(df1['date2']))
mapper = pd.Series(idx, index=idx)
df2.groupby(mapper[pd.to_datetime(df2['date'])].values)['value'].mean()
One option is with conditional_join from pyjanitor, which solves inequality joins such as this:
# pip install pyjanitor
import pandas as pd
import janitor
df1['date1'] = pd.to_datetime(df1['date1'])
df1['date2'] = pd.to_datetime(df1['date2'])
df2['date'] = pd.to_datetime(df2['date'])
(
df1
.filter(like='date')
.conditional_join(
df2.filter(['value', 'date']),
('date1', 'date', '<='),
('date2', 'date', '>='))
.groupby(['date1', 'date2'])
.value
.mean()
)
date1 date2
2022-01-01 01:01:01 2022-01-01 02:01:01 15.0
2022-01-01 04:12:01 2022-01-01 04:14:01 40.0
Name: value, dtype: float64
I think I figured it out. It is not the best but works.
df1['date'] = pd.to_datetime(df1['date1']).dt.date
df2['date'] = pd.to_datetime(df2['dates']).dt.date
df3 = pd.merge(df1, df2, on=['date'], how='left')
mask = (df3['dates'] > df3['date1']) & (df3['dates'] < df3['date2'])
df4 = df3.loc[mask]
df4.groupby(['date1', 'date2'])['value'].mean()

Change index of multiple dataframes at once

Here is my earlier part of the code -
import pandas as pd
import itertools as it
import numpy as np
a= pd.read_excel(r'D:\Ph.D. IEE\IEE 6570 - IR Dr. Greene\3machinediverging.xlsx')
question = pd.DataFrame(a).set_index('Job')
df2 = question.index
permutations = list(it.permutations(question.index))
dfper = pd.DataFrame(permutations)
for i in range(len(dfper)):
fr = dfper.iloc[0:len(dfper)]
fr.index.name = ''
print(fr)
for i in range(0, fr.shape[0], 1):
print (fr.iloc[i:i+1].T)
This gives me 120 dataframes.
0
0 A
1 B
2 C
3 D
4 E
1
0 A
1 B
2 C
3 E
4 D
and so on...
I would like to change the index of these dataframes to the alphabet column (using a for loop). Any help would be really appreciated. Thank you.
import pandas as pd
df1 = pd.DataFrame({0: [ 'A', 'B', 'C', 'D', 'E']})
df2 = pd.DataFrame({1: [ 'A', 'B', 'C', 'D', 'E']})
df_list = [df1, df2]
for df in df_list:
# set the first column as index
df.set_index(df.columns[0], inplace=True)
I would organize all dataframes in a list:
df_list = [df1, df2, df3,...]
and then:
for df in df_list: df = df.set_index(df.columns[0])

matplotlib - plot merged dataframe with group bar

I try to plot a grouped bar chart from a merged dataframe. below code the bar is stacked, how can I put it side by side just like a grouped bar chart?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
df1 = pd.DataFrame({
'key': ['A', 'B', 'C', 'D'],
'value':[ 10 ,6, 6, 8]})
df2 = pd.DataFrame({
'key': ['B', 'D', 'A', 'F'],
'value':[ 3, 5, 5, 7]})
df3 = pd.merge(df1, df2, how='inner', on=['key'])
print(df1)
print(df2)
print(df3)
fig, ax = plt.subplots(figsize=(12, 8))
b1 = ax.bar(df3['key'],df3['value_x'])
b2 = ax.bar(df3['key'],df3['value_y'])
pngname = "demo.png"
fig.savefig(pngname, dpi=fig.dpi)
print("[[./%s]]"%(pngname))
Current output:
The problem is that the x axis data is the same, in your case it aren't numbers, it are the keys: "A", "B", "C". So matplotlib stacks them one onto another.
There's a simple way around it, as some tutorials online show https://www.geeksforgeeks.org/create-a-grouped-bar-plot-in-matplotlib/.
So, what you do is basically enumerate the keys, i.e. A=1, B=2, C=3. After this, choose your desired bar width, I chose 0.4 for example. And now, shift one group of bars to the left by bar_width/2, and shift the other one to the right by bar_width/2.
Perhaps the code explains it better than I did:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
df1 = pd.DataFrame({
'key': ['A', 'B', 'C', 'D'],
'value':[ 10 ,6, 6, 8]})
df2 = pd.DataFrame({
'key': ['B', 'D', 'A', 'F'],
'value':[ 3, 5, 5, 7]})
df3 = pd.merge(df1, df2, how='inner', on=['key'])
fig, ax = plt.subplots(figsize=(12, 8))
# modifications
x = np.arange(len(df3['key'])) # enumerate the keys
bar_width = 0.4 # choose bar length
b1 = ax.bar(x - bar_width/2,df3['value_x'], width=bar_width, label='value_x') # shift x values left
b2 = ax.bar(x + bar_width/2,df3['value_y'], width=bar_width, label='value_y') # shift x values right
plt.xticks(x, df3['key']) # replace x axis ticks with keys from df3.
plt.legend(['value_x', 'value_y'])
plt.show()
Result:

Getting standard deviation on a specific number of dates

In this dataframe...
import pandas as pd
import numpy as np
import datetime
tf = 365
dt = datetime.datetime.now()-datetime.timedelta(days=365)
df = pd.DataFrame({
'Cat': np.repeat(['a', 'b', 'c'], tf),
'Date': np.tile(pd.date_range(dt, periods=tf), 3),
'Val': np.random.rand(3*tf)
})
How can I get a dictionary of standard deviation of each 'Cat' values for a specific number of days - back from the last day for a large dataset?
This code gives the standard deviation for 10 days...
{s: np.std(df[(df.Cat == s) &
(df.Date > today-datetime.timedelta(days=10))].Val)
for s in df.Cat.unique()}
...looks clunky.
Is there a better way?
First filter by boolean indexing and then aggregate std, but because default value ddof=1 is necessary set it to 0:
d1 = df[(df.Date>dt-datetime.timedelta(days=10))].groupby('Cat')['Val'].std(ddof=0).to_dict()
print (d1)
{'a': 0.28435695432581953, 'b': 0.2908486860242955, 'c': 0.2995981283031974}
Another solution is use custom function:
f = lambda x: np.std(x.loc[(x.Date > dt-datetime.timedelta(days=10)), 'Val'])
d2 = df.groupby('Cat').apply(f).to_dict()
Difference between solutions is if some values in group not matched conditions then is removed and for second solution is assignd NaN:
d1 = {'b': 0.2908486860242955, 'c': 0.2995981283031974}
d2 = {'a': nan, 'b': 0.2908486860242955, 'c': 0.2995981283031974}