Date Countdown with pandas - pandas

I'm trying to calc the different between a date and today in months.
Here is what I have so far:
import pandas as pd
import numpy as np
from datetime import date
def calc_date_countdown(df):
today = date.today()
df['countdown'] = df['date'].apply(lambda x: (x-today)/np.timedelta64(1, 'M'))
df['countdown'] = df['countdown'].astype(int)
return df
Any pointers on what I'm doing wrong or maybe a more efficient way of doing it?
When I run on my dataset, this is the error I'm getting: TypeError: unsupported operand type(s) for -: 'Timestamp' and 'datetime.date'

Using apply is not very efficient, as this is an array operation.
See the below example:
from datetime import date, datetime
def per_array(df):
df['months'] = ((pd.to_datetime(date.today()) - df['date']) / np.timedelta64(1, 'M')).astype(int)
return df
def using_apply(df):
today = date.today()
df['months'] = df['date'].apply(lambda x: (x-pd.to_datetime(today))/np.timedelta64(1, 'M'))
df['months'] = df['months'].astype(int)
return df
df = pd.DataFrame({'date': [pd.to_datetime(f"2023-0{i}-01") for i in range(1,8)]})
print(df)
# date
# 0 2023-01-01
# 1 2023-02-01
# 2 2023-03-01
# 3 2023-04-01
# 4 2023-05-01
# 5 2023-06-01
# 6 2023-07-01
Timing it:
%%timeit
per_array(df)
195 µs ± 5.14 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%%timeit
using_apply(df)
384 µs ± 3.22 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
As you can see, it is around twice as fast to not use apply.

import pandas as pd
def calc_date_countdown(df):
today = pd.Timestamp.today()
df['countdown'] = df['date'].apply(lambda x: (x - today).days // 30)
return df
This should work as long as your date column in the dataframe is a Timestamp object. If it's not, you may need to convert it using pd.to_datetime() before running the function.

Related

Dropping Duplicate Points

I have two geodataframes or geoseries, both consists of thousands of points.
My requirement is to append (merge) both geodataframes and drop duplicate points.
In other words, output = gdf1 all points + gdf2 points that do not intersect with gdf1 points
I tried as:
output = geopandas.overlay(gdf1, gdf2, how='symmetric_difference')
However, it is very slow.
Do you know any faster way of doing it ?
Here is another way of combining dataframes using pandas, along with timings, versus geopandas:
import pandas as pd
import numpy as np
data1 = np.random.randint(-100, 100, size=10000)
data2 = np.random.randint(-100, 100, size=10000)
df1 = pd.concat([-pd.Series(data1, name="longitude"), pd.Series(data1, name="latitude")], axis=1)
df1['geometry'] = df1.apply(lambda x: (x['latitude'], x['longitude']), axis=1)
df2 = pd.concat([-pd.Series(data2, name="longitude"), pd.Series(data2, name="latitude")], axis=1)
df2['geometry'] = df2.apply(lambda x: (x['latitude'], x['longitude']), axis=1)
df1 = df1.set_index(["longitude", "latitude"])
df2 = df2.set_index(["longitude", "latitude"])
%timeit pd.concat([df1[~df1.index.isin(df2.index)],df2[~df2.index.isin(df1.index)]])
112 ms ± 217 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
This seems a lot faster than using geopandas
import geopandas as gp
gdf1 = gp.GeoDataFrame(
df1, geometry=gp.points_from_xy(df1.index.get_level_values("longitude"), df1.index.get_level_values("latitude")))
gdf2 = gp.GeoDataFrame(
df2, geometry=gp.points_from_xy(df2.index.get_level_values("longitude"), df2.index.get_level_values("latitude")))
%timeit gp.overlay(gdf1, gdf2, how='symmetric_difference')
29 s ± 317 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
But maybe you need some kind of optimisations as mentioned here.
The function checks for non-matching indexes from each df and then combines them.
df1 = pd.DataFrame([1,2,3,4],columns=['col1']).set_index("col1")
df2 = pd.DataFrame([3,4,5,6],columns=['col1']).set_index("col1")
pd.concat([df1[~df1.index.isin(df2.index)],df2[~df2.index.isin(df1.index)]])
col1
1
2
5
6

apply calculation in pandas column with groupby

what could be wrong in below code ??
a)I need a group by Area columns and apply some mathematical formula across columns:
b)Also if I have another column lets say the date and need to be added to groupby how will it come in below command
df3 = dataset.groupby('AREA')(['col1']+['col2']).sum()
table is in image below
enter image description here
I think you can sum column before grouping for better performance:
dataset['new'] = dataset['col1']+dataset['col2']
df3 = dataset.groupby('AREA', as_index=False)['new'].sum()
But your solution is possible in lambda function:
df3 = (dataset.groupby('AREA')
.apply(lambda x: (x['col1']+x['col2']).sum())
.reset_index(name='SUM'))
Performance:
np.random.seed(123)
N = 100000
dataset = pd.DataFrame({'AREA': np.random.randint(1000, size=N),
'col1': np.random.randint(10, size=N),
'col2':np.random.randint(10, size=N)})
#print (dataset)
In [24]: %%timeit
...: dataset['new'] = dataset['col1']+dataset['col2']
...: df3 = dataset.groupby('AREA', as_index=False)['new'].sum()
...:
7.64 ms ± 50.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [25]: %%timeit
...: df3 = (dataset.groupby('AREA')
...: .apply(lambda x: (x['col1']+x['col2']).sum())
...: .reset_index(name='SUM'))
...:
368 ms ± 5.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Fast implementation of incrementing dates by days according to another column pandas

I have the following pandas DF:
print(df.to_dict())
{'Date_Installed': {11885: Timestamp('2018-11-15 00:00:00'), 111885: Timestamp('2018-11-15 00:00:00')}, 'days_from_instalation': {11885: 2, 111885: 3}}
I would like to create a new column that increments the 'Date_Installed' column by the days from the column 'days_from_instalation'
I know that this is possible using the apply() method as following:
from datetime import timedelta
df['desired_date']=df.apply(lambda row:row['Date_Installed']+timedelta(row['days_from_instalation']), axis=1)
which produces my desired output:
print(df.to_dict())
{'Date_Installed': {11885: Timestamp('2018-11-15 00:00:00'), 111885: Timestamp('2018-11-15 00:00:00')}, 'days_from_instalation': {11885: 2, 111885: 3}, 'desired_date': {11885: Timestamp('2018-11-17 00:00:00'), 111885: Timestamp('2018-11-18 00:00:00')}}
However this method is extremely slow, and isn't realistic to apply to my full DF.
I wen't over several questions on incrementing dates in pandas like this one:
pandas-increment-datetime
But they all seem to deal with constant incrementation, without any vectorised method to do so.
Is there any vectorised version of this type of increment?
Thanks in advance!
Add timedeltas created by to_timedelta:
df['desired_date'] = df['Date_Installed'] +
pd.to_timedelta(df['days_from_instalation'], unit='d')
print (df)
Date_Installed days_from_instalation desired_date
11885 2018-11-15 2 2018-11-17
111885 2018-11-15 3 2018-11-18
Another numpy solution is faster, but lost timezones (if specified):
a = pd.to_timedelta(df['days_from_instalation'], unit='d').values.astype(np.int64)
df['desired_date1'] = pd.to_datetime(df['Date_Installed'].values.astype(np.int64)+a, unit='ns')
Performance:
#20krows
df = pd.concat([df] * 10000, ignore_index=True)
In [217]: %timeit df['desired_date1'] = pd.to_datetime(df['Date_Installed'].values.astype(np.int64) + pd.to_timedelta(df['days_from_instalation'], unit='d').values.astype(np.int64), unit='ns')
886 µs ± 9.92 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [218]: %timeit df['desired_date'] = df['Date_Installed'] + pd.to_timedelta(df['days_from_instalation'], unit='d')
1.53 ms ± 82.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

opening csv with a single row of 6M values very slow in pandas

Hi I have a file with approximately 6M comma-separated values all on one line
I am trying
import pandas as pd
v = pd.read_csv(file_name,
nrows=1, skiprows=3, header=None, verbose=True, dtype=np.float32)
with the file being
Name
Tue Nov 6 13:52:15 2018
Description
52.2269,52.2148,52.246,52.361,52.5263,52.7399,52.9738,53.1952,...45.4,
I get the output
Tokenization took: 0.00 ms
Type conversion took: 53023.43 ms
Parser memory cleanup took: 212.13 ms
v summary shows
1 rows × 6316057 columns
The file reading time takes a lot longer than expected, I think it may be due to the data being in one row. Is there anything I can do to speed it up, or do I need a different library ?
For my timings below, some dummy data:
data = np.random.randn(1_000_000)
with open('tmp', 'wt') as f:
f.write('dummy\n')
f.write('dummy\n')
f.write('dummy\n')
for val in data:
f.write(str(val) + ',')
f.write('\n')
In general, pandas parser is optimized for the 'long' data case, rather than a single very wide row like this. You could pre-process the data, turning the delimiter into newlines, which for my example is ~40x faster.
def parse_wide_to_long(f):
from io import StringIO
data = open(f).read().splitlines()[-1]
data = data.replace(',', '\n')
return pd.read_csv(StringIO(data), header=None)
In [33]: %timeit pd.read_csv('tmp', nrows=1, skiprows=3, header=None, dtype=np.float32)
20.6 s ± 2.04 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [39]: %timeit parse_wide_to_long('tmp')
484 ms ± 35.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Optimization - split a column of type set into multiple columns

I want to create new columns based on the elements of column Col1, which is of type set. Each element has a corresponding column name that is stored in a dict. Here is the full code:
import numpy as np
import pandas as pd
np.random.seed(123)
N = 10**4 #number of rows in the dataframe
df = pd.DataFrame({'Cnt': np.random.randint(2,10,N)})
# generate lists of random length
def f(x):
return set(np.random.randint(101,120,x))
df['Col1'] = df['Cnt'].apply(f)
# dictionary with column names for each element in list
d = {'Item_1':101, 'Item_2':102, 'Item_3':103, 'Item_4':104, 'Item_5':105, 'Item_6':106, 'Item_7':107, 'Item_8':108,
'Item_9':109, 'Item_10':110, 'Item_11':111, 'Item_12':112, 'Item_13':113, 'Item_14':114, 'Item_15':115, 'Item_16':116,
'Item_17':117, 'Item_18':118, 'Item_19':119, 'Item_20':120}
def elem_in_set(x,e):
return 1 if e in x else 0
def create_columns(input_data, d):
df = input_data.copy()
for k, v in d.items():
df[k] = df.apply(lambda x: elem_in_set(x['Col1'], v), axis=1)
return df
%timeit create_columns(df, d)
#5.05 s ± 78.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
The problem is that the production dataframe has about 400k rows, and my solution does not scale well at all - I'm looking at around 10 minutes on my machine. The column containing all elements (Col1) could be type list instead of set, but that doesn't improve performance.
Is there a faster solution to this?
I made a small change in your create_columns apply. Seems like it works much faster now.
import numpy as np
import pandas as pd
np.random.seed(123)
N = 10**4 #number of rows in the dataframe
df = pd.DataFrame({'Cnt': np.random.randint(2,10,N)})
# generate lists of random length
def f(x):
return set(np.random.randint(101,120,x))
df['Col1'] = df['Cnt'].apply(f)
# dictionary with column names for each element in list
d = {'Item_1':101, 'Item_2':102, 'Item_3':103, 'Item_4':104, 'Item_5':105, 'Item_6':106, 'Item_7':107, 'Item_8':108,
'Item_9':109, 'Item_10':110, 'Item_11':111, 'Item_12':112, 'Item_13':113, 'Item_14':114, 'Item_15':115, 'Item_16':116,
'Item_17':117, 'Item_18':118, 'Item_19':119, 'Item_20':120}
def create_columns(input_data, d):
df = input_data.copy()
for k, v in d.items():
df[k] = df.Col1.apply(lambda x: 1 if v in x else 0)
return df
create_columns(df, d)
#191 ms ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)