Increment a time and add it in data frame column - pandas

Hi I am new to python and I am looking for below result.
I have From_Curr(3), To_Curr(3) and making currency pairs and adding new column in my data frame as time.
3*3 = 9 currency pairs created So I want same time for currency pairs and then increment by 1 hr again for same pairs as shown below.
Problem statement is time gets incremented after every row.
Actual df:
Expected df:
Thanks for any help and appreciate your time.
`
import pandas as pd
import datetime
from datetime import timedelta
data = pd.DataFrame({'From':["EUR","GBP",'USD'],
'To':["INR","SGD",'HKD'],
'time':''})
init_date = datetime.datetime(1, 1, 1)
for index, row in data.iterrows():
row['time'] = str(init_date)[11:19]
init_date = init_date + timedelta(hours=1.0)
`

I'm not understanding why you are repeating the combinations, and incrementing in one hour in the last half.
But for this case, you can do something like this:
import pandas as pd
data = pd.DataFrame({'From':["EUR","GBP",'USD'],
'To':["INR","SGD",'HKD'],
'time':''})
outlist = [ (i, j) for i in data["From"] for j in data["To"] ]*2 # Create double combinations
data = pd.DataFrame(data=outlist,columns=["From","To"])
data["time"] = "00:00:00"
data["time"].iloc[int(len(data)/2):len(data)] = "01:00:00" # Assign 1 hour to last half
data["time"] = pd.to_datetime(data["time"]).dt.time
Update: After some clarifications
import pandas as pd
data = pd.DataFrame(
{"From": ["EUR", "GBP", "USD"], "To": ["INR", "SGD", "HKD"], "time": ""}
)
outlist = [
(i, j) for i in data["From"] for j in data["To"]
] * 2 # Create double combinations, i think that for your case it would be 24 instead of two
data = pd.DataFrame(data=outlist, columns=["From", "To"])
data["time"] = data.groupby(["From", "To"]).cumcount() # Get counts of pairs values
data["time"] = data["time"] * pd.to_datetime("01:00:00").value # Multiply occurrences by the value of 1 hour
data["time"] = pd.to_datetime(data["time"]).dt.time # Pass to time
I think this script covers all your needs, happy coding :)
Regards,

Related

discrete numpy array to continuous array

I have some discrete data in an array, such that:
arr = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
whose plot looks like:
I also have an index array, such that each unique value in arr is associated with a unique index value, like:
ind = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
What is the most pythonic way of converting arr from discrete values to continuous values, so that the array would look like this when plotted?:
therefore, interpolating between the discrete points to make continuous data
I found a solution to this if anyone has a similar issue. It is maybe not the most elegant so modifications are welcome:
def ref_linear_interp(x, y):
arr = []
ux=np.unique(x) #unique x values
for u in ux:
idx = y[x==u]
try:
min = y[x==u-1][0]
max = y[x==u][0]
except:
min = y[x==u][0]
max = y[x==u][0]
try:
min = y[x==u][0]
max = y[x==u+1][0]
except:
min = y[x==u][0]
max = y[x==u][0]
if min==max:
sub = np.full((len(idx)), min)
arr.append(sub)
else:
sub = np.linspace(min, max, len(idx))
arr.append(sub)
return np.concatenate(arr, axis=None).ravel()
y = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
x = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
z = np.arange(1, 16, 1)
Here is an answer for the symmetric solution that I would expect when reading the question:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# create the data as described
numbers = [1,2,3,2,1]
nblock = 3
df = pd.DataFrame({
"x": np.arange(nblock*len(numbers)),
"y": np.repeat(numbers, nblock),
"label": np.repeat(np.arange(len(numbers)), nblock)
})
Expecting a constant block size of 3, we could use a rolling window:
df['y-smooth'] = df['y'].rolling(nblock, center=True).mean()
# fill NaNs
df['y-smooth'].bfill(inplace=True)
df['y-smooth'].ffill(inplace=True)
plt.plot(df['x'], df['y-smooth'], marker='*')
If the block size is allowed to vary, we could determine the block centers and interpolate piecewise.
centers = df[['x', 'y', 'label']].groupby('label').mean()
df['y-interp'] = np.interp(df['x'], centers['x'], centers['y'])
plt.plot(df['x'], df['y-interp'], marker='*')
Note: You may also try
centers = df[['x', 'y', 'label']].groupby('label').min() to select the left corner of the labelled blocks.

cumsum() on running streak of data

i'm attempting to determine the sum of a column for a running period where there is a negative gain - ie i want to determine the total for a loosing streak.
i've set up a column that provides the numerical number of days where its been loosing (Consecutive Losses), but i wish to sum up the same for the total loss throughout the streak. what i have (Aggregate Consecutive Loss) 1) doesn't work (because it just cumsums() without resetting to zero at each streak) and 2) is incorrectly as i should in fact take the Open value at the start of the streak and Close value at the end.
how can i correctly setup this Aggregate Consecutive Loss value in pandas?
import pandas as pd
import numpy as np
import yfinance as yf
def get( symbols, group_by_ticker=False, **kwargs ):
if not isinstance(symbols, list):
symbols = [ symbols, ]
kwargs['auto_adjust'] = True
kwargs['prepost'] = True
kwargs['threads'] = True
df = None
if group_by_ticker:
kwargs['group_by'] = 'ticker'
df = yf.download( symbols, **kwargs)
for t in symbols:
df["Change Percent",t] = df["Close",t].pct_change() * 100
df["Gain",t] = np.where( df['Change Percent',t] > 0, True, False ).astype('bool')
a = df['Gain',t] != True
df['Consecutive Losses',t] = a.cumsum() - a.cumsum().where(~a).ffill().fillna(0).astype(int)
x = df['Change Percent',t].where( df['Consecutive Losses',t] > 0 )
df['Aggregate Consecutive Loss',t] = x.cumsum() - x.cumsum().where(~a).ffill().fillna(0).astype(float)
return df
data = get( ["DOW", "IDX"], period="6mo")
data[['Change Percent','Gain','Consecutive Losses','Aggregate Consecutive Loss']].head(50)

Pandas Timeseries: Total duration meeting a specific condition

I have a timeseries
ts = pd.Series(data=[0,1,2,3,4],index=[pd.Timestamp('1991-01-01'),pd.Timestamp('1995-01-01'),pd.Timestamp('1996-01-01'),pd.Timestamp('2010-01-01'),pd.Timestamp('2011-01-01')])
Whats the fastest, most readable, way to get the total duration in which the value is below 2, assuming the values are valid until the next time-step indicates otherwise (no linear interpolation). I imagine there probably is a pandas function for this
This seems to be working quite well, however I am still baffled that there does not seem to be a pandas function for this!
import pandas as pd
import numpy as np
ts = pd.Series(data=[0,1,2,3,4],index=[pd.Timestamp('1991-01-01'),pd.Timestamp('1995-01-01'),pd.Timestamp('1996-01-01'),pd.Timestamp('2010-01-01'),pd.Timestamp('2011-01-01')])
# making the timeseries binary. 1 = meets condition, 0 = does not
ts = ts.where(ts>=2,other=1)
ts = ts.where(ts<2,other=0)
delta_time = ts.index.to_pydatetime()[1:]-ts.index.to_pydatetime()[:-1]
time_below_2 = np.sum(delta_time[np.invert(ts.values[:-1])]).total_seconds()
time_above_2 = np.sum(delta_time[(ts.values[:-1])]).total_seconds()
The above function seems to break for certain timeframes. This option is slower, but did not break in any of my tests:
def get_total_duration_above_and_below_value(value,ts):
# making the timeseries binary. 1 = above value, 0 = below value
ts = ts.where(ts >= value, other=1)
ts = ts.where(ts < value, other=0)
time_above_value = 0
time_below_value = 0
for i in range(ts.size - 1):
if ts[i] == 1:
time_above_value += abs(pd.Timedelta(
ts.index[i] - ts.index[i + 1]).total_seconds()) / 3600
else:
time_below_value += abs(pd.Timedelta(
ts.index[i] - ts.index[i + 1]).total_seconds()) / 3600
return time_above_value, time_below_value

Assigning values to dataframe columns

In the below code, the dataframe df5 is not getting populated. I am just assigning the values to dataframe's columns and I have specified the column beforehand. When I print the dataframe, it returns an empty dataframe. Not sure whether I am missing something.
Any help would be appreciated.
import math
import pandas as pd
columns = ['ClosestLat','ClosestLong']
df5 = pd.DataFrame(columns=columns)
def distance(pt1, pt2):
return math.sqrt((pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2)
for pt1 in df1:
closestPoints = [pt1, df2[0]]
for pt2 in df2:
if distance(pt1, pt2) < distance(closestPoints[0], closestPoints[1]):
closestPoints = [pt1, pt2]
df5['ClosestLat'] = closestPoints[1][0]
df5['ClosestLat'] = closestPoints[1][0]
df5['ClosestLong'] = closestPoints[1][1]
print ("Point: " + str(closestPoints[0]) + " is closest to " + str(closestPoints[1]))
From the look of your code, you're trying to populate df5 with a list of latitudes and longitudes. However, you're making a couple mistakes.
The columns of pandas dataframes are Series, and hold some type of sequential data. So df5['ClosestLat'] = closestPoints[1][0] attempts to assign the entire column a single numerical value, and results in an empty column.
Even if the dataframe wasn't ignoring your attempts to assign a real number to the column, you would lose data because you are overwriting the column with each loop.
The Solution: Build a list of lats and longs, then insert into the dataframe.
import math
import pandas as pd
columns = ['ClosestLat','ClosestLong']
df5 = pd.DataFrame(columns=columns)
def distance(pt1, pt2):
return math.sqrt((pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2)
lats, lngs = [], []
for pt1 in df1:
closestPoints = [pt1, df2[0]]
for pt2 in df2:
if distance(pt1, pt2) < distance(closestPoints[0], closestPoints[1]):
closestPoints = [pt1, pt2]
lats.append(closestPoints[1][0])
lngs.append(closestPoints[1][1])
df['ClosestLat'] = pd.Series(lats)
df['ClosestLong'] = pd.Series(lngs)

apply generic function in a vectorized fashion using numpy/pandas

I am trying to vectorize my code and, thanks in large part to some users (https://stackoverflow.com/users/3293881/divakar, https://stackoverflow.com/users/625914/behzad-nouri), I was able to make huge progress. Essentially, I am trying to apply a generic function (in this case max_dd_array_ret) to each of the bins I found (see vectorize complex slicing with pandas dataframe for details on date vectorization and Start, End and Duration of Maximum Drawdown in Python for the rationale behind max_dd_array_ret). the problem is the following: I should be able to obtain the result df_2 and, to some degree, ranged_DD(asd_1.values, starts, ends+1) is what I am looking for, except for the tragic effect that it's as if the first two bins are merged and the last one is missing as it can be gauged by looking at the results.
any explanation and fix is very welcomed
import pandas as pd
import numpy as np
from time import time
from scipy.stats import binned_statistic
def max_dd_array_ret(xs):
xs = (xs+1).cumprod()
i = np.argmax(np.maximum.accumulate(xs) - xs) # end of the period
j = np.argmax(xs[:i])
max_dd = abs(xs[j]/xs[i] -1)
return max_dd if max_dd is not None else 0
def get_ranges_arr(starts,ends):
# Taken from https://stackoverflow.com/a/37626057/3293881
counts = ends - starts
counts_csum = counts.cumsum()
id_arr = np.ones(counts_csum[-1],dtype=int)
id_arr[0] = starts[0]
id_arr[counts_csum[:-1]] = starts[1:] - ends[:-1] + 1
return id_arr.cumsum()
def ranged_DD(arr,starts,ends):
# Get all indices and the IDs corresponding to same groups
idx = get_ranges_arr(starts,ends)
id_arr = np.repeat(np.arange(starts.size),ends-starts)
slice_arr = arr[idx]
return binned_statistic(id_arr, slice_arr, statistic=max_dd_array_ret)[0]
asd_1 = pd.Series(0.01 * np.random.randn(500), index=pd.date_range('2011-1-1', periods=500)).pct_change()
index_1 = pd.to_datetime(['2011-2-2', '2011-4-3', '2011-5-1','2011-7-2', '2011-8-3', '2011-9-1','2011-10-2', '2011-11-3', '2011-12-1','2012-1-2', '2012-2-3', '2012-3-1',])
index_2 = pd.to_datetime(['2011-2-15', '2011-4-16', '2011-5-17','2011-7-17', '2011-8-17', '2011-9-17','2011-10-17', '2011-11-17', '2011-12-17','2012-1-17', '2012-2-17', '2012-3-17',])
starts = asd_1.index.searchsorted(index_1)
ends = asd_1.index.searchsorted(index_2)
df_2 = pd.DataFrame([max_dd_array_ret(asd_1.loc[i:j]) for i, j in zip(index_1, index_2)], index=index_1)
print(df_2[0].values)
print(ranged_DD(asd_1.values, starts, ends+1))
results:
df_2
[ 1.75893509 6.08002911 2.60131797 1.55631781 1.8770067 2.50709085
1.43863472 1.85322338 1.84767224 1.32605754 1.48688414 5.44786663]
ranged_DD(asd_1.values, starts, ends+1)
[ 6.08002911 2.60131797 1.55631781 1.8770067 2.50709085 1.43863472
1.85322338 1.84767224 1.32605754 1.48688414]
which are identical except for the first two:
[ 1.75893509 6.08002911 vs [ 6.08002911
and the last two
1.48688414 5.44786663] vs 1.48688414]
p.s.:while looking in more detail at the docs (http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binned_statistic.html) I found that this might be the problem
"All but the last (righthand-most) bin is half-open. In other words,
if bins is [1, 2, 3, 4], then the first bin is [1, 2) (including 1,
but excluding 2) and the second [2, 3). The last bin, however, is [3,
4], which includes 4. New in version 0.11.0."
problem is I don't how to reset it.