I have a plot that looks like this :
import pandas as pd
import pandas_datareader as web
import datetime as dt
from datetime import timedelta
import matplotlib.pyplot as plt
#get the data
start_date = pd.to_datetime('2019-11-1')
end_date = pd.datetime.today()
df = web.DataReader('^gspc', 'yahoo', start_date, end_date)
df = df['Adj Close']
#build the plot
fig, ax1 = plt.subplots()
ax1.plot(df)
#set the axhline
ax1.axhline(df.max(),xmin=0,xmax=1)
ax1.set_xlim(start_date,end_date + timedelta(30))
ax1.set_ylim(df.min() -200, df.max() +200)
I am trying to set the axhline so it starts on the day of the maximum value in the df. I am having issues because the index is a datetime object, and axhline needs an integer.
Here is what I've tried:
ax1.axhline(df.max(),xmin=df.idxmax(),xmax=1)
What is the most efficient way to set the xmin to the date of the max value in the df?
Thanks
axhline() uses a y position, and two x positions. Y is in data coordinates and x in axes coordinates (0 for the left margin, 1 for the right). But the desired starting position is only available in data coordinates. hlines() can work with these.
df.argmax() finds the position of the maximum. df.index[df.argmax()] or df.idxmax() gets the value of the index at that position.
import pandas as pd
import pandas_datareader as web
import datetime as dt
from datetime import timedelta
import matplotlib.pyplot as plt
start_date = pd.to_datetime('2019-11-1')
end_date = pd.datetime.today()
df = web.DataReader('^gspc', 'yahoo', start_date, end_date)
df = df['Adj Close']
fig, ax1 = plt.subplots()
ax1.plot(df)
ax1.hlines(df.max(), df.idxmax(), end_date + timedelta(30), color='crimson', ls=':')
ax1.set_xlim(start_date, end_date + timedelta(30))
ax1.set_ylim(df.min() - 200, df.max() + 200)
plt.show()
Related
I have a DataFrame with several columns and date as index. I use sns.heatmap to plot it, with the date on the y-axis. I would like to force the ticks to display the 1st of October of every year only. I used the solution given by #Ayrton Bourn on Date axis in heatmap seaborn, which allows me to change the frequency of ticks but not at which day to display the date.
His method is the only one that allows me to choose the frequency of y-ticks so far. I tried using mdates.YearLocator() or set_major_locator without success.
With the code below, do you have any suggestion that would allow me to choose the frequency of date ticks (every year) and the day displayed (every '200x-10-01' for example) ?
import numpy as np
from datetime import date, datetime, timedelta
import pandas as pd
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from collections.abc import Iterable
from sklearn import linear_model
class AxTransformer:
def __init__(self, datetime_vals=False):
self.datetime_vals = datetime_vals
self.lr = linear_model.LinearRegression()
return
def process_tick_vals(self, tick_vals):
if not isinstance(tick_vals, Iterable) or isinstance(tick_vals, str):
tick_vals = [tick_vals]
if self.datetime_vals == True:
tick_vals = pd.to_datetime(tick_vals).astype(int).values
tick_vals = np.array(tick_vals)
return tick_vals
def fit(self, ax, axis='x'):
axis = getattr(ax, f'get_{axis}axis')()
tick_locs = axis.get_ticklocs()
tick_vals = self.process_tick_vals([label._text for label in axis.get_ticklabels()])
self.lr.fit(tick_vals.reshape(-1, 1), tick_locs)
return
def transform(self, tick_vals):
tick_vals = self.process_tick_vals(tick_vals)
tick_locs = self.lr.predict(np.array(tick_vals).reshape(-1, 1))
return tick_locs
def set_date_ticks(ax, start_date, end_date, axis='y', date_format='%Y-%m-%d', **date_range_kwargs):
dt_rng = pd.date_range(start_date, end_date, **date_range_kwargs)
ax_transformer = AxTransformer(datetime_vals=True)
ax_transformer.fit(ax, axis=axis)
getattr(ax, f'set_{axis}ticks')(ax_transformer.transform(dt_rng))
getattr(ax, f'set_{axis}ticklabels')(dt_rng.strftime(date_format))
ax.tick_params(axis=axis, which='both', bottom=True, top=False, labelbottom=True)
return ax
base = datetime(2000, 1, 1)
arr = np.array([base + timedelta(days=i) for i in range(366*3)])
val = np.random.rand(len(arr),3)
df = pd.DataFrame(val, index = arr)
f, ax = plt.subplots(figsize=(20,20))
ax = sns.heatmap(df, ax = ax)
set_date_ticks(ax, '2000-01-01', '2003-12-01', freq='1Y')
ax.format_ydata = mdates.DateFormatter('% Y')
plt.show()
I found the solution by looking into the Kwargs.
To choose the dates displayed, the only thing to change resides in "freq =" (see here the detailed list of frequencies). In my case, in order to display yearly 1st of Octobers, I just have to change the start date to be a first of October, and the frequency to specify that I want the ticks to be at the beginning of every 12th month (the end date does not matter):
f, ax = plt.subplots(figsize=(20,20))
ax = sns.heatmap(df, ax = ax)
set_date_ticks(ax, '2000-10-01', '2004-10-01', freq='12MS')
ax.format_ydata = mdates.DateFormatter('% Y')
plt.show()
I have this dataframe:
dates;A;B;C
2018-01-31;1;2;5
2018-02-28;1;4;3
2018-03-31;1;5;5
2018-04-30;1;6;3
2018-05-31;1;6;7
2018-06-30;1;7;3
2018-07-31;1;9;9
2018-08-31;1;2;3
2018-09-30;1;2;10
2018-10-31;1;4;3
2018-11-30;1;7;11
2018-12-31;1;2;3
I read it:
dfr = pd.read_csv('test.dat', sep=';', header = 0, index_col=0, parse_dates=True)
and then I try to plot it:
width = 5
dfr.index = pd.to_datetime(dfr.index)
x = date2num(dfr.index)
axs.bar(x-0.5*width,dfr.iloc[:,1], width=width)
axs.bar(x+0.5*width,dfr.iloc[:,2], width=width)
axs.xaxis_date()
months = dates.MonthLocator()
axs.xaxis.set_major_formatter(dates.DateFormatter(r'\textbf{%B}'))
months_f = dates.DateFormatter('%B')
axs.xaxis.set_major_locator(months)
plt.setp( axs.xaxis.get_majorticklabels(), rotation=90)
here the modules imported:
import matplotlib.pyplot as plt
from matplotlib.dates import date2num
import datetime
import pandas as pd
import matplotlib.dates as dates
and here the result:
I do not get why x label starts with 'Feb'.
I would like to have something like 'Jan,Feb,Mar...' as x labels in the x axis.
Thanks in advance
The heights of the bar charts you made do not correspond to the labelled month, i.e. the values for Feb are actually those of Jan. Therefore, the problem is in the way you labelled the axis rather than having an incorrect plot order.
I'm not so familiar with the packages you used, so I proposed a different way of making your plot:
dfr['dates'] = pd.to_datetime(dfr['dates'])
### group by months
month_vals = dfr.groupby(dfr['dates'].map(lambda x: x.month))
month_vals = sorted(month_vals, key=lambda m: m[0])
fig, axs = plt.subplots()
spacing = 0.15
### Create the list of months and the corresponding dataframes
months, df_months = zip(*month_vals)
### In your case, each month has exactly one entry, but in case there are more, sum over all of them
axs.bar([m-spacing for m in months], [df_m.loc[:,'B'].sum() for df_m in df_months], width=0.3)
axs.bar([m+spacing for m in months], [df_m.loc[:,'C'].sum() for df_m in df_months], width=0.3)
axs.set_xticks(months)
### 1900 and 1 are dummy values; we are just initializing a datetime instance here
axs.set_xticklabels([datetime.date(1900, m, 1).strftime('%b') for m in months])
Output:
I have a simple dataframe I am plotting in matplotlib. However, the plot is showing the range of the dates, rather than just the two observed data points.
How can I only plot the two data points and not the range of the dates?
df structure:
Date Number
2018-01-01 12:00:00 1
2018-02-01 12:00:00 2
Output of the matplotlib code:
Here is what I expected (this was done using a string and not a date on the x-axis data):
df code:
import pandas as pd
df = pd.DataFrame([['2018-01-01 12:00:00', 1], ['2018-02-01 12:00:00',2]], columns=['Date', 'Number'])
df['Date'] = pd.to_datetime(df['Date'])
df.set_index(['Date'],inplace=True)
Plot code:
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots(
figsize=(4,5),
dpi=72
)
width = 0.75
#starts the bar chart creation
ax1.bar(df.index, df['Number'],
width,
align='center',
color=('#666666', '#333333'),
edgecolor='#FF0000',
linewidth=2
)
ax1.set_ylim(0,3)
ax1.set_ylabel('Score')
fig.autofmt_xdate()
#Title
plt.title('Scores by group and gender')
plt.tight_layout()
plt.show()
Try adding something like:
import matplotlib.dates as mdates
myFmt = mdates.DateFormatter('%y-%m-%d')
ax1.xaxis.set_major_formatter(myFmt)
plt.xticks(df.index)
I think the dates are transformed to large integers at the time of the plot. So width = 0.75 is very small, try something bigger (like width = 20:
Matplotlib bar plots are numeric in nature. If you want a categorical bar plot instead, you may use pandas bar plots.
df.plot.bar()
You may then want to beautify the labels a bit
import matplotlib.pyplot as plt
import pandas as pd
df = pd.DataFrame([['2018-01-01 12:00:00', 1], ['2018-02-01 12:00:00',2]], columns=['Date', 'Number'])
df['Date'] = pd.to_datetime(df['Date'])
df.set_index(['Date'],inplace=True)
ax = df.plot.bar()
ax.tick_params(axis="x", rotation=0)
ax.set_xticklabels([t.get_text().split()[0] for t in ax.get_xticklabels()])
plt.show()
Ok I'm trying to do something that should be trivial but instead I've spent more time than I'd like to admit searching google and stack overflow only to become more frustrated.
What I'm trying to do: I'd like to format my x-axis on a seaborn tsplot.
What my stack overflow searching has told me: matplot lib has a set_major_formattter function but I can't seem to use it without tripping an overflow error.
What I'm looking for: a simple way to convert datetime64[ns] to a float that can be used with marplot lib's set_major_formatter.
Where I think I'm stuck:
df.date_action = df.date_action.values.astype('float')
# converts the field to a float but matplotlib expects seconds since 0001-01-01 not nano seconds since epoch
is there a simple way to do this that I'm missing?
the most helpful post I reviewed so far was
31255815 which got me 95% of the way there but not quite
here is some sample code to illustrate the issue
# standard imports
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
## generate fake data
from datetime import timedelta, date
import random
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2013, 1, 1)
end_date = date(2018, 6, 2)
date_list = []
number_list = []
for single_date in daterange(start_date, end_date):
date_list.append(single_date)
if len(number_list) > 0:
number_list.append(random.random() + number_list[-1])
else:
number_list.append(random.random())
df = pd.DataFrame(data={'date_action': date_list, 'values': number_list})
# note my actual data comes in as a datetime64[ns]
df['date_action'] = df['date_action'].astype('datetime64[ns]')
# the following looked promising but is still offset an incorrect amount
#df.date_action = df.date_action.values.astype('float')
#df.date_action = df.date_action.to_datetime
## chart stuff
plt.clf()
import matplotlib.dates as mdates
df['dummy_01'] = 0
rows = 1
cols = 1
fig, axs = plt.subplots(nrows=rows, ncols=cols, figsize=(10, 8))
ax1 = plt.subplot2grid((rows, cols), (0, 0))
for i in [ax1]: # trying to format x-axis
pass
i.xaxis_date()
i.xaxis.set_major_locator(mdates.AutoDateLocator())
i.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
sns.tsplot(df, time='date_action', unit='dummy_01',
value='values', ax=ax1) #
plt.plot()
plt.show()
I am trying to annotate my plot with part of a dataframe. However, the time 00:00:00 is appearing in all the row labels. Is there a clean way to remove them since my data is daily in frequency? I have tried the normalize function but that doesn't remove the time; it just zeroes the time.
Here is what the issue looks like and the sample code to reproduce the issue.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import table
# Setup of mock data
date_range = pd.date_range('2014-01-01', '2015-01-01', freq='MS')
df = pd.DataFrame({'Values': np.random.rand(0, 10, len(date_range))}, index=date_range)
# The plotting of the table
fig7 = plt.figure()
ax10 = plt.subplot2grid((1, 1), (0, 0))
table(ax10, np.round(df.tail(5), 2), loc='center', colWidths=[0.1] * 2)
fig7.show()
Simply access the .date attribute of the DateTimeIndex so that every individual element of your index would be represented in datetime.date format.
The default DateTimeIndex format is datetime.datetime which gets defined automatically even if you didn't explicitly define your index that way before.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import table
np.random.seed(42)
# Setup of mock data
date_range = pd.date_range('2014-01-01', '2015-01-01', freq='MS')
df = pd.DataFrame({'Values': np.random.rand(len(date_range))}, date_range)
df.index = df.index.date # <------ only change here
# The plotting of the table
fig7 = plt.figure()
ax10 = plt.subplot2grid((1, 1), (0, 0))
table(ax10, np.round(df.tail(5), 2), loc='center', colWidths=[0.1] * 2)
fig7.show()