Formatting index of a pandas table in a plot - pandas

I am trying to annotate my plot with part of a dataframe. However, the time 00:00:00 is appearing in all the row labels. Is there a clean way to remove them since my data is daily in frequency? I have tried the normalize function but that doesn't remove the time; it just zeroes the time.
Here is what the issue looks like and the sample code to reproduce the issue.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import table
# Setup of mock data
date_range = pd.date_range('2014-01-01', '2015-01-01', freq='MS')
df = pd.DataFrame({'Values': np.random.rand(0, 10, len(date_range))}, index=date_range)
# The plotting of the table
fig7 = plt.figure()
ax10 = plt.subplot2grid((1, 1), (0, 0))
table(ax10, np.round(df.tail(5), 2), loc='center', colWidths=[0.1] * 2)
fig7.show()

Simply access the .date attribute of the DateTimeIndex so that every individual element of your index would be represented in datetime.date format.
The default DateTimeIndex format is datetime.datetime which gets defined automatically even if you didn't explicitly define your index that way before.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tools.plotting import table
np.random.seed(42)
# Setup of mock data
date_range = pd.date_range('2014-01-01', '2015-01-01', freq='MS')
df = pd.DataFrame({'Values': np.random.rand(len(date_range))}, date_range)
df.index = df.index.date # <------ only change here
# The plotting of the table
fig7 = plt.figure()
ax10 = plt.subplot2grid((1, 1), (0, 0))
table(ax10, np.round(df.tail(5), 2), loc='center', colWidths=[0.1] * 2)
fig7.show()

Related

How make scatterplot in pandas readable

I've been playing with Titanic dataset and working through some visualisations in Pandas using this tutorial. https://www.kdnuggets.com/2023/02/5-pandas-plotting-functions-might-know.html
I have a visual of scatterplot having used this code.
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('train.csv')
I was confused by bootstrap plot result so went on to scatterplot.
pd.plotting.scatter_matrix(df, figsize=(10,10), )
plt.show()
I can sort of interpret it but I'd like to put the various variables at top and bottom of every column. Is that doable?
You can use:
fig, ax = plt.subplots(4, 3, figsize=(20, 15))
sns.scatterplot(x = 'bedrooms', y = 'price', data = dataset, whis=1.5, ax=ax[0, 0])
sns.scatterplot(x = 'bathrooms', y = 'price', data = dataset, whis=1.5, ax=ax[0, 1])

Creating 3D scatter chart in Taipy

I was wondering how one would create a 3D scatter chart in Taipy.
I tried this code initially:
import pandas as pd
import numpy as np
from taipy import Gui
df = pd.DataFrame(np.random.randint(0,100,size=(100, 3)), columns=list('xyz'))
df['cluster1']=np.random.randint(0,3,100)
my_page ="""
Creation of a 3-D chart:
<|{df}|chart|type=Scatter3D|x=x|y=y|z=z|mode=markers|color=cluster|>
"""
Gui(page=my_page).run()
This does indeed display a 3D plot, but the colors (clusters) do not show up.
Any hint?
Yes, you need some massaging of your dataframes to do it.
Here's a sample code that achieves this:
import pandas as pd
import numpy as np
from taipy import Gui
df = pd.DataFrame(np.random.randint(0,100,size=(100, 3)), columns=list('xyz'))
df['cluster1']=np.random.randint(0,3,100)
# Create a list of 3 dataframes, one per cluster
datas = [df[df['cluster1']==i] for i in range(3)]
properties = {
}
# create dynamically the property list.
# str(i) points to a dataframe index
# "/x" points to the column value in the selected dataframe
for i in range(len(datas)):
properties[f"x[{i+1}]"] = str(i)+"/x"
properties[f"y[{i+1}]"] = str(i)+"/y"
properties[f"z[{i+1}]"] = str(i)+"/z"
properties[f'name[{i+1}]'] = str(i+1)
print(properties)
chart = "<|{datas}|chart|type=Scatter3D|properties={properties}|mode=markers|height=800px|>"
Gui(page=chart).run()
In fact, with the new release: Taipy 1.1, this is very easy to do in a few lines of code:
import pandas as pd
import numpy as np
from taipy import Gui
color_map={0:"blue",1:'green', 2:"red"}
df = pd.DataFrame(np.random.randint(0,100,size=(100, 3)), columns=list('xyz'))
df['cluster1'] = np.random.randint(0,3,100)
df['cluster_colors'] = df.apply(lambda row: color_map[row.cluster1], axis=1)
marker = {"color":"cluster_colors"}
chart = "<|{df}|chart|type=Scatter3D|x=x|y=y|z=z|marker={marker}|mode=markers|height=800px|>"
Gui(page=chart).run()
If you want to leave it to Taipy to pick the colors for you, then you can simply use:
import pandas as pd
import numpy as np
from taipy import Gui
df = pd.DataFrame(np.random.randint(0,100,size=(100, 3)), columns=list('xyz'))
df['cluster1'] = np.random.randint(0,3,100)
marker = {"color":"cluster1"}
chart = "<|{df}|chart|type=Scatter3D|x=x|y=y|z=z|marker={marker}|mode=markers|height=800px|>"
Gui(page=chart).run()

seaborn.swarmplot problem with symlog scale: zero's are not expanded

I have a data set of positive values and zero's that I would like to show on the log scale. To represent zero's I use 'symlog' option, but all zero values are mapped into one point on swarmplot. How to fix it?
import numpy as np
import seaborn as sns
import pandas as pd
import random
import matplotlib.pyplot as plt
n = 100
x = np.concatenate(([0]*n,np.linspace(0,1,n),[5]*n,np.linspace(10,100,n),np.linspace(100,1000,n)),axis=None)
data = pd.DataFrame({'value': x, 'category': random.choices([0,1,2,3], k=len(x))})
f, ax = plt.subplots(figsize=(10, 6))
ax.set_yscale("symlog",linthreshy=1.e-2)
ax.set_ylim(ymax=1000)
sns.swarmplot(x="category", y="value", data=data)
sns.despine(left=True)
link to the resulting plot

How can I graph a candlestick chart with a DataFrame in Python?

I am not able to figure out how to graph a candlestick OHLC chart with python. Ever since matplotlib.finance was deprecated I've had this issue... Thanks for your help!
The DataFrame "quotes" is an excel (can't paste here), but has the following columns:
Index(['Date', 'Open', 'High', 'Low', 'Close'], dtype='object')
I also have a default index. The 'Date' column is a pandas._libs.tslibs.timestamps.Timestamp
When I run the code I get the following error:
File "", line 30, in
candlestick_ohlc(ax, zip(mdates.date2num(quotes.index.to_pydatetime()),
AttributeError: 'RangeIndex' object has no attribute 'to_pydatetime'
Here is my code:
import datetime
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.dates import MONDAY, DateFormatter, DayLocator,
WeekdayLocator
from mpl_finance import candlestick_ohlc
date1 = "2004-2-1"
date2 = "2004-4-12"
mondays = WeekdayLocator(MONDAY)
alldays = DayLocator()
weekFormatter = DateFormatter('%b %d')
dayFormatter = DateFormatter('%d')
fig, ax = plt.subplots()
fig.subplots_adjust(bottom=0.2)
ax.xaxis.set_major_locator(mondays)
ax.xaxis.set_minor_locator(alldays)
ax.xaxis.set_major_formatter(weekFormatter)
candlestick_ohlc(ax, zip(mdates.date2num(quotes.index.to_pydatetime()),
quotes['Open'], quotes['High'],
quotes['Low'], quotes['Close']),
width=0.6)
ax.xaxis_date()
ax.autoscale_view()
plt.setp(plt.gca().get_xticklabels(), rotation=45,
horizontalalignment='right')
plt.show()
If you don't specify an index while building your DataFrame, it will default to a RangeIndex that just numbers your rows consecutively. This RangeIndex is obviously not convertible to a date -- hence the error. The read_excel function takes index_col as a parameter to specify which column to use as an index. You might also have to provide parse_dates=True.

convert pandas datetime64[ns] to matplotlib date-float for date x-axis in seaborn tsplot

Ok I'm trying to do something that should be trivial but instead I've spent more time than I'd like to admit searching google and stack overflow only to become more frustrated.
What I'm trying to do: I'd like to format my x-axis on a seaborn tsplot.
What my stack overflow searching has told me: matplot lib has a set_major_formattter function but I can't seem to use it without tripping an overflow error.
What I'm looking for: a simple way to convert datetime64[ns] to a float that can be used with marplot lib's set_major_formatter.
Where I think I'm stuck:
df.date_action = df.date_action.values.astype('float')
# converts the field to a float but matplotlib expects seconds since 0001-01-01 not nano seconds since epoch
is there a simple way to do this that I'm missing?
the most helpful post I reviewed so far was
31255815 which got me 95% of the way there but not quite
here is some sample code to illustrate the issue
# standard imports
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
## generate fake data
from datetime import timedelta, date
import random
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date + timedelta(n)
start_date = date(2013, 1, 1)
end_date = date(2018, 6, 2)
date_list = []
number_list = []
for single_date in daterange(start_date, end_date):
date_list.append(single_date)
if len(number_list) > 0:
number_list.append(random.random() + number_list[-1])
else:
number_list.append(random.random())
df = pd.DataFrame(data={'date_action': date_list, 'values': number_list})
# note my actual data comes in as a datetime64[ns]
df['date_action'] = df['date_action'].astype('datetime64[ns]')
# the following looked promising but is still offset an incorrect amount
#df.date_action = df.date_action.values.astype('float')
#df.date_action = df.date_action.to_datetime
## chart stuff
plt.clf()
import matplotlib.dates as mdates
df['dummy_01'] = 0
rows = 1
cols = 1
fig, axs = plt.subplots(nrows=rows, ncols=cols, figsize=(10, 8))
ax1 = plt.subplot2grid((rows, cols), (0, 0))
for i in [ax1]: # trying to format x-axis
pass
i.xaxis_date()
i.xaxis.set_major_locator(mdates.AutoDateLocator())
i.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
sns.tsplot(df, time='date_action', unit='dummy_01',
value='values', ax=ax1) #
plt.plot()
plt.show()