dataframe style gradient shading based on another dataframe - pandas

I'm trying to base off gradient shading of a dataframe, using values from another dataframe (same dimension).
I have the below code, based from an answer to a similar question. However I need the shading to have the effect of "axis=None", where as below applies a column-wise shade.
A = pd.DataFrame(np.random.randn(6, 3), columns=['a', 'b', 'c'])
B = pd.DataFrame(np.random.randn(6, 3), columns=['a', 'b', 'c'])
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
def b_g(s, cmap='PuBu', low=0, high=0):
# Pass the columns from Dataframe A
a = A.loc[:,s.name].copy()
rng = a.max() - a.min()
norm = colors.Normalize(a.min() - (rng * low),
a.max() + (rng * high))
normed = norm(a.values)
c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
return ['background-color: %s' % color for color in c]
B.style.apply(b_g,cmap='PuBu')

Related

Not able to create a 3x3 grid of subplots to visualize 9 Series individually

I want to have a 3x3 grid of subplots to visualize each Series individually.
I first created some toy data:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='whitegrid', rc={"figure.figsize":(14,6)})
rs = np.random.RandomState(444)
dates = pd.date_range(start="2009-01-01", end='2019-12-31', freq='1D')
values = rs.randn(4017,12).cumsum(axis=0)
data = pd.DataFrame(values, dates, columns =['a','b','c','d','e','f','h','i','j','k','l','m'])
Here is the first code I wrote:
fig, ax = plt.subplots(3, 3, sharex=True, sharey=True)
for col in n_cols:
ax = data[col].plot()
With these lines of code the problem is that I get the 3x3 grid but all the columns have been plotten on the same subplotsAxes, in the bottom right corner.
Bottom Right Corner with all Lines
Here is the second thing I tried:
n_cols = ['a', 'b', 'c', 'd', 'e', 'f', 'h', 'i', 'j']
fig, ax = plt.subplots(3, 3, sharex=True, sharey=True)
for col in n_cols:
for i in range(3):
for j in range(3):
ax[i,j].plot(data[col])
But now I get all the columns plotted on every single subplotAxes.
All AxesSubplot with same lines
And if I try something like this:
fig, ax = plt.subplots(sharex=True, sharey=True)
for col in n_cols:
for i in range(3):
for j in range(3):
ax[i,j].add_subplot(data[col])
But I get:
TypeError: 'AxesSubplot' object is not subscriptable
I am sorry but can't figure out what to do.
Currently you're plotting each series in each of the subplots:
for col in n_cols:
for i in range(3):
for j in range(3):
ax[i,j].plot(data[col])
Following your example code, here is a way to only plot a single series per subplot:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
rs = np.random.RandomState(444)
dates = pd.date_range(start="2009-01-01", end='2019-12-31', freq='1D')
values = rs.randn(4017,12).cumsum(axis=0)
data = pd.DataFrame(values, dates, columns =['a','b','c','d','e','f','h','i','j','k','l','m'])
n_cols = ['a', 'b', 'c', 'd', 'e', 'f', 'h', 'i', 'j']
fig, ax = plt.subplots(3, 3, sharex=True, sharey=True)
for i in range(3):
for j in range(3):
col_name = n_cols[i*3+j]
ax[i,j].plot(data[col_name])
plt.show()

matplotlib scatter with c=date

How to plot a pandas dataframe like the one below with x on the x-axis, the values on the y-axis (one line per row) and the lines colored by date
values = [[0.2, 3.1, 17.4, 28.9, 57.7, 76.9, 82.8, 87.6, 92.4, 98.9, 100.0],
[0.2, 2.1, 15.5, 26.0, 54.2, 75.6, 82.1, 87.4, 92.4, 98.9, 100.0]]
x = [0.1, 0.2, 0.315, 0.4, 0.63, 1, 1.25, 1.6, 2, 3.15, 4]
dates = pd.date_range(start='2017-07-01', freq='D', periods=2)
data = pd.DataFrame(data=values, columns=x)
data['dates'] = dates
edit: sorry for not being precise.
Is there a way to set the colors of the lines according to a columns of Timestamps using data[x].T.plot(kind='line', legend=False).
If this is not possible, how to set "c" in plt.scatter to an array of Timestamps?
edit: the plot should look like this but should have a colorbar instead of a legend
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# create test data with a structure similar to the real data
x_values = np.linspace(1, 10, 8)
dat = np.random.randn(100, 8)
df = pd.DataFrame(data=np.abs(dat), columns=x_values)
df = df.cumsum(axis=1)
df = df.divide(df.max(axis=1), axis='index')
# create discontinuos date range and add it to data frame
dates = pd.date_range(start=('2016-01-01'), end=('2017-05-01'), freq='D')
dates = dates[(dates < '2016-07-01') | (dates > '2017-03-01')]
df['date'] = sorted(random.sample(dates.date.tolist(), 100))
# create a dataframe with a continous date range (see df) and corresponding colors
drange = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
colors = iter(plt.cm.jet(np.linspace(0, 1, drange.shape[0])))
cdf = pd.DataFrame(data=np.array([drange.date, list(colors)]).T, columns=['date', 'colors'])
# and merge colors to data
data = pd.merge(df, cdf)
# plot all data row by row with color of lines
# matching the date columns
fig, ax = plt.subplots()
for idx in data.index:
ax.plot(x_values, data.loc[idx, x_values],
linestyle='-', alpha=0.75,
color=data.loc[idx, 'colors'],
label=data.loc[idx, 'date'])
# reduce entries of legend
handles, labels = ax.get_legend_handles_labels()
entries = int(data.shape[0]/10)
handles = handles[::entries]
labels = labels[::entries]
ax.legend(handles, labels)

Color code a pandas plot based on column values

I have a large pandas dataframe that I want to create a plot - here is a simplified example:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
colors = iter(cm.rainbow(np.linspace(0, 1, 4)))
sample = pd.DataFrame({'X': [1,2,3,1,2,3,1,2,1,2,3],
'Y': [1,1,1,2,2,2,3,3,4,4,4]})
I want to create a color coded line plot, with color changing based on X column values (X values are always repeats the same numbers, but the length is not always the same) - the default plot is :
plt.plot(sample['X'], sample['Y'], linestyle = '-')
but I want to automate the process, so everytime X column restarts to have a new color - this is the result that I want to have
plt.plot(sample['X'][0:3], sample['Y'][0:3], linestyle = '-', color = next(colors))
plt.plot(sample['X'][3:6], sample['Y'][3:6], linestyle = '-', color = next(colors))
plt.plot(sample['X'][6:8], sample['Y'][6:8], linestyle = '-', color = next(colors))
plt.plot(sample['X'][8:], sample['Y'][8:], linestyle = '-', color = next(colors))
any suggestion on how to achieve this?
I would build on what you already proposed:
slices = [slice(0, 3), slice(3, 6), slice(6, 8), slice(8, None)]
for _slice, color in zip(slices, colors):
plt.plot(sample['X'][_slice], sample['Y'][_slice], c=color)
plt.show()
alternatively, if you add an extra column to your data:
sample2 = pd.DataFrame({'X': [1,2,3,1,2,3,1,2,1,2,3],
'Y': [1,1,1,2,2,2,3,3,4,4,4],
'G': [0,0,0,1,1,1,2,2,3,3,3]})
colors = cm.rainbow(np.linspace(0, 1, 4))
for name, group in sample2.groupby(['G']):
plt.plot(group['X'], group['Y'], c=colors[name])
plt.show()

Plotting lists with different number of elements in matplotlib

I have a list of numpy arrays, each potentially having a different number of elements, such as:
[array([55]),
array([54]),
array([], dtype=float64),
array([48, 55]),]
I would like to plot this, where each array has an abscissa (x value) assigned, such as [1,2,3,4] so that the plot should show the following points: [[1,55], [2, 54], [4, 48], [4, 55]].
Is there a way I can do that with matplotlib? or how can I transform the data with numpy or pandas first so that it is can be plotted?
What you want to do is chain the original array and generate a new array with "abscissas". There are many way to concatenated, one of the most efficient is using itertools.chain.
import itertools
from numpy import array
x = [array([55]), array([54]), array([]), array([48, 55])]
ys = list(itertools.chain(*x))
# this will be [55, 54, 48, 55]
# generate abscissas
xs = list(itertools.chain(*[[i+1]*len(x1) for i, x1 in enumerate(x)]))
Now you can just plot easily with matplotlib as below
import matplotlib.pyplot as plt
plt.plot(xs, ys)
If you want to have different markers for different groups of data (the colours are automatically cycled by matplotlib):
import numpy as np
import matplotlib.pyplot as plt
markers = ['o', #'circle',
'v', #'triangle_down',
'^', #'triangle_up',
'<', #'triangle_left',
'>', #'triangle_right',
'1', #'tri_down',
'2', #'tri_up',
'3', #'tri_left',
'4', #'tri_right',
'8', #'octagon',
's', #'square',
'p', #'pentagon',
'h', #'hexagon1',
'H', #'hexagon2',
'D', #'diamond',
'd', #'thin_diamond'
]
n_markers = len(markers)
a = [10.*np.random.random(int(np.random.random()*10)) for i in xrange(n_markers)]
fig = plt.figure()
ax = fig.add_subplot(111)
for i, data in enumerate(a):
xs = data.shape[0]*[i,] # makes the abscissas list
marker = markers[i % n_markers] # picks a valid marker
ax.plot(xs, data, marker, label='data %d, %s'%(i, marker))
ax.set_xlim(-1, 1.4*len(a))
ax.set_ylim(0, 10)
ax.legend(loc=None)
fig.tight_layout()
Notice the limits to y scale are hard coded, change accordingly. The 1.4*len(a) is meant to leave room on the right side of the graph for the legend.
The example above has no point in the x=0 (would be dark blue circles) as the randomly assigned size for its data set was zero, but you can easily place a +1 if you don't want to use x=0.
Using pandas to create a numpy array with nans inserted when an array is empty or shorter than the longest array in the list...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
arr_list = [np.array([55]),
np.array([54]),
np.array([], dtype='float64'),
np.array([48, 55]),]
df = pd.DataFrame(arr_list)
list_len = len(df)
repeats = len(list(df))
vals = df.values.flatten()
xax = np.repeat(np.arange(list_len) + 1, repeats)
df_plot = pd.DataFrame({'xax': xax, 'vals': vals})
plt.scatter(df_plot.xax, df_plot.vals);
with x your list :
[plt.plot(np.repeat(i,len(x[i])), x[i],'.') for i in range(len(x))]
plt.show()
#Alessandro Mariani's answer based on itertools made me think of another way to generate an array containg the data I needed. In some cases it may be more compact. It is also based on itertools.chain:
import itertools
from numpy import array
y = [array([55]), array([54]), array([]), array([48, 55])]
x = array([1,2,3,4])
d = array(list(itertools.chain(*[itertools.product([t], n) for t, n in zip(x,y)])))
d is now the following array:
array([[ 1, 55],
[ 2, 54],
[ 4, 48],
[ 4, 55]])

Pandas bar plot -- specify bar color by column

Is there a simply way to specify bar colors by column name using Pandas DataFrame.plot(kind='bar') method?
I have a script that generates multiple DataFrames from several different data files in a directory. For example it does something like this:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pds
data_files = ['a', 'b', 'c', 'd']
df1 = pds.DataFrame(np.random.rand(4,3), columns=data_files[:-1])
df2 = pds.DataFrame(np.random.rand(4,3), columns=data_files[1:])
df1.plot(kind='bar', ax=plt.subplot(121))
df2.plot(kind='bar', ax=plt.subplot(122))
plt.show()
With the following output:
Unfortunately, the column colors aren't consistent for each label in the different plots. Is it possible to pass in a dictionary of (filenames:colors), so that any particular column always has the same color. For example, I could imagine creating this by zipping up the filenames with the Matplotlib color_cycle:
data_files = ['a', 'b', 'c', 'd']
colors = plt.rcParams['axes.color_cycle']
print zip(data_files, colors)
[('a', u'b'), ('b', u'g'), ('c', u'r'), ('d', u'c')]
I could figure out how to do this directly with Matplotlib: I just thought there might be a simpler, built-in solution.
Edit:
Below is a partial solution that works in pure Matplotlib. However, I'm using this in an IPython notebook that will be distributed to non-programmer colleagues, and I'd like to minimize the amount of excessive plotting code.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pds
data_files = ['a', 'b', 'c', 'd']
mpl_colors = plt.rcParams['axes.color_cycle']
colors = dict(zip(data_files, mpl_colors))
def bar_plotter(df, colors, sub):
ncols = df.shape[1]
width = 1./(ncols+2.)
starts = df.index.values - width*ncols/2.
plt.subplot(120+sub)
for n, col in enumerate(df):
plt.bar(starts + width*n, df[col].values, color=colors[col],
width=width, label=col)
plt.xticks(df.index.values)
plt.grid()
plt.legend()
df1 = pds.DataFrame(np.random.rand(4,3), columns=data_files[:-1])
df2 = pds.DataFrame(np.random.rand(4,3), columns=data_files[1:])
bar_plotter(df1, colors, 1)
bar_plotter(df2, colors, 2)
plt.show()
You can pass a list as the colors. This will require a little bit of manual work to get it to line up, unlike if you could pass a dictionary, but may be a less cluttered way to accomplish your goal.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pds
data_files = ['a', 'b', 'c', 'd']
df1 = pds.DataFrame(np.random.rand(4,3), columns=data_files[:-1])
df2 = pds.DataFrame(np.random.rand(4,3), columns=data_files[1:])
color_list = ['b', 'g', 'r', 'c']
df1.plot(kind='bar', ax=plt.subplot(121), color=color_list)
df2.plot(kind='bar', ax=plt.subplot(122), color=color_list[1:])
plt.show()
EDIT
Ajean came up with a simple way to return a list of the correct colors from a dictionary:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pds
data_files = ['a', 'b', 'c', 'd']
color_list = ['b', 'g', 'r', 'c']
d2c = dict(zip(data_files, color_list))
df1 = pds.DataFrame(np.random.rand(4,3), columns=data_files[:-1])
df2 = pds.DataFrame(np.random.rand(4,3), columns=data_files[1:])
df1.plot(kind='bar', ax=plt.subplot(121), color=map(d2c.get,df1.columns))
df2.plot(kind='bar', ax=plt.subplot(122), color=map(d2c.get,df2.columns))
plt.show()
Pandas version 1.1.0 makes this easier. You can pass a dictionary to specify different color for each column in the pandas.DataFrame.plot.bar() function:
Here is an example:
df1 = pd.DataFrame({'a': [1.2, .8, .9], 'b': [.2, .9, .7]})
df2 = pd.DataFrame({'b': [0.2, .5, .4], 'c': [.5, .6, .7], 'd': [1.1, .6, .7]})
color_dict = {'a':'green', 'b': 'red', 'c':'blue', 'd': 'cyan'}
df1.plot.bar(color = color_dict)
df2.plot.bar(color = color_dict)