coloring histogram with multiple colors where each color has a specified range - matplotlib

I was trying to load an excel data into python and plot it as a histogram. My aim would be to color the histogram according to specific ranges: every number smaller than 4 = yellow, numbers between 4 and 12 = orange and so on. I encountered 2 problems I don´t have 4 separate histograms the programm plots everything in 1 graph. Second problem,one of the loops, obviously is wrong because it only shows everything in yellow.
Could somebody help me out which loop is wrong and why? Is there a better way to do this?I appriciate every help im pretty new at programming.
import matplotlib.pyplot as plt
import pandas as pd
import openpyxl
# Load the workbook
path='C:/Users/akosw/OneDrive/Desktop/Programmieren/Geotechnikstammdaten.xlsx'
#workbook = openpyxl.load_workbook("Geotechnikstammdaten.xlsx","C:/Users/akosw/OneDrive/Desktop/Programmieren")
workbook = openpyxl.load_workbook(path)
# Select the sheet
sheet = workbook['Tabelle3']
# Extract the values from each column
columns = [[cell.value for cell in column] for column in zip(*sheet.rows)]
# Iterate over the columns
for i, values in enumerate(columns):
# Create the histogram
plt.hist(values, bins=50)
# Color the bars according to the specified rules
for patch in plt.gca().patches:
if patch.get_height() < 4:
patch.set_facecolor('yellow')
elif patch.get_height() < 12:
patch.set_facecolor('orange')
elif patch.get_height() < 26:
patch.set_facecolor('green')
elif patch.get_height() < 51:
patch.set_facecolor('blue')
else:
patch.set_facecolor('red')
plt.title(f'Column {i+1}')
plt.show()
# Save the histogram to a file
# plt.savefig(f'histogram_{i+1}.png')
Here is kind of what im trying to achiev but instead of a bar chart i want a histogramm.
from bisect import bisect
import matplotlib.pyplot as plt
import numpy as np
OM_VALUES = [4, 12, 26, 51]
OM_COLORS = ["yellow", "orange", "blue", "green", "red"]
data = [4, 6, 7, 7, 11, 16, 23, 30, 30, 27, 1, 3, 4, 33, 37, 39, 45, 51]
labels = range(len(data))#[0,1,2,3,4,5,6,7,8,9,10,11]
plt.barh(labels, data,height=1.0, color=[OENORM_COLORS[bisect(OENORM_VALUES, v)] for v in data])
plt.title('Counts per depth')
plt.xlabel('Value')
plt.ylabel('Depth')
#plt.savefig('counts_depth.png')
plt.show()

Related

How to plot the relation between an array's columns and rows mean value

I'm a newcomer to Pandas and Matplotlib, trying to plot a relation between the mean value of my array's rows and columns. The result I'm looking for is something like this:
"linhas" refers to the rows and "colunas" refers to the columns. The Y label refers to the means and the X label refers to the number of columns in my array
I came up with some solutions, as shown below:
print(arr)
df = pd.DataFrame(arr)
display(df)
num_cols = [df.shape[1]]
print(type(num_cols))
print(num_cols)
cols = df.count(axis=1)
lcols = cols.tolist()
print(type(lcols))
col_mean = df[:].mean(axis=0)
print(type(col_mean))
col_mean.tolist()
row_mean = df[:].mean(axis=1)
print(type(row_mean))
row_mean.tolist()
print(type(row_mean))
print(row_mean)
dados = pd.DataFrame({
'Colunas': col_mean,
'Linhas': row_mean
}, index=lcols)
lines = dados.plot.line()
What I was looking after is something like this:
"linhas" refers to the rows and "colunas" refers to the columns. The Y label refers to the means and the X label refers to the number of columns in my array
Unfortunately, my output is totally wrong, as follows:
My output
Any help would be deeply appreciated, as I'm a bit lost right now.
Thanks in advance!
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# just a dummy array
arr = np.array([[37, 68, 1, 19, 6],
[ 0, 14, 32, 73, 53],
[37, 85, 67, 30, 91],
[42, 52, 6, 42, 85],
[82, 26, 44, 38, 48],
[54, 55, 23, 46, 78]])
n_rows, n_cols = arr.shape
df = pd.DataFrame(arr)
col_mean = df.mean(axis=0)
row_mean = df.mean(axis=1)
plt.plot(range(1, n_rows+1), row_mean, marker='^', c='orange', label='rows')
plt.plot(range(1, n_cols+1), col_mean, marker='o', c='blue', label='cols')
plt.xlabel('Label x axis')
plt.ylabel('Label y axis')
plt.title('Title plotting')
plt.legend()

Setting independent colorbar scale to y-values of plot using matplotlib and proplot

I have a series of histograms that I plot over the top of each other using a for loop:
import matplotlib as plt
import proplot as pplt
cmap = colormap
fig = pplt.figure(figsize=(12, 10), dpi=300)
jj = [ 4, 3, 2, 1, 0]
for j in jj:
plt.fill_between(p[:,j], s[:, j], y2=0, alpha = 0.6, color = colormap[:,4-j], edgecolor=[0,0,0], linewidth=1.5)
The colormap in question is a manually specified list of RGB triplets (from Fabio Crameri's 'lajolla' map):
0.64566 0.823453 0.895061 0.924676 0.957142
0.277907 0.386042 0.526882 0.657688 0.803006
0.259453 0.301045 0.317257 0.331596 0.408285
Each color corresponds to data recorded under different conditions. I want the colorbar to have manually specified ticks corresponding to this variable (e.g. c = 30, 35, 40, 45, 50), but I can't seem to configure the colormap to not just pull the indices of the cmap matrix (0, 1, 2, 3, 4) as the values of the mapped variable. Trying to set the ticks outside of this range just result in them not being shown.
cbar = fig.colorbar(np.transpose(cmap))
cbar.set_ticks([30, 35, 40, 45, 50])
cbar.set_ticklabels([30, 35, 40, 45, 50])
Any idea how I can resolve this?
Tried shifting indices of colormap but this doesn't seem to work.
Trying to get the colorbar with ticks corresponding to the '30, 35, 40, 45, 50' values quoted above.

Plotting ways (linestrings) over a map in Python

this is my second try for the same question and I really hope that someone may help me...
Even thought some really nice people tried to help me. There is a lot I couldn't figure out, despite there help.
From the beginning:
I created a dataframe. This dataframe is huge and gives information about travellers in a city. The dataframe looks like this. This is only the head.
In origin and destination you have the ids of the citylocations, in move how many travelled from origin to destination. longitude and latitude is where the exact point is and the linestring the combination of the points..
I created the linestring with this code:
erg2['Linestring'] = erg2.apply(lambda x: LineString([(x['latitude_origin'], x['longitude_origin']), (x['latitude_destination'], x['longitude_destination'])]), axis = 1)
Now my question is how to plot the ways over a map. Even thought I tried all th eexamples from the geopandas documentary etc. I cant help myself..
I cant show you what I already plotted because it doesnt make sense and I guess it would be smarter to start plotting from the beginning.
You see that in the column move there are some 0. This means that no one travelled this route. So this I dont need to plot..
I have to plot the lines with the information where the traveller started origin and where he went destination.
also I need to outline the different lines depending on movements..
with this plotting code
fig = px.line_mapbox(erg2, lat="latitude_origin", lon="longitude_origin", color="move",
hover_name= gdf["origin"] + " - " + gdf["destination"],
center =dict(lon=13.41053,lat=52.52437), zoom=3, height=600
)
fig.update_layout(mapbox_style="stamen-terrain", mapbox_zoom=4, mapbox_center_lat = 52.52437,
margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
Maybe someone has an idea???
I tried it with thios code:
import requests, io, json
import geopandas as gpd
import shapely.geometry
import pandas as pd
import numpy as np
import itertools
import plotly.express as px
# get some public addressess - hospitals. data that has GPS lat / lon
dfhos = pd.read_csv(io.StringIO(requests.get("http://media.nhschoices.nhs.uk/data/foi/Hospital.csv").text),
sep="¬",engine="python",).loc[:, ["OrganisationName", "Latitude", "Longitude"]]
a = np.arange(len(dfhos))
np.random.shuffle(a)
# establish N links between hospitals
N = 10
df = (
pd.DataFrame({0:a[0:N], 1:a[25:25+N]}).merge(dfhos,left_on=0,right_index=True)
.merge(dfhos,left_on=1, right_index=True, suffixes=("_origin", "_destination"))
)
# build a geopandas data frame that has LineString between two hospitals
gdf = gpd.GeoDataFrame(
data=df,
geometry=df.apply(
lambda r: shapely.geometry.LineString(
[(r["Longitude_origin"], r["Latitude_origin"]),
(r["Longitude_destination"], r["Latitude_destination"]) ]), axis=1)
)
# sample code https://plotly.com/python/lines-on-mapbox/#lines-on-mapbox-maps-from-geopandas
lats = []
lons = []
names = []
for feature, name in zip(gdf.geometry, gdf["OrganisationName_origin"] + " - " + gdf["OrganisationName_destination"]):
if isinstance(feature, shapely.geometry.linestring.LineString):
linestrings = [feature]
elif isinstance(feature, shapely.geometry.multilinestring.MultiLineString):
linestrings = feature.geoms
else:
continue
for linestring in linestrings:
x, y = linestring.xy
lats = np.append(lats, y)
lons = np.append(lons, x)
names = np.append(names, [name]*len(y))
lats = np.append(lats, None)
lons = np.append(lons, None)
names = np.append(names, None)
fig = px.line_mapbox(lat=lats, lon=lons, hover_name=names)
fig.update_layout(mapbox_style="stamen-terrain",
mapbox_zoom=4,
mapbox_center_lon=gdf.total_bounds[[0,2]].mean(),
mapbox_center_lat=gdf.total_bounds[[1,3]].mean(),
margin={"r":0,"t":0,"l":0,"b":0}
)
which looks like the perfect code but I cant really use it for my data..
I am very new to coding. So please be patient a bit;))
Thanks a lot in advance.
All the best
previously answered this question How to plot visualize a Linestring over a map with Python?. I suggested that you update that question, I still recommend that you do
line strings IMHO are not the way to go. plotly does not use line strings, so it's extra complexity to encode to line strings to decode to numpy arrays. check out the examples on official documentation https://plotly.com/python/lines-on-mapbox/. here it is very clear geopandas is just a source that has to be encoded into numpy arrays
data
your sample data it appears should be one Dataframe and has no need for geopandas or line strings
almost all of your sample data is unusable as every row where origin and destination are different have move of zero which you note should be excluded
import pandas as pd
import numpy as np
import plotly.express as px
df = pd.DataFrame({"origin": [88, 88, 88, 88, 88, 87],
"destination": [88, 89, 110, 111, 112, 83],
"move": [20, 0, 5, 0, 0, 10],
"longitude_origin": [13.481016, 13.481016, 13.481016, 13.481016, 13.481016, 13.479667],
"latitude_origin": [52.457055, 52.457055, 52.457055, 52.457055, 52.457055, 52.4796],
"longitude_destination": [13.481016, 13.504075, 13.613772, 13.586891, 13.559341, 13.481016],
"latitude_destination": [52.457055, 52.443923, 52.533194, 52.523562, 52.507418, 52.457055]})
solution
have further refined line_array() function so it can be used to encode hover and color parameters from simplified solution I previously provided
# lines in plotly are delimited by none
def line_array(data, cols=[], empty_val=None):
if isinstance(data, pd.DataFrame):
vals = data.loc[:, cols].values
elif isinstance(data, pd.Series):
a = data.values
vals = np.pad(a.reshape(a.shape[0], -1), [(0, 0), (0, 1)], mode="edge")
return np.pad(vals, [(0, 0), (0, 1)], constant_values=empty_val).reshape(
1, (len(df) * 3))[0]
# only draw lines where move > 0 and destination is different to origin
df = df.loc[df["move"].gt(0) & (df["origin"]!=df["destination"])]
lons = line_array(df, ["longitude_origin", "longitude_destination"])
lats = line_array(df, ["latitude_origin", "latitude_destination"])
fig = px.line_mapbox(
lat=lats,
lon=lons,
hover_name=line_array(
df.loc[:, ["origin", "destination"]].astype(str).apply(" - ".join, axis=1)
),
hover_data={
"move": line_array(df, ["move", "move"], empty_val=-99),
"origin": line_array(df, ["origin", "origin"], empty_val=-99),
},
color=line_array(df, ["origin", "origin"], empty_val=-99),
).update_traces(visible=False, selector={"name": "-99"})
fig.update_layout(
mapbox={
"style": "stamen-terrain",
"zoom": 9.5,
"center": {"lat": lats[0], "lon": lons[0]},
},
margin={"r": 0, "t": 0, "l": 0, "b": 0},
)

How to plot two data sets into using same bin size?

I have two different data sets. I want to plot histogram using two different data sets but keeping the bins same, there width and range of each bin should be same.
Data1 = np.array([1,2,3,3,5,6,7,8])
Data2 = np.array[1,2,3,4,6,7,8,8]
n,bins,patches = plt.hist(Data1,bins=20)
plt.ylabel("no of states")
plt.xlabel("bins")
plt.savefig("./DOS")`
You can look at the documentation for matplotlib.pyplot.hist and you will see that the bins argument can be an integer (defining the number of bins) or a sequence (defining the edges of the bins themselves).
Therefore, you need to manually define the bins you want to use and pass these to plt.hist:
import matplotlib.pyplot as plt
import numpy as np
bin_edges = [0, 2, 4, 6, 8]
data = np.random.rand(50) * 8
plt.hist(data, bins=bin_edges)
You can pass the bins returned from your first histogram plot as an argument to the second histogram to make sure both have the same bin sizes.
Complete answer:
import numpy as np
import matplotlib.pyplot as plt
Data1 = np.array([1, 2, 3, 3, 5, 6, 7, 8])
Data2 = np.array([1, 2, 3, 4, 6, 7, 8, 8])
n, bins, patches = plt.hist(Data1, bins=20, label='Data 1')
plt.hist(Data2, bins=bins, label='Data 2')
plt.ylabel("no of states")
plt.xlabel("bins")
plt.legend()
plt.show()

Percentile Distribution Graph

Does anyone have an idea how to change X axis scale and ticks to display a percentile distribution like the graph below? This image is from MATLAB, but I want to use Python (via Matplotlib or Seaborn) to generate.
From the pointer by #paulh, I'm a lot closer now. This code
import matplotlib
matplotlib.use('Agg')
import numpy as np
import matplotlib.pyplot as plt
import probscale
import seaborn as sns
clear_bkgd = {'axes.facecolor':'none', 'figure.facecolor':'none'}
sns.set(style='ticks', context='notebook', palette="muted", rc=clear_bkgd)
fig, ax = plt.subplots(figsize=(8, 4))
x = [30, 60, 80, 90, 95, 97, 98, 98.5, 98.9, 99.1, 99.2, 99.3, 99.4]
y = np.arange(0, 12.1, 1)
ax.set_xlim(40, 99.5)
ax.set_xscale('prob')
ax.plot(x, y)
sns.despine(fig=fig)
Generates the following plot (notice the re-distributed X-Axis)
Which I find much more useful than a the standard scale:
I contacted the author of the original graph and they gave me some pointers. It is actually a log scale graph, with x axis reversed and values of [100-val], with manual labeling of the x axis ticks. The code below recreates the original image with the same sample data as the other graphs here.
import matplotlib
matplotlib.use('Agg')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
clear_bkgd = {'axes.facecolor':'none', 'figure.facecolor':'none'}
sns.set(style='ticks', context='notebook', palette="muted", rc=clear_bkgd)
x = [30, 60, 80, 90, 95, 97, 98, 98.5, 98.9, 99.1, 99.2, 99.3, 99.4]
y = np.arange(0, 12.1, 1)
# Number of intervals to display.
# Later calculations add 2 to this number to pad it to align with the reversed axis
num_intervals = 3
x_values = 1.0 - 1.0/10**np.arange(0,num_intervals+2)
# Start with hard-coded lengths for 0,90,99
# Rest of array generated to display correct number of decimal places as precision increases
lengths = [1,2,2] + [int(v)+1 for v in list(np.arange(3,num_intervals+2))]
# Build the label string by trimming on the calculated lengths and appending %
labels = [str(100*v)[0:l] + "%" for v,l in zip(x_values, lengths)]
fig, ax = plt.subplots(figsize=(8, 4))
ax.set_xscale('log')
plt.gca().invert_xaxis()
# Labels have to be reversed because axis is reversed
ax.xaxis.set_ticklabels( labels[::-1] )
ax.plot([100.0 - v for v in x], y)
ax.grid(True, linewidth=0.5, zorder=5)
ax.grid(True, which='minor', linewidth=0.5, linestyle=':')
sns.despine(fig=fig)
plt.savefig("test.png", dpi=300, format='png')
This is the resulting graph:
These type of graphs are popular in the low-latency community for plotting latency distributions. When dealing with latencies most of the interesting information tends to be in the higher percentiles, so a logarithmic view tends to work better. I've first seen these graphs used in https://github.com/giltene/jHiccup and https://github.com/HdrHistogram/.
The cited graph was generated by the following code
n = ceil(log10(length(values)));
p = 1 - 1./10.^(0:0.01:n);
percentiles = prctile(values, p * 100);
semilogx(1./(1-p), percentiles);
The x-axis was labelled with the code below
labels = cell(n+1, 1);
for i = 1:n+1
labels{i} = getPercentileLabel(i-1);
end
set(gca, 'XTick', 10.^(0:n));
set(gca, 'XTickLabel', labels);
% {'0%' '90%' '99%' '99.9%' '99.99%' '99.999%' '99.999%' '99.9999%'}
function label = getPercentileLabel(i)
switch(i)
case 0
label = '0%';
case 1
label = '90%';
case 2
label = '99%';
otherwise
label = '99.';
for k = 1:i-2
label = [label '9'];
end
label = [label '%'];
end
end
The following Python code uses Pandas to read a csv file that contains a list of recorded latency values (in milliseconds), then it records those latency values (as microseconds) in an HdrHistogram, and saves the HdrHistogram to an hgrm file, that will then be used by Seaborn to plot the latency distribution graph.
import pandas as pd
from hdrh.histogram import HdrHistogram
from hdrh.dump import dump
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import sys
import argparse
# Parse the command line arguments.
parser = argparse.ArgumentParser()
parser.add_argument('csv_file')
parser.add_argument('hgrm_file')
parser.add_argument('png_file')
args = parser.parse_args()
csv_file = args.csv_file
hgrm_file = args.hgrm_file
png_file = args.png_file
# Read the csv file into a Pandas data frame and generate an hgrm file.
csv_df = pd.read_csv(csv_file, index_col=False)
USECS_PER_SEC=1000000
MIN_LATENCY_USECS = 1
MAX_LATENCY_USECS = 24 * 60 * 60 * USECS_PER_SEC # 24 hours
# MAX_LATENCY_USECS = int(csv_df['response-time'].max()) * USECS_PER_SEC # 1 hour
LATENCY_SIGNIFICANT_DIGITS = 5
histogram = HdrHistogram(MIN_LATENCY_USECS, MAX_LATENCY_USECS, LATENCY_SIGNIFICANT_DIGITS)
for latency_sec in csv_df['response-time'].tolist():
histogram.record_value(latency_sec*USECS_PER_SEC)
# histogram.record_corrected_value(latency_sec*USECS_PER_SEC, 10)
TICKS_PER_HALF_DISTANCE=5
histogram.output_percentile_distribution(open(hgrm_file, 'wb'), USECS_PER_SEC, TICKS_PER_HALF_DISTANCE)
# Read the generated hgrm file into a Pandas data frame.
hgrm_df = pd.read_csv(hgrm_file, comment='#', skip_blank_lines=True, sep=r"\s+", engine='python', header=0, names=['Latency', 'Percentile'], usecols=[0, 3])
# Plot the latency distribution using Seaborn and save it as a png file.
sns.set_theme()
sns.set_style("dark")
sns.set_context("paper")
sns.set_color_codes("pastel")
fig, ax = plt.subplots(1,1,figsize=(20,15))
fig.suptitle('Latency Results')
sns.lineplot(x='Percentile', y='Latency', data=hgrm_df, ax=ax)
ax.set_title('Latency Distribution')
ax.set_xlabel('Percentile (%)')
ax.set_ylabel('Latency (seconds)')
ax.set_xscale('log')
ax.set_xticks([1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
ax.set_xticklabels(['0', '90', '99', '99.9', '99.99', '99.999', '99.9999', '99.99999'])
fig.tight_layout()
fig.savefig(png_file)