I am trying to create a heatmap displaying correlation coefficient values. I'm quite new at this, but the code below would annotate in multiple decimal places, whereas i'm trying to narrow down to 2 d.p.
Does anyone have experience with this?
import pandas_datareader.data as web
import pandas as pd
import datetime as dt
import csv
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import seaborn as sns
style.use('ggplot')
def visualize_data():
df = pd.read_csv('sti_joined.csv')
df.set_index('Date', inplace=True)
df_corr = df.pct_change().corr()
print(df_corr.head())
data = df_corr.values
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
# heatmap = ax.pcolor(data, cmap=plt.cm.get_cmap('RdYlGn'))
heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn)
fig.colorbar(heatmap)
ax.set_xticks(np.arange(data.shape[0]) + 0.5, minor=False)
ax.set_yticks(np.arange(data.shape[1]) + 0.5, minor=False)
ax.invert_yaxis()
ax.xaxis.tick_top()
for y in range(data.shape[0]):
for x in range(data.shape[1]):
plt.text(x + 0.5, y + 0.5, '%.4f' % data[y, x],
horizontalalignment='center',
verticalalignment='center',
)
column_labels = df_corr.columns
row_labels = df_corr.index
ax.set_xticklabels(column_labels)
ax.set_yticklabels(row_labels)
plt.xticks(rotation=90)
heatmap.set_clim(-1,1)
plt.tight_layout()
plt.show()
visualize_data()
Instead of '%.4f' % data[y, x], you can try using something like
'{0:.2f}'.format(data[y,x])
Related
I want to connect airplanes in origin (lat_1 lon_1) to dest(lat_2 lon_2). I use these data.
callsign
latitude_1
longitude_1
latitude_2
longitude_2
0
HBAL102
-4.82114
-76.3194
-4.5249
-79.0103
1
AUA1028
-33.9635
151.181
48.1174
16.55
2
ABW120
41.9659
-87.8832
55.9835
37.4958
3
CSN461
33.9363
-118.414
50.0357
8.5723
4
ETH3730
25.3864
55.4221
50.6342
5.43903
But unfortunately, I would get an incorrect result when creating LineString with shapely. I used everything like rotate and affine but it didn't correct.
Code:
cols = pd.read_csv("/content/dirct_lines.csv",sep=";")
line = cols[["callsign","latitude_1","longitude_1","latitude_2","longitude_2"]].dropna()
line['geometry'] = line.apply(lambda x: [(x['latitude_1'],
x['longitude_1']),
(x['latitude_2'],
x['longitude_2'])], axis = 1)
geoline = gpd.GeoDataFrame(line,geometry="geometry",
crs="EPSG:4326")
import matplotlib.pyplot as plt
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
ax = world.plot(figsize=(14,9),
color='white', edgecolor='black')
geoline.plot(figsize=(14,9),ax=ax,facecolor = 'lightgrey', linewidth = 1.75,
edgecolor = 'red',
alpha = 2)
plt.show()
Shapely Output:
something that was interesting for me was that when I use Matplotlib to create lines everything is correct.
Code:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(projection=ccrs.PlateCarree())
ax.stock_img()
org_lon, org_lat = cols["longitude_1"], cols["latitude_1"]
dst_lon, dst_lat = cols["longitude_2"], cols["latitude_2"]
plt.plot([org_lon, dst_lon], [org_lat, dst_lat],
color='black', linewidth=0.5, marker='_',
transform=ccrs.PlateCarree()
)
plt.savefig(f"fight_path.png",dpi=60,facecolor = None, bbox_inches = 'tight', pad_inches = None)
plt.show()
Matplotlib Output:
What is the problem?
why isn't correct by shapely?
it's just the way you are creating the geometry. Below works correctly.
import io
import geopandas as gpd
import pandas as pd
import shapely.geometry
df = pd.read_csv(
io.StringIO(
"""callsign,latitude_1,longitude_1,latitude_2,longitude_2
HBAL102,-4.82114,-76.3194,-4.5249,-79.0103
AUA1028,-33.9635,151.181,48.1174,16.55
ABW120,41.9659,-87.8832,55.9835,37.4958
CSN461,33.9363,-118.414,50.0357,8.5723
ETH3730,25.3864,55.4221,50.6342,5.43903
"""
)
)
geoline = gpd.GeoDataFrame(
geometry=[
shapely.geometry.LineString(points)
for points in zip(
gpd.points_from_xy(df["longitude_1"], df["latitude_1"]),
gpd.points_from_xy(df["longitude_2"], df["latitude_2"]),
)
],
data=df,
)
import matplotlib.pyplot as plt
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
ax = world.plot(figsize=(14, 9), color="white", edgecolor="black")
geoline.plot(
figsize=(14, 9),
ax=ax,
facecolor="lightgrey",
linewidth=1.75,
edgecolor="red",
)
plt.show()
So this is my code, it's written a little messy and my result is absolutely ridiculous. I have no idea how to fix it.
Also, the seaborn library does not work on my computer in any way.
.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv('Data.csv',encoding="latin1",sep=";",engine="python")
table = data.replace(0, 0.1)
plt.plot(table["RMDM"], table["BSURF"], color="#03012d", marker=".", ls="None", markersize=3, label="")
data['RMDM'] = data['RMDM'].astype(float)
data['BSURF'] = data['BSURF'].astype(float)
fig, ax = plt.subplots()
x=data['BSURF']
y=data['RMDM']
ax.set_yscale('log')
ax.set_xscale('log')
plt.style.use('classic')
plt.xlabel('B_LC')
plt.ylabel('RM/DM')
plt.plot(x,y, 'og')
from scipy.stats import linregress
df = data.loc[(data['RMDM'] >0) & (data['BSURF'] >0)]
stats = linregress(np.log10(df["RMDM"]),np.log10(df["BSURF"]))
m = stats.slope
b = stats.intercept
r = stats.rvalue
x = np.logspace(-1, 5, base=10)
y = (m*x+b)
plt.plot(x, y, c='orange', label="fit")
plt.legend()
#m,c=np.polyfit(x,y,1)
#plt.plot(x,m*x+c)
plt.grid()
plt.show()
lmplot can be used to create a linear line through your data. you correctly used np.log for the linear regression data. keep x in terms of the log.
df['log_col1']=np.log(df['col1'])
sns.lmplot(x='log_col1','y='target', data=df, ci=None)
sns.scatterplot(y='target',x='log_col1',data=df)
plt.show()
Does anyone know how to show the labels of the minor ticks on a logarithmic scale with Python/Matplotlib?
You can use plt.tick_params(axis='y', which='minor') to set the minor ticks on and format them with the matplotlib.ticker FormatStrFormatter. For example,
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
x = np.linspace(0,4,1000)
y = np.exp(x)
plt.plot(x, y)
ax = plt.gca()
ax.set_yscale('log')
plt.tick_params(axis='y', which='minor')
ax.yaxis.set_minor_formatter(FormatStrFormatter("%.1f"))
plt.show()
One option is to use matplotlib.ticker.LogLocator
import numpy
import pylab
import matplotlib.pyplot
import matplotlib.ticker
## setup styles
from matplotlib import rc
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Times-Roman']})
rc('text', usetex = True)
matplotlib.rcParams['text.latex.preamble'] = [r"\usepackage{amsmath}"]
## make figure
figure, ax = matplotlib.pyplot.subplots(1, sharex = True, squeeze = True)
x = numpy.linspace(0.0, 20.0, 1000)
y = numpy.exp(x)
ax.plot(x, y)
ax.set_yscale('log')
## set y ticks
y_major = matplotlib.ticker.LogLocator(base = 10.0, numticks = 5)
ax.yaxis.set_major_locator(y_major)
y_minor = matplotlib.ticker.LogLocator(base = 10.0, subs = numpy.arange(1.0, 10.0) * 0.1, numticks = 10)
ax.yaxis.set_minor_locator(y_minor)
ax.yaxis.set_minor_formatter(matplotlib.ticker.NullFormatter())
## save figure
pylab.tight_layout()
pylab.savefig('./test.png', dpi = 200)
you would get
the only thing you need to manually adjust is the numticks input for both major and minor ticks, they both have to be a fraction of total possible number of major ticks.
I would like to format my x axis with the legend values at the mid point of each bar whilst retaining the gender group identification. I'd like lower the gender groups to sit below the other xticklabels for clarity.
To this point, I've added xticks but actually labeling them correctly and neatly is proving trickier.
from itertools import chain, cycle
import logging
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
matplotlib.style.use("ggplot")
m = {"Males" : {"Yes": 2, "No": 8}}
w = {"Females": {"Yes": 3, "No": 7}}
data = {**m, **w}
df = DataFrame(data)
# relative freq table
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
plt.show()
The following might be what you're looking for.
from itertools import chain
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame
matplotlib.style.use("ggplot")
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
labels = [l for l in ax.get_xticklabels()]
for i,l in enumerate(labels[len(df_ft):]):
l.set_text(df_ft.columns[i % len(df_ft.columns)])
for i,l in enumerate(labels[:len(df_ft)]):
l.set_text("\n"+l.get_text())
ax.set_xticklabels(labels)
plt.savefig(__file__+".png")
plt.show()
Altair would do a great job here.
from altair import *
from pandas import DataFrame
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df = df.stack().reset_index()
df.columns=['response','gender','count']
Vis #1
Chart(df).mark_bar().encode(x='gender',y='count',color='response').configure_cell(width=200, height=200)
Vis 2
Chart(df).mark_bar().encode(x=X('response', axis=False),
y=Y('count', axis=Axis(grid=False)),
color='response',
column=Column('gender', axis=Axis(axisWidth=1.0, offset=-8.0, orient='bottom'),scale=Scale(padding=30.0))).configure_cell(width=200, height=200).configure_facet_cell(strokeWidth=0)
I want to draw a scatter trend line on matplot. How can I do that?
Python
import pandas as pd
import matplotlib.pyplot as plt
csv = pd.read_csv('/tmp/test.csv')
data = csv[['fee', 'time']]
x = data['fee']
y = data['time']
plt.scatter(x, y)
plt.show()
CSV
fee,time
100,650
90,700
80,860
70,800
60,1000
50,1200
time is integer value.
Scatter chart
I'm sorry I found the answer by myself.
How to add trendline in python matplotlib dot (scatter) graphs?
Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
csv = pd.read_csv('/tmp/test.csv')
data = csv[['fee', 'time']]
x = data['fee']
y = data['time']
plt.scatter(x, y)
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")
plt.show()
Chart
With text:
from sklearn.metrics import r2_score
plt.plot(x,y,"+", ms=10, mec="k")
z = np.polyfit(x, y, 1)
y_hat = np.poly1d(z)(x)
plt.plot(x, y_hat, "r--", lw=1)
text = f"$y={z[0]:0.3f}\;x{z[1]:+0.3f}$\n$R^2 = {r2_score(y,y_hat):0.3f}$"
plt.gca().text(0.05, 0.95, text,transform=plt.gca().transAxes,
fontsize=14, verticalalignment='top')
You also can use Seaborn lmplot:
import seaborn as sns
import pandas as pd
from io import StringIO
textfile = StringIO("""fee,time
100,650
90,700
80,860
70,800
60,1000
50,1200""")
df = pd.read_csv(textfile)
_ = sns.lmplot(x='fee', y='time', data=df, ci=None)
Output: