How can I draw scatter trend line on matplot? Python-Pandas - pandas

I want to draw a scatter trend line on matplot. How can I do that?
Python
import pandas as pd
import matplotlib.pyplot as plt
csv = pd.read_csv('/tmp/test.csv')
data = csv[['fee', 'time']]
x = data['fee']
y = data['time']
plt.scatter(x, y)
plt.show()
CSV
fee,time
100,650
90,700
80,860
70,800
60,1000
50,1200
time is integer value.
Scatter chart

I'm sorry I found the answer by myself.
How to add trendline in python matplotlib dot (scatter) graphs?
Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
csv = pd.read_csv('/tmp/test.csv')
data = csv[['fee', 'time']]
x = data['fee']
y = data['time']
plt.scatter(x, y)
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")
plt.show()
Chart

With text:
from sklearn.metrics import r2_score
plt.plot(x,y,"+", ms=10, mec="k")
z = np.polyfit(x, y, 1)
y_hat = np.poly1d(z)(x)
plt.plot(x, y_hat, "r--", lw=1)
text = f"$y={z[0]:0.3f}\;x{z[1]:+0.3f}$\n$R^2 = {r2_score(y,y_hat):0.3f}$"
plt.gca().text(0.05, 0.95, text,transform=plt.gca().transAxes,
fontsize=14, verticalalignment='top')

You also can use Seaborn lmplot:
import seaborn as sns
import pandas as pd
from io import StringIO
textfile = StringIO("""fee,time
100,650
90,700
80,860
70,800
60,1000
50,1200""")
df = pd.read_csv(textfile)
_ = sns.lmplot(x='fee', y='time', data=df, ci=None)
Output:

Related

I cannot fit my data logarithmically, How can I add log trendline?

So this is my code, it's written a little messy and my result is absolutely ridiculous. I have no idea how to fix it.
Also, the seaborn library does not work on my computer in any way.
.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv('Data.csv',encoding="latin1",sep=";",engine="python")
table = data.replace(0, 0.1)
plt.plot(table["RMDM"], table["BSURF"], color="#03012d", marker=".", ls="None", markersize=3, label="")
data['RMDM'] = data['RMDM'].astype(float)
data['BSURF'] = data['BSURF'].astype(float)
fig, ax = plt.subplots()
x=data['BSURF']
y=data['RMDM']
ax.set_yscale('log')
ax.set_xscale('log')
plt.style.use('classic')
plt.xlabel('B_LC')
plt.ylabel('RM/DM')
plt.plot(x,y, 'og')
from scipy.stats import linregress
df = data.loc[(data['RMDM'] >0) & (data['BSURF'] >0)]
stats = linregress(np.log10(df["RMDM"]),np.log10(df["BSURF"]))
m = stats.slope
b = stats.intercept
r = stats.rvalue
x = np.logspace(-1, 5, base=10)
y = (m*x+b)
plt.plot(x, y, c='orange', label="fit")
plt.legend()
#m,c=np.polyfit(x,y,1)
#plt.plot(x,m*x+c)
plt.grid()
plt.show()
lmplot can be used to create a linear line through your data. you correctly used np.log for the linear regression data. keep x in terms of the log.
df['log_col1']=np.log(df['col1'])
sns.lmplot(x='log_col1','y='target', data=df, ci=None)
sns.scatterplot(y='target',x='log_col1',data=df)
plt.show()

How to annotate in 2 decimal places using Matplotlib

I am trying to create a heatmap displaying correlation coefficient values. I'm quite new at this, but the code below would annotate in multiple decimal places, whereas i'm trying to narrow down to 2 d.p.
Does anyone have experience with this?
import pandas_datareader.data as web
import pandas as pd
import datetime as dt
import csv
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import seaborn as sns
style.use('ggplot')
def visualize_data():
df = pd.read_csv('sti_joined.csv')
df.set_index('Date', inplace=True)
df_corr = df.pct_change().corr()
print(df_corr.head())
data = df_corr.values
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
# heatmap = ax.pcolor(data, cmap=plt.cm.get_cmap('RdYlGn'))
heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn)
fig.colorbar(heatmap)
ax.set_xticks(np.arange(data.shape[0]) + 0.5, minor=False)
ax.set_yticks(np.arange(data.shape[1]) + 0.5, minor=False)
ax.invert_yaxis()
ax.xaxis.tick_top()
for y in range(data.shape[0]):
for x in range(data.shape[1]):
plt.text(x + 0.5, y + 0.5, '%.4f' % data[y, x],
horizontalalignment='center',
verticalalignment='center',
)
column_labels = df_corr.columns
row_labels = df_corr.index
ax.set_xticklabels(column_labels)
ax.set_yticklabels(row_labels)
plt.xticks(rotation=90)
heatmap.set_clim(-1,1)
plt.tight_layout()
plt.show()
visualize_data()
Instead of '%.4f' % data[y, x], you can try using something like
'{0:.2f}'.format(data[y,x])

Colorbar scientific notation, change e^ to 10^

I am using scientific notation in a colorbar within a 2D plot. I want to write 10^{-3} instead of e-3. I tried to change that (see code below) but it does not work...
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker
x = np.random.rand(100)
y = np.random.rand(100)
z = np.random.rand(100)*0.001
x=x.reshape((10,10))
y=y.reshape((10,10))
z=z.reshape((10,10))
fig, ax = plt.subplots(figsize=(8,6))
cs = ax.contourf(x,y,z, 10)
plt.xticks(fontsize=16,rotation=0)
plt.yticks(fontsize=16,rotation=0)
cbar = plt.colorbar(cs,)
cbar.set_label("test",fontsize = 22)
cbar.formatter.set_scientific(True)
cbar.formatter.set_powerlimits((0, 0))
cbar.ax.tick_params(labelsize=16)
cbar.ax.yaxis.get_offset_text().set_fontsize(22)
cbar.ax.xaxis.major.formatter._useMathText = True
cbar.update_ticks()
plt.savefig("test.png")
It seems you want a ScalarFormatter with mathtext in use.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker
x = np.tile(np.arange(10), 10).reshape((10,10))
y = np.repeat(np.arange(10),10).reshape((10,10))
z = np.sort(np.random.rand(100)*0.001).reshape((10,10))
fig, ax = plt.subplots(figsize=(8,6))
cs = ax.contourf(x,y,z, 10)
fmt = matplotlib.ticker.ScalarFormatter(useMathText=True)
fmt.set_powerlimits((0, 0))
cbar = plt.colorbar(cs,format=fmt)
plt.show()

controlling the number of x ticks in pyplot

I want to display all 13 x ticks, but the graph only shows 7 of them having two intervals.
plt.locator_params(axis='x',nbins=13)
Why doesn't above code work??
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as dates
y = [0, 0.86, 0.826, 0.816, 0.807, 0.803, 0.804, 0.803, 0.802,0.81, 0.813, 0.813, 0.813]
times = pd.date_range('2015-02-25', periods=13)
fig, ax = plt.subplots(1)
fig.autofmt_xdate()
xfmt = dates.DateFormatter('%d-%m-%y')
ax.xaxis.set_major_formatter(xfmt)
plt.locator_params(axis='x',nbins=13)
ax.plot_date(times.to_pydatetime(), y, 'v-')
ax.xaxis.set_minor_locator(dates.WeekdayLocator(byweekday=(1),
interval=1))
ax.xaxis.set_minor_formatter(dates.DateFormatter('%d\n%a'))
ax.xaxis.grid(True, which="minor")
ax.yaxis.grid()
plt.tight_layout()
plt.show()
The warning should give you some clue why this is happening:
UserWarning: 'set_params()' not defined for locator of type <class 'pandas.tseries.converter.PandasAutoDateLocator'>
str(type(self)))
Use plt.xticks(times.to_pydatetime()) instead:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as dates
y = [0, 0.86, 0.826, 0.816, 0.807, 0.803, 0.804, 0.803, 0.802,0.81, 0.813, 0.813, 0.813]
times = pd.date_range('2015-02-25', periods=13)
fig, ax = plt.subplots(1)
fig.autofmt_xdate()
xfmt = dates.DateFormatter('%d-%m-%y')
ax.xaxis.set_major_formatter(xfmt)
ax.plot_date(times.to_pydatetime(), y, 'v-')
ax.xaxis.set_minor_locator(dates.WeekdayLocator(byweekday=(1),
interval=1))
plt.xticks(times.to_pydatetime())
ax.xaxis.set_minor_formatter(dates.DateFormatter('%d\n%a'))
ax.xaxis.grid(True, which="minor")
ax.yaxis.grid()
plt.tight_layout()
plt.show()

name "plot" is not defined

I'm trying to plot some data using matplotlib with the code below.
import matplotlib.pyplot as plt
import numpy as np
data_x = np.linspace(0, 10, 100)
data_y = 10 * np.exp(-data_x)
np.savetxt('tabelle1.txt', np.column_stack([data_x, data_y]), header='U I')
x, y = np.genfromtxt('tabelle1', unpack=True)
plt.plot(x, y, 'rx')
plt.xlabel(r'$x$')
plt.ylabel(r'$y$')
plt.yscale('log')
plt.tight_layout()
plt.savefig('loesung.pdf')
However, this generates an error saying NameError: name plot is not defined.
How can I fix this?
please try
#Add this script
import matplotlib
#Before
import matplotlib.pyplot as plt