This is my dataframe
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
import matplotlib.pyplot as plt
d = {"STATE" : [ "NJ", "NJ", "NJ", "NJ"],
"CATEGORY": ["A", "B", "C", "D"],
"LATITUDE" : [ 40.794856, 40.790176, 40.826762, 40.495150],
"LONGITUDE" : [ -74.149086, -74.255100, -74.101990, -74.442890]}
df = pd.DataFrame(data=d)
df.plot(kind="scatter", x="LONGITUDE", y="LATITUDE", alpha=0.4)
plt.show()
I want to calculate the distance between points based on category column:
A -> B
A -> C
A -> D
and connect the dots with distance displayed between them as a label
#creating point object so I can calculate distance bwtween coordinates
df["point"] = [Point(xy) for xy in zip(df['LONGITUDE'], df['LATITUDE'])]
#Formula I use for calculating distance between two points, this works when I have two seperate columns
#df['lat_long_diff'] = df.apply(lambda x : geodesic((x['LATITUDE_A'],x['LONGITUDE_A']),(x['LATITUDE_other_points'],x['LONGITUDE_other_points'])).miles,axis=1)
Also if I could include the Map of New Jersey as background it would be great, just the outline map would do.
import cartopy
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
from matplotlib.text import Text
import geopandas as gpd
import pandas as pd
import geopy.distance
def calc_dist(point_a, point_b):
return(geopy.distance.geodesic(point_a, point_b).miles)
def draw_lines(x, y, p1 , p2):
x1, x2 = x[p1], x[p2]
y1, y2 = y[p1], y[p2]
ax.plot([x1,x2],[y1,y2])
d = {"STATE" : [ "NJ", "NJ", "NJ", "NJ"],
"CATEGORY": ["A", "B", "C", "D"],
"LATITUDE" : [ 40.794856, 40.790176, 40.826762, 40.495150],
"LONGITUDE" : [ -74.149086, -74.255100, -74.101990, -74.442890]}
df = pd.DataFrame(d)
dist_list = []
for i in [1,2,3]:
dist = calc_dist((df.at[0, 'LATITUDE'], df.at[0, 'LONGITUDE']),
(df.at[i, 'LATITUDE'], df.at[i, 'LONGITUDE']))
dist_list.append(dist)
proj = ccrs.PlateCarree(central_longitude=0)
fig, ax = plt.subplots(subplot_kw=dict(projection=proj), figsize=(16,16))
ax.set_extent([df['LONGITUDE'].min()-1,
df['LONGITUDE'].max()+ 1,
df['LATITUDE'].min()- 1,
df['LATITUDE'].max()+1],
crs=ccrs.PlateCarree())
x = df['LONGITUDE'].tolist()
y = df['LATITUDE'].tolist()
ax.scatter(x, y)
draw_lines(x, y, 0, 1)
draw_lines(x, y, 0, 2)
draw_lines(x, y, 0, 3)
ax._add_text(Text(text=str(round(dist_list[0], 2)), x=-74.20, y=40.82))
ax._add_text(Text(text=str(round(dist_list[1], 2)), x=-74.12, y= 40.8))
ax._add_text(Text(text=str(round(dist_list[2], 2)), x=-74.29, y= 40.64))
ax.add_feature(cfeature.STATES.with_scale('10m'), zorder=0)
fig.canvas.draw()
fig.tight_layout()
plt.show()
Related
I want to connect airplanes in origin (lat_1 lon_1) to dest(lat_2 lon_2). I use these data.
callsign
latitude_1
longitude_1
latitude_2
longitude_2
0
HBAL102
-4.82114
-76.3194
-4.5249
-79.0103
1
AUA1028
-33.9635
151.181
48.1174
16.55
2
ABW120
41.9659
-87.8832
55.9835
37.4958
3
CSN461
33.9363
-118.414
50.0357
8.5723
4
ETH3730
25.3864
55.4221
50.6342
5.43903
But unfortunately, I would get an incorrect result when creating LineString with shapely. I used everything like rotate and affine but it didn't correct.
Code:
cols = pd.read_csv("/content/dirct_lines.csv",sep=";")
line = cols[["callsign","latitude_1","longitude_1","latitude_2","longitude_2"]].dropna()
line['geometry'] = line.apply(lambda x: [(x['latitude_1'],
x['longitude_1']),
(x['latitude_2'],
x['longitude_2'])], axis = 1)
geoline = gpd.GeoDataFrame(line,geometry="geometry",
crs="EPSG:4326")
import matplotlib.pyplot as plt
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
ax = world.plot(figsize=(14,9),
color='white', edgecolor='black')
geoline.plot(figsize=(14,9),ax=ax,facecolor = 'lightgrey', linewidth = 1.75,
edgecolor = 'red',
alpha = 2)
plt.show()
Shapely Output:
something that was interesting for me was that when I use Matplotlib to create lines everything is correct.
Code:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(projection=ccrs.PlateCarree())
ax.stock_img()
org_lon, org_lat = cols["longitude_1"], cols["latitude_1"]
dst_lon, dst_lat = cols["longitude_2"], cols["latitude_2"]
plt.plot([org_lon, dst_lon], [org_lat, dst_lat],
color='black', linewidth=0.5, marker='_',
transform=ccrs.PlateCarree()
)
plt.savefig(f"fight_path.png",dpi=60,facecolor = None, bbox_inches = 'tight', pad_inches = None)
plt.show()
Matplotlib Output:
What is the problem?
why isn't correct by shapely?
it's just the way you are creating the geometry. Below works correctly.
import io
import geopandas as gpd
import pandas as pd
import shapely.geometry
df = pd.read_csv(
io.StringIO(
"""callsign,latitude_1,longitude_1,latitude_2,longitude_2
HBAL102,-4.82114,-76.3194,-4.5249,-79.0103
AUA1028,-33.9635,151.181,48.1174,16.55
ABW120,41.9659,-87.8832,55.9835,37.4958
CSN461,33.9363,-118.414,50.0357,8.5723
ETH3730,25.3864,55.4221,50.6342,5.43903
"""
)
)
geoline = gpd.GeoDataFrame(
geometry=[
shapely.geometry.LineString(points)
for points in zip(
gpd.points_from_xy(df["longitude_1"], df["latitude_1"]),
gpd.points_from_xy(df["longitude_2"], df["latitude_2"]),
)
],
data=df,
)
import matplotlib.pyplot as plt
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
ax = world.plot(figsize=(14, 9), color="white", edgecolor="black")
geoline.plot(
figsize=(14, 9),
ax=ax,
facecolor="lightgrey",
linewidth=1.75,
edgecolor="red",
)
plt.show()
I have this simple dataframe:
df = pd.DataFrame({"X": np.random.randint(50,53,size=100),
"Y": np.random.randint(200,300,size=100),
"Z": np.random.randint(400,800,size=100)})
And as I have many columns (all of them numeric), I did this loop in order to do a specific plot:
for i in df.columns:
data = df[i]
data.plot(kind="kde")
plt.vlines(x=data.mean(),ymin=0, ymax=0.01, linestyles="dotted")
plt.show()
However, I'm having trouble trying to generalize the ymax argument of plt.vlines(), as I need to get the maximum y-axis value of each density plot in order to plot the mean vline of each plot accordingly. I have tried with np.argmax(), but it doesn't seem to work.
Any suggestions?
pandas.DataFrame.plot() returns matplotlib.axes.Axes object. You can use get_ylim() function to get ymin and ymax.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"X": np.random.randint(50,53,size=100),
"Y": np.random.randint(200,300,size=100),
"Z": np.random.randint(400,800,size=100)})
for i in df.columns:
data = df[i]
ax = data.plot(kind="kde")
ymin, ymax = ax.get_ylim()
plt.vlines(x=data.mean(),ymin=ymin, ymax=ymax, linestyles="dotted")
plt.show()
To get the value of the kde corresponding to the mean, you could extract the curve from the plot and interpolate it at the position of the mean:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({"X": 20 + np.random.randint(-1, 2, size=100).cumsum(),
"Y": 30 + np.random.randint(-1, 2, size=100).cumsum(),
"Z": 40 + np.random.randint(-1, 2, size=100).cumsum()})
fig, ax = plt.subplots()
for col in df.columns:
data = df[col]
data.plot(kind="kde", ax=ax)
x = data.mean()
kdeline = ax.lines[-1]
ymax = np.interp(x, kdeline.get_xdata(), kdeline.get_ydata())
ax.vlines(x=data.mean(), ymin=0, ymax=ymax, linestyles="dotted")
ax.set_ylim(ymin=0) # ax.vlines() moves the bottom ylim; set it back to 0
plt.show()
Use plt.axvline. You specify the limits as numbers in the range [0,1], 0 being the bottom of the plot, 1 being the top.
for i in df.columns:
data = df[i]
data.plot(kind="kde")
plt.axvline(data.mean(), 0, 1, linestyle='dotted', color='black')
plt.show()
A simple example is as follows:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
N = 1000
x = np.linspace(1, 5, N)
y = npr.randint(1e16, size = N) / 1e16
y = np.sort(y)
fig, ax = plt.subplots()
ax.loglog(x, y, '.')
ax.grid(True, 'both')
Where I want to replace the xticks. So far everything I tried, failed to work:
ax.set_xticks([2, 3, 4], ['a', 'b', 'c'])
ax.xaxis.set_ticks_position('none')
ax.set_xticks([])
None of the above showed any effect. My goal is to replace the ticks with custom defined ticks (strings or integers). So instead of 2 x 10⁰ it should only be 2. Similar for other xticks.
Probably this is what you're after:
import numpy as np
import matplotlib.pyplot as plt
N = 1000
x = np.linspace(1, 5, N)
y = np.random.rand(N)
y = np.sort(y)
fig, ax = plt.subplots()
ax.loglog(x, y, '.')
ax.grid(True, 'both')
ax.set_xticks([2, 3, 4])
ax.set_xticklabels(['a', 'b', 'c'])
ax.minorticks_off()
plt.show()
I would like to format my x axis with the legend values at the mid point of each bar whilst retaining the gender group identification. I'd like lower the gender groups to sit below the other xticklabels for clarity.
To this point, I've added xticks but actually labeling them correctly and neatly is proving trickier.
from itertools import chain, cycle
import logging
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
matplotlib.style.use("ggplot")
m = {"Males" : {"Yes": 2, "No": 8}}
w = {"Females": {"Yes": 3, "No": 7}}
data = {**m, **w}
df = DataFrame(data)
# relative freq table
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
plt.show()
The following might be what you're looking for.
from itertools import chain
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame
matplotlib.style.use("ggplot")
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
labels = [l for l in ax.get_xticklabels()]
for i,l in enumerate(labels[len(df_ft):]):
l.set_text(df_ft.columns[i % len(df_ft.columns)])
for i,l in enumerate(labels[:len(df_ft)]):
l.set_text("\n"+l.get_text())
ax.set_xticklabels(labels)
plt.savefig(__file__+".png")
plt.show()
Altair would do a great job here.
from altair import *
from pandas import DataFrame
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df = df.stack().reset_index()
df.columns=['response','gender','count']
Vis #1
Chart(df).mark_bar().encode(x='gender',y='count',color='response').configure_cell(width=200, height=200)
Vis 2
Chart(df).mark_bar().encode(x=X('response', axis=False),
y=Y('count', axis=Axis(grid=False)),
color='response',
column=Column('gender', axis=Axis(axisWidth=1.0, offset=-8.0, orient='bottom'),scale=Scale(padding=30.0))).configure_cell(width=200, height=200).configure_facet_cell(strokeWidth=0)
How do I get the internally created colorbar instance of a plot created by pandas.DataFrame.plot?
Here is an example for generating a colored scatter plot:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools as it
# [ (0,0), (0,1), ..., (9,9) ]
xy_positions = list( it.product( range(10), range(10) ) )
df = pd.DataFrame( xy_positions, columns=['x','y'] )
# draw 100 floats
df['score'] = np.random.random( 100 )
ax = df.plot( kind='scatter',
x='x',
y='y',
c='score',
s=500)
ax.set_xlim( [-0.5,9.5] )
ax.set_ylim( [-0.5,9.5] )
plt.show()
which gives me a figure like this:
How do I get the colorbar instance in order to manipulate it, for instance for changing the label or setting the ticks?
pandas does not return the axis for the colorbar, therefore we have to locate it:
1st, let's get the figure instance: i.e., use plt.gcf()
In [61]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools as it
# [ (0,0), (0,1), ..., (9,9) ]
xy_positions = list( it.product( range(10), range(10) ) )
df = pd.DataFrame( xy_positions, columns=['x','y'] )
# draw 100 floats
df['score'] = np.random.random( 100 )
ax = df.plot( kind='scatter',
x='x',
y='y',
c='score',
s=500)
ax.set_xlim( [-0.5,9.5] )
ax.set_ylim( [-0.5,9.5] )
f = plt.gcf()
2, how many axes does this figure have?
In [62]:
f.get_axes()
Out[62]:
[<matplotlib.axes._subplots.AxesSubplot at 0x120a4d450>,
<matplotlib.axes._subplots.AxesSubplot at 0x120ad0050>]
3, The first axes (that is, the first one created), contains the plot
In [63]:
ax
Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x120a4d450>
4, Therefore, the second axis is the colorbar axes
In [64]:
cax = f.get_axes()[1]
#and we can modify it, i.e.:
cax.set_ylabel('test')
It's not quite the same but you could just plot using matplotlib:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools as it
# [ (0,0), (0,1), ..., (9,9) ]
xy_positions = list( it.product( range(10), range(10) ) )
df = pd.DataFrame( xy_positions, columns=['x','y'] )
# draw 100 floats
df['score'] = np.random.random( 100 )
fig = plt.figure()
ax = fig.add_subplot(111)
s = ax.scatter(df.x, df.y, c=df.score, s=500)
cb = plt.colorbar(s)
cb.set_label('desired_label')
ax.set_xlim( [-0.5,9.5] )
ax.set_ylim( [-0.5,9.5] )
plt.show()