Which plotting library does this plot belong to? - matplotlib

Which plotting library does this plot belong to?

I'm not very good at ploting grpahs but this is how I came up with what I was looking for:
# Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
data_dict = {'Best fit': [395.0, 401.0, 358.0, 443.0, 357.0, 378.0, 356.0, 356.0, 403.0, 380.0, 397.0, 406.0, 409.0, 414.0, 350.0, 433.0, 345.0, 376.0, 374.0, 379.0, 9.0, 13.0, 10.0, 13.0, 16.0, 12.0, 6.0, 11.0, 20.0, 10.0, 12.0, 11.0, 15.0, 11.0, 11.0, 11.0, 15.0, 10.0, 8.0, 18.0, 864.0, 803.0, 849.0, 858.0, 815.0, 856.0, 927.0, 878.0, 834.0, 837.0, 811.0, 857.0, 848.0, 869.0, 861.0, 820.0, 887.0, 842.0, 834.0, np.nan], 'MDP': [332, 321, 304, 377, 304, 313, 289, 314, 341, 321, 348, 334, 361, 348, 292, 362, 285, 316, 291, 318, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 770, 770, 819, 751, 822, 842, 758, 825, 886, 830, 774, 839, 779, 821, 812, 850, 822, 786, 874, 831], 'Q-Learning': [358, 329, 309, 381, 302, 319, 296, 315, 343, 318, 338, 336, 360, 357, 299, 363, 287, 337, 301, 334, 3, 6, 5, 5, 4, 5, 4, 3, 8, 6, 4, 0, 8, 1, 4, 0, 9, 5, 3, 8, 771, 833, 757, 837, 831, 784, 806, 890, 843, 775, 838, 776, 824, 830, 834, 827, 791, 868, 816, 806], 'parametrized_factor': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]}
data2 = pd.DataFrame(data_dict)
# figure size
plt.figure(figsize=(12, 8))
ax = sns.pointplot(y="Best fit", x="parametrized_factor", data=data2, linestyles='-.', color='g', capsize=.1, scale=.2, errwidth=.5)
ax = sns.pointplot(y="MDP", x="parametrized_factor", data=data2, linestyles='-', color='r', capsize=.12, scale=.2, errwidth=.5)
ax = sns.pointplot(y="Q-Learning", x="parametrized_factor", data=data2, linestyles=':', color='k', capsize=.15, scale=.5, errwidth=.5)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
labels = ax.legend(['BestFit', 'MDP', 'Q- Learning']), colors = ['green', 'red', 'black']
i = 0
for l in labels.get_texts():
l.set_color(colors[i])
i+=1
# ADDED: Remove labels.
ax.set_ylabel('Rejection rate')
ax.set_xlabel('Parametrized factor')
plt.setp(ax.get_legend().get_texts(), fontsize='12') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='12') # for legend title
plt.show()
I still have not figure out how to change the color of the marks on the legend.

Related

Most efficient method to loop through a dataframe and return a filtered array of values based on multiple criteria

I have a dataset which has data of events including various elements with positional data of these elements included at various points in time. The total dataset is very large covering many of these events.
For each element at each point in time, I want to find the closest other element. To start this I was going to return an array of the positional data of all other elements at a specific time period and include this in the same row of the original dataframe (to perform further calculations on later).
I had two attempts at coding this, which I have included below. Both take too long on such a large dataset. Any ways that I can make it more efficient would be greatly appreciated.
import pandas as pd
import numpy as np
def func1(db, val, frame):
return db.loc[(db['val'].isin([val])) & (db['frameId'].isin([frame])) & ['displayName', 'x', 'y']]
.reset_index(drop=True).values.tolist()
d = pd.DataFrame({'displayName': ['Bob', 'Jane', 'Alice',
'Bob', 'Jane', 'Alice'],
'x': [90, 88, 86, 94, 91, 92],
'y': [24, 13, 18, 20, 15, 16],
'val': [201801, 201801, 201801, 201801, 201801, 201801],
'frameId': [1, 1, 1, 2, 2, 2]})
res = d.apply(lambda row: func1(d, row['val'], row['frameId']), axis=1)
Approach 2:
def func2(db, val, frame):
return [l[[0, 1, 2]] for l in db if l[3] == val if l[4] == frame]
res = d.apply(lambda row: func2(np.array(d), row['val'], row['frameId']), axis=1)
The result (res) will thus be an array that looks like this:
[[['Bob', 90, 24], ['Jane', 88, 13], ['Alice', 86, 18]],
[['Bob', 90, 24], ['Jane', 88, 13], ['Alice', 86, 18]],
[['Bob', 90, 24], ['Jane', 88, 13], ['Alice', 86, 18]],
[['Bob', 94, 20], ['Jane', 91, 15], ['Alice', 92, 16]],
[['Bob', 94, 20], ['Jane', 91, 15], ['Alice', 92, 16]],
[['Bob', 94, 20], ['Jane', 91, 15], ['Alice', 92, 16]]]
However over the large dataset this is very time consuming to produce under both methods so any way to reduce time complexity would be welcomed.
If the order of the first dimension of the 3D-Array does not matter, then please use (if it does matter, then you will have to create a series that groups by displayName or index and takes the cumcount. Sort by that and then drop. Let me know.:
import pandas as pd
import numpy as np
d = pd.DataFrame({'displayName': ['Bob', 'Jane', 'Alice',
'Bob', 'Jane', 'Alice'],
'x': [90, 88, 86, 94, 91, 92],
'y': [24, 13, 18, 20, 15, 16],
'val': [201801, 201801, 201801, 201801, 201801, 201801],
'frameId': [1, 1, 1, 2, 2, 2]})
n = d['frameId'].max() + 1
x = d['displayName'].nunique()
pd.concat([d.iloc[:,0:3]]*n).to_numpy().reshape(df.shape[0],x,x)
Out[1]:
array([[['Bob', 90, 24],
['Jane', 88, 13],
['Alice', 86, 18]],
[['Bob', 94, 20],
['Jane', 91, 15],
['Alice', 92, 16]],
[['Bob', 90, 24],
['Jane', 88, 13],
['Alice', 86, 18]],
[['Bob', 94, 20],
['Jane', 91, 15],
['Alice', 92, 16]],
[['Bob', 90, 24],
['Jane', 88, 13],
['Alice', 86, 18]],
[['Bob', 94, 20],
['Jane', 91, 15],
['Alice', 92, 16]]], dtype=object)

bilstm and attention to find the topic representation of text

#Preprocessing of data
df = pd.read_csv("small_quac.csv")
df = df.drop(['Unnamed: 0'], axis = 1)
shared_topic, section_title, for_tokenize = read_data(df)
# Define x_train and x_test
x_train = np.asarray(shared_topic)
y_train = np.asarray(section_title)
# Find max_seq_len
max_seq_len_x = get_max_seq_len(x_train, remove_stopwords=False)
max_seq_len_y = get_max_seq_len(y_train, remove_stopwords=False)
max_seq_len = max(max_seq_len_x, max_seq_len_y)
tokenizer = Tokenizer(filters='\n')
tokenizer.fit_on_texts(for_tokenize)
vocab_size = len(tokenizer.word_index) + 1
X = tokenizer.texts_to_sequences(x_train)
y = tokenizer.texts_to_sequences(y_train)
# print(X[0])
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
fdist = tokenizer.word_counts
X = pad_sequences(X, maxlen=max_seq_len_x, padding='post')
y = pad_sequences(y, maxlen=max_seq_len_y, padding='post')
# from here modelling starts
rnn_cell_size = 128
max_seq_len_y = 14
max_seq_len_x = 139
class Attention(tf.keras.Model):
def __init__(self, units):
super(Attention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, features, hidden):
hidden_with_time_axis = tf.expand_dims(hidden, 1)
score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
attention_weights = tf.nn.softmax(self.V(score), axis=1)
context_vector = attention_weights * features
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
sequence_input = tf.keras.layers.Input(shape=(max_seq_len_x,))
embedded_sequences = tf.keras.layers.Embedding(vocab_size,
300, weights=[embedding_matrix],
trainable=False, mask_zero=True, name='Encoder-Word-Embedding')(sequence_input)
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM
(rnn_cell_size,
dropout=0.3,
return_sequences=True,
return_state=True,
recurrent_activation='relu',
recurrent_initializer='glorot_uniform'), name="bi_lstm_0")(embedded_sequences)
lstm, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional \
(tf.keras.layers.LSTM
(rnn_cell_size,
dropout=0.2,
return_sequences=True,
return_state=True,
recurrent_activation='relu',
recurrent_initializer='glorot_uniform'))(lstm)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
context_vector, attention_weights = Attention(32)(lstm, state_h)
output = keras.layers.Dense(max_seq_len_y, activation='softmax')(context_vector)
model = keras.Model(inputs=sequence_input, outputs=output)
# summarize layers
print(model.summary())
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(x=X, y=y,epochs=30)
Also, I am using glove embeddings 300 Dimensions.
Here my X is a matrix of shape (59,139) where 59 = number of samples and 139 = length of maximum sentence in my text rows. These 139 values are filled with the word2idx of my vocabulary.
Y is a matrix of shape (59, 14) where 59=same above and 14 = length of my maximum title and filled with word2idx of vocabulary.
For example I want this:
Input:
array([293, 40, 294, 129, 75, 130, 129, 131, 295, 296, 132, 297, 298,
2, 299, 34, 12, 76, 300, 27, 301, 15, 1, 302, 133, 4,
77, 303, 3, 134, 304, 78, 34, 305, 11, 306, 307, 4, 1,
132, 135, 22, 10, 308, 11, 136, 4, 1, 309, 50, 4, 310,
11, 78, 311, 312, 3, 77, 1, 313, 130, 10, 137, 11, 12,
109, 7, 314, 315, 7, 1, 76, 316, 4, 317, 318, 34, 138,
319, 139, 320, 3, 77, 321, 79, 322, 4, 1, 323, 324, 4,
1, 325, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0])
Output:
array([1040, 1041, 2, 1042, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Please help me out, I have spend so many days to find the approach but I am unable to find it.

How to skip 'for' loop when dealing with numpy arrays

Here is my code:
import numpy as np
>>> x
array([[ 1, 57],
[ 2, 21],
[ 4, 34],
...,
[3348, 29],
[3350, 23],
[3353, 11]])
>>> x.shape
(1310, 2)
>>> pic # greyscale image
array([[223, 222, 225, ..., 217, 219, 214],
[224, 222, 219, ..., 220, 219, 216],
[223, 224, 220, ..., 219, 215, 213],
...,
[228, 226, 231, ..., 224, 228, 229],
[229, 227, 227, ..., 216, 225, 227],
[226, 228, 225, ..., 218, 225, 230]], dtype=uint8)
pic = np.stack((pic,pic,pic), axis=2)
>>> pic.shape
(2208, 2752, 3)
>>>labels.shape
(2208, 2752)
color = [0, 0, 255]
for i in x:
B=np.full((i[1],3), color).astype('int')
pic[labels==i[0]]=B
It colors all the pixels in grayscale image (pic) with blue (rgb 0,0,255), which satisfy the condition pic[labels==i[0]]. Now, this is very slow because of the 'for' loop used (for i in x).
Is there any efficient 'Numpy way', which won't include for loop, and
would, therefore, be much faster. Thanks for your kind help!

TypeError: Expected binary or unicode string

I want to use tf.data to input my image data. And I have read all image in a fold into a np.array, then I used to np.array to create a tf.data.Dataset object. However, I had a TypeError. My code is shown as follows.
import os
from scipy.misc import imread
import numpy as np
import glob
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
image = []
img_dir = 'data/ILSVRC2012_test/*'
images = np.array([np.array(imread(data)) for data in glob.glob(img_dir)])
image_data = tf.data.Dataset.from_tensor_slices(images)
And the following block is error information.
TypeError: Expected binary or unicode string, got array([[[184, 210, 225],
[184, 210, 225],
[184, 210, 225],
...,
[160, 185, 205],
[159, 184, 204],
[159, 184, 204]],
[[183, 209, 224],
[184, 210, 225],
[184, 210, 225],
...,
[159, 186, 205],
[159, 186, 205],
[159, 186, 205]],
[[184, 210, 225],
[184, 210, 225],
[185, 211, 226],
...,
[160, 187, 206],
[160, 187, 206],
[160, 187, 206]],
...,
[[ 65, 65, 15],
[ 71, 71, 17],
[ 75, 76, 19],
...,
[ 83, 83, 19],
[ 82, 87, 21],
[ 85, 85, 21]],
[[ 70, 70, 18],
[ 74, 75, 18],
[ 74, 78, 19],
...,
[ 77, 81, 20],
[ 78, 87, 24],
[ 77, 81, 20]],
[[ 71, 71, 17],
[ 73, 74, 17],
[ 77, 78, 20],
...,
[ 85, 86, 20],
[ 85, 85, 21],
[ 75, 74, 20]]], dtype=uint8)
Any help would be appreciated!

Font type in Matplotlib

I am pretty new to Matplotlib, I have been wading through their very nice webpage, and managed to generate something very similar to what I wanted. I am including the code to the plot I want, the only issue I can't fix is to get the xlabel, ylabel and legend also in bold.
Any suggestions would be very welcome,
many thanks
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
import numpy as np
from matplotlib.font_manager import FontProperties
from pylab import *
font = FontProperties()
font = {'family' : 'serif',
'weight' : 'bold',
'size' : 12,
}
fig = plt.figure(figsize=(6, 8))
matplotlib.rc('font', **font)
ax = host_subplot(211, axes_class=AA.Axes)
X=[0.061, 0.12, 0.17, 0.23, 0.29, 0.34, 0.4, 0.46, 0.51]
Y=[3.2, 4, 5.6, 7.4, 11.2, 18.6, 28.9, 42.5, 55.9]
Z=[3.2, 4.1, 5.7, 7.6, 11.3, 18.5, 27, 35.6, 46.9]
A=[3.2, 4, 5.6, 7.6, 11.3, 19.2, 30.4, 44.6, 57.7]
B=[3.2, 3.5, 4.8, 6.5, 10.4, 19.7, 32.9, 53.8, 84.2]
C=[3.1, 3.8, 5.6, 8, 13, 26.1, 41.1, 64.3, 103.7]
ax.plot(X, Y, color="red", linewidth=2.5, marker="v", markersize=7)
ax.plot(X, Z, color="orange", linewidth=2.5, marker="p", markersize=7)
ax.plot(X, A, color="yellow", linewidth=2.5, marker="s", markersize=7)
ax.plot(X, B, color="#33CC33", linewidth=2.5, marker="h", markersize=7)
ax.plot(X, C, color="green", linewidth=2.5, marker="D", markersize=7)
ax2 = ax.twin() # ax2 is responsible for "top" axis and "right" axis
ax2.set_xticks([0.061, 0.12, 0.17, 0.23, 0.29, 0.34, 0.4, 0.46, 0.51])
ax2.set_xticklabels(["4", "3","2.4", "2", "1.9", "1.7", "1.6", "1.5", "1.4$\AA$"])
ax2.axis["right"].major_ticklabels.set_visible(False)
ax.set_xlim(0.05, 0.52)
ax.set_ylim(0, 109)
plt.xlabel('1/d$^2$', fontsize=14, fontweight='bold')
plt.ylabel('R$_{meas}$')
plt.legend(("0.05deg/0.05s", "SUM20", "SUM40", "SUM80", "SUM160"))
fontsize=12, fancybox=True, shadow=True)
ax = host_subplot(212, axes_class=AA.Axes)
X=[0.061, 0.12, 0.17, 0.23, 0.29, 0.34, 0.4, 0.46, 0.51]
Y=[65, 62, 46, 35, 23, 13, 8, 4, 2]
Z=[65, 62, 47, 35, 23, 14, 9, 5, 3]
A=[66, 62, 47, 35, 23, 13, 8, 4, 2]
B=[71, 66, 48, 36, 23, 13, 8, 4, 2]
C=[70, 65, 48, 36, 23, 13, 8, 4, 2]
ax.plot(X, Y, color="red", linewidth=2.5, marker="v", markersize=7)
ax.plot(X, Z, color="orange", linewidth=2.5, marker="p", markersize=7)
ax.plot(X, A, color="yellow", linewidth=2.5, marker="s", markersize=7)
ax.plot(X, B, color="#33CC33", linewidth=2.5, marker="h", markersize=7)
ax.plot(X, C, color="green", linewidth=2.5, marker="D", markersize=7)
ax2 = ax.twin() # ax2 is responsible for "top" axis and "right" axis
ax2.set_xticks([0.061, 0.12, 0.17, 0.23, 0.29, 0.34, 0.4, 0.46, 0.51])
ax2.set_xticklabels(["4", "3","2.4", "2", "1.9", "1.7", "1.6", "1.5", "1.4$\AA$"])
ax2.axis["right"].major_ticklabels.set_visible(False)
ax.set_xlim(0.05, 0.52)
fig.subplots_adjust(hspace=0.3)
plt.xlabel('1/d$^2$', fontweight='bold')
plt.ylabel('I/sigma', fontdict=font)
plt.legend(("0.05deg/0.05s", "SUM20", "SUM40", "SUM80", "SUM160"))
plt.show()
Text objects have a weight property: http://matplotlib.org/users/text_props.html
plt.xlabel('This is my label', weight='bold')
plt.title('This is my title', weight='bold')