Related
import numpy as np
from matplotlib import pyplot as plt
data = np.array([[0.8, 2.4, 2.5, 3.9, 0.0, 4.0, 0.0],
[2.4, 0.0, 4.0, 1.0, 2.7, 0.0, 0.0],
[1.1, 2.4, 0.8, 4.3, 1.9, 4.4, 0.0],
[0.6, 0.0, 0.3, 0.0, 3.1, 0.0, 0.0],
[0.7, 1.7, 0.6, 2.6, 2.2, 6.2, 0.0],
[1.3, 1.2, 0.0, 0.0, 0.0, 3.2, 5.1],
[0.1, 2.0, 0.0, 1.4, 0.0, 1.9, 6.3]])
plt.figure(figsize=(6, 4))
im = plt.imshow(data, cmap="YlGn")
linewidth = 2
for axis in ['top', 'bottom', 'left', 'right']:
plt.gca().spines[axis].set_linewidth(linewidth)
plt.gca().set_xticks(np.arange(data.shape[1] + 1) - .5, minor=True)
plt.gca().set_yticks(np.arange(data.shape[0] + 1) - .5, minor=True)
plt.gca().grid(which="minor", color="black", linewidth=linewidth)
plt.gca().tick_params(which="minor", bottom=False, left=False)
plt.tight_layout()
plt.gca().set_xticks(ticks=[])
plt.gca().set_yticks(ticks=[])
plt.savefig("test.pdf",
bbox_inches="tight",
transparent="True",
pad_inches=1.0/72.0 * linewidth / 2.0)
This code will output the following pdf, but you can see that there are white borders on the left and bottom, so the pdf is not centered after being inserted into LaTex. How to solve this problem?
plt result:
import numpy as np
from matplotlib import pyplot as plt
data = np.array([[0.8, 2.4, 2.5, 3.9, 0.0, 4.0, 0.0],
[2.4, 0.0, 4.0, 1.0, 2.7, 0.0, 0.0],
[1.1, 2.4, 0.8, 4.3, 1.9, 4.4, 0.0],
[0.6, 0.0, 0.3, 0.0, 3.1, 0.0, 0.0],
[0.7, 1.7, 0.6, 2.6, 2.2, 6.2, 0.0],
[1.3, 1.2, 0.0, 0.0, 0.0, 3.2, 5.1],
[0.1, 2.0, 0.0, 1.4, 0.0, 1.9, 6.3]])
plt.figure(figsize=(6, 4))
im = plt.imshow(data, cmap="YlGn")
linewidth = 2
for axis in ['top', 'bottom', 'left', 'right']:
plt.gca().spines[axis].set_linewidth(linewidth)
plt.gca().set_xticks(np.arange(data.shape[1] + 1) - .5, minor=True)
plt.gca().set_yticks(np.arange(data.shape[0] + 1) - .5, minor=True)
plt.gca().grid(which="minor", color="black", linewidth=linewidth)
plt.gca().tick_params(which="minor", bottom=False, left=False)
plt.tight_layout()
plt.gca().set_xticks(ticks=[])
plt.gca().set_yticks(ticks=[])
plt.gca().tick_params(axis="both",
which="major",
left=False,
bottom=False,
labelleft=False,
labelbottom=False)
plt.savefig("test.pdf",
bbox_inches="tight",
transparent="True",
pad_inches=1.0 / 72.0 * linewidth / 2.0)
It was an issue with ticks, solved now.
I have a large dataset with thousands of rows though fewer columns, i have ordered them by row values so that each of the 'objects' are grouped together, just like the dataset in Table1 below:
#Table1 :
data = [['ALFA', 351740.00, 0.31, 0.22, 0.44, 0.19, 0.05],
['ALFA', 401740.00, 0.43, 0.26, 0.23, 0.16, 0.09],
['ALFA', 892350.00, 0.58, 0.24, 0.05, 0.07, 0.4],
['Bravo', 511830.00, 0.52, 0.16, 0.08, 0.26, 0],
['Charlie', 590030.00, 0.75, 0.2, 0.14, 0.37, 0.06],
['Charlie', 590030.00, 0.75, 0.2, 0.27, 0.2, 0.01],
['Charlie', 590030.00, 0.75, 0.2, 0.29, 0.11, 0.04],
['Charlie', 590030.00, 0.75, 0.2, 0.27, 0.2, 0.01],
['Charlie', 401740.00, 0.43, 0.26, 0.14, 0.37, 0.06],
['Charlie', 511830.00, 0.52, 0.16, 0.13, 0.22, 0.01],
['Delta', 590030.00, 0.75, 0.2, 0.34, 0.3, 0],
['Delta', 590030.00, 0.75, 0.2, 0, 0.28, 0],
['Delta', 351740.00, 0.31, 0.22, 0.44, 0.19, 0.05],
['Echo', 892350.00, 0.58, 0.24, 0.23, 0.16, 0.09],
['Echo', 590030.00, 0.75, 0.2, 0.05, 0.07, 0.4],
['Echo', 590030.00, 0.75, 0.2, 0.08, 0.26, 0],
['Echo', 590030.00, 0.75, 0.2, 0.14, 0.37, 0.06],
['Foxtrot', 401740.00, 0.43, 0.26, 0.27, 0.2, 0.01],
['Foxtrot', 511830.00, 0.52, 0.16, 0.29, 0.11, 0.04],
['Golf', 590030.00, 0.75, 0.2, 0.27, 0.2, 0.01],
['Golf', 590030.00, 0.75, 0.2, 0.14, 0.37, 0.06],
['Golf', 351740.00, 0.31, 0.22, 0.13, 0.22, 0.01],
['Hotel', 892350.00, 0.58, 0.24, 0.34, 0.3, 0],
['Hotel', 590030.00, 0.75, 0.2, 0, 0.28, 0],
['Hotel', 590030.00, 0.75, 0.2, 0.29, 0.11, 0.04]]
df = pd.DataFrame(data, columns= ['Objects', 'Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6'])
df
However i would like to write a query to go through the dataset, partition the data by these objects and get only the averages for all the columns (for each object) in a separate table much like the Table2 below:
#Table2:
data2 = [['ALFA', 548610.00, 0.44, 0.24, 0.24, 0.14, 0.18],
['Bravo', 511830.00, 0.52, 0.16, 0.08, 0.26, 0],
['Charlie', 545615.00, 0.66, 0.20, 0.21, 0.25, 0.03],
['Delta', 510600.00, 0.60, 0.21, 0.26, 0.26, 0.02],
['Echo', 665610.00, 0.71, 0.21, 0.13, 0.22, 0.14],
['Foxtrot', 456785.00, 0.48, 0.21, 0.28, 0.16, 0.03],
['Golf', 510600.00, 0.60, 0.21, 0.18, 0.26, 0.03],
['Hotel', 690803.33, 0.69, 0.21, 0.21, 0.23, 0.01]]
df2 = pd.DataFrame(data, columns= ['Objects', 'Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6'])
df2
Please note that the number of the objects vary across the dataset so the query should be able to count the number of objects and use that number to get the average of all the columns for each object and then present all these values in a new table (much Like what partition windows function does).
For instance note that the '548610.00' alues in Table2 for ALFA(column1) is merely an addition of Column1 values of ALFA in Table1 (351740.00 + 401740.00 + 401740.00) and divide by the count of ALFA being '3'
I believe a simple avg() function should answer your question
SELECT Objects,
AVG(Column1),
AVG(Column2),
AVG(Column3),
AVG(Column4),
AVG(Column5),
AVG(Column6)
FROM tableA
GROUP BY Objects
db fiddle link
I'm trying to train LSTM model in Keras using data of variable timestep, for example, the data looks like:
<tf.RaggedTensor [[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
[[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]], ...,
[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
[[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]]>
and its corresponding label:
<tf.RaggedTensor [[6, 6], [7, 7], [8], ..., [6], [11, 11, 11, 11, 11], [24, 24, 24, 24, 24]]>
Each input data have 13 features, so for each time step, the model receives a 1 x 13 vector. I wonder if it is possible to do so? I don't mind doing this on pytorch either.
I try to align them with no reshape layer.
However, my input for each time step in the LSTM layer is a vector of dimension 13. And each sample has variable-length of these vectors, which means the time step is not constant for each sample. Can you show me a code example of how to train such model? –
TurquoiseJ
First of all, the concept of windows length and time steps is they take the same amount of the input with a higher number of length and time.
We assume the input to extract features can be divide by multiple times of windows travels along with axis, please see the attached for idea.
[Codes]:
batched_features = tf.constant( [ [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], ], shape=( 2, 1, 13 ) )
batched_labels = tf.constant( [[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]], shape=( 2, 13 ) )
dataset = tf.data.Dataset.from_tensor_slices((batched_features, batched_labels))
dataset = dataset.batch(10)
batched_features = dataset
[Sample]:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
bidirectional (Bidirectiona (None, 1, 64) 11776
l)
bidirectional_1 (Bidirectio (None, 64) 24832
nal)
dense (Dense) (None, 13) 845
=================================================================
Total params: 37,453
Trainable params: 37,453
Non-trainable params: 0
_________________________________________________________________
<BatchDataset element_spec=(TensorSpec(shape=(None, 1, 13), dtype=tf.int32, name=None), TensorSpec(shape=(None, 13), dtype=tf.int32, name=None))>
Epoch 1/100
2022-03-28 05:19:04.116345: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8100
1/1 [==============================] - 8s 8s/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 0.0000e+00 - val_accuracy: 1.0000
Epoch 2/100
1/1 [==============================] - 0s 38ms/step - loss: 0.0000e+00 - accuracy: 1.0000 - val_loss: 0.0000e+00 - val_accuracy: 1.0000
Assume each windows consume about 13 level of the input :
batched_features = tf.constant( [ [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], ], shape=( 2, 1, 13 ) )
batched_labels = tf.constant( [[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]], shape=( 2, 13 ) )
Adding more windows is easy by
batched_features = tf.constant( [ [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], ], shape=( 3, 1, 13 ) )
batched_labels = tf.constant( [[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]], shape=( 3, 13 ) )
dataset = tf.data.Dataset.from_tensor_slices((batched_features, batched_labels))
dataset = dataset.batch(10)
batched_features = dataset
At least you tell me what is the purpose they can use reverse windows to have certain results. ( Apmplitues frequency )
The results will look like these for each windows :
[ Output ] : 2 and 3 Windows
# Sequence types with timestep #1:
# <BatchDataset element_spec=(TensorSpec(shape=(None, 1, 13), dtype=tf.int32, name=None), TensorSpec(shape=(None, 13), dtype=tf.int32, name=None))>
# Sequence types with timestep #2:
# <BatchDataset element_spec=(TensorSpec(shape=(None, 1, 13), dtype=tf.int32, name=None), TensorSpec(shape=(None, 13), dtype=tf.int32, name=None))>
[ Result ]:
I'm using the tf.unsorted_segment_sum method of TensorFlow and it works.
For example:
tf.unsorted_segment_sum(tf.constant([0.2, 0.1, 0.5, 0.7, 0.8]),
tf.constant([0, 0, 1, 2, 2]), 3)
Gives the right result:
array([ 0.3, 0.5 , 1.5 ], dtype=float32)
I want to get:
array([0.3, 0.3, 0.5, 1.5, 1.5], dtype=float32)
I've solved it.
data = tf.constant([0.2, 0.1, 0.5, 0.7, 0.8])
gr_idx = tf.constant([0, 0, 1, 2, 2])
y, idx, count = tf.unique_with_count(gr_idx)
group_sum = tf.segment_sum(data, gr_idx)
group_sup = tf.gather(group_sum, idx)
answer:
array([0.3, 0.3, 0.5, 1.5, 1.5], dtype=float32)
What I want to do is "mask" a subset of an array of j elements, from range 0 to k. Eg. For this array:
[0.2, 0.1, 0.3, 0.4, 0.5]
Masking the first 2 elements it becomes
[NaN, NaN, 0.3, 0.4, 0.5]
Does masked_array support this operation?
In [51]: arr=np.ma.array([0.2, 0.1, 0.3, 0.4, 0.5],mask=[True,True,False,False,False])
In [52]: print(arr)
[-- -- 0.3 0.4 0.5]
Or, if you already have a numpy array, you could use np.ma.masked_less_equal (see the link for a variety of other operations for masking particular elements):
In [53]: arr=np.array([0.2, 0.1, 0.3, 0.4, 0.5])
In [56]: np.ma.masked_less_equal(arr,0.2)
Out[57]:
masked_array(data = [-- -- 0.3 0.4 0.5],
mask = [ True True False False False],
fill_value = 1e+20)
Or, if you wish to mask the first two elements:
In [67]: arr=np.array([0.2, 0.1, 0.3, 0.4, 0.5])
In [68]: arr=np.ma.array(arr,mask=False)
In [69]: arr.mask[:2]=True
In [70]: arr
Out[70]:
masked_array(data = [-- -- 0.3 0.4 0.5],
mask = [ True True False False False],
fill_value = 1e+20)
I found this:
ma.array([1,2,3,4], mask=[1,1,0,0])
masked_array(data = [-- -- 3 4],
mask = [ True True False False],
fill_value = 999999)