Loop through fields and break - awk

My file looks looks like this:
1000074493 1D # # # # #
1000098165 1D # # # # #
1000105360 1D # # # # #
1000115763 1D 2D # # # #
1000345208 1D # # # # #
1000470774 1D 2D # 4D # #
1000487544 # # 3D # 5D #
1000499657 1D # # # # #
1000531456 1D # # # # #
1000561333 # # # # 5D #
I want to loop per record through fields 2:NF
print if $NF != #
and stop reading the line but continue in next line.
In other words,
find the first field after the first which isn't #, then print only the first field and that field, and skip to the next line.
So the expected result would be:
1000074493 1D
1000098165 1D
1000105360 1D
1000115763 1D
1000345208 1D
1000470774 1D
1000487544 3D
1000499657 1D
1000531456 1D
1000561333 5D
My code is:
awk '{for(i=2; i<=NF; i++) {if($i != "#" ) print $1,$i }}' $FILE
which gives me:
1000074493 1D
1000098165 1D
1000105360 1D
1000115763 1D
1000115763 1D
1000345208 1D
1000470774 1D
1000470774 2D
1000470774 4D
1000487544 3D
1000487544 5D
1000499657 1D
1000531456 1D
1000561333 5D
What do I need to change?

Like your original question articulation should already have suggested, the keyword you are looking for is break.
awk '{for(i=2; i<=NF; i++) if($i != "#" ) { print $1,$i; break }}' "$FILE"
Demo: https://ideone.com/hWRM9K
As an aside, avoid useless uses of cat and use lower case for your private variables, and quote file name variables.

With awk you can do this:
awk '{gsub(/#/,""); print $1,$2}' file
1000074493 1D
1000098165 1D
1000105360 1D
1000115763 1D
1000345208 1D
1000470774 1D
1000487544 3D
1000499657 1D
1000531456 1D
1000561333 5D

the following applied to your file gave me your expected result
awk '{i=2; while ($i == "#") i++; print $1 " " $i}' $FILE

Found out finally by myself:
awk '{for(i=2; i<=NF; i++) {if($i != "#" ) print $1,$i }}' $FILE|awk '$1 != p {print $1,$2}{p=$1}'
if anyone knows how to combine both awk statements in one, it would be appreciated!

Related

Numba "LLVM IR parsing error" seemingly because numba.complex128[:,:].shape has a default type?

Please bear with the long question.
Numba encounters "LLVM IR parsing error" in my code seemingly due to defualt typing of np.complex128.shape, but I could not find any documentations saying that np.complex128.shape, numba.complex128.shape or prange have default types.
Minimal workable reproduction:
import numpy as np
from numba import jit, njit, prange
from numba import complex128, int32 # import jit value types
# Invert an (n,n) submatrix of a (m>n,n) rectangular matrix by taking the first
# n rows. "Taking the first n rows" is motivated by the RHS being rank n.
#
# -- Input --
# (m,n) matrix A
#
# -- Return --
# (m,m) matrix A_inv
#njit(complex128[:,:](complex128[:,:]))
def inv_square_jit(in_matrix):
if in_matrix.ndim != 2:
raise ValueError("Input should be 2d array")
n_row = in_matrix.shape[0]
n_col = in_matrix.shape[1]
if n_row<=n_col:
raise ValueError("Input should have more rows than cols")
# Remove specfied column (slightly faster than delete)
# and remove extra rows
sqinv = np.linalg.inv(in_matrix[:n_col, :])
padded = np.zeros((n_row, n_row), dtype = np.complex128)
padded[:len(sqinv), :len(sqinv)] = sqinv
return(padded)
# Solve degenerate underdetermined equation system
# -- Input --
# (m,n+1), (m,n), rank-n 2d np arrays A, B
# n-dim np array-like vb
# or
# (m,n+1), rank n+1 A,
# m-dim np array-like v_rhs
#
# vb can be any array-like item, and is not necessarily 1d.
# Implemented with ChiPhiFunc in mind.
#
# -- Return --
# n+1 np array-like va
#
# -- Note --
# For recursion relations with ChiPhiFunc's, A and B should come from
# convolution matrices. That still needs implementation.
#njit(complex128[:](complex128[:,:], complex128[:]))
def solve_degenerate_jit(A, v_rhs):
n_dim = A.shape[1]
if A.shape[0] != v_rhs.shape[0]:
raise ValueError("solve_underdetermined: A, v_rhs must have the same number of rows")
A_inv = np.ascontiguousarray(inv_square_jit(A))
# This vector is actually m-dim, with m-n blank elems at the end.
va = (A_inv#np.ascontiguousarray(v_rhs))[:n_dim]
return(va)
# #njit(complex128[:](complex128[:,:], complex128[:,:], complex128[:]))
# def solve_degenerate_jit(A, B, vb):
# B_cont = np.ascontiguousarray(B)
# vb_cont = np.ascontiguousarray(vb)
# return(solve_degenerate_jit(A, B_cont#vb_cont))
# Generate convolution operator from a for an n_dim vector.
#njit(complex128[:,:](complex128[:], int32))
def conv_matrix(vec, n_dim):
out_transposed = np.zeros((n_dim,len(vec)+n_dim-1), dtype = np.complex128)
for i in prange(n_dim):
out_transposed[i, i:i+len(vec)] = vec
return(out_transposed.T)
# For solving a*va = v_rhs, where va, vb have the same number of dimensions.
# In the context below, "#dim" represents number of chi mode components.
#
# -- Input --
# v_source_A: 2d matrix, content of ChiPhiFuncGrid, #dim = a
# v_rhs: 2d matrix, content of ChiPhiFuncGrid, #dim = m
# rank_rhs: int, rank of v_rhs (and correct answer)
# -- Output --
# va: 2d matrix, content of ChiPhiFuncGrid. Has #dim = rank_rhs
#njit(complex128[:,:](complex128[:,:], complex128[:,:], int32), parallel=True)
def batch_degen_jit(v_source_A, v_rhs, rank_rhs):
# if type(v_source_A) is not ChiPhiFuncGrid or type(v_source_B) is not ChiPhiFuncGrid:
# raise TypeError('batch_underdetermined_deconv: input should be ChiPhiFuncGrid.')
A_slices = np.ascontiguousarray(v_source_A.T) # now the axis 0 is phi grid
v_rhs_slices = np.ascontiguousarray(v_rhs.T) # now the axis 0 is phi grid
# axis 0 is phi grid, axis 1 is chi mode
va_transposed = np.zeros((len(A_slices), rank_rhs), dtype = np.complex128)
if len(A_slices) != len(v_rhs_slices):
raise ValueError('batch_underdetermined_deconv: A, v_rhs must have the same number of phi grids.')
if len(v_source_A) + rank_rhs - 1 != len(v_rhs):
raise ValueError('batch_underdetermined_deconv: #dim_A + rank_rhs - 1 = #dim_v_rhs must hold.')
for i in prange(len(A_slices)):
A_conv_matrix_i = conv_matrix(A_slices[i], rank_rhs)
# ********** Removing this line somehow makes it compile **********
va_transposed[i, :] = solve_degenerate_jit(A_conv_matrix_i,v_rhs_slices[i])
# ********** Removing this line somehow makes it compile **********
return va_transposed.T
The code compiles fine with parallel=False for the last method. However, with parallel=True, error occurs in for i in prange(len(A_slices)): of def batch_degen_jit(v_source_A, v_rhs, rank_rhs):, seemingly because solve_degenerate_jit(complex128[:,:], complex128[:], int32) accepts int32, but the prange(len(A_slices)) produces int64. Replacing all int32 with int64 solves the problem. Removing the *-marked line also makes it compile.
Error:
LoweringError: Failed in nopython mode pipeline (step: nopython mode backend)
Failed in nopython mode pipeline (step: nopython mode backend)
LLVM IR parsing error
<string>:1278:34: error: '%.777' defined with type 'i64' but expected 'i32'
%".778" = icmp eq i32 %".776", %".777"
^
File "<ipython-input-24-fa65c2d527fa>", line 104:
def batch_degen_jit(v_source_A, v_rhs, rank_rhs):
<source elided>
raise ValueError('batch_underdetermined_deconv: #dim_A + rank_rhs - 1 = #dim_v_rhs must hold.')
for i in prange(len(A_slices)):
^
During: lowering "id=17[LoopNest(index_variable = parfor_index.1805, range = (0, $10call_method.4_size0.1767, 1))]{120: <ir.Block at <ipython-input-24-fa65c2d527fa> (104)>}Var(parfor_index.1805, <ipython-input-24-fa65c2d527fa>:104)" at <ipython-input-24-fa65c2d527fa> (104)
Why is this the case?
Thank you!
(P.S. here's a test case for the methods:
convolver = np.random.rand(10,3)
correct_answer = np.random.rand(10,5)
rhs = np.zeros((10,7))
for i in range(10):
rhs[i] = np.convolve(convolver[i], correct_answer[i])
print(batch_degen_jit(np.complex128(convolver).T, np.complex128(rhs).T, 5))
)

How to avoid overdispersed Poisson regression overfitting?

I have a dataset including three variables including company id (there are 96 companies), expert id (there are 38 experts) and points given by experts to companies. Points are discrete values from 0 to 100. I tried fitting an overdispersed poisson to model points given by the experts. But I don't know why the model overfits although I am using a linear likelihood. Here is my JAGS code:
model_code <- "
model
{
# Likelihood
for (i in 1:N) {
y[i] ~ dpois(exp(mu[i]))
mu[i] ~ dnorm(alpha[company[i]] + beta[expert[i]] , sigma^-2)
}
# Priors
for (j in 1:J){
alpha[j] ~ dnorm (mu.a, sigma.a^-2)
}
for (k in 1:K){
beta[k] ~ dnorm (mu.a, sigma.a^-2)
}
mu.a ~ dunif (0, 100)
sigma.a ~ dunif (0, 100)
sigma ~ dunif(0, 100)
}
"
Anyone knows why this model overfits and how to fix it?

WinBUGS Examples Vol 1, Dyes example returns error

Currently going through examples volume 1 and came across an error with the dyes example.
When I try to load inits from the example it returns "this chain contains uninitialized variables. I am not sure which part of it is not right as on the first sight I see theta, tau.btw and tau.with is all specified and nothing is left out.
I am using the code directly from Examples Vol 1 under help tab. The same error happened to all three choices of priors for between-variation.
I would really appreciate any advice on the problem. Thanks in advance.
Below is the code I copied directly from the dyes example.
model
{
for( i in 1 : batches ) {
mu[i] ~ dnorm(theta, tau.btw)
for( j in 1 : samples ) {
y[i , j] ~ dnorm(mu[i], tau.with)
}
}
theta ~ dnorm(0.0, 1.0E-10)
# prior for within-variation
sigma2.with <- 1 / tau.with
tau.with ~ dgamma(0.001, 0.001)
# Choice of priors for between-variation
# Prior 1: uniform on SD
#sigma.btw~ dunif(0,100)
#sigma2.btw<-sigma.btw*sigma.btw
#tau.btw<-1/sigma2.btw
# Prior 2: Uniform on intra-class correlation coefficient,
# ICC=sigma2.btw / (sigma2.btw+sigma2.with)
ICC ~ dunif(0,1)
sigma2.btw <- sigma2.with *ICC/(1-ICC)
tau.btw<-1/sigma2.btw
# Prior 3: gamma(0.001, 0.001) NOT RECOMMENDED
#tau.btw ~ dgamma(0.001, 0.001)
#sigma2.btw <- 1 / tau.btw
}
Data
list(batches = 6, samples = 5,
y = structure(
.Data = c(1545, 1440, 1440, 1520, 1580,
1540, 1555, 1490, 1560, 1495,
1595, 1550, 1605, 1510, 1560,
1445, 1440, 1595, 1465, 1545,
1595, 1630, 1515, 1635, 1625,
1520, 1455, 1450, 1480, 1445), .Dim = c(6, 5)))
Inits1
list(theta=1500, tau.with=1, sigma.btw=1)
Inits2
list(theta=1500, tau.with=1,ICC=0.5)
Inits3
list(theta=1500, tau.with=1, tau.btw=1)
That is not an error per se. Yes you have provided the inits for the parameters of interest.
However there are the six mu[i] variables that are not data, but are variables drawn from mu[i] ~ dnorm(theta, tau.btw).
You could provide initial values for these as well, but it is best imo to just click on gen inits if you are using WinBUGS from the GUI - this will provide initial values for those.

Comparison of GradientDescent algorithm in tensorflow with the implementation of Michael Nielsen

First I will give an overview of my problem. I have two setups:
1) A net which is based on tensorflow
2) A net which is based on code from Michael Nielsen's Book http://neuralnetworksanddeeplearning.com/index.html
Both nets are completely equal. They both have
3 hidden layers a 30 neurons
2 inputs neurons, one output neuron
All activations are sigmoid
Stochastic Gradient descent algorithm as learning algorithm with eta=3.0
quadratic cost function : cost_function = tf.scalar_mul(1.0/(N_training_set*2.0),tf.reduce_sum(tf.squared_difference(y,y_)))
batch_size of 10
weight initialization: The weights which connect the lth and l+1th layer are initialized with sigma=1/sqrt(N_l), where N_l is the number of neurons in the lth layer.
My problem is, that the tensorflow results are very bad ( a factor 10 worse than the results one obtains if I use the Nielsen code).
So before I post my complete code: Does anybody know that there is a bug in the tensorflow StochasticGradientDescent algorithm? (Or does anybody has a reference how the learning rate of the Stocharstic Gradient Descent in tensorflow is defined? I cannot find something in the api)
Here is my code for the tensorflow net:
regression.py
import readData
import matplotlib.pyplot as plt
import numpy as np
from random import randint
import random
from root_numpy import fill_hist
from ROOT import TCanvas, TH2F, TText, TF1 ,TH1D
import ROOT
import tensorflow as tf
import math
# # # # # # ##
#Read in data#
# #
function_outputs=True# apply an invertable function to the y's and train with the modified outputs y_mod! Up to know this function is just a normalization.
function_inputs=True #
full_set = readData.read_data_set("./TH2D_A00_TB10.root","LHCChi2_CMSSM_nObs1061_A00_TB10","full_set",function_inputs,function_outputs)
N_full_set=full_set.get_N()
N_validation_set=10000
N_training_set=N_full_set-(N_validation_set)
full=range(0,N_full_set)
random.shuffle(full)
training_subset=full[:N_training_set]#indices for training set
validation_subset=full[N_training_set:N_training_set+N_validation_set]#indices for validation set
training_set = readData.read_data_set("./TH2D_A00_TB10.root","LHCChi2_CMSSM_nObs1061_A00_TB10","training_set",
function_inputs,function_outputs,full_set=full_set,subset=training_subset)
validation_set = readData.read_data_set("./TH2D_A00_TB10.root","LHCChi2_CMSSM_nObs1061_A00_TB10","validation_set",
function_inputs,function_outputs,full_set=full_set,subset=validation_subset )
#overwiew of full data set, training_data set and validation_data set. The modified members( normalized in this case) can be accessed with the x_mod() and y_mod() member functions
#the normalized data (input and output) will be used to train the net
print "full_data_set:"
print "x (inputs)"
print full_set.get_x()
print "y (outputs)"
print full_set.get_y()
print "x_mod"
print full_set.get_x_mod()
print "y_mod"
print full_set.get_y_mod()
print "------------------"
print "training_data_set:"
print "x (inputs)"
print training_set.get_x()
print "y (outputs)"
print training_set.get_y()
print "x_mod"
print training_set.get_x_mod()
print "y_mod"
print training_set.get_y_mod()
print "------------------"
print "evaluation_data_set:"
print "x (inputs)"
print validation_set.get_x()
print "y (outputs)"
print validation_set.get_y()
print "x_mod"
print validation_set.get_x_mod()
print "y_mod"
print validation_set.get_y_mod()
print "------------------"
# # # # # # # # # # # ##
#setting up the network#
# #
N_epochs = 20
learning_rate = 3.0
batch_size = 10
N1 = 2 #equals N_inputs
N2 = 30
N3 = 30
N4 = 30
N5 = 1
N_in=N1
N_out=N5
#one calculates everything directly for all elements in one batch
"""example: N_in=2,N_out=3, mini_batch_size=5, activation function=linear. In der output matrix gibt es 5Zeilen,jede fuer ein mini batch. Jede Zeile hat 3 Spalten fuer ein output neuron jeweils
W2
[[-0.31917086 -0.03908769 0.5792625 ]
[ 1.34563279 0.03904691 0.39674851]]
b2
[ 0.40960133 -0.5495823 -0.97048181]
x_in
[[ 23.2 12.2 ]
[ 0. 1.1 ]
[ 2.3 3.3 ]
[ 23.22222 24.44444]
[ 333. 444. ]]
y=x_in*W2+b2
[[ 9.42155647 -0.98004436 17.30874062]
[ 1.88979745 -0.50663072 -0.53405845]
[ 4.1160965 -0.51062918 1.67109203]
[ 25.8909874 -0.50280523 22.17957497]
[ 491.5866394 3.77104688 368.08026123]]
hier wird klar, dass b2 auf jede Zeile der Matrix x_in*w2 draufaddiert wird.
W2 ist die transponierte der atrix, die im Buch definiert ist.
"""
x = tf.placeholder(tf.float32,[None,N1])#don't take the shape=(batch_size,N1) argument, because we need this for different batch sizes
W2 = tf.Variable(tf.random_normal([N1, N2],mean=0.0,stddev=1.0/math.sqrt(N1*1.0)))# Initialize the weights for one neuron with 1/sqrt(Number of weights which enter the neuron/ Number of neurons in layer before)
b2 = tf.Variable(tf.random_normal([N2]))
a2 = tf.sigmoid(tf.matmul(x, W2) + b2) #x=a1
W3 = tf.Variable(tf.random_normal([N2, N3],mean=0.0,stddev=1.0/math.sqrt(N2*1.0)))
b3 = tf.Variable(tf.random_normal([N3]))
a3 = tf.sigmoid(tf.matmul(a2, W3) + b3)
W4 = tf.Variable(tf.random_normal([N3, N4],mean=0.0,stddev=1.0/math.sqrt(N3*1.0)))
b4 = tf.Variable(tf.random_normal([N4]))
a4 = tf.sigmoid(tf.matmul(a3, W4) + b4)
W5 = tf.Variable(tf.random_normal([N4, N5],mean=0.0,stddev=1.0/math.sqrt(N4*1.0)))
b5 = tf.Variable(tf.random_normal([N5]))
y = tf.sigmoid(tf.matmul(a4, W5) + b5)
y_ = tf.placeholder(tf.float32,[None,N_out]) # ,shape=(None,N_out)
# # # # # # # # # # # # # #
#initializing and training#
# #
cost_function = tf.scalar_mul(1.0/(N_training_set*2.0),tf.reduce_sum(tf.squared_difference(y,y_)))
error_to_desired_output= y-y_
abs_error_to_desired_output= tf.abs(y-y_)
sum_abs_error_to_desired_output= tf.reduce_sum(tf.abs(y-y_))
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)
init = tf.initialize_all_variables()
#launch the graph
sess = tf.Session()
sess.run(init)
N_training_batch=training_set.get_N()/batch_size #rounds to samllest integer
out_mod_validation=[0]*N_epochs # output of net, when inputting x_mod of validation data. Will be saved after each epoch.
error_mod_validation_data= [0]*N_epochs #absolute error on mod validation data after each epoch
diff_mod_validation=[0]*N_epochs # error vector of validation data after each epoch. i.e. y-y_
cost_training_data=[0]*N_epochs
for i in range(0,N_epochs):
for j in range(0,N_training_batch):
batch_xs, batch_ys, epochs_completed = training_set.next_batch(batch_size)#always gives the modified x's and y's. If one does not want to modifie them the function has to be set to identity
sess.run(train_step, feed_dict={x: batch_xs,
y_: batch_ys})
cost_training_data[i]=sess.run(cost_function, feed_dict={
x: training_set.get_x_mod(), y_: training_set.get_y_mod()})
out_mod_validation[i]= sess.run(y, feed_dict={
x: validation_set.get_x_mod()})# output of net, when imputting x_mod of validation data after each training epoch
diff_mod_validation[i]=sess.run(error_to_desired_output, feed_dict={
x: validation_set.get_x_mod(),y_: validation_set.get_y_mod()})
error_mod_validation_data[i]=sess.run(sum_abs_error_to_desired_output, feed_dict={
x: validation_set.get_x_mod(),y_: validation_set.get_y_mod()})
print "epochs completed: "+str(i)
#now calculate everything for the unmodified/unnormalized outputs
out_validation=[0]*N_epochs # output of net, when inputting x_mod of validation data and making the normalization of the output backwards, saved after each epoch
error_validation_data=[0.0]*N_epochs
diff_validation=[0.0]*N_epochs
#make the transformation on the outputs backwards
for i in range(0,N_epochs):
out_validation[i]=np.ndarray(shape=(validation_set.get_N(),1))
for j in range(0,len(out_mod_validation[i])):
out_validation[i][j]=out_mod_validation[i][j]#do this, because otherwise we will produce only a reference
readData.apply_inverse_function_to_outputs(out_mod_validation[i],out_validation[i],full_set.get_y_max())# second argument will be changed!
diff_validation[i]=np.subtract(out_validation[i],validation_set.get_y())
error_validation_data[i]=np.sum(np.absolute(np.subtract(out_validation[i],validation_set.get_y())))
#print at 10 examples how good the output matches the desired output
for i in range(0,10):
print "desired output"
print validation_set.get_y()[i][0]
print "actual output after last training epoch"
print out_validation[-1][i][0]
print "-------"
print "total error on validation_data set after last training"
print error_validation_data[-1]
# # # # ##
#printing#
# #
plt.figure(1)
plt.title("Costfunction of (modified) Training-data")
plt.xlabel("epochs")
plt.ylabel("cost function")
x_range=[x+1 for x in range(0,N_epochs)]
plt.plot(x_range,cost_training_data)
plt.savefig("cost_on_training_data.png")
plt.figure(2)
plt.title("f data")
plt.xlabel("epochs")
plt.ylabel("total error on validation data")
x_range=[x+1 for x in range(0,N_epochs)]
plt.plot(x_range,error_validation_data)
plt.savefig("error_on_val_data.png")
error_on_validation_data_after_training = diff_validation[-1].reshape((1,validation_set.get_N()))
hist=TH1D('hist',"Errors on val data after last training epoch",200,-10000,10000)
fill_hist(hist,error_on_validation_data_after_training[0])
canvas=TCanvas();
hist.GetXaxis().SetTitle("desired Chi^2- outputted Chi^2");
hist.Draw()
canvas.SaveAs('error_on_val_data_hist.png')
readData.py
import numpy as np
import root_numpy
from ROOT import TFile, TH2D, TCanvas
import itertools
def apply_function_to_inputs(x,x_mod,x_max):# python uebergibt alles als reference
#normalize the inputs
for i in range(0,len(x)):
for j in range(0,len(x[i])):
#print "x["+str(i)+"]["+str(j)+"]="+str(x[i][j])
x_mod[i][j]=x[i][j]/x_max[j]
#print "x_mod["+str(i)+"]["+str(j)+"]="+str(x_mod[i][j])
def apply_inverse_function_to_inputs(x,x_mod,x_max):# python uebergibt alles als reference
#re normalize the inputs
for i in range(0,len(x)):
for j in range(0,len(x[i])):
x_mod[i][j]=x[i][j]*x_max[j]
def apply_function_to_outputs(y,y_mod,y_max):# python uebergibt alles als reference
#normalize the outputs
for i in range(0,len(y)):
for j in range(0,len(y[i])):
y_mod[i][j]=y[i][j]/y_max[j]
def apply_inverse_function_to_outputs(y,y_mod,y_max):# python uebergibt alles als reference
#re-normalize the outputs
for i in range(0,len(y)):
for j in range(0,len(y[i])):
y_mod[i][j]=y[i][j]*y_max[j]
class Dataset(object):
def __init__(self,path,hist_name,kind_of_set,function_inputs,function_outputs,full_set,subset):
self._kind_of_set=kind_of_set
"""example
self._x np.ndarray(shape=(N_points,2))
[[ 10. 95.]
[ 10. 100.]
[ 10. 105.]
...,
[ 2490. 1185.]
[ 2490. 1190.]
[ 2490. 1195.]]
self._y np.ndarray(shape=(N_points,1))
[[ 0.00000000e+00]
[ 0.00000000e+00]
[ 0.00000000e+00]
...,
[ 6.34848448e-06]
[ 6.34845946e-06]
[ 6.34848448e-06]]
"""
rfile = TFile(path)
histogram = rfile.Get(hist_name)
#now prepare data for training:
if kind_of_set=="full_set":
N_points=histogram.GetXaxis().GetNbins() * histogram.GetYaxis().GetNbins() #number of points in full_set
self._N=N_points
self._y=np.ndarray(shape=(N_points,1))
self._x=np.ndarray(shape=(N_points,2))
self._y_mod=np.ndarray(shape=(N_points,1)) #function applied to outputs, for example normalized, or a function is applied
self._x_mod=np.ndarray(shape=(N_points,2)) #function applied to inputs
self._y_max=np.ndarray(shape=(1))
self._y_max[0]=0.0
self._x_max=np.ndarray(shape=(2))
self._x_max=np.ndarray(shape=(2))
self._x_max[0]=0.0
self._x_max[1]=0.0
i=0
for x_bin in range(0, histogram.GetXaxis().GetNbins()):
for y_bin in range(0, histogram.GetYaxis().GetNbins()):
self._x[i][0]=histogram.GetXaxis().GetBinCenter(x_bin)
self._x[i][1]=histogram.GetYaxis().GetBinCenter(y_bin)
self._y[i][0]=histogram.GetBinContent(x_bin,y_bin)
for j in range(0,len(self._x[i])):# only in the full_set case the maximum values are calculated
if self._x[i][j]>self._x_max[j]:
self._x_max[j]=self._x[i][j]
for j in range(0,len(self._y[i])):
if self._y[i][j]>self._y_max[j]:
self._y_max[j]=self._y[i][j]
i=i+1
#apply function to inputs and outputs, the function can also be the identity
apply_function_to_inputs(self._x,self._x_mod,self._x_max)
apply_function_to_outputs(self._y,self._y_mod,self._y_max)
elif kind_of_set=="training_set" or kind_of_set=="validation_set" or kind_of_set=="test_set":
self._N = len(subset)#Number of elements of the data set
self._y=np.ndarray(shape=(self._N,1))
self._x=np.ndarray(shape=(self._N,2))
self._y_mod=np.ndarray(shape=(self._N,1))
self._x_mod=np.ndarray(shape=(self._N,2))
self._y_max=full_set.get_y_max()
self._x_max=full_set.get_x_max()
for i in range(0,self._N):
self._x[i][0]=full_set.get_x()[subset[i]][0]
self._x[i][1]=full_set.get_x()[subset[i]][1]
self._y[i][0]=full_set.get_y()[subset[i]][0]
self._x_mod[i][0]=full_set.get_x_mod()[subset[i]][0]
self._x_mod[i][1]=full_set.get_x_mod()[subset[i]][1]
self._y_mod[i][0]=full_set.get_y_mod()[subset[i]][0]
if len(self._x)==0:# If the set has 0 entries the list is empty
self._N_input=-1
else:
self._N_input = len(self._x[0])
if len(self._y)==0:# If the set has 0 entries the list is empty
self._N_output=-1
else:
self._N_output = len(self._y[0])
self._index_in_epoch = 0 #if one has trained 2 mini batches in the epoch already then this is 2*batch_size
self._epochs_completed = 0
def get_N_input_nodes(self):
return self._N_input
def get_N_output_nodes(self):
return self._N_output
def get_N(self):
return self._N
def get_x(self):
return self._x
def get_y(self):
return self._y
def get_x_max(self):
return self._x_max
def get_y_max(self):
return self._y_max
def get_x_mod(self):
return self._x_mod
def get_y_mod(self):
return self._y_mod
def next_batch(self, batch_size, fake_x=False):
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch >= self._N:
# Finished epoch
self._epochs_completed += 1
# Shuffle the data
perm = np.arange(self._N)
np.random.shuffle(perm)
self._x = self._x[perm]#shuffle both, actually one would only need to shuffle x_mod and y_mod, but for consistency we shuffle both!
self._y = self._y[perm]
self._x_mod = self._x_mod[perm]
self._y_mod = self._y_mod[perm]
# Start next epoch
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._N #if batch size<= self._N then an exception is thrown!
end = self._index_in_epoch
return self._x_mod[start:end], self._y_mod[start:end], self._epochs_completed
def read_data_set(path,hist_name,kind_of_set,function_inputs,function_outputs,full_set=None,subset=None):
return Dataset(path,hist_name,kind_of_set,function_inputs,function_outputs,full_set,subset)
I have uploaded the corresponding data input file to
https://github.com/kanban1992/GradientDescent_Comparison

word2vec, sum or average word embeddings?

I'm using word2vec to represent a small phrase (3 to 4 words) as a unique vector, either by adding each individual word embedding or by calculating the average of word embeddings.
From the experiments I've done I always get the same cosine similarity. I suspect it has to do with the word vectors generated by word2vec being normed to unit length (Euclidean norm) after training? or either I have a BUG in the code, or I'm missing something.
Here is the code:
import numpy as np
from nltk import PunktWordTokenizer
from gensim.models import Word2Vec
from numpy.linalg import norm
from scipy.spatial.distance import cosine
def pattern2vector(tokens, word2vec, AVG=False):
pattern_vector = np.zeros(word2vec.layer1_size)
n_words = 0
if len(tokens) > 1:
for t in tokens:
try:
vector = word2vec[t.strip()]
pattern_vector = np.add(pattern_vector,vector)
n_words += 1
except KeyError, e:
continue
if AVG is True:
pattern_vector = np.divide(pattern_vector,n_words)
elif len(tokens) == 1:
try:
pattern_vector = word2vec[tokens[0].strip()]
except KeyError:
pass
return pattern_vector
def main():
print "Loading word2vec model ...\n"
word2vecmodelpath = "/data/word2vec/vectors_200.bin"
word2vec = Word2Vec.load_word2vec_format(word2vecmodelpath, binary=True)
pattern_1 = 'founder and ceo'
pattern_2 = 'co-founder and former chairman'
tokens_1 = PunktWordTokenizer().tokenize(pattern_1)
tokens_2 = PunktWordTokenizer().tokenize(pattern_2)
print "vec1", tokens_1
print "vec2", tokens_2
p1 = pattern2vector(tokens_1, word2vec, False)
p2 = pattern2vector(tokens_2, word2vec, False)
print "\nSUM"
print "dot(vec1,vec2)", np.dot(p1,p2)
print "norm(p1)", norm(p1)
print "norm(p2)", norm(p2)
print "dot((norm)vec1,norm(vec2))", np.dot(norm(p1),norm(p2))
print "cosine(vec1,vec2)", np.divide(np.dot(p1,p2),np.dot(norm(p1),norm(p2)))
print "\n"
print "AVG"
p1 = pattern2vector(tokens_1, word2vec, True)
p2 = pattern2vector(tokens_2, word2vec, True)
print "dot(vec1,vec2)", np.dot(p1,p2)
print "norm(p1)", norm(p1)
print "norm(p2)", norm(p2)
print "dot(norm(vec1),norm(vec2))", np.dot(norm(p1),norm(p2))
print "cosine(vec1,vec2)", np.divide(np.dot(p1,p2),np.dot(norm(p1),norm(p2)))
if __name__ == "__main__":
main()
and here is the output:
Loading word2vec model ...
Dimensions 200
vec1 ['founder', 'and', 'ceo']
vec2 ['co-founder', 'and', 'former', 'chairman']
SUM
dot(vec1,vec2) 5.4008677771
norm(p1) 2.19382594282
norm(p2) 2.87226958166
dot((norm)vec1,norm(vec2)) 6.30125952303
cosine(vec1,vec2) 0.857109242583
AVG
dot(vec1,vec2) 0.450072314758
norm(p1) 0.731275314273
norm(p2) 0.718067395416
dot(norm(vec1),norm(vec2)) 0.525104960252
cosine(vec1,vec2) 0.857109242583
I'm using the cosine similarity as defined here Cosine Similarity (Wikipedia). The values for the norms and dot products are indeed different.
Can anyone explain why the cosine is the same?
Thank you,
David
Cosine measures the angle between two vectors and does not take the length of either vector into account. When you divide by the length of the phrase, you are just shortening the vector, not changing its angular position. So your results look correct to me.