I implemented the LSDD changepoint detection method decribed in [1] in Julia, to see if I could make it faster than the existing python implementation [2], which is based on a grid search that looks for the optimal parameters.
I obtain the desired results but despite my best efforts, my grid search version of it takes about the same time to compute as the python one, which is still way too long for real applications.
I also tried using the Optimize package which only makes things worse (2 or 3 times slower).
Here is the grid search that I implemented :
using Random
using LinearAlgebra
function squared_distance(X::Array{Float64,1},C::Array{Float64,1})
sqd = zeros(length(X),length(C))
for i in 1:length(X)
for j in 1:length(C)
sqd[i,j] = X[i]^2 + C[j]^2 - 2*X[i]*C[j]
end
end
return sqd
end
function lsdd(x::Array{Float64,1},y::Array{Float64,1}; folds = 5, sigma_list = nothing , lambda_list = nothing)
lx,ly = length(x), length(y)
b = min(lx+ly,300)
C = shuffle(vcat(x,y))[1:b]
CC_dist2 = squared_distance(C,C)
xC_dist2, yC_dist2 = squared_distance(x,C), squared_distance(y,C)
Tx,Ty = length(x) - div(lx,folds), length(y) - div(ly,folds)
#Define the training and testing data sets
cv_split1, cv_split2 = floor.(collect(1:lx)*folds/lx), floor.(collect(1:ly)*folds/ly)
cv_index1, cv_index2 = shuffle(cv_split1), shuffle(cv_split2)
tr_idx1,tr_idx2 = [findall(x->x!=i,cv_index1) for i in 1:folds], [findall(x->x!=i,cv_index2) for i in 1:folds]
te_idx1,te_idx2 = [findall(x->x==i,cv_index1) for i in 1:folds], [findall(x->x==i,cv_index2) for i in 1:folds]
xTr_dist, yTr_dist = [xC_dist2[i,:] for i in tr_idx1], [yC_dist2[i,:] for i in tr_idx2]
xTe_dist, yTe_dist = [xC_dist2[i,:] for i in te_idx1], [yC_dist2[i,:] for i in te_idx2]
if sigma_list == nothing
sigma_list = [0.25, 0.5, 0.75, 1, 1.2, 1.5, 2, 2.5, 2.2, 3, 5]
end
if lambda_list == nothing
lambda_list = [1.00000000e-03, 3.16227766e-03, 1.00000000e-02, 3.16227766e-02,
1.00000000e-01, 3.16227766e-01, 1.00000000e+00, 3.16227766e+00,
1.00000000e+01]
end
#memory prealocation
score_cv = zeros(length(sigma_list),length(lambda_list))
H = zeros(b,b)
hx_tr, hy_tr = [zeros(b,1) for i in 1:folds], [zeros(b,1) for i in 1:folds]
hx_te, hy_te = [zeros(1,b) for i in 1:folds], [zeros(1,b) for i in 1:folds]
#h_tr,h_te = zeros(b,1), zeros(1,b)
theta = zeros(b)
for (sigma_idx,sigma) in enumerate(sigma_list)
#the expression of H is different for higher dimension
#H = sqrt((sigma^2)*pi)*exp.(-CC_dist2/(4*sigma^2))
set_H(H,CC_dist2,sigma,b)
#check if the sum is performed along the right dimension
set_htr(hx_tr,xTr_dist,sigma,Tx), set_htr(hy_tr,yTr_dist,sigma,Ty)
set_hte(hx_te,xTe_dist,sigma,lx-Tx), set_hte(hy_te,yTe_dist,sigma,ly-Ty)
for i in 1:folds
h_tr = hx_tr[i] - hy_tr[i]
h_te = hx_te[i] - hy_te[i]
#set_h(h_tr,hx_tr[i],hy_tr[i],b)
#set_h(h_te,hx_te[i],hy_te[i],b)
for (lambda_idx,lambda) in enumerate(lambda_list)
set_theta(theta,H,lambda,h_tr,b)
score_cv[sigma_idx,lambda_idx] += dot(theta,H*theta) - 2*dot(theta,h_te)
end
end
end
#retrieve the value of the optimal parameters
sigma_chosen = sigma_list[findmin(score_cv)[2][2]]
lambda_chosen = lambda_list[findmin(score_cv)[2][2]]
#calculating the new "optimal" solution
H = sqrt((sigma_chosen^2)*pi)*exp.(-CC_dist2/(4*sigma_chosen^2))
H_lambda = H + lambda_chosen*Matrix{Float64}(I, b, b)
h = (1/lx)*sum(exp.(-xC_dist2/(2*sigma_chosen^2)),dims = 1) - (1/ly)*sum(exp.(-yC_dist2/(2*sigma_chosen^2)),dims = 1)
theta_final = H_lambda\transpose(h)
f = transpose(theta_final).*sum(exp.(-vcat(xC_dist2,yC_dist2)/(2*sigma_chosen^2)),dims = 1)
L2 = 2*dot(theta_final,h) - dot(theta_final,H*theta_final)
return L2
end
function set_H(H::Array{Float64,2},dist::Array{Float64,2},sigma::Float64,b::Int16)
for i in 1:b
for j in 1:b
H[i,j] = sqrt((sigma^2)*pi)*exp(-dist[i,j]/(4*sigma^2))
end
end
end
function set_theta(theta::Array{Float64,1},H::Array{Float64,2},lambda::Float64,h::Array{Float64,2},b::Int64)
Hl = (H + lambda*Matrix{Float64}(I, b, b))
LAPACK.posv!('L', Hl, h)
theta = h
end
function set_htr(h::Array{Float64,1},dists::Array{Float64,2},sigma::Float64,T::Int16)
for (CVidx,dist) in enumerate(dists)
for (idx,value) in enumerate((1/T)*sum(exp.(-dist/(2*sigma^2)),dims = 1))
h[CVidx][idx] = value
end
end
end
function set_hte(h::Array{Float64,1},dists::Array{Float64,2},sigma::Array{Float64,1},T::Int16)
for (CVidx,dist) in enumerate(dists)
for (idx,value) in enumerate((1/T)*sum(exp.(-dist/(2*sigma^2)),dims = 1))
h[CVidx][idx] = value
end
end
end
function set_h(h,h1,h2,b)
for i in 1:b
h[i] = h1[i] - h2[i]
end
end
The set_H, set_h and set_theta functions are there because I read somewhere that modifying prealocated memory in place with a function was faster, but it did not make a great difference.
To test it, I use two random distribution as input data :
x,y = rand(500),1.5*rand(500)
lsdd(x,y) #returns a value around 0.3
Now here is the version of the code where I try to use Optimizer :
function Theta(sigma::Float64,lambda::Float64,x::Array{Float64,1},y::Array{Float64,1},folds::Int8)
lx,ly = length(x), length(y)
b = min(lx+ly,300)
C = shuffle(vcat(x,y))[1:b]
CC_dist2 = squared_distance(C,C)
xC_dist2, yC_dist2 = squared_distance(x,C), squared_distance(y,C)
#the subsets are not be mutually exclusive !
Tx,Ty = length(x) - div(lx,folds), length(y) - div(ly,folds)
shuffled_x, shuffled_y = [shuffle(1:lx) for i in 1:folds], [shuffle(1:ly) for i in 1:folds]
cv_index1, cv_index2 = floor.(collect(1:lx)*folds/lx)[shuffle(1:lx)], floor.(collect(1:ly)*folds/ly)[shuffle(1:ly)]
tr_idx1,tr_idx2 = [i[1:Tx] for i in shuffled_x], [i[1:Ty] for i in shuffled_y]
te_idx1,te_idx2 = [i[Tx:end] for i in shuffled_x], [i[Ty:end] for i in shuffled_y]
xTr_dist, yTr_dist = [xC_dist2[i,:] for i in tr_idx1], [yC_dist2[i,:] for i in tr_idx2]
xTe_dist, yTe_dist = [xC_dist2[i,:] for i in te_idx1], [yC_dist2[i,:] for i in te_idx2]
score_cv = 0
Id = Matrix{Float64}(I, b, b)
H = sqrt((sigma^2)*pi)*exp.(-CC_dist2/(4*sigma^2))
hx_tr, hy_tr = [transpose((1/Tx)*sum(exp.(-dist/(2*sigma^2)),dims = 1)) for dist in xTr_dist], [transpose((1/Ty)*sum(exp.(-dist/(2*sigma^2)),dims = 1)) for dist in yTr_dist]
hx_te, hy_te = [(lx-Tx)*sum(exp.(-dist/(2*sigma^2)),dims = 1) for dist in xTe_dist], [(ly-Ty)*sum(exp.(-dist/(2*sigma^2)),dims = 1) for dist in yTe_dist]
for i in 1:folds
h_tr, h_te = hx_tr[i] - hy_tr[i], hx_te[i] - hy_te[i]
#theta = (H + lambda * Id)\h_tr
theta = copy(h_tr)
Hl = (H + lambda*Matrix{Float64}(I, b, b))
LAPACK.posv!('L', Hl, theta)
score_cv += dot(theta,H*theta) - 2*dot(theta,h_te)
end
return score_cv,(CC_dist2,xC_dist2,yC_dist2)
end
function cost(params::Array{Float64,1},x::Array{Float64,1},y::Array{Float64,1},folds::Int8)
s,l = params[1],params[2]
return Theta(s,l,x,y,folds)[1]
end
"""
Performs the optinization
"""
function lsdd3(x::Array{Float64,1},y::Array{Float64,1}; folds = 4)
start = [1,0.1]
b = min(length(x)+length(y),300)
lx,ly = length(x),length(y)
#result = optimize(params -> cost(params,x,y,folds),fill(0.0,2),fill(50.0,2),start, Fminbox(LBFGS(linesearch=LineSearches.BackTracking())); autodiff = :forward)
result = optimize(params -> cost(params,x,y,folds),start, BFGS(),Optim.Options(f_calls_limit = 5, iterations = 5))
#bboptimize(rosenbrock2d; SearchRange = [(-5.0, 5.0), (-2.0, 2.0)])
#result = optimize(cost,[0,0],[Inf,Inf],start, Fminbox(AcceleratedGradientDescent()))
sigma_chosen,lambda_chosen = Optim.minimizer(result)
CC_dist2, xC_dist2, yC_dist2 = Theta(sigma_chosen,lambda_chosen,x,y,folds)[2]
H = sqrt((sigma_chosen^2)*pi)*exp.(-CC_dist2/(4*sigma_chosen^2))
h = (1/lx)*sum(exp.(-xC_dist2/(2*sigma_chosen^2)),dims = 1) - (1/ly)*sum(exp.(-yC_dist2/(2*sigma_chosen^2)),dims = 1)
theta_final = (H + lambda_chosen*Matrix{Float64}(I, b, b))\transpose(h)
f = transpose(theta_final).*sum(exp.(-vcat(xC_dist2,yC_dist2)/(2*sigma_chosen^2)),dims = 1)
L2 = 2*dot(theta_final,h) - dot(theta_final,H*theta_final)
return L2
end
No matter, which kind of option I use in the optimizer, I always end up with something too slow. Maybe the grid search is the best option, but I don't know how to make it faster... Does anyone have an idea how I could proceed further ?
[1] : http://www.mcduplessis.com/wp-content/uploads/2016/05/Journal-IEICE-2014-CLSDD-1.pdf
[2] : http://www.ms.k.u-tokyo.ac.jp/software.html
I'm trying to do a Gaussian bell using the data I am obtaining from a matrix but everytime I try to run the program I obtain this message:
"Error: syntax error, unexpected identifier, expecting end"
The data used to obtain the gaussina bell is a matrix which includes the last point of every n displacements, which are the last position of a particle. I want to know if there is an easier way to obtain the gaussian bell in scilab because I have to also do a fit with an histogram using the same data.
function bla7()
t=4000
n=1000
l=0.067
p=%pi*2
w1=zeros(t,1);
w2=zeros(t,1);
for I=1:t
a=(grand(n,1,"unf",0,p));
x=l*cos(a)
y=l*sin(a)
z1=zeros(n,1);
z2=zeros(n,1);
for i=2:n
z1(i)=z1(i-1)+x(i);
z2(i)=z2(i-1)+y(i);
end
w1(I)=z1($)
w2(I)=z2($)
end
n=10000
w10=zeros(t,1);
w20=zeros(t,1);
for I=1:t
a=(grand(n,1,"unf",0,p));
x=l*cos(a)
y=l*sin(a)
z1=zeros(n,1);
z2=zeros(n,1);
for i=2:n
z1(i)=z1(i-1)+x(i);
z2(i)=z2(i-1)+y(i);
end
w10(I)=z1($)
w20(I)=z2($)
end
n=100
w100=zeros(t,1);
w200=zeros(t,1);
for I=1:t
a=(grand(n,1,"unf",0,p));
x=l*cos(a)
y=l*sin(a)
z1=zeros(n,1);
z2=zeros(n,1);
for i=2:n
z1(i)=z1(i-1)+x(i);
z2(i)=z2(i-1)+y(i);
end
w100(I)=z1($)
w200(I)=z2($)
end
k=70
v=12/k
c1=zeros(k,1)
for r=1:t
c=w1(r)
m=-6+v
n=-6
for g=1:k
if (c<m & c>=n) then
c1(g)=c1(g)+1
m=m+v
n=n+v
else
m=m+v
n=n+v
end
end
end
c2=zeros(k,1)
c2(1)=-6+(6/k)
for b=2:k
c2(b)=c2(b-1)+v
end
y = stdev(w1)
normal1=zeros(k,1)
normal2=zeros(k,1)
bb=-6
bc=-6+v
for wa=1:k
bd=(bb+bc)/2
gauss1=(1/(y*sqrt(2*%pi)))exp(-0.5(bb/y)^2)
gauss2=(1/(y*sqrt(2*%pi)))exp(-0.5(bc/y)^2)
gauss3=(1/(y*sqrt(2*%pi)))exp(-0.5(bd/y)^2)
gauss4=((bc-bb)/6)*(gauss1+gauss2+4*gauss3)
bb=bb+v
bc=bc+v
normal2(wa,1)=gauss4
end
normal3=normal2*4000
k=100
v=24/k
c10=zeros(k,1)
for r=1:t
c=w10(r)
m=-12+v
n=-12
for g=1:k
if (c<m & c>=n) then
c10(g)=c10(g)+1
m=m+v
n=n+v
else
m=m+v
n=n+v
end
end
end
c20=zeros(k,1)
c20(1)=-12+(12/k)
for b=2:k
c20(b)=c20(b-1)+v
end
y = stdev(w10)
normal10=zeros(k,1)
normal20=zeros(k,1)
bb=-12
bc=-12+v
for wa=1:k
bd=(bb+bc)/2
gauss10=(1/(y*sqrt(2*%pi)))exp(-0.5(bb/y)^2)
gauss20=(1/(y*sqrt(2*%pi)))exp(-0.5(bc/y)^2)
gauss30=(1/(y*sqrt(2*%pi)))exp(-0.5(bd/y)^2)
gauss40=((bc-bb)/6)*(gauss10+gauss20+4*gauss30)
bb=bb+v
bc=bc+v
normal20(wa,1)=gauss40
end
normal30=normal20*4000
k=70
v=12/k
c100=zeros(k,1)
for r=1:t
c=w100(r)
m=-6+v
n=-6
for g=1:k
if (c<m & c>=n) then
c100(g)=c100(g)+1
m=m+v
n=n+v
else
m=m+v
n=n+v
end
end
end
c200=zeros(k,1)
c200(1)=-6+(6/k)
for b=2:k
c200(b)=c200(b-1)+v
end
y = stdev(w100)
normal100=zeros(k,1)
normal200=zeros(k,1)
bb=-6
bc=-6+v
for wa=1:k
bd=(bb+bc)/2
gauss100=(1/(y*sqrt(2*%pi)))exp(-0.5(bb/y)^2)
gauss200=(1/(y*sqrt(2*%pi)))exp(-0.5(bc/y)^2)
gauss300=(1/(y*sqrt(2*%pi)))exp(-0.5(bd/y)^2)
gauss400=((bc-bb)/6)*(gauss100+gauss200+4*gauss300)
bb=bb+v
bc=bc+v
normal200(wa,1)=gauss400
end
normal300=normal200*4000
bar(c20,c10,1.0,'white')
plot(c20, normal30, 'b-')
bar(c2,c1,1.0,'white')
plot(c2, normal3, 'r-')
bar(c200,c100,1.0,'white')
plot(c200, normal300, 'm-')
poly1.thickness=3;
xlabel(["x / um"]);
ylabel("molecules");
gcf().axes_size=[500,500]
a=gca();
a.zoom_box=[-12,12;0,600];
a.font_size=4;
a.labels_font_size=5;
a.x_label.font_size = 5;
a.y_label.font_size = 5;
ticks = a.x_ticks
ticks.labels =["-12";"-10";"-8";"-6";"-4";"-2";"0";"2";"4";"6";"8";"10";"12"]
ticks.locations = [-12;-10;-8;-6;-4;-2;0;2;4;6;8;10;12]
a.x_ticks = ticks
endfunction
Each and every one of your gauss variables are missing the multiplication operator in two places. Check every line at it will run. For example, this:
gauss1=(1/(y*sqrt(2*%pi)))exp(-0.5(bb/y)^2)
should be this:
gauss1=(1/(y*sqrt(2*%pi))) * exp(-0.5 * (bb/y)^2)
As for the Gaussian bell, there is no standard function in Scilab. However, you could define a new function to make things more clear in your case:
function x = myGauss(s,b_)
x = (1/(s*sqrt(2*%pi)))*exp(-0.5*(b_/s)^2)
endfunction
Actually, while we're at it, your whole code is really difficult to read. You should define functions instead of repeating code: it helps clarify what you mean, and if there is a mistake, you need to fix only one place. Also, I personally do not recommend that you enclose everything in a function like bla7() because it makes things harder to debug. Your example could be rewritten like this:
The myGauss function;
A function w_ to calculate w1, w2, w10, w20, w100 and w200;
A function c_ to calculate c1, c2, c10, c20, c100 and c200;
A function normal_ to calculate normal1, normal2, normal10, normal20, normal100 and normal200;
Call all four functions as many times as needed with different inputs for different results.
If you do that, your could will look like this:
function x = myGauss(s,b_)
x = (1 / (s * sqrt(2 * %pi))) * exp(-0.5 * (b_/s)^2);
endfunction
function [w1_,w2_] = w_(t_,l_,n_,p_)
w1_ = zeros(t_,1);
w2_ = zeros(t_,1);
for I = 1 : t_
a = (grand(n_,1,"unf",0,p_));
x = l_ * cos(a);
y = l_ * sin(a);
z1 = zeros(n_,1);
z2 = zeros(n_,1);
for i = 2 : n_
z1(i) = z1(i-1) + x(i);
z2(i) = z2(i-1) + y(i);
end
w1_(I) = z1($);
w2_(I) = z2($);
end
endfunction
function [c1_,c2_] = c_(t_,k_,v_,w1_,x_)
c1_ = zeros(k_,1)
for r = 1 : t_
c = w1_(r);
m = -x_ + v_;
n = -x_;
for g = 1 : k_
if (c < m & c >= n) then
c1_(g) = c1_(g) + 1;
m = m + v_;
n = n + v_;
else
m = m + v_;
n = n + v_;
end
end
end
c2_ = zeros(k_,1);
c2_(1) = -x_ + (x_/k_);
for b = 2 : k_
c2_(b) = c2_(b-1) + v_;
end
endfunction
function [normal1_,normal2_,normal3_] = normal_(k_,bb_,bc_,v_,w1_)
y = stdev(w1_);
normal1_ = zeros(k_,1);
normal2_ = zeros(k_,1);
for wa = 1 : k_
bd_ = (bb_ + bc_) / 2;
gauss1 = myGauss(y,bb_);
gauss2 = myGauss(y,bc_);
gauss3 = myGauss(y,bd_);
gauss4 = ((bc_ - bb_) / 6) * (gauss1 + gauss2 + 4 * gauss3);
bb_ = bb_ + v_;
bc_ = bc_ + v_;
normal2_(wa,1) = gauss4;
end
normal3_ = normal2_ * 4000;
endfunction
t = 4000;
l = 0.067;
p = 2 * %pi;
n = 1000;
k = 70;
v = 12 / k;
x = 6;
bb = -x;
bc = -x + v;
[w1,w2] = w_(t,l,n,p);
[c1,c2] = c_(t,k,v,w1,x);
[normal1,normal2,normal3] = normal_(k,bb,bc,v,w1);
bar(c2,c1,1.0,'white');
plot(c2, normal3, 'r-');
n = 10000;
k = 100;
v = 24 / k;
x = 12;
bb = -x;
bc = -x + v;
[w10,w20] = w_(t,l,n,p);
[c10,c20] = c_(t,k,v,w10,x);
[normal10,normal20,normal30] = normal_(k,bb,bc,v,w10);
bar(c20,c10,1.0,'white');
plot(c20, normal30, 'b-');
n = 100;
k = 70;
v = 12 / k;
x = 6;
bb = -x;
bc = -x + v;
[w100,w200] = w_(t,l,n,p);
[c100,c200] = c_(t,k,v,w100,x);
[normal100,normal200,normal300] = normal_(k,bb,bc,v,w100);
bar(c200,c100,1.0,'white');
plot(c200, normal300, 'm-');
poly1.thickness=3;
xlabel(["x / um"]);
ylabel("molecules");
gcf().axes_size=[500,500]
a=gca();
a.zoom_box=[-12,12;0,600];
a.font_size=4;
a.labels_font_size=5;
a.x_label.font_size = 5;
a.y_label.font_size = 5;
ticks = a.x_ticks
ticks.labels =["-12";"-10";"-8";"-6";"-4";"-2";"0";"2";"4";"6";"8";"10";"12"]
ticks.locations = [-12;-10;-8;-6;-4;-2;0;2;4;6;8;10;12]
a.x_ticks = ticks