I implemented the LSDD changepoint detection method decribed in [1] in Julia, to see if I could make it faster than the existing python implementation [2], which is based on a grid search that looks for the optimal parameters.
I obtain the desired results but despite my best efforts, my grid search version of it takes about the same time to compute as the python one, which is still way too long for real applications.
I also tried using the Optimize package which only makes things worse (2 or 3 times slower).
Here is the grid search that I implemented :
using Random
using LinearAlgebra
function squared_distance(X::Array{Float64,1},C::Array{Float64,1})
sqd = zeros(length(X),length(C))
for i in 1:length(X)
for j in 1:length(C)
sqd[i,j] = X[i]^2 + C[j]^2 - 2*X[i]*C[j]
end
end
return sqd
end
function lsdd(x::Array{Float64,1},y::Array{Float64,1}; folds = 5, sigma_list = nothing , lambda_list = nothing)
lx,ly = length(x), length(y)
b = min(lx+ly,300)
C = shuffle(vcat(x,y))[1:b]
CC_dist2 = squared_distance(C,C)
xC_dist2, yC_dist2 = squared_distance(x,C), squared_distance(y,C)
Tx,Ty = length(x) - div(lx,folds), length(y) - div(ly,folds)
#Define the training and testing data sets
cv_split1, cv_split2 = floor.(collect(1:lx)*folds/lx), floor.(collect(1:ly)*folds/ly)
cv_index1, cv_index2 = shuffle(cv_split1), shuffle(cv_split2)
tr_idx1,tr_idx2 = [findall(x->x!=i,cv_index1) for i in 1:folds], [findall(x->x!=i,cv_index2) for i in 1:folds]
te_idx1,te_idx2 = [findall(x->x==i,cv_index1) for i in 1:folds], [findall(x->x==i,cv_index2) for i in 1:folds]
xTr_dist, yTr_dist = [xC_dist2[i,:] for i in tr_idx1], [yC_dist2[i,:] for i in tr_idx2]
xTe_dist, yTe_dist = [xC_dist2[i,:] for i in te_idx1], [yC_dist2[i,:] for i in te_idx2]
if sigma_list == nothing
sigma_list = [0.25, 0.5, 0.75, 1, 1.2, 1.5, 2, 2.5, 2.2, 3, 5]
end
if lambda_list == nothing
lambda_list = [1.00000000e-03, 3.16227766e-03, 1.00000000e-02, 3.16227766e-02,
1.00000000e-01, 3.16227766e-01, 1.00000000e+00, 3.16227766e+00,
1.00000000e+01]
end
#memory prealocation
score_cv = zeros(length(sigma_list),length(lambda_list))
H = zeros(b,b)
hx_tr, hy_tr = [zeros(b,1) for i in 1:folds], [zeros(b,1) for i in 1:folds]
hx_te, hy_te = [zeros(1,b) for i in 1:folds], [zeros(1,b) for i in 1:folds]
#h_tr,h_te = zeros(b,1), zeros(1,b)
theta = zeros(b)
for (sigma_idx,sigma) in enumerate(sigma_list)
#the expression of H is different for higher dimension
#H = sqrt((sigma^2)*pi)*exp.(-CC_dist2/(4*sigma^2))
set_H(H,CC_dist2,sigma,b)
#check if the sum is performed along the right dimension
set_htr(hx_tr,xTr_dist,sigma,Tx), set_htr(hy_tr,yTr_dist,sigma,Ty)
set_hte(hx_te,xTe_dist,sigma,lx-Tx), set_hte(hy_te,yTe_dist,sigma,ly-Ty)
for i in 1:folds
h_tr = hx_tr[i] - hy_tr[i]
h_te = hx_te[i] - hy_te[i]
#set_h(h_tr,hx_tr[i],hy_tr[i],b)
#set_h(h_te,hx_te[i],hy_te[i],b)
for (lambda_idx,lambda) in enumerate(lambda_list)
set_theta(theta,H,lambda,h_tr,b)
score_cv[sigma_idx,lambda_idx] += dot(theta,H*theta) - 2*dot(theta,h_te)
end
end
end
#retrieve the value of the optimal parameters
sigma_chosen = sigma_list[findmin(score_cv)[2][2]]
lambda_chosen = lambda_list[findmin(score_cv)[2][2]]
#calculating the new "optimal" solution
H = sqrt((sigma_chosen^2)*pi)*exp.(-CC_dist2/(4*sigma_chosen^2))
H_lambda = H + lambda_chosen*Matrix{Float64}(I, b, b)
h = (1/lx)*sum(exp.(-xC_dist2/(2*sigma_chosen^2)),dims = 1) - (1/ly)*sum(exp.(-yC_dist2/(2*sigma_chosen^2)),dims = 1)
theta_final = H_lambda\transpose(h)
f = transpose(theta_final).*sum(exp.(-vcat(xC_dist2,yC_dist2)/(2*sigma_chosen^2)),dims = 1)
L2 = 2*dot(theta_final,h) - dot(theta_final,H*theta_final)
return L2
end
function set_H(H::Array{Float64,2},dist::Array{Float64,2},sigma::Float64,b::Int16)
for i in 1:b
for j in 1:b
H[i,j] = sqrt((sigma^2)*pi)*exp(-dist[i,j]/(4*sigma^2))
end
end
end
function set_theta(theta::Array{Float64,1},H::Array{Float64,2},lambda::Float64,h::Array{Float64,2},b::Int64)
Hl = (H + lambda*Matrix{Float64}(I, b, b))
LAPACK.posv!('L', Hl, h)
theta = h
end
function set_htr(h::Array{Float64,1},dists::Array{Float64,2},sigma::Float64,T::Int16)
for (CVidx,dist) in enumerate(dists)
for (idx,value) in enumerate((1/T)*sum(exp.(-dist/(2*sigma^2)),dims = 1))
h[CVidx][idx] = value
end
end
end
function set_hte(h::Array{Float64,1},dists::Array{Float64,2},sigma::Array{Float64,1},T::Int16)
for (CVidx,dist) in enumerate(dists)
for (idx,value) in enumerate((1/T)*sum(exp.(-dist/(2*sigma^2)),dims = 1))
h[CVidx][idx] = value
end
end
end
function set_h(h,h1,h2,b)
for i in 1:b
h[i] = h1[i] - h2[i]
end
end
The set_H, set_h and set_theta functions are there because I read somewhere that modifying prealocated memory in place with a function was faster, but it did not make a great difference.
To test it, I use two random distribution as input data :
x,y = rand(500),1.5*rand(500)
lsdd(x,y) #returns a value around 0.3
Now here is the version of the code where I try to use Optimizer :
function Theta(sigma::Float64,lambda::Float64,x::Array{Float64,1},y::Array{Float64,1},folds::Int8)
lx,ly = length(x), length(y)
b = min(lx+ly,300)
C = shuffle(vcat(x,y))[1:b]
CC_dist2 = squared_distance(C,C)
xC_dist2, yC_dist2 = squared_distance(x,C), squared_distance(y,C)
#the subsets are not be mutually exclusive !
Tx,Ty = length(x) - div(lx,folds), length(y) - div(ly,folds)
shuffled_x, shuffled_y = [shuffle(1:lx) for i in 1:folds], [shuffle(1:ly) for i in 1:folds]
cv_index1, cv_index2 = floor.(collect(1:lx)*folds/lx)[shuffle(1:lx)], floor.(collect(1:ly)*folds/ly)[shuffle(1:ly)]
tr_idx1,tr_idx2 = [i[1:Tx] for i in shuffled_x], [i[1:Ty] for i in shuffled_y]
te_idx1,te_idx2 = [i[Tx:end] for i in shuffled_x], [i[Ty:end] for i in shuffled_y]
xTr_dist, yTr_dist = [xC_dist2[i,:] for i in tr_idx1], [yC_dist2[i,:] for i in tr_idx2]
xTe_dist, yTe_dist = [xC_dist2[i,:] for i in te_idx1], [yC_dist2[i,:] for i in te_idx2]
score_cv = 0
Id = Matrix{Float64}(I, b, b)
H = sqrt((sigma^2)*pi)*exp.(-CC_dist2/(4*sigma^2))
hx_tr, hy_tr = [transpose((1/Tx)*sum(exp.(-dist/(2*sigma^2)),dims = 1)) for dist in xTr_dist], [transpose((1/Ty)*sum(exp.(-dist/(2*sigma^2)),dims = 1)) for dist in yTr_dist]
hx_te, hy_te = [(lx-Tx)*sum(exp.(-dist/(2*sigma^2)),dims = 1) for dist in xTe_dist], [(ly-Ty)*sum(exp.(-dist/(2*sigma^2)),dims = 1) for dist in yTe_dist]
for i in 1:folds
h_tr, h_te = hx_tr[i] - hy_tr[i], hx_te[i] - hy_te[i]
#theta = (H + lambda * Id)\h_tr
theta = copy(h_tr)
Hl = (H + lambda*Matrix{Float64}(I, b, b))
LAPACK.posv!('L', Hl, theta)
score_cv += dot(theta,H*theta) - 2*dot(theta,h_te)
end
return score_cv,(CC_dist2,xC_dist2,yC_dist2)
end
function cost(params::Array{Float64,1},x::Array{Float64,1},y::Array{Float64,1},folds::Int8)
s,l = params[1],params[2]
return Theta(s,l,x,y,folds)[1]
end
"""
Performs the optinization
"""
function lsdd3(x::Array{Float64,1},y::Array{Float64,1}; folds = 4)
start = [1,0.1]
b = min(length(x)+length(y),300)
lx,ly = length(x),length(y)
#result = optimize(params -> cost(params,x,y,folds),fill(0.0,2),fill(50.0,2),start, Fminbox(LBFGS(linesearch=LineSearches.BackTracking())); autodiff = :forward)
result = optimize(params -> cost(params,x,y,folds),start, BFGS(),Optim.Options(f_calls_limit = 5, iterations = 5))
#bboptimize(rosenbrock2d; SearchRange = [(-5.0, 5.0), (-2.0, 2.0)])
#result = optimize(cost,[0,0],[Inf,Inf],start, Fminbox(AcceleratedGradientDescent()))
sigma_chosen,lambda_chosen = Optim.minimizer(result)
CC_dist2, xC_dist2, yC_dist2 = Theta(sigma_chosen,lambda_chosen,x,y,folds)[2]
H = sqrt((sigma_chosen^2)*pi)*exp.(-CC_dist2/(4*sigma_chosen^2))
h = (1/lx)*sum(exp.(-xC_dist2/(2*sigma_chosen^2)),dims = 1) - (1/ly)*sum(exp.(-yC_dist2/(2*sigma_chosen^2)),dims = 1)
theta_final = (H + lambda_chosen*Matrix{Float64}(I, b, b))\transpose(h)
f = transpose(theta_final).*sum(exp.(-vcat(xC_dist2,yC_dist2)/(2*sigma_chosen^2)),dims = 1)
L2 = 2*dot(theta_final,h) - dot(theta_final,H*theta_final)
return L2
end
No matter, which kind of option I use in the optimizer, I always end up with something too slow. Maybe the grid search is the best option, but I don't know how to make it faster... Does anyone have an idea how I could proceed further ?
[1] : http://www.mcduplessis.com/wp-content/uploads/2016/05/Journal-IEICE-2014-CLSDD-1.pdf
[2] : http://www.ms.k.u-tokyo.ac.jp/software.html
I have a sorted Series like this:
[2, 4, 5, 6, 8, 9]
I want to produce another Series or ndarray of the same length, where the first two odd numbers and the first two even numbers are labeled sequentially:
[0, 1, 2, _, _, 3]
The _ values I don't really care about. They can be zero.
Now I do it this way:
src = pd.Series([2, 4, 5, 6, 8, 9])
odd = src % 2 != 0
where = np.hstack((np.where(odd)[0][:2], np.where(~odd)[0][:2]))
where.sort() # maintain ordering - thanks to #hpaulj
res = np.zeros(len(src), int)
res[where] = np.arange(len(where))
Can you do it more concisely? The input will never be empty, but there might be no odds or no evens (in which case the result could have length 1, 2, or 3 instead of 4).
Great Problem! I'm still exploring and learning.
I've basically stuck with what you've done so far with modest tweaks for efficiency. I'll update if I think of anything else cool.
conclusions
So far, I've thrashed around alot and haven't improved much.
my answer
fast
odd = src.values % 2
even = 1 - odd
res = ((odd.cumsum() * odd) < 3) * ((even.cumsum() * even) < 3)
(res.cumsum() - 1) * res
alternative 1
pretty fast
a = src.values
odd = (a % 2).astype(bool)
rng = np.arange(len(a))
# same reason these are 2, we have 4 below
where = np.append(rng[~odd][:2], rng[odd][:2])
res = np.zeros(len(a), int)
# nature of the problem necessitates that this is 4
res[where] = np.arange(4)
alternative 2
not as quick, but creative
a = src.values
odd = a % 2
res = np.zeros(len(src), int)
b = np.arange(2)
c = b[:, None] == odd
res[(c.cumsum(1) * c <= 2).all(0)] = np.arange(4)
alternative 3
still slow
odd = src.values % 2
a = (odd[:, None] == [0, 1])
b = ((a.cumsum(0) * a) <= 2).all(1)
(b.cumsum() - 1) * b
timing
code
def pir3(src):
odd = src.values % 2
a = (odd[:, None] == [0, 1])
b = ((a.cumsum(0) * a) <= 2).all(1)
return (b.cumsum() - 1) * b
def pir0(src):
odd = src.values % 2
even = 1 - odd
res = ((odd.cumsum() * odd) < 3) * ((even.cumsum() * even) < 3)
return (res.cumsum() - 1) * res
def pir2(src):
a = src.values
odd = a % 2
res = np.zeros(len(src), int)
c = b[:, None] == odd
res[(c.cumsum(1) * c <= 2).all(0)] = np.arange(4)
return res
def pir1(src):
a = src.values
odd = (a % 2).astype(bool)
rng = np.arange(len(a))
where = np.append(rng[~odd][:2], rng[odd][:2])
res = np.zeros(len(a), int)
res[where] = np.arange(4)
return res
def john0(src):
odd = src % 2 == 0
where = np.hstack((np.where(odd)[0][:2], np.where(~odd)[0][:2]))
res = np.zeros(len(src), int)
res[where] = np.arange(len(where))
return res
def john1(src):
odd = src.values % 2 == 0
where = np.hstack((np.where(odd)[0][:2], np.where(~odd)[0][:2]))
res = np.zeros(len(src), int)
res[where] = np.arange(len(where))
return res
src = pd.Series([2, 4, 5, 6, 8, 9])
src = pd.Series([2, 4, 5, 6, 8, 9] * 10000)
I am experiencing a problem with the outputs from my loop. As the sub is running I can see that the results from the final IF statement are being overwritten by the results from the second one. My code is structured as follows:
for i = 1 to 5
for j = 1 to 50
for each events.value in eventArray
if events.value = arrayElem then
if cells(i,j).value = "x" then
type = "col1"
elseif cells(i,j).value = "y" then
date = "col2"
elseif cells(i,j).value = "z" then
num = "col3"
end if
count = count + 1
activeworkbook.worksheets("output").cells(count + 1, 1) = type
activeworkbook.worksheets("output").cells(count + 1, 2) = date
activeworkbook.worksheets("output").cells(count + 1, 3) = num
end if
next arrayElem
if cells(i,j).value = "a" then
name = "row1"
elseif cells(i,j).value = "b" then
size = "row2"
elseif cells(i,j).value = "c" then
height = "row3"
end if
activeworkbook.worksheets("output").cells(count + 2, 1) = name
activeworkbook.worksheets("output").cells(count + 2, 2) = size
activeworkbook.worksheets("output").cells(count + 2, 3) = height
next j
next i
Obviously these are dumby variables and results, but the overall structure is the same as the real code. I can see "name","size", and "height" being printed, but then they get replaced by "type", "date", and "num". How do I prevent this from happening? Each time a new event is found I need it to print its associated characteristics printed into a new row in the "output" sheet.
Consider the following simplified version of your code:
For i = 1 To 100
If x = y Then
rowNum = rowNum + 1
Cells(rowNum + 1, 1) = "A"
End If
Cells(rowNum + 2, 1) = "B"
Next
Each time through the loop you are writing out either one or two things (two if x = y is true, one if it isn't) but you are only incrementing the row number by zero or one (one if x = y is true, zero if it isn't). Even if you know that x will always equal y, you are still trying to write two rows of information out but only increasing the row counter by one.
Assuming you are not trying to replace the "B"s in my example with the "A"s from the next iteration through the loop, you should change the code to something like:
For i = 1 To 100
If x = y Then
rowNum = rowNum + 1
Cells(rowNum, 1) = "A"
End If
rowNum = rowNum + 1
Cells(rowNum, 1) = "B"
Next