#using Pkg
#if ! haskey(Pkg.dependencies(), "Distributions")
# Pkg.add("Distributions")
#end
using LinearAlgebra
using Random
using Statistics
using DelimitedFiles
#using Distributions
using BetaML.Clustering
K = 3
X = [1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4]
9×2 Array{Float64,2}: 1.0 10.5 1.5 10.8 1.8 8.0 1.7 15.0 3.2 40.0 3.6 32.0 3.3 38.0 5.1 -2.3 5.2 -2.4
Z₀ = initRepresentatives([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.6 38],2,initStrategy="grid")
2×2 Array{Float64,2}: 1.0 8.0 3.6 40.0
(clIdx,Z) = kmeans([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3)
([2, 2, 2, 2, 3, 3, 3, 1, 1], [5.15 -2.3499999999999996; 1.5 11.075; 3.366666666666667 36.666666666666664])
(clIdx,Z) = kmedoids([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3,dist = (x,y) -> norm(x-y)^2,initStrategy="grid")
([2, 2, 2, 2, 3, 3, 3, 1, 1], [5.1 -2.3; 1.5 10.8; 3.3 38.0])
clusters = em([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3,verbosity=FULL)
clusters.pₙₖ
Iter. 1: Var. of the post 2.163665445135426 Log-likelihood -57.57647125026064 Iter. 2: Var. of the post 0.8859993363738548 Log-likelihood -55.411635885609805 Iter. 3: Var. of the post 0.4631675430495836 Log-likelihood -52.363399702650966 Iter. 4: Var. of the post 0.9539686982943021 Log-likelihood -48.45986544166918 Iter. 5: Var. of the post 0.2801904268692464 Log-likelihood -33.125175201316154 Iter. 6: Var. of the post 7.287993222852859e-22 Log-likelihood -30.93024764051858 Iter. 7: Var. of the post 0.0 Log-likelihood -30.93024764051858
9×3 Array{Float64,2}: 1.0 7.7918e-78 5.95934e-37 1.0 3.90859e-62 1.48106e-28 1.0 3.2796e-51 2.19577e-26 1.0 6.51017e-58 2.83748e-21 1.51765e-28 8.63292e-51 1.0 2.45605e-27 7.29549e-33 1.0 3.04396e-28 2.61747e-46 1.0 4.63159e-60 1.0 4.03358e-42 2.78564e-63 1.0 8.12962e-44
cd(@__DIR__)
K = [1,12]
seeds = [0,1,2,3,4]
5-element Array{Int64,1}: 0 1 2 3 4
# Test data
baseDir = "assets/netflix/toy_data/"
X = readdlm(joinpath(baseDir,"toy_data.txt"))
X = map(x -> x == 0 ? missing : x, X)
(n,d) = size(X)
for k in K
ulL = -Inf
bestSeed = -1
bestOut = nothing
for s in seeds
println("[INFO] Working with (k,seed) = ($(k), $(s))")
μ₀ = readdlm(joinpath(baseDir,"init_mu_$(k)_$(s).csv"), ' ')
σ²₀ = dropdims(readdlm(joinpath(baseDir,"init_var_$(k)_$(s).csv"), ' '),dims=2)
p₀ = dropdims(readdlm(joinpath(baseDir,"init_p_$(k)_$(s).csv"), ' '),dims=2)
emOut = em(X,k;p₀=p₀,mixtures=[SphericalGaussian(μ₀[i,:],σ²₀[i]) for i in 1:k],verbosity=NONE,minVariance=0.25)
lL = emOut.lL
if lL > ulL
ulL = lL
bestSeed = s
bestOut = emOut
end
end
println("Upper logLikelihood with $(k) clusters: $(ulL) (seed $(bestSeed))")
end
[INFO] Working with (k,seed) = (1, 0) [INFO] Working with (k,seed) = (1, 1) [INFO] Working with (k,seed) = (1, 2) [INFO] Working with (k,seed) = (1, 3) [INFO] Working with (k,seed) = (1, 4) Upper logLikelihood with 1 clusters: -1307.2234317600933 (seed 0) [INFO] Working with (k,seed) = (12, 0) [INFO] Working with (k,seed) = (12, 1) [INFO] Working with (k,seed) = (12, 2) [INFO] Working with (k,seed) = (12, 3) [INFO] Working with (k,seed) = (12, 4) Upper logLikelihood with 12 clusters: -1118.6190434326675 (seed 2)
# Full NetFlix dataset.. may take time !!!!
baseDir = "assets/netflix/full/"
X = convert(Array{Int64,2},readdlm(joinpath(baseDir,"netflix_incomplete.txt")))
(n,d) = size(X)
X = map(x -> x == 0 ? missing : x, X)
1200×1200 Array{Union{Missing, Int64},2}: 2 4 5 missing … 4 4 4 3 5 5 3 5 3 4 2 missing 4 3 4 4 3 4 3 4 4 4 4 4 2 2 5 4 4 missing missing 3 missing missing 4 … 2 5 4 1 4 5 4 4 5 4 2 missing 5 4 5 3 3 3 5 missing 5 2 5 5 missing missing missing missing missing 5 5 3 missing 5 3 … missing 3 missing 2 4 5 missing 4 3 missing missing 4 5 missing 3 4 3 ⋮ ⋱ missing missing 5 3 missing 5 4 3 2 5 missing 5 missing missing missing 4 5 4 … 3 5 4 missing 2 5 3 4 4 missing 3 4 5 3 3 4 3 missing missing 4 4 missing 4 4 3 2 5 4 3 3 2 2 4 5 3 … 5 5 3 missing 4 5 missing 3 3 3 2 3 5 4 3 4 2 3 4 missing 4 4 5 4 3 missing 5 4 5 4 missing
for k in K
ulL = -Inf
bestSeed = -1
bestOut = nothing
for s in seeds
println("[INFO] Working with (k,seed) = ($(k), $(s))")
μ₀ = readdlm(joinpath(baseDir,"init_mu_$(k)_$(s).csv"), ' ')
σ²₀ = dropdims(readdlm(joinpath(baseDir,"init_var_$(k)_$(s).csv"), ' '),dims=2)
p₀ = dropdims(readdlm(joinpath(baseDir,"init_p_$(k)_$(s).csv"), ' '),dims=2)
emOut = em(X,k;p₀=p₀,mixtures=[SphericalGaussian(μ₀[i,:],σ²₀[i]) for i in 1:k],verbosity=NONE,minVariance=0.25)
lL = emOut.lL
if lL > ulL
ulL = lL
bestSeed = s
bestOut = emOut
end
end
println("Upper logLikelihood with $(k) clusters: $(ulL) (seed $(bestSeed))")
end
[INFO] Working with (k,seed) = (1, 0) [INFO] Working with (k,seed) = (1, 1) [INFO] Working with (k,seed) = (1, 2) [INFO] Working with (k,seed) = (1, 3) [INFO] Working with (k,seed) = (1, 4) Upper logLikelihood with 1 clusters: -1.5210609539852452e6 (seed 0) [INFO] Working with (k,seed) = (12, 0) [INFO] Working with (k,seed) = (12, 1) [INFO] Working with (k,seed) = (12, 2) [INFO] Working with (k,seed) = (12, 3) [INFO] Working with (k,seed) = (12, 4) Upper logLikelihood with 12 clusters: -1.3902809991574623e6 (seed 1)
X = [1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4]
9×2 Array{Union{Missing, Float64},2}: 1.0 10.5 1.5 missing 1.8 8.0 1.7 15.0 3.2 40.0 missing missing 3.3 38.0 missing -2.3 5.2 -2.4
cFOut = predictMissing(X,3,mixtures=[SphericalGaussian() for i in 1:3],verbosity=FULL,minVariance=0.25)
cFOut.X̂
Iter. 1: Var. of the post 2.658995979045184 Log-likelihood -46.60839895825246 Iter. 2: Var. of the post 0.5972000744973046 Log-likelihood -34.3316693714349 Iter. 3: Var. of the post 0.3033156311165382 Log-likelihood -32.64175983202529 Iter. 4: Var. of the post 0.3218644885234808 Log-likelihood -29.812356340540394 Iter. 5: Var. of the post 0.044179093958920966 Log-likelihood -27.683492280198745 Iter. 6: Var. of the post 0.008700550767783852 Log-likelihood -27.681894241887314 Iter. 7: Var. of the post 0.0033482623120286137 Log-likelihood -27.681809792362692 Iter. 8: Var. of the post 0.0015987062820508016 Log-likelihood -27.681790513684003
9×2 Array{Union{Missing, Float64},2}: 1.0 10.5 1.5 14.1872 1.8 8.0 1.7 15.0 3.2 40.0 2.86281 15.1282 3.3 38.0 5.2 -2.3 5.2 -2.4