Notebook

Clustering and collaborative filtering (via clustering) algorithms¶

Importable source code (most up-to-date version) - Julia Package
Demonstrative static notebook
Demonstrative live notebook (temporary personal online computational environment on myBinder) - it can takes minutes to start with!
Theory based on MITx 6.86x - Machine Learning with Python: from Linear Models to Deep Learning (Unit 4)
New to Julia? A concise Julia tutorial - Julia Quick Syntax Reference book

In [12]:

#using Pkg
#if ! haskey(Pkg.dependencies(), "Distributions")
#    Pkg.add("Distributions")
#end
using LinearAlgebra
using Random
using Statistics
using DelimitedFiles
#using Distributions
using BetaML.Clustering

In [13]:

K = 3
X = [1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4]

Out[13]:

9×2 Array{Float64,2}:
 1.0  10.5
 1.5  10.8
 1.8   8.0
 1.7  15.0
 3.2  40.0
 3.6  32.0
 3.3  38.0
 5.1  -2.3
 5.2  -2.4

In [14]:

Z₀ = initRepresentatives([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.6 38],2,initStrategy="grid")

Out[14]:

2×2 Array{Float64,2}:
 1.0   8.0
 3.6  40.0

In [15]:

(clIdx,Z) = kmeans([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3)

Out[15]:

([2, 2, 2, 2, 3, 3, 3, 1, 1], [5.15 -2.3499999999999996; 1.5 11.075; 3.366666666666667 36.666666666666664])

In [16]:

(clIdx,Z) = kmedoids([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3,dist = (x,y) -> norm(x-y)^2,initStrategy="grid")

Out[16]:

([2, 2, 2, 2, 3, 3, 3, 1, 1], [5.1 -2.3; 1.5 10.8; 3.3 38.0])

In [17]:

clusters = em([1 10.5;1.5 10.8; 1.8 8; 1.7 15; 3.2 40; 3.6 32; 3.3 38; 5.1 -2.3; 5.2 -2.4],3,verbosity=FULL)
clusters.pₙₖ

Iter. 1:	Var. of the post  2.163665445135426 	  Log-likelihood -57.57647125026064
Iter. 2:	Var. of the post  0.8859993363738548 	  Log-likelihood -55.411635885609805
Iter. 3:	Var. of the post  0.4631675430495836 	  Log-likelihood -52.363399702650966
Iter. 4:	Var. of the post  0.9539686982943021 	  Log-likelihood -48.45986544166918
Iter. 5:	Var. of the post  0.2801904268692464 	  Log-likelihood -33.125175201316154
Iter. 6:	Var. of the post  7.287993222852859e-22 	  Log-likelihood -30.93024764051858
Iter. 7:	Var. of the post  0.0 	  Log-likelihood -30.93024764051858

Out[17]:

9×3 Array{Float64,2}:
 1.0          7.7918e-78   5.95934e-37
 1.0          3.90859e-62  1.48106e-28
 1.0          3.2796e-51   2.19577e-26
 1.0          6.51017e-58  2.83748e-21
 1.51765e-28  8.63292e-51  1.0
 2.45605e-27  7.29549e-33  1.0
 3.04396e-28  2.61747e-46  1.0
 4.63159e-60  1.0          4.03358e-42
 2.78564e-63  1.0          8.12962e-44

In [18]:

cd(@__DIR__)
K = [1,12]
seeds = [0,1,2,3,4]

Out[18]:

5-element Array{Int64,1}:
 0
 1
 2
 3
 4

In [28]:

# Test data
baseDir = "assets/netflix/toy_data/"
X = readdlm(joinpath(baseDir,"toy_data.txt"))
X = map(x -> x == 0 ? missing : x, X)
(n,d) = size(X)
for k in K
    ulL = -Inf
    bestSeed = -1
    bestOut = nothing
    for s in seeds
        println("[INFO] Working with (k,seed) = ($(k), $(s))")
        μ₀ = readdlm(joinpath(baseDir,"init_mu_$(k)_$(s).csv"), ' ')
        σ²₀ = dropdims(readdlm(joinpath(baseDir,"init_var_$(k)_$(s).csv"), ' '),dims=2)
        p₀ = dropdims(readdlm(joinpath(baseDir,"init_p_$(k)_$(s).csv"), ' '),dims=2)
        emOut = em(X,k;p₀=p₀,mixtures=[SphericalGaussian(μ₀[i,:],σ²₀[i]) for i in 1:k],verbosity=NONE,minVariance=0.25)
        lL  = emOut.lL
        if lL > ulL
            ulL = lL
            bestSeed = s
            bestOut = emOut
        end
    end
    println("Upper logLikelihood with $(k) clusters: $(ulL) (seed $(bestSeed))")
end

[INFO] Working with (k,seed) = (1, 0)
[INFO] Working with (k,seed) = (1, 1)
[INFO] Working with (k,seed) = (1, 2)
[INFO] Working with (k,seed) = (1, 3)
[INFO] Working with (k,seed) = (1, 4)
Upper logLikelihood with 1 clusters: -1307.2234317600933 (seed 0)
[INFO] Working with (k,seed) = (12, 0)
[INFO] Working with (k,seed) = (12, 1)
[INFO] Working with (k,seed) = (12, 2)
[INFO] Working with (k,seed) = (12, 3)
[INFO] Working with (k,seed) = (12, 4)
Upper logLikelihood with 12 clusters: -1118.6190434326675 (seed 2)

In [36]:

# Full NetFlix dataset.. may take time !!!!
baseDir = "assets/netflix/full/"
X = convert(Array{Int64,2},readdlm(joinpath(baseDir,"netflix_incomplete.txt")))
(n,d) = size(X)
X = map(x -> x == 0 ? missing : x, X)

Out[36]:

1200×1200 Array{Union{Missing, Int64},2}:
 2         4         5          missing  …  4         4         4
 3         5         5         3            5         3         4
 2          missing  4         3            4         4         3
 4         3         4         4            4         4         4
 2         2         5         4            4          missing   missing
 3          missing   missing  4         …  2         5         4
 1         4         5         4            4         5         4
 2          missing  5         4            5         3         3
 3         5          missing  5            2         5         5
  missing   missing   missing   missing      missing  5         5
 3          missing  5         3         …   missing  3          missing
 2         4         5          missing     4         3          missing
  missing  4         5          missing     3         4         3
 ⋮                                       ⋱                      
  missing   missing  5         3             missing  5         4
 3         2         5          missing     5          missing   missing
  missing  4         5         4         …  3         5         4
  missing  2         5         3            4         4          missing
 3         4         5         3            3         4         3
  missing   missing  4         4             missing  4         4
 3         2         5         4            3         3         2
 2         4         5         3         …  5         5         3
  missing  4         5          missing     3         3         3
 2         3         5         4            3         4         2
 3         4          missing  4            4         5         4
 3          missing  5         4            5         4          missing

In [37]:

for k in K
    ulL = -Inf
    bestSeed = -1
    bestOut = nothing
    for s in seeds
        println("[INFO] Working with (k,seed) = ($(k), $(s))")
        μ₀  = readdlm(joinpath(baseDir,"init_mu_$(k)_$(s).csv"), ' ')
        σ²₀ = dropdims(readdlm(joinpath(baseDir,"init_var_$(k)_$(s).csv"), ' '),dims=2)
        p₀  = dropdims(readdlm(joinpath(baseDir,"init_p_$(k)_$(s).csv"), ' '),dims=2)
        emOut = em(X,k;p₀=p₀,mixtures=[SphericalGaussian(μ₀[i,:],σ²₀[i]) for i in 1:k],verbosity=NONE,minVariance=0.25)
        lL  = emOut.lL
        if lL > ulL
            ulL = lL
            bestSeed = s
            bestOut = emOut
        end
    end
    println("Upper logLikelihood with $(k) clusters: $(ulL) (seed $(bestSeed))")
end

[INFO] Working with (k,seed) = (1, 0)
[INFO] Working with (k,seed) = (1, 1)
[INFO] Working with (k,seed) = (1, 2)
[INFO] Working with (k,seed) = (1, 3)
[INFO] Working with (k,seed) = (1, 4)
Upper logLikelihood with 1 clusters: -1.5210609539852452e6 (seed 0)
[INFO] Working with (k,seed) = (12, 0)
[INFO] Working with (k,seed) = (12, 1)
[INFO] Working with (k,seed) = (12, 2)
[INFO] Working with (k,seed) = (12, 3)
[INFO] Working with (k,seed) = (12, 4)
Upper logLikelihood with 12 clusters: -1.3902809991574623e6 (seed 1)

In [38]:

X = [1 10.5;1.5 missing; 1.8 8; 1.7 15; 3.2 40; missing missing; 3.3 38; missing -2.3; 5.2 -2.4]

Out[38]:

9×2 Array{Union{Missing, Float64},2}:
 1.0       10.5
 1.5         missing
 1.8        8.0
 1.7       15.0
 3.2       40.0
  missing    missing
 3.3       38.0
  missing  -2.3
 5.2       -2.4

In [39]:

cFOut = predictMissing(X,3,mixtures=[SphericalGaussian() for i in 1:3],verbosity=FULL,minVariance=0.25)
cFOut.X̂

Iter. 1:	Var. of the post  2.658995979045184 	  Log-likelihood -46.60839895825246
Iter. 2:	Var. of the post  0.5972000744973046 	  Log-likelihood -34.3316693714349
Iter. 3:	Var. of the post  0.3033156311165382 	  Log-likelihood -32.64175983202529
Iter. 4:	Var. of the post  0.3218644885234808 	  Log-likelihood -29.812356340540394
Iter. 5:	Var. of the post  0.044179093958920966 	  Log-likelihood -27.683492280198745
Iter. 6:	Var. of the post  0.008700550767783852 	  Log-likelihood -27.681894241887314
Iter. 7:	Var. of the post  0.0033482623120286137 	  Log-likelihood -27.681809792362692
Iter. 8:	Var. of the post  0.0015987062820508016 	  Log-likelihood -27.681790513684003

Out[39]:

9×2 Array{Union{Missing, Float64},2}:
 1.0      10.5
 1.5      14.1872
 1.8       8.0
 1.7      15.0
 3.2      40.0
 2.86281  15.1282
 3.3      38.0
 5.2      -2.3
 5.2      -2.4

In [ ]: