Notebook

In [10]:

using TextAnalysis, Languages, Clustering

In [11]:

filename = joinpath(dirname(@__FILE__), "data", "highlights.csv")

Out[11]:

"data/highlights.csv"

In [12]:

function get_headers(source)
    headers = [ String(s) for s in source[1,:] ]
    source = source[2:end,:] # clear the headers
    headers, source
end

source = readcsv(filename)
headers, source = get_headers(source)

Out[12]:

(String["updatedAt", "_typeName", "reactions", "sourceUrl", "text", "id", "createdAt", "embedUrl"], Any["2018-03-02T12:15:51.000Z" "Highlight" … "2018-03-02T11:27:37.000Z" ""; "2018-03-02T11:27:52.000Z" "Highlight" … "2018-03-02T11:27:52.000Z" ""; … ; "2018-03-02T12:18:56.000Z" "Production" … "2018-03-02T12:17:55.000Z" "https://www.youtube.com/embed/Vbaf9yJ6HBc"; "2018-03-02T12:18:39.000Z" "Production" … "2018-03-02T12:18:39.000Z" "https://www.youtube.com/embed/sxKcEsT9qLM"])

In [13]:

sample = source[:,5]
length(sample)

Out[13]:

In [14]:

#fd = FileDocument(sample)
#sd = StringDocument(fd)

text = join(sample, ",")
sd = StringDocument(text)
crps = Corpus([sd])

Out[14]:

A Corpus

In [15]:

remove_punctuation!(sd)
update_lexicon!(crps)
lexicon(crps)

Out[15]:

Dict{String,Int64} with 252 entries:
  "gut"            => 1
  "Bei"            => 1
  "regelkonformer" => 1
  "hat"            => 4
  "Vincenz'"       => 1
  "Handelskrieg"   => 1
  "Zöllen"         => 1
  "mal"            => 2
  "das"            => 5
  "des"            => 4
  "zurück"         => 1
  "jedem"          => 1
  "verstehen"      => 1
  "Schutzinseln"   => 1
  "Unternehmen"    => 1
  "Tag"            => 1
  "Gisel"          => 1
  "Markierungen"   => 1
  "von"            => 6
  "Trottoirkanten" => 1
  "hin"            => 1
  "Mütchen"        => 1
  "kleinste"       => 2
  "bis"            => 2
  "Klaus"          => 1
  ⋮                => ⋮

In [19]:

update_inverse_index!(crps)
inverse_index(crps)
hash_function!(crps, TextHashFunction())

crps["Handelskrieg"]

Out[19]:

1-element Array{Int64,1}:
 1

In [17]:

m = DocumentTermMatrix(crps)
D = dtm(m, :dense)
T = tf_idf(D)
cl = kmeans(T, 5)

WARNING: min(x::AbstractArray{T1}, y::AbstractArray{T2}) where {T1 <: Real, T2 <: Real} is deprecated, use min.(x, y) instead.
Stacktrace:
 [1] depwarn(::String, ::Symbol) at ./deprecated.jl:70
 [2] min(::Array{Float64,1}, ::Array{Float64,1}) at ./deprecated.jl:57
 [3] repick_unused_centers(::Array{Float64,2}, ::Array{Float64,1}, ::Array{Float64,2}, ::Array{Int64,1}) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:373
 [4] _kmeans!(::Array{Float64,2}, ::Void, ::Array{Float64,2}, ::Array{Int64,1}, ::Array{Float64,1}, ::Array{Int64,1}, ::Array{Float64,1}, ::Int64, ::Float64, ::Int64, ::Distances.SqEuclidean) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:108
 [5] #kmeans!#1(::Void, ::Int64, ::Float64, ::Symbol, ::Distances.SqEuclidean, ::Function, ::Array{Float64,2}, ::Array{Float64,2}) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:38
 [6] (::Clustering.#kw##kmeans!)(::Array{Any,1}, ::Clustering.#kmeans!, ::Array{Float64,2}, ::Array{Float64,2}) at ./<missing>:0
 [7] #kmeans#2(::Void, ::Symbol, ::Int64, ::Float64, ::Symbol, ::Distances.SqEuclidean, ::Function, ::Array{Float64,2}, ::Int64) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:55
 [8] kmeans(::Array{Float64,2}, ::Int64) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:51
 [9] include_string(::String, ::String) at ./loading.jl:522
 [10] include_string(::Module, ::String, ::String) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Compat/src/Compat.jl:71
 [11] execute_request(::ZMQ.Socket, ::IJulia.Msg) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/IJulia/src/execute_request.jl:158
 [12] (::Compat.#inner#17{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})() at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Compat/src/Compat.jl:385
 [13] eventloop(::ZMQ.Socket) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/IJulia/src/eventloop.jl:8
 [14] (::IJulia.##14#17)() at ./task.jl:335
while loading In[17], in expression starting on line 4

Out[17]:

Clustering.KmeansResult{Float64}([0.0 0.0 … 0.0 0.0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [252, 0, 0, 0, 0], [252.0, 6.91213e-310, 6.91213e-310, 6.91213e-310, 6.91213e-310], 0.0, 1, true)

In [18]:

m = DocumentTermMatrix(crps)
k = 2            # number of topics
iteration = 1000 # number of gibbs sampling iterations
alpha = 0.1      # hyper parameter
beta = 0.1       # hyber parameter
l = lda(m, k, iteration, alpha, beta) # l is k x word matrix.
                                      # value is probablity of occurrence of a word in a topic.

Out[18]:

2×252 SparseMatrixCSC{Float64,Int64} with 263 stored entries:
  [1  ,   1]  =  0.00512821
  [1  ,   2]  =  0.00512821
  [2  ,   3]  =  0.00531915
  [2  ,   4]  =  0.00531915
  [2  ,   5]  =  0.00531915
  [1  ,   6]  =  0.00512821
  [2  ,   7]  =  0.00531915
  [2  ,   8]  =  0.00531915
  [2  ,   9]  =  0.00531915
  [1  ,  10]  =  0.00512821
  ⋮
  [2  , 242]  =  0.0106383
  [1  , 243]  =  0.00512821
  [1  , 244]  =  0.00512821
  [2  , 245]  =  0.00531915
  [1  , 246]  =  0.00512821
  [1  , 247]  =  0.00512821
  [1  , 248]  =  0.00512821
  [1  , 249]  =  0.00512821
  [1  , 250]  =  0.00512821
  [2  , 251]  =  0.0106383
  [1  , 252]  =  0.00512821