using TextAnalysis, Languages, Clustering
filename = joinpath(dirname(@__FILE__), "data", "highlights.csv")
"data/highlights.csv"
function get_headers(source)
headers = [ String(s) for s in source[1,:] ]
source = source[2:end,:] # clear the headers
headers, source
end
source = readcsv(filename)
headers, source = get_headers(source)
(String["updatedAt", "_typeName", "reactions", "sourceUrl", "text", "id", "createdAt", "embedUrl"], Any["2018-03-02T12:15:51.000Z" "Highlight" … "2018-03-02T11:27:37.000Z" ""; "2018-03-02T11:27:52.000Z" "Highlight" … "2018-03-02T11:27:52.000Z" ""; … ; "2018-03-02T12:18:56.000Z" "Production" … "2018-03-02T12:17:55.000Z" "https://www.youtube.com/embed/Vbaf9yJ6HBc"; "2018-03-02T12:18:39.000Z" "Production" … "2018-03-02T12:18:39.000Z" "https://www.youtube.com/embed/sxKcEsT9qLM"])
sample = source[:,5]
length(sample)
19
#fd = FileDocument(sample)
#sd = StringDocument(fd)
text = join(sample, ",")
sd = StringDocument(text)
crps = Corpus([sd])
A Corpus
remove_punctuation!(sd)
update_lexicon!(crps)
lexicon(crps)
Dict{String,Int64} with 252 entries: "gut" => 1 "Bei" => 1 "regelkonformer" => 1 "hat" => 4 "Vincenz'" => 1 "Handelskrieg" => 1 "Zöllen" => 1 "mal" => 2 "das" => 5 "des" => 4 "zurück" => 1 "jedem" => 1 "verstehen" => 1 "Schutzinseln" => 1 "Unternehmen" => 1 "Tag" => 1 "Gisel" => 1 "Markierungen" => 1 "von" => 6 "Trottoirkanten" => 1 "hin" => 1 "Mütchen" => 1 "kleinste" => 2 "bis" => 2 "Klaus" => 1 ⋮ => ⋮
update_inverse_index!(crps)
inverse_index(crps)
hash_function!(crps, TextHashFunction())
crps["Handelskrieg"]
1-element Array{Int64,1}: 1
m = DocumentTermMatrix(crps)
D = dtm(m, :dense)
T = tf_idf(D)
cl = kmeans(T, 5)
WARNING: min(x::AbstractArray{T1}, y::AbstractArray{T2}) where {T1 <: Real, T2 <: Real} is deprecated, use min.(x, y) instead. Stacktrace: [1] depwarn(::String, ::Symbol) at ./deprecated.jl:70 [2] min(::Array{Float64,1}, ::Array{Float64,1}) at ./deprecated.jl:57 [3] repick_unused_centers(::Array{Float64,2}, ::Array{Float64,1}, ::Array{Float64,2}, ::Array{Int64,1}) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:373 [4] _kmeans!(::Array{Float64,2}, ::Void, ::Array{Float64,2}, ::Array{Int64,1}, ::Array{Float64,1}, ::Array{Int64,1}, ::Array{Float64,1}, ::Int64, ::Float64, ::Int64, ::Distances.SqEuclidean) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:108 [5] #kmeans!#1(::Void, ::Int64, ::Float64, ::Symbol, ::Distances.SqEuclidean, ::Function, ::Array{Float64,2}, ::Array{Float64,2}) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:38 [6] (::Clustering.#kw##kmeans!)(::Array{Any,1}, ::Clustering.#kmeans!, ::Array{Float64,2}, ::Array{Float64,2}) at ./<missing>:0 [7] #kmeans#2(::Void, ::Symbol, ::Int64, ::Float64, ::Symbol, ::Distances.SqEuclidean, ::Function, ::Array{Float64,2}, ::Int64) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:55 [8] kmeans(::Array{Float64,2}, ::Int64) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Clustering/src/kmeans.jl:51 [9] include_string(::String, ::String) at ./loading.jl:522 [10] include_string(::Module, ::String, ::String) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Compat/src/Compat.jl:71 [11] execute_request(::ZMQ.Socket, ::IJulia.Msg) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/IJulia/src/execute_request.jl:158 [12] (::Compat.#inner#17{Array{Any,1},IJulia.#execute_request,Tuple{ZMQ.Socket,IJulia.Msg}})() at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/Compat/src/Compat.jl:385 [13] eventloop(::ZMQ.Socket) at /home/oleg/Localdev/anaconda/envs/julia/share/julia/site/v0.6/IJulia/src/eventloop.jl:8 [14] (::IJulia.##14#17)() at ./task.jl:335 while loading In[17], in expression starting on line 4
Clustering.KmeansResult{Float64}([0.0 0.0 … 0.0 0.0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1 … 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 … 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [252, 0, 0, 0, 0], [252.0, 6.91213e-310, 6.91213e-310, 6.91213e-310, 6.91213e-310], 0.0, 1, true)
m = DocumentTermMatrix(crps)
k = 2 # number of topics
iteration = 1000 # number of gibbs sampling iterations
alpha = 0.1 # hyper parameter
beta = 0.1 # hyber parameter
l = lda(m, k, iteration, alpha, beta) # l is k x word matrix.
# value is probablity of occurrence of a word in a topic.
2×252 SparseMatrixCSC{Float64,Int64} with 263 stored entries: [1 , 1] = 0.00512821 [1 , 2] = 0.00512821 [2 , 3] = 0.00531915 [2 , 4] = 0.00531915 [2 , 5] = 0.00531915 [1 , 6] = 0.00512821 [2 , 7] = 0.00531915 [2 , 8] = 0.00531915 [2 , 9] = 0.00531915 [1 , 10] = 0.00512821 ⋮ [2 , 242] = 0.0106383 [1 , 243] = 0.00512821 [1 , 244] = 0.00512821 [2 , 245] = 0.00531915 [1 , 246] = 0.00512821 [1 , 247] = 0.00512821 [1 , 248] = 0.00512821 [1 , 249] = 0.00512821 [1 , 250] = 0.00512821 [2 , 251] = 0.0106383 [1 , 252] = 0.00512821