using TextAnalysis, Languages, Clustering

filename = joinpath(dirname(@__FILE__), "data", "highlights.csv")

function get_headers(source)
    headers = [ String(s) for s in source[1,:] ]
    source = source[2:end,:] # clear the headers
    headers, source
end

source = readcsv(filename)
headers, source = get_headers(source)

sample = source[:,5]
length(sample)

#fd = FileDocument(sample)
#sd = StringDocument(fd)

text = join(sample, ",")
sd = StringDocument(text)
crps = Corpus([sd])

remove_punctuation!(sd)
update_lexicon!(crps)
lexicon(crps)

update_inverse_index!(crps)
inverse_index(crps)
hash_function!(crps, TextHashFunction())

crps["Handelskrieg"]

m = DocumentTermMatrix(crps)
D = dtm(m, :dense)
T = tf_idf(D)
cl = kmeans(T, 5)

m = DocumentTermMatrix(crps)
k = 2            # number of topics
iteration = 1000 # number of gibbs sampling iterations
alpha = 0.1      # hyper parameter
beta = 0.1       # hyber parameter
l = lda(m, k, iteration, alpha, beta) # l is k x word matrix.
                                      # value is probablity of occurrence of a word in a topic.