import json
import glob
import collections
import os

docs_number = 30000

path_to_authors = 'C:\\ML_Strijov\\Habr_classification\\authors.txt'

authors = dict()
author_id = 1

for page_id in xrange(docs_number):
    filename = 'C:\\ML_Strijov\\Habr_classification\\habr_pages\\%d' % page_id
    if not os.path.exists(filename):
        continue

    with open(filename) as f:
        post = json.load(f)
        if post['author'] not in authors.keys():
            authors[post['author']] = author_id
            author_id += 1
            
print ("List of authors is formed")
            
with open(path_to_authors, 'wb') as out:
    for page_id in xrange(docs_number):

        filename = 'C:\\ML_Strijov\\Habr_classification\\habr_pages\\%d' % page_id
        if not os.path.exists(filename):
            continue

        with open(filename) as f:
            post = json.load(f)

        page_author = post['author']

        print>>out, authors[page_author], page_id

import codecs
import os
import re
import json
import operator
from collections import defaultdict

topic_count = 10

author_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\authors.txt'
theta_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\theta.txt'
author_topic_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\author_topic.txt'
mrec_data_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\author_recomendations_2.tsv'

authors = defaultdict(defaultdict)
defaultdict_float = lambda: defaultdict(float)
author_topic_distribution = defaultdict(defaultdict_float)
author_doc_evaluation = defaultdict(defaultdict_float)
author_list = []


with open(author_path, 'r') as authors_file:
    for index, line in enumerate(authors_file):
        splitted = line.split()
        author = int(splitted[0])
        doc_id = int(splitted[1])
        authors[doc_id][author] = 1
        author_list.append(author)
        
with open(theta_path, 'r') as theta:
    for index, line in enumerate(theta):
        splitted = line.split()

        if index > 0 and index % 1000 == 0:
            print 'Processed {0} documents'.format(index)

        doc_id = int(splitted[0])
        distribution = tuple(map(float, splitted[1:]))

        for author in authors[doc_id]:
            for topic in range(topic_count):
                author_topic_distribution[author][topic] += distribution[topic] * authors[doc_id][author]
                

print 'Mrec data counting...'  

with open(mrec_data_path, 'w') as mrec_data_out:       
    with open(theta_path, 'r') as theta:
        for index, line in enumerate(theta):
            splitted = line.split()

            if index > 0 and index % 10000 == 0:
                print 'Processed {0} documents'.format(index)

            doc_id = int(splitted[0])
            distribution = tuple(map(float, splitted[1:]))

            for author in author_list[:100]:
                for topic in range(topic_count):
                    evaluation = distribution[topic] * author_topic_distribution[author][topic] * 100.0
                    if evaluation > 1:
                        print>>mrec_data_out, str(author)+'\t'+str(doc_id)+'\t'+str(evaluation)