import json
import glob
import collections
import os
docs_number = 30000
path_to_authors = 'C:\\ML_Strijov\\Habr_classification\\authors.txt'
authors = dict()
author_id = 1
for page_id in xrange(docs_number):
filename = 'C:\\ML_Strijov\\Habr_classification\\habr_pages\\%d' % page_id
if not os.path.exists(filename):
continue
with open(filename) as f:
post = json.load(f)
if post['author'] not in authors.keys():
authors[post['author']] = author_id
author_id += 1
print ("List of authors is formed")
with open(path_to_authors, 'wb') as out:
for page_id in xrange(docs_number):
filename = 'C:\\ML_Strijov\\Habr_classification\\habr_pages\\%d' % page_id
if not os.path.exists(filename):
continue
with open(filename) as f:
post = json.load(f)
page_author = post['author']
print>>out, authors[page_author], page_id
List of authors is formed
import codecs
import os
import re
import json
import operator
from collections import defaultdict
topic_count = 10
author_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\authors.txt'
theta_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\theta.txt'
author_topic_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\author_topic.txt'
mrec_data_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\author_recomendations_2.tsv'
authors = defaultdict(defaultdict)
defaultdict_float = lambda: defaultdict(float)
author_topic_distribution = defaultdict(defaultdict_float)
author_doc_evaluation = defaultdict(defaultdict_float)
author_list = []
with open(author_path, 'r') as authors_file:
for index, line in enumerate(authors_file):
splitted = line.split()
author = int(splitted[0])
doc_id = int(splitted[1])
authors[doc_id][author] = 1
author_list.append(author)
with open(theta_path, 'r') as theta:
for index, line in enumerate(theta):
splitted = line.split()
if index > 0 and index % 1000 == 0:
print 'Processed {0} documents'.format(index)
doc_id = int(splitted[0])
distribution = tuple(map(float, splitted[1:]))
for author in authors[doc_id]:
for topic in range(topic_count):
author_topic_distribution[author][topic] += distribution[topic] * authors[doc_id][author]
print 'Mrec data counting...'
with open(mrec_data_path, 'w') as mrec_data_out:
with open(theta_path, 'r') as theta:
for index, line in enumerate(theta):
splitted = line.split()
if index > 0 and index % 10000 == 0:
print 'Processed {0} documents'.format(index)
doc_id = int(splitted[0])
distribution = tuple(map(float, splitted[1:]))
for author in author_list[:100]:
for topic in range(topic_count):
evaluation = distribution[topic] * author_topic_distribution[author][topic] * 100.0
if evaluation > 1:
print>>mrec_data_out, str(author)+'\t'+str(doc_id)+'\t'+str(evaluation)
Processed 1000 documents Processed 2000 documents Processed 3000 documents Processed 4000 documents Processed 5000 documents Processed 6000 documents Processed 7000 documents Processed 8000 documents Processed 9000 documents Processed 10000 documents Processed 11000 documents Processed 12000 documents Processed 13000 documents Processed 14000 documents Processed 15000 documents Processed 16000 documents Processed 17000 documents Processed 18000 documents Processed 19000 documents Processed 20000 documents Processed 21000 documents Processed 22000 documents Processed 23000 documents Processed 24000 documents Processed 25000 documents Processed 26000 documents Processed 27000 documents Processed 28000 documents Processed 29000 documents Processed 30000 documents Processed 31000 documents Processed 32000 documents Processed 33000 documents Processed 34000 documents Processed 35000 documents Processed 36000 documents Processed 37000 documents Processed 38000 documents Processed 39000 documents Processed 40000 documents Processed 41000 documents Processed 42000 documents Processed 43000 documents Processed 44000 documents Processed 45000 documents Processed 46000 documents Processed 47000 documents Processed 48000 documents Processed 49000 documents Processed 50000 documents Mrec data counting... Processed 10000 documents Processed 20000 documents Processed 30000 documents Processed 40000 documents Processed 50000 documents