import json import glob import collections import os docs_number = 30000 path_to_authors = 'C:\\ML_Strijov\\Habr_classification\\authors.txt' authors = dict() author_id = 1 for page_id in xrange(docs_number): filename = 'C:\\ML_Strijov\\Habr_classification\\habr_pages\\%d' % page_id if not os.path.exists(filename): continue with open(filename) as f: post = json.load(f) if post['author'] not in authors.keys(): authors[post['author']] = author_id author_id += 1 print ("List of authors is formed") with open(path_to_authors, 'wb') as out: for page_id in xrange(docs_number): filename = 'C:\\ML_Strijov\\Habr_classification\\habr_pages\\%d' % page_id if not os.path.exists(filename): continue with open(filename) as f: post = json.load(f) page_author = post['author'] print>>out, authors[page_author], page_id import codecs import os import re import json import operator from collections import defaultdict topic_count = 10 author_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\authors.txt' theta_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\theta.txt' author_topic_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\author_topic.txt' mrec_data_path = 'C:\\ML_Strijov\\Habr_classification\\habr_code\\author_recomendations_2.tsv' authors = defaultdict(defaultdict) defaultdict_float = lambda: defaultdict(float) author_topic_distribution = defaultdict(defaultdict_float) author_doc_evaluation = defaultdict(defaultdict_float) author_list = [] with open(author_path, 'r') as authors_file: for index, line in enumerate(authors_file): splitted = line.split() author = int(splitted[0]) doc_id = int(splitted[1]) authors[doc_id][author] = 1 author_list.append(author) with open(theta_path, 'r') as theta: for index, line in enumerate(theta): splitted = line.split() if index > 0 and index % 1000 == 0: print 'Processed {0} documents'.format(index) doc_id = int(splitted[0]) distribution = tuple(map(float, splitted[1:])) for author in authors[doc_id]: for topic in range(topic_count): author_topic_distribution[author][topic] += distribution[topic] * authors[doc_id][author] print 'Mrec data counting...' with open(mrec_data_path, 'w') as mrec_data_out: with open(theta_path, 'r') as theta: for index, line in enumerate(theta): splitted = line.split() if index > 0 and index % 10000 == 0: print 'Processed {0} documents'.format(index) doc_id = int(splitted[0]) distribution = tuple(map(float, splitted[1:])) for author in author_list[:100]: for topic in range(topic_count): evaluation = distribution[topic] * author_topic_distribution[author][topic] * 100.0 if evaluation > 1: print>>mrec_data_out, str(author)+'\t'+str(doc_id)+'\t'+str(evaluation)