%matplotlib inline %autosave 10 import gensim import cPickle as pickle from sklearn import * import numpy from matplotlib import pyplot articles = pickle.load(open('data/plos_biology_articles_unfurled.list','r')) dois = pickle.load(open('data/plos_biology_dois.list','r')) articles[0][:10] dois[0] corpus = gensim.corpora.MmCorpus('data/plos_biology_corpus.mm') dictionary = dictionary = gensim.corpora.dictionary.Dictionary().load('data/plos_biology.dict') corpus_mat = gensim.matutils.corpus2csc(corpus) corpus_mat = corpus_mat.T print corpus_mat.shape svd = decomposition.TruncatedSVD(n_components=2) corpus_mat_transform = svd.fit_transform(corpus_mat) pyplot.scatter(corpus_mat_transform[:,0], corpus_mat_transform[:,1]) pyplot.scatter(numpy.median(corpus_mat_transform, axis=0)[0], numpy.median(corpus_mat_transform, axis=1)[1], color='red') corpus_mat_transform[corpus_mat_transform[:,0]>150] numpy.where(corpus_mat_transform[:,0]>150) for index in numpy.where(corpus_mat_transform[:,0]>150)[0]: print 'http://www.plosbiology.org/article/info:doi/%s' % dois[index]