#!/usr/bin/env python # coding: utf-8 # In[2]: from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/new/words-6mill/colls/', r'.*\.txt', cat_pattern=r'(\w+)\.txt') # In[3]: reader.categories('gpo.txt') # In[4]: gpowords = reader.words('gpo.txt') # In[5]: len(gpowords) # In[6]: len(set([w.lower() for w in reader.words('gpo.txt')])) # In[7]: import nlt # In[9]: cfd = nltk.ConditionalFreqDist( (genre, word) for genre in ['gpo', 'artstor'] for word in reader.words(categories=genre)) # In[10]: cfd.most_common() # In[16]: fd = nltk.FreqDist((word) for word in reader.words(categories='gpo')) #fd.most_common() fd[1] # In[18]: fd.plot() # In[ ]: cfd = nltk.ConditionalFreqDist( (genre, word) for genre in ['gpo', 'artstor'] for word in reader.words(categories=genre))