import nltk
from nltk.corpus import gutenberg
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Note: you can use gutenberg.sents
to access individual sentences.
Let's select some authors and get some of their works into the training set. Some of the largest number of works in this collection are for Jane Austen and William Shakespeare, so for the rest of the chapter let's stick with those:
author1_train = gutenberg.sents('austen-emma.txt') + gutenberg.sents('austen-persuasion.txt')
print (author1_train)
print (len(author1_train))
[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'], ['VOLUME', 'I'], ...] 11499
author1_test = gutenberg.sents('austen-sense.txt')
print (author1_test)
print (len(author1_test))
[['[', 'Sense', 'and', 'Sensibility', 'by', 'Jane', 'Austen', '1811', ']'], ['CHAPTER', '1'], ...] 4999
author2_train = gutenberg.sents('shakespeare-caesar.txt') + gutenberg.sents(
print (author2_train)
print (len(author2_train))
[['[', 'The', 'Tragedie', 'of', 'Julius', 'Caesar', 'by', 'William', 'Shakespeare', '1599', ']'], ['Actus', 'Primus', '.'], ...] 5269
author2_test = gutenberg.sents('shakespeare-macbeth.txt')
print (author2_test)
print (len(author2_test))
[['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ...] 1907
Finally, let's check if the two authors produce markedly different texts: estimate the average number of characters per word, number of words per sentence, and diversity of author's vocabulary – average number of times each word occurs in a text by the author:
def statistics(gutenberg_data):
for work in gutenberg_data:
num_chars = len(gutenberg.raw(work))
num_words = len(gutenberg.words(work))
num_sents = len(gutenberg.sents(work))
num_vocab = len(set(w.lower() for w in gutenberg.words(work)))
print(round(num_chars/num_words), # average word length in characters
round(num_words/num_sents), # average sentence length in words
round(num_words/num_vocab), # average number of times each word occurs uniquely
gutenberg_data = ['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt',
'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt']
5 25 26 austen-emma.txt 5 26 17 austen-persuasion.txt 5 28 22 austen-sense.txt 4 12 9 shakespeare-caesar.txt 4 12 8 shakespeare-hamlet.txt 4 12 7 shakespeare-macbeth.txt
To fairly test generalization behavior, let's set additional test data from within the same set of works as we are training the algorithm on. By comparing the algorithm's performance on the set of sentences coming from the same literary works to its performances on a different set of works, you will be able to tell how well the algorithm generalizes above the words it has seen in the training data.
First, put the sentences with the author labels together:
all_sents = [(sent, "austen") for sent in author1_train]
all_sents += [(sent, "shakespeare") for sent in author2_train]
print (f"Dataset size = {str(len(all_sents))} sentences")
Dataset size = 16768 sentences
Next, shuffle the data and split it keeping the proportion of the author-speficic data consistent across the training and the same-data testing set. Let's call the test set coming from the same data pre-test
import random
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
values = [author for (sent, author) in all_sents]
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
strat_train_set = []
strat_pretest_set = []
for train_index, pretest_index in split.split(all_sents, values):
strat_train_set = [all_sents[index] for index in train_index]
strat_pretest_set = [all_sents[index] for index in pretest_index]
Let's check that the proportions are kept the same across the two data portions:
def cat_proportions(data, cat):
count = 0
for item in data:
if item[1]==cat:
count += 1
return float(count) / float(len(data))
categories = ["austen", "shakespeare"]
rows = []
rows.append(["Category", "Overall", "Stratified train", "Stratified pretest"])
for cat in categories:
rows.append([cat, f"{cat_proportions(all_sents, cat):.6f}",
f"{cat_proportions(strat_train_set, cat):.6f}",
f"{cat_proportions(strat_pretest_set, cat):.6f}"])
columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
print(''.join(' {:{width}} '.format(row[i], width=column_widths[i])
for i in range(0, len(row))))
Category Overall Stratified train Stratified pretest austen 0.685771 0.685776 0.685748 shakespeare 0.314229 0.314224 0.314252
Now also initialize the test set in the same way, by adding author labels to the sentences:
test_set = [(sent, "austen") for sent in author1_test]
test_set += [(sent, "shakespeare") for sent in author2_test]
rows = []
rows.append(["Category", "Overall", "Stratified train", "Stratified pretest", "Test"])
for cat in categories:
rows.append([cat, f"{cat_proportions(all_sents, cat):.6f}",
f"{cat_proportions(strat_train_set, cat):.6f}",
f"{cat_proportions(strat_pretest_set, cat):.6f}",
f"{cat_proportions(test_set, cat):.6f}"])
columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
print(''.join(' {:{width}} '.format(row[i], width=column_widths[i])
for i in range(0, len(row))))
Category Overall Stratified train Stratified pretest Test austen 0.685771 0.685776 0.685748 0.723863 shakespeare 0.314229 0.314224 0.314252 0.276137
Naive Bayes model from Chapter 2 with words as features can be used as a reasonable approach to set up the benchmark result. Let's first extract the word features:
def get_features(text):
features = {}
word_list = [word for word in text]
for word in word_list:
features[word] = True
return features
train_features = [(get_features(sents), label) for (sents, label) in strat_train_set]
pretest_features = [(get_features(sents), label) for (sents, label) in strat_pretest_set]
13414 {'Pol': True, '.': True} {'And': True, 'as': True, 'to': True, 'my': True, 'father': True, ',': True, 'I': True, 'really': True, 'should': True, 'not': True, 'have': True, 'thought': True, 'that': True, 'he': True, 'who': True, 'has': True, 'kept': True, 'himself': True, 'single': True, 'so': True, 'long': True, 'for': True, 'our': True, 'sakes': True, 'need': True, 'be': True, 'suspected': True, 'now': True, '.': True}
Now train NLTK's Naive Bayes classifier on the training data and test it on the pretest data:
from nltk import NaiveBayesClassifier, classify
print (f"Training set size = {str(len(train_features))} sentences")
print (f"Pretest set size = {str(len(pretest_features))} sentences")
# train the classifier
classifier = NaiveBayesClassifier.train(train_features)
print (f"Accuracy on the training set = {str(classify.accuracy(classifier, train_features))}")
print (f"Accuracy on the pretest set = {str(classify.accuracy(classifier, pretest_features))}")
# check which words are most informative for the classifier
Training set size = 13414 sentences Pretest set size = 3354 sentences Accuracy on the training set = 0.9786789920978083 Accuracy on the pretest set = 0.9636255217650567 Most Informative Features been = True austen : shakes = 257.7 : 1.0 King = True shakes : austen = 197.1 : 1.0 thou = True shakes : austen = 191.3 : 1.0 Lord = True shakes : austen = 61.2 : 1.0 doth = True shakes : austen = 60.4 : 1.0 d = True shakes : austen = 58.9 : 1.0 quite = True austen : shakes = 55.6 : 1.0 Tis = True shakes : austen = 51.6 : 1.0 She = True austen : shakes = 43.2 : 1.0 think = True austen : shakes = 39.9 : 1.0 back = True austen : shakes = 34.4 : 1.0 has = True austen : shakes = 34.2 : 1.0 father = True austen : shakes = 32.3 : 1.0 coming = True austen : shakes = 29.5 : 1.0 moment = True austen : shakes = 29.1 : 1.0 looking = True austen : shakes = 28.6 : 1.0 l = True shakes : austen = 28.4 : 1.0 mind = True austen : shakes = 28.3 : 1.0 far = True austen : shakes = 26.1 : 1.0 years = True austen : shakes = 25.8 : 1.0 known = True austen : shakes = 25.5 : 1.0 mother = True austen : shakes = 25.5 : 1.0 Nor = True shakes : austen = 25.5 : 1.0 carriage = True austen : shakes = 24.9 : 1.0 hardly = True austen : shakes = 24.9 : 1.0 party = True austen : shakes = 24.5 : 1.0 ere = True shakes : austen = 24.0 : 1.0 few = True austen : shakes = 23.7 : 1.0 account = True austen : shakes = 23.7 : 1.0 poor = True austen : shakes = 23.0 : 1.0 feeling = True austen : shakes = 22.8 : 1.0 she = True austen : shakes = 22.7 : 1.0 among = True austen : shakes = 22.1 : 1.0 brother = True austen : shakes = 21.8 : 1.0 assure = True austen : shakes = 21.2 : 1.0 Brother = True shakes : austen = 21.1 : 1.0 seen = True austen : shakes = 20.6 : 1.0 afterwards = True austen : shakes = 19.7 : 1.0 manners = True austen : shakes = 19.7 : 1.0 Mark = True shakes : austen = 19.6 : 1.0 whether = True austen : shakes = 19.1 : 1.0 care = True austen : shakes = 18.5 : 1.0 mean = True austen : shakes = 18.5 : 1.0 3 = True shakes : austen = 18.2 : 1.0 4 = True shakes : austen = 18.2 : 1.0 Letters = True shakes : austen = 18.2 : 1.0 beginning = True austen : shakes = 17.6 : 1.0 husband = True austen : shakes = 17.6 : 1.0 company = True austen : shakes = 17.3 : 1.0 imagine = True austen : shakes = 17.3 : 1.0
Compare this performance to the performance on the new test set:
test_features = [(get_features(sents), label) for (sents, label) in test_set]
print (f"Test set size = {str(len(test_features))} sentences")
print (f"Accuracy on the test set = {str(classify.accuracy(classifier, test_features))}")
Test set size = 6906 sentences Accuracy on the test set = 0.895742832319722
Let's visualize the accuracy across the three datasets with bar charts using matplotlib
. This will help you better understand the differences in accuracy scores.
%matplotlib inline
import matplotlib.pyplot as plt
a = ["Train", "Pretest", "Test"]
index = range(len(a))
b = [97.87, 96.36, 89.57] # Accuracy scores for the datasets
fig, ax = plt.subplots()
axes = plt.gca()
# Let's set 68 as the lower bound as the majority class baseline is at 68.58 for the original set
axes.set_ylim([68,100]), b, color=['#0A40A4', '#61A4F6', '#DB025B'])
plt.xticks(index, a)
import matplotlib
<function matplotlib.pyplot.legend(*args, **kwargs)>
Let's apply a different classifier – DecisionTreeClassifier
– to this task. This classifier will struggle with this high number of features (over 13K words), so let's try to narrow the number of features down. A useful heuristics is to take into account the words that are neither too frequent (e.g., occur in all or most texts) nor too rare (because they will make for very sparse and therefore not very useful features). Let's first estimate how often each word occurs across all texts, i.e. estimate their document frequencies:
from collections import Counter
words = []
def extract_words(text, words):
words += set([word for word in text])
return words
for (sents, label) in strat_train_set:
words = extract_words(sents, words)
#print(len(set(words))) # =13553
counts = Counter(words)
print(len(set(counts))) # =13553
Counter({'.': 9108, ',': 7126, 'to': 4382, 'the': 4119, 'and': 3996, 'of': 3823, 'a': 3078, 'I': 2967, 'in': 2473, 'not': 2450, ';': 2411, 'was': 2317, 'it': 2269, 'be': 2149, 'that': 1949, '"': 1932, 'you': 1884, 'her': 1877, "'": 1702, 'had': 1595, 'for': 1582, 'she': 1542, 'with': 1489, 'is': 1480, 'but': 1439, 'as': 1381, 'he': 1372, 'have': 1316, 'his': 1241, '."': 1238, 'at': 1178, 'very': 1150, 'all': 1106, 's': 1097, 'him': 1071, 'so': 1028, 'Mr': 1009, 'my': 990, 'could': 957, 'on': 904, 'would': 886, '--': 863, 'me': 856, '?': 856, ':': 853, 'been': 844, 'by': 800, 'were': 791, 'no': 785, 'this': 742, 'which': 725, 'Mrs': 716, 'She': 713, 'do': 709, 'will': 706, '-': 700, '!': 687, 'from': 677, 'must': 672, 'any': 666, 'Emma': 657, 'more': 616, 'The': 613, 'or': 610, 'them': 607, 'He': 598, 'what': 590, 'an': 590, 'are': 586, 'they': 577, 'And': 563, 'much': 561, 'there': 551, 'your': 535, 'It': 531, 'said': 530, 'one': 517, 'than': 514, ...}) Let's visualize how many words occur in more than a certain percentage of documents. To convert raw document counts into percentages of the total, you need to divide the number of documents in which a word occurs (in counts
) by the total number of documents in strat_train_set
. This total equals $13414$.
from numpy import arange
percentages = {}
maximum = float(13414)
# Let's explore the document frequency bands
for item in counts.items():
perc = float(item[1])/maximum
for freq in arange(0.00, 0.05, 0.0125):
if perc>=freq and perc<=freq+0.0125:
freq_range = str(freq)[:6] + "%-" + str(freq+0.0125)[:6] + "%"
percentages[freq_range] = percentages.get(freq_range, 0) + 1
for freq in arange(0.05, 1.00, 0.05):
if perc>=freq and perc<=freq+0.05:
freq_range = str(freq)[:4] + "%-" + str(freq+0.05)[:4] + "%"
percentages[freq_range] = percentages.get(freq_range, 0) + 1
# Print out these frequency bands
for key in sorted(percentages.keys()):
print(key + " texts: " + str(percentages.get(key)) + " words")
0.0%-0.0125% texts: 13355 words 0.0125%-0.025% texts: 84 words 0.025%-0.0375% texts: 33 words 0.0375%-0.05% texts: 22 words 0.05%-0.1% texts: 32 words 0.1%-0.15% texts: 13 words 0.15%-0.2% texts: 6 words 0.2%-0.25% texts: 2 words 0.25%-0.3% texts: 2 words 0.3%-0.35% texts: 2 words 0.5%-0.55% texts: 1 words 0.65%-0.70% texts: 1 words
Now let's visualize word document frequencies. For that, let's first arrange the word frequencies in texts in descending order and assign each word a rank: for example, the most frequent word that occurs in most texts would get a rank of 1, the second most frequent word would get a rank of 2, and so on. Next, let's plot these ranks against the total number of occurrences of each word in documents: for example, the most frequent word (full stop, ".") with rank 1 occurs in $9108$ texts, the word with rank 2 (comma, ",") occurs in $7126$ in total, and so on. Let's also select a word, for instance, "happy" and get its rank.
import operator
# Based on
def visualize(word_doc_map, word):
sorted_map = (sorted(word_doc_map.items(), key=operator.itemgetter(1)))[::-1]
occurrences = []
ranks = []
word_rank = 0
word_frequency = 0
# Find the rank and overall document frequency of all words
rank = 1
for item in sorted_map:
if (item[0] == word):
word_rank = rank
word_frequency = item[1]
rank += 1
# Plot word frequences against their ranks
plt.title("Word document frequencies")
plt.ylabel("Total number of document occurrences")
plt.xlabel("Word ranks (rank of word \"" + word + "\" is " + str(word_rank) + ")")
# Logarithms help present the frequency/rank information concisely
plt.plot(ranks, occurrences)
visualize(counts, "happy")
Frequencies drop very fast, as this graph shows. You can apply logarithmic function to the absolute frequency values to smooth the curve and make the changes in frequencies clearer:
import operator
# Based on
def visualize(word_doc_map, word):
sorted_map = (sorted(word_doc_map.items(), key=operator.itemgetter(1)))[::-1]
occurrences = []
ranks = []
word_rank = 0
word_frequency = 0
# Find the rank and overall document frequency of all words
rank = 1
for item in sorted_map:
if (item[0] == word):
word_rank = rank
word_frequency = item[1]
rank += 1
# Plot word frequences against their ranks
plt.title("Word document frequencies (log)")
plt.ylabel("Total number of document occurrences")
plt.xlabel("Word ranks (rank of word \"" + word + "\" is " + str(word_rank) + ")")
# Logarithms help present the frequency/rank information concisely
plt.loglog(ranks, occurrences, basex=10)
visualize(counts, "happy")
These graphs exemplify Zipf's law – an empirical law formulated by George Kingsley Zipf, which states that the frequency of any word is inversely proportional to its rank in the frequency table. Originally, it states that the most frequent word will occur approximately twice as often as the second most frequent word, three times as often as the third most frequent word, and so on. The particular proportion is a rough estimate and depends on the data (for instance, in this case, you are looking into document rather than total word frequencies, and the ratio between the first and the second ranks is not exactly 1/2), but what matters is that the rank-frequency distribution is an inverse relation. In plain terms, this means that a small amount of very frequent words will occur in most documents, and a much larger amount of words (so-called long tail of the distribution) will be seen very rarely.
One of the problems with this distribution for clasifiers like Decision Trees is that such rare words add to the complexity of the algorithm but not help classification. Here is your first example of feature selection practices: let's filter out rare words and consider as features only the words that occur with in certain proportion or number of documents. The code below uses a minimum frequency threshold of $200$ documents for the word to be considered as a feature, and a maximum frequency threshold of $20\%$ of the texts. Feel free to modify these values.
from nltk import DecisionTreeClassifier
maximum = float(13414)
selected_words = []
for item in counts.items():
count = float(item[1])
if count > 200 and count/maximum < 0.2:
def get_features(text, selected_words):
features = {}
word_list = [word for word in text]
for word in word_list:
if word in selected_words:
features[word] = True
return features
train_features = [(get_features(sents, selected_words), label) for (sents, label)
in strat_train_set]
pretest_features = [(get_features(sents, selected_words), label) for (sents, label)
in strat_pretest_set]
test_features = [(get_features(sents, selected_words), label) for (sents, label)
in test_set]
classifier = DecisionTreeClassifier.train(train_features)
print (f"Accuracy on the training set = {str(classify.accuracy(classifier, train_features))}")
print (f"Accuracy on the pretest set = {str(classify.accuracy(classifier, pretest_features))}")
print (f"Accuracy on the test set = {str(classify.accuracy(classifier, test_features))}")
166 Accuracy on the training set = 0.8099746533472492 Accuracy on the pretest set = 0.7960644007155635 Accuracy on the test set = 0.8066898349261512
Finally, let's visualize these accuracy scores. Note that despite the results being overall lower than those you achieved with the benchmark model, they are consistent across all three datasets, which shows that the classifier generalizes well.
a = ["Train", "Pretest", "Test"]
index = range(len(a))
b = [81.00, 79.64, 80.79] # Accuracy scores for the datasets
fig, ax = plt.subplots()
axes = plt.gca()
# Let's set 68 as the lower bound as the majority class baseline is at 68.58 for the original set
axes.set_ylim([68,100]), b, color=['#0A40A4', '#61A4F6', '#DB025B'])
plt.xticks(index, a)
import matplotlib
<function matplotlib.pyplot.legend(*args, **kwargs)>
#average word length in characters
def avg_number_chars(text):
total_chars = 0.0
for word in text:
total_chars += len(word)
return float(total_chars)/float(len(text))
#length in terms of words
def number_words(text):
return float(len(text))
print(avg_number_chars(["Not", "so", "happy", ",", "yet", "much", "happyer"]))
print(number_words(["Not", "so", "happy", ",", "yet", "much", "happyer"]))
3.5714285714285716 7.0
Represent all data sets with their feature sets. You will need to initialize a feature set for each text and map it to the author. In addition, let's switch to numerical representation of author labels:
def initialize_dataset(source):
all_features = []
targets = []
for (sent, label) in source:
if label=="austen": targets.append(0)
else: targets.append(1)
return all_features, targets
train_data, train_targets = initialize_dataset(strat_train_set)
pretest_data, pretest_targets = initialize_dataset(strat_pretest_set)
test_data, test_targets = initialize_dataset(test_set)
print (len(train_data), len(train_targets))
print (len(pretest_data), len(pretest_targets))
print (len(test_data), len(test_targets))
13414 13414 3354 3354 6906 6906
Now apply classification with the sklearn
's Decision Trees
from sklearn.tree import DecisionTreeClassifier
text_clf = DecisionTreeClassifier(random_state=42), train_targets)
predicted = text_clf.predict(pretest_data)
Run evaluation including the following metrics: accuracy, confusion matrix, precision, recall and F1:
import numpy as np
from sklearn import metrics
def evaluate(predicted, targets):
print(np.mean(predicted == targets))
print(metrics.confusion_matrix(targets, predicted))
print(metrics.classification_report(targets, predicted))
evaluate(predicted, pretest_targets)
0.7975551580202743 [[2133 167] [ 512 542]] precision recall f1-score support 0 0.81 0.93 0.86 2300 1 0.76 0.51 0.61 1054 accuracy 0.80 3354 macro avg 0.79 0.72 0.74 3354 weighted avg 0.79 0.80 0.78 3354
And on the test set:
predicted = text_clf.predict(test_data)
evaluate(predicted, test_targets)
0.8049522154648132 [[4605 394] [ 953 954]] precision recall f1-score support 0 0.83 0.92 0.87 4999 1 0.71 0.50 0.59 1907 accuracy 0.80 6906 macro avg 0.77 0.71 0.73 6906 weighted avg 0.80 0.80 0.79 6906
This looks like a much more generalizable set of features – the performance on both set is very close ($0.7973$ vs. $0.8050$). Besides, it contains only $2$ features as opposed to over $13K$ words! However, now the performance is much lower than with words. Let's try and improve it further. Let's visualize these results with matplotlib
# Adapted from:
pretestAcc = (96.36, 79.72)
testAcc = (89.57, 80.49)
ind = np.arange(len(pretestAcc)) # the x locations for the groups
width = 0.2 # the width of the bars
fig, ax = plt.subplots()
rects1 = - width/2, pretestAcc, width, label='Pretest', color='#61A4F6')
rects2 = + width/2, testAcc, width, label='Test', color='#DB025B')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy scores')
ax.set_title('Scores by feature set and data set')
ax.set_xticklabels(('Benchmark', 'F1-2'))
Feature type 3 – count of stopwords: Add spaCy
functionality and see how only a handful of frequent words are distributed in texts:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_md')
# a very general method that can be applied to any type of words
def word_counts(text):
counts = {}
for word in text:
counts[word.lower()] = counts.get(word.lower(), 0) + 1
return counts
Now let's augment our feature set with the counts of stopwords only:
def initialize_dataset(source):
all_features = []
targets = []
for (sent, label) in source:
counts = word_counts(sent)
for word in STOP_WORDS:
if word in counts.keys():
if label=="austen": targets.append(0)
else: targets.append(1)
return all_features, targets
train_data, train_targets = initialize_dataset(strat_train_set)
pretest_data, pretest_targets = initialize_dataset(strat_pretest_set)
test_data, test_targets = initialize_dataset(test_set)
print (len(train_data), len(train_targets))
print (len(pretest_data), len(pretest_targets))
print (len(test_data), len(test_targets))
13414 13414 3354 3354 6906 6906
Now train and test on both pretest and test data:
text_clf = DecisionTreeClassifier(random_state=42), train_targets)
predicted = text_clf.predict(pretest_data)
evaluate(predicted, pretest_targets)
predicted = text_clf.predict(test_data)
evaluate(predicted, test_targets)
0.8127608825283243 [[1967 333] [ 295 759]] precision recall f1-score support 0 0.87 0.86 0.86 2300 1 0.70 0.72 0.71 1054 accuracy 0.81 3354 macro avg 0.78 0.79 0.78 3354 weighted avg 0.81 0.81 0.81 3354 0.8087170576310455 [[4225 774] [ 547 1360]] precision recall f1-score support 0 0.89 0.85 0.86 4999 1 0.64 0.71 0.67 1907 accuracy 0.81 6906 macro avg 0.76 0.78 0.77 6906 weighted avg 0.82 0.81 0.81 6906
There is a slight improvement in accuracy of about $0.005$ (or $0.5\%$), up to $0.812$-$0.815$ and $0.806$-$0.81$. However, what is most interesting about this feature is that there is now a more considerable improvement in performance metrics on the minority class (shakespeare): on the pretest set recall rises from $0.51$ to $0.72$ and F1 from $0.61$ to $0.71$ – a whole of $10$ points; on the test set the improvement in recall is from $0.50$ to $0.71$ and in F1 from $0.59$ to $0.67$.
pretestAcc = (96.36, 79.72, 81.18) # use the last accuracy score as the last value
testAcc = (89.57, 80.49, 80.96) # use the last accuracy score as the last value
ind = np.arange(len(pretestAcc)) # the x locations for the groups
width = 0.25 # the width of the bars
fig, ax = plt.subplots()
rects1 = - width/2, pretestAcc, width, label='Pretest', color='#61A4F6')
rects2 = + width/2, testAcc, width, label='Test', color='#DB025B')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy scores')
ax.set_title('Scores by feature set and data set')
ax.set_xticklabels(('Benchmark', 'F1-2', 'F1-3'))
Feature type 4 – proportion of stopwords: Estimate what proportion of words in sentence are stopwords:
def proportion_words(text, wordlist):
count = 0
for word in text:
if word.lower() in wordlist:
count += 1
return float(count)/float(len(text))
def initialize_dataset(source):
all_features = []
targets = []
for (sent, label) in source:
counts = word_counts(sent)
for word in STOP_WORDS:
if word in counts.keys():
feature_list.append(proportion_words(sent, STOP_WORDS))
if label=="austen": targets.append(0)
else: targets.append(1)
return all_features, targets
train_data, train_targets = initialize_dataset(strat_train_set)
pretest_data, pretest_targets = initialize_dataset(strat_pretest_set)
test_data, test_targets = initialize_dataset(test_set)
print (len(train_data), len(train_targets))
print (len(pretest_data), len(pretest_targets))
print (len(test_data), len(test_targets))
13414 13414 3354 3354 6906 6906
As before, train and test on both pretest and test data:
text_clf = DecisionTreeClassifier(random_state=42), train_targets)
predicted = text_clf.predict(pretest_data)
evaluate(predicted, pretest_targets)
predicted = text_clf.predict(test_data)
evaluate(predicted, test_targets)
0.8106738223017292 [[1985 315] [ 320 734]] precision recall f1-score support 0 0.86 0.86 0.86 2300 1 0.70 0.70 0.70 1054 accuracy 0.81 3354 macro avg 0.78 0.78 0.78 3354 weighted avg 0.81 0.81 0.81 3354 0.8124818997972777 [[4275 724] [ 571 1336]] precision recall f1-score support 0 0.88 0.86 0.87 4999 1 0.65 0.70 0.67 1907 accuracy 0.81 6906 macro avg 0.77 0.78 0.77 6906 weighted avg 0.82 0.81 0.81 6906
We see an even further small improvement: $0.812$-$0.815$ and $0.815$-$0.819$. Moreover, performance on both pretest and test sets is very similar now. However, overall perfomance is still not as good as what you've got with words, so let's keep going. Let's plot these values, too:
pretestAcc = (96.36, 79.72, 81.22) # use the last accuracy score as the last value
testAcc = (89.57, 80.49, 81.83) # use the last accuracy score as the last value
ind = np.arange(len(pretestAcc)) # the x locations for the groups
width = 0.25 # the width of the bars
fig, ax = plt.subplots()
rects1 = - width/2, pretestAcc, width, label='Pretest', color='#61A4F6')
rects2 = + width/2, testAcc, width, label='Test', color='#DB025B')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy scores')
ax.set_title('Scores by feature set and data set')
ax.set_xticklabels(('Benchmark', 'F1-2', 'F1-4'))
Feature type 5 – proportion of words of specific parts of speech: Just like you added proportion of stopwords, add proportions of words of specific parts of speech. For that, first, preprocess the sentences and for each of them keep a dictionary mapping each sentence to its spaCy
's representation with all language-related fields (this might take some time due to processing run by spaCy
, so let's add some code to track how many sentences have been processed):
def preprocess(source):
source_docs = {}
index = 0
for (sent, label) in source:
text = " ".join(sent)
source_docs[text] = nlp(text)
if index>0 and (index%2000)==0:
print(str(index) + " texts processed")
index += 1
print("Dataset processed")
return source_docs
train_docs = preprocess(strat_train_set)
pretest_docs = preprocess(strat_pretest_set)
test_docs = preprocess(test_set)
2000 texts processed 4000 texts processed 6000 texts processed 8000 texts processed 10000 texts processed 12000 texts processed Dataset processed 2000 texts processed Dataset processed 2000 texts processed 4000 texts processed 6000 texts processed Dataset processed
Now add the PoS distributions as features:
from collections import Counter
pos_list = ["C", "D", "E", "F", "I", "J", "M", "N", "P", "R", "T", "U", "V", "W"]
def pos_counts(text, source_docs, pos_list):
pos_counts = {}
doc = source_docs.get(" ".join(text))
tags = []
for word in doc:
counts = Counter(tags)
for pos in pos_list:
if pos in counts.keys():
pos_counts[pos] = counts.get(pos)
else: pos_counts[pos] = 0
return pos_counts
def initialize_dataset(source, source_docs):
all_features = []
targets = []
for (sent, label) in source:
counts = word_counts(sent)
for word in STOP_WORDS:
if word in counts.keys():
feature_list.append(proportion_words(sent, STOP_WORDS))
p_counts = pos_counts(sent, source_docs, pos_list)
for pos in p_counts.keys():
if label=="austen": targets.append(0)
else: targets.append(1)
return all_features, targets
train_data, train_targets = initialize_dataset(strat_train_set, train_docs)
pretest_data, pretest_targets = initialize_dataset(strat_pretest_set, pretest_docs)
test_data, test_targets = initialize_dataset(test_set, test_docs)
print (len(train_data), len(train_targets))
print (len(pretest_data), len(pretest_targets))
print (len(test_data), len(test_targets))
13414 13414 3354 3354 6906 6906
And, as before, train, test and evaluate:
text_clf = DecisionTreeClassifier(random_state=42), train_targets)
predicted = text_clf.predict(pretest_data)
evaluate(predicted, pretest_targets)
predicted = text_clf.predict(test_data)
evaluate(predicted, test_targets)
0.8208109719737626 [[1999 301] [ 300 754]] precision recall f1-score support 0 0.87 0.87 0.87 2300 1 0.71 0.72 0.72 1054 accuracy 0.82 3354 macro avg 0.79 0.79 0.79 3354 weighted avg 0.82 0.82 0.82 3354 0.8284100781928757 [[4326 673] [ 512 1395]] precision recall f1-score support 0 0.89 0.87 0.88 4999 1 0.67 0.73 0.70 1907 accuracy 0.83 6906 macro avg 0.78 0.80 0.79 6906 weighted avg 0.83 0.83 0.83 6906
An improvement with this feature reaches $0.82$-$0.83$ on both sets.
For convenience, let's pack up the datasets initialization, training, testing and evaluation into a method, since we don't change any code in this bit:
def run():
train_data, train_targets = initialize_dataset(strat_train_set, train_docs)
pretest_data, pretest_targets = initialize_dataset(strat_pretest_set, pretest_docs)
test_data, test_targets = initialize_dataset(test_set, test_docs)
print (len(train_data), len(train_targets))
print (len(pretest_data), len(pretest_targets))
print (len(test_data), len(test_targets))
print ()
text_clf = DecisionTreeClassifier(random_state=42), train_targets)
predicted = text_clf.predict(pretest_data)
evaluate(predicted, pretest_targets)
predicted = text_clf.predict(test_data)
evaluate(predicted, test_targets)
13414 13414 3354 3354 6906 6906 0.8208109719737626 [[1999 301] [ 300 754]] precision recall f1-score support 0 0.87 0.87 0.87 2300 1 0.71 0.72 0.72 1054 accuracy 0.82 3354 macro avg 0.79 0.79 0.79 3354 weighted avg 0.82 0.82 0.82 3354 0.8284100781928757 [[4326 673] [ 512 1395]] precision recall f1-score support 0 0.89 0.87 0.88 4999 1 0.67 0.73 0.70 1907 accuracy 0.83 6906 macro avg 0.78 0.80 0.79 6906 weighted avg 0.83 0.83 0.83 6906
Let's plot the improvements in the results:
pretestAcc = (96.36, 79.72, 81.22, 83.10) # use the last accuracy score as the last value
testAcc = (89.57, 80.49, 81.83, 82.54) # use the last accuracy score as the last value
ind = np.arange(len(pretestAcc)) # the x locations for the groups
width = 0.25 # the width of the bars
fig, ax = plt.subplots()
rects1 = - width/2, pretestAcc, width, label='Pretest', color='#61A4F6')
rects2 = + width/2, testAcc, width, label='Test', color='#DB025B')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy scores')
ax.set_title('Scores by feature set and data set')
ax.set_xticklabels(('Benchmark', 'F1-2', 'F1-4', 'F1-5'))
Let's add further linguistic feature – for instance, suffixes that are already stored in the docs
Feature type 6 – count selected suffixes: As the number of suffixes will be quite large (smaller than the number of words, though), let's set a cutoff point to, e.g. the top $40\%$ of the suffixes:
import operator
def select_suffixes(cutoff):
all_suffixes = []
for doc in train_docs.values():
for word in doc:
counts = Counter(all_suffixes)
sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
selected_suffixes = []
for i in range(0, round(len(counts)*cutoff)):
return selected_suffixes
selected_suffixes = select_suffixes(0.4)
577 [',', '.', 'the', 'and', 'to', 'ing', 'of', 'her', '"', 'a', 'i', 'hat', 'it', 'in', ';', 'was', 'not', 'she', 'you', 'his', 'uld', 'be', 'he', 'ere', 'had', "'", 'as', '--', 'all', 'ion', 'but', 'for', 'ith', 'ery', 'is', 'ave', 'ent', 'ill', 'nce', 'ght', 'ter', 'at', 'my', 'our', 'so', 'him', 'een', '?', 's', 'uch', 'ore', 'ome', 'mr', 'ver', 'are', 'ted', 'one', ':', 'ble', 'ell', 'no', 'on', 'now', 'any', 'ust', 'by', 'me', '!', 'hen', 'hey', 'out', 'ess', 'ich', '-', 'do', 'ure', 'mrs', 'ain', 'rom', 'elf', 'red', 'or', 'ood', 'if', 'mma', 'use', 'aid', 'hem', 'ton', 'ely', 'sed', 'own', 'est', 'man', 'an', 'ost', 'ake', 'ers', 'ear', 'nly', '_', 'we', 'iss', 'ned', 'ugh', '.--', 'lly', 'eir', 'ied', 'end', 'am', 'han', 'ons', 'did', 'ame', 'can', 'tle', 'ite', 'ose', 'ate', 'tly', 'how', 'ant', 'ard', 'ast', 'ong', 'ive', 'ity', 'who', 'ine', 'ved', 'eat', 'ect', 'iet', 'nne', 'ded', 'ink', 'way', 'ord', 'ous', 'ays', 'ime', 'ked', 'ady', 'say', 'age', 'ike', 'old', 'ies', 'eed', 'ngs', 'too', 'd', 'ind', 'les', 'ade', 'der', 'aue', 'see', 'med', 'ort', 'ile', 'day', 'rst', 'ley', 'rth', 'its', 'ace', 'may', 'art', 'ove', 'ody', 'let', 'ice', 'nds', 'ane', 'ful', 'led', 'ish', 'oon', 'has', 'und', 'yes', 'nts', 'ven', 'tes', 'ath', 'pon', 'rds', 'rry', 'hed', 'ged', 'ary', 'ead', 'son', 'ple', 'nto', 'two', 'mes', 'ily', 'go', 'ife', 'ten', 'ung', 'ook', 'ise', 'ner', 'ius', 'ral', 'sir', 'iot', 'dly', 'rly', 'rse', 'ank', 'oor', 'ger', 'rty', 'up', '(', 'ide', ')', 'men', 'ree', 'low', 'ves', 'ced', 'new', 'oth', 'yet', 'fax', 'res', 'wer', 'eal', 'ses', 'oes', 'sse', 'ars', 'ase', 'ces', 'eld', 'ual', 'ppy', 'ope', 'oom', 'ole', 'ond', 'elt', 'hou', 'ken', 'per', 'off', 'bly', 'alk', 'ese', 'lay', 'ick', 'us', 'sit', 'unt', 'thy', 'tus', 'oke', 'why', 'iue', 'ire', 'nor', 'alf', 'saw', 'sar', 'ack', 'nge', 'kes', 'wed', 'int', 'oh', 'eet', 'hee', 'ach', 'get', 'eak', 'ber', 'dge', 'few', 'lls', 'row', 'air', 'aps', 'lfe', 'urs', 'rld', 'wne', 'nes', 'uth', 'tch', 'ull', 'eve', 'rue', 'arm', 'o', 'ury', 'eel', 'cke', 'lse', 'urn', 'ene', 'set', '`', 'ety', 'dea', 'oue', 'ubt', 'nse', 'try', 'eft', 'put', 'ues', 'uer', 'vs', 'ins', 'hts', 'ude', 'ped', 'eth', 'tty', 'ony', 'isa', 't', 'nst', 'oss', 'mer', 'ept', 'ued', 'ans', 'hom', 'uen', 'ext', 'act', 'far', 'don', 'lar', 'rge', 'sly', 'nke', 'hes', 'ors', 'ret', 'ren', 'bad', 'nch', 'ohn', 'gly', 'lad', 'une', 'rit', 'ory', 'igh', 'tis', 'ior', 'hin', 'pen', 'eep', 'rve', 'nty', 'ild', 'gan', 'wes', 'tay', 'hew', 'tin', 'rke', 'ial', 'ges', 'oft', 'fer', 'rne', 'yed', 'des', 'ote', 'nal', 'ray', 'uty', 'ert', 'ste', 'nde', 'den', 'ets', 'ean', 'ows', 'ncy', 'cle', 'hus', 'hip', 'cts', 'ist', 'lla', 'tta', 'mon', 'got', 'hly', 'ews', 'eye', 'th', 'ize', 'ule', 'vp', 'lth', 'epe', 'ete', 'met', 'ult', 'yme', 'rts', 'lle', 'pes', 'st', 'eem', 'uck', 'fit', 'ute', 'ply', 'nel', 'vil', 'doe', 'eek', 'ier', 'elp', 'gin', 'oks', 'lor', 'ths', 'tie', 'bed', 'asy', 'ens', 'ech', 'val', 'gue', 'ass', 'ale', 'dow', 'acy', 'eme', 'zed', 'rew', 'ark', 'ape', 'irl', 'rer', 'ool', 'uit', 'mpt', 'lty', 'yle', 'irs', 'ece', 'rms', 'uct', 'xed', 'ern', 'iew', 'ago', 'ney', 'oad', 'afe', 'ask', 'rme', 'rch', 'rre', 'ork', 'ief', 'oms', 'xon', 'nay', 'rie', 'cal', 'nks', 'boy', 'joy', 'eks', 'ods', 'sts', 'god', 'l', 'mbe', 'sat', 'als', 'uce', 'bey', 'lue', 'ler', 'lan', 'apa', 'ush', 'lis', 'orm', 'ock', 'sea', 'gle', '&', 'dle', 'ait', 'gth', 're', 'due', 'oin', 'oof', 'sin', 'ims', 'top', 'ems', 'law', 'die', 'uly', 'tio', 'tal', 'eas', 'sad', 'mit', 'ian', 'mad', 'lso', 'fts', 'nth', 'oud', 'run', 'wee', 'gun', 'lts', 'usy', 'sic', 'gry', 'iam', 'ees', 'aes', 'els', 'iod', 'ror', 'esh', 'ume', 'rce', 'tor', 'bid', 'lye', 'eds', 'awn', 'gge', 'ker', 'lke', 'raw', 'ha', 'oot', 'lds', 'ser', 'odd', 'ska', 'rls', 'aim', 'dit', 'mly', 'pay', 'sty', 'erd', 'ilt', 'ede', 'dom', 'ils', 'tic', 'tea', 'lia', 'box', 'bit', 'net', 'cut', 'oat', 'sal', 'ash']
def suffix_counts(text, source_docs, suffix_list):
suffix_counts = {}
doc = source_docs.get(" ".join(text))
suffixes = []
for word in doc:
counts = Counter(suffixes)
for suffix in suffix_list:
if suffix in counts.keys():
suffix_counts[suffix] = counts.get(suffix)
else: suffix_counts[suffix] = 0
return suffix_counts
def initialize_dataset(source, source_docs):
all_features = []
targets = []
for (sent, label) in source:
counts = word_counts(sent)
for word in STOP_WORDS:
if word in counts.keys():
feature_list.append(proportion_words(sent, STOP_WORDS))
p_counts = pos_counts(sent, source_docs, pos_list)
for pos in p_counts.keys():
s_counts = suffix_counts(sent, source_docs, selected_suffixes)
for suffix in s_counts.keys():
if label=="austen": targets.append(0)
else: targets.append(1)
return all_features, targets
13414 13414 3354 3354 6906 6906 0.9543828264758497 [[2218 82] [ 71 983]] precision recall f1-score support 0 0.97 0.96 0.97 2300 1 0.92 0.93 0.93 1054 accuracy 0.95 3354 macro avg 0.95 0.95 0.95 3354 weighted avg 0.95 0.95 0.95 3354 0.9501882421083117 [[4815 184] [ 160 1747]] precision recall f1-score support 0 0.97 0.96 0.97 4999 1 0.90 0.92 0.91 1907 accuracy 0.95 6906 macro avg 0.94 0.94 0.94 6906 weighted avg 0.95 0.95 0.95 6906
This feature brings the largest improvement: up to $0.954$-$0.956$ and $0.952$-$0.954$.
pretestAcc = (96.36, 79.72, 81.22, 83.10, 95.47) # use the last accuracy score as the last value
testAcc = (89.57, 80.49, 81.83, 82.54, 95.34) # use the last accuracy score as the last value
ind = np.arange(len(pretestAcc)) # the x locations for the groups
width = 0.3 # the width of the bars
fig, ax = plt.subplots()
rects1 = - width/2, pretestAcc, width, label='Pretest', color='#61A4F6')
rects2 = + width/2, testAcc, width, label='Test', color='#DB025B')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy scores')
ax.set_title('Scores by feature set and data set')
ax.set_xticklabels(('Benchmark', 'F1-2', 'F1-4', 'F1-5', 'F1-6'))
Finally, let's collect specific (non-overlapping) vocabularies per each author.
Feature type 7 – count words that are specific for each author: First collect the set of words that is unique for each author (i.e., the words that only Shakespeare or only Austen uses) and then count their occurrences across the data sets. You can introduce a cutoff as before and only use, e.g., top $50\%$ of the words:
def unique_vocabulary(label1, label2, cutoff):
voc1 = []
voc2 = []
for (sent, label) in strat_train_set:
if label==label1:
for word in sent:
elif label==label2:
for word in sent:
counts1 = Counter(voc1)
sorted_counts1 = sorted(counts1.items(), key=operator.itemgetter(1), reverse=True)
counts2 = Counter(voc2)
sorted_counts2 = sorted(counts2.items(), key=operator.itemgetter(1), reverse=True)
unique_voc = []
for i in range(0, round(len(sorted_counts1)*cutoff)):
if not sorted_counts1[i][0] in counts2.keys():
for i in range(0, round(len(sorted_counts2)*cutoff)):
if not sorted_counts2[i][0] in counts1.keys():
return unique_voc
unique_voc = unique_vocabulary("austen", "shakespeare", 0.5)
4435 ['"', 'have', '--', '."', 'mr', 'mrs', 'emma', 'miss', 'than', '.--', ',"', 'only', 'every', 'never', 'harriet', 'anne', 'herself', 'own', 'weston', 'knightley', 'elton', 'again', 'always', 'soon', '!--', '?"', 'captain', 'jane', 'woodhouse', 'dear', 'elliot', 'ever', 'up', 'just', 'having', 'give', 'himself', 'fairfax', 'over', 'upon', 'seemed', 'wentworth', '!"', 'churchill', 'however', '?--', 'even', 'felt', 'really', 'frank', 'us', 'its', 'room', 'half', 'feelings', 'hartfield', 'certainly', 'charles', 'smith', 'bates', 'russell', 'believe', 'love', 'family', 'evening', 'feel', 'walter', ';--', 'hear', 'looked', 'idea', 'deal', 'acquaintance', 'myself', 'highbury', 'down', 'between', 'musgrove', 'mary', 'hour', 'subject', '`', 'louisa', 'perfectly', 'suppose', 'under', 'general', 'obliged', 'happiness', 'able', 'wanted', 'replied', 'given', 'john', 'talked', 'passed', 'elizabeth', '--"', 'understand', 'nobody', 'leave', 'kind', 'less', 'interest', 'near', 'attention', 'situation', 'randalls', 'martin', 'agreeable', 'walked', 'perry', 'chapter', 'afraid', 'equal', 'extremely', 'gave', 'themselves', 'attachment', 'natural', 'kellynch', 'business', 'henrietta', 'wished', 'usual', 'talking', 'door', 'called', 'minutes', 'conversation', 'used', 'benwick', 'days', 'child', 'degree', 'appeared', 'object', 'uppercross', 'isabella', 'use', 'particularly', 'pleased', 'received', 'harville', 'superior', 'means', 'different', 'lyme', 'admiral', 'goddard', 'took', 'colonel', 'son', 'help', 'returned', 'yourself', 'forward', 'hoped', 'asked', 'giving', 'air', 'cole', 'real', 'added', 'society', 'handsome', 'croft', 'expected', 'believed', 'continued', 'year', 'settled', 'kindness', 'girl', 'four', 'weather', 'anything', 'supposed', 'five', 'appearance', 'entirely', 'engaged', 'anxious', 'fond', 'probably', 'turned', 'campbell', 'engagement', 'london', 'lived', 'hours', 'circumstances', 'taylor', 'delighted', 'pain', 'advantage', 'week', 'warm', 'curiosity', 'delightful', 'donwell', 'decided', 'comfortable', 'evil', 'dixon', 'loved', 'everything', 'understanding', 'style', 'persuaded', 'conduct', 'sat', 'amiable', 'easy', 'generally', 'news', 'looks', 'completely', 'self', 'understood', 'fair', 'tone', 'knowing', 'dancing', 'mentioned', ",'", 'obliging', 'plan', 'regret', 'favour', 'living', 'serious', ',--', 'wanting', 'fact', 'aware', 'agreed', 'convinced', 'especially', 'disposed', ',)', 'pass', ".'", 'knows', 'influence', 'happened', 'elegant', 'hayter', 'actually', 'papa', 'wallis', 'gratitude', 'extraordinary', 'observed', 'interesting', 'marrying', 'says', 'justice', 'pity', 'merely', 'absolutely', 'cheerful', 'observe', 'attentions', 'distress', 'clever', 'trying', 'compliment', 'rooms', 'abbey', 'judge', 'charming', 'instead', 'spite', 'enscombe', 'join', 'proof', 'waiting', 'summer', 'whenever', 'calling', 'difficulty', 'seems', 'recollect', 'vain', 'certain', '),', 'live', 'staying', 'useful', 'hint', 'claims', 'greatest', 'event', 'invitation', 'weeks', 'service', 'complete', 'musgroves', 'period', 'enjoyment', 'attached', 'tired', 'grove', 'fully', ';"', 'surprize', 'connexion', 'telling', 'entered', 'distance', 'parties', 'music', 'expect', 'consciousness', 'occupied', 'lately', 'cousin', 'itself', 'mere', 'listened', 'odd', 'beautiful', 'maple', 'judgment', 'mistaken', 'hers', '.--"', 'fixed', 'anxiety', 'cottage', 'resolved', 'written', 'shepherd', 'recommend', 'guess', 'ourselves', 'scarcely', 'equally', 'occurred', 'admitted', 'considering', 'miles', 'liked', 'agitation', 'information', 'consideration', 'wishing', 'desirable', 'difficulties', 'robert', 'plain', 'form', 'tea', 'hurry', 'imagined', 'box', 'remained', 'nearly', 'spent', 'thinks', 'ball', 'girls', 'glance', 'whatever', 'circle', 'rain', 'reached', 'getting', 'dalrymple', 'comprehend', 'spend', 'fortnight', ':--', 'opportunity', 'approbation', 'around', 'move', 'education', 'vanity', 'deserve', 'joined', 'uncle', 'confidence', 'compliments', 'henry', 'favourite', 'breakfast', 'moments', 'dreadful', 'reasonable', 'scheme', 'rational', 'future', 'terms', 'proved', 'opened', 'behaviour', 'neighbourhood', 'joy', 'camden', 'crofts', 'thrown', '_she_', 'severe', 'concern', 'warmth', 'promised', 'smiled', 'returning', 'escape', 'intercourse', 'frederick', 'admired', 'eltons', 'dance', 'confusion', 'ma', 'suspicion', 'busy', 'sick', 'intimacy', 'surprise', 'loss', 'indifference', 'produced', 'chuse', 'amusement', 'encouragement', 'description', 'suspect', 'neighbours', 'charade', 'ideas', 'excepting', 'drew', 'belong', 'totally', 'stopped', 'repeated', 'suit', 'seriously', 'evidently', 'following', 'closed', 'conviction', 'warmly', 'arrived', 'acknowledged', 'observation', 'campbells', 'infinitely', 'differently', 'disappointment', 'inferior', 'elegance', 'sudden', 'exclaimed', 'winter', 'anywhere', 'proposed', 'suffered', 'views', 'boys', 'james', 'somebody', 'respectable', 'intimate', 'required', 'grateful', 'forced', 'seven', 'road', 'disagreeable', 'amused', 'regular', 'prevent', 'clear', 'ways', 'laughing', 'sensations', 'public', 'attending', 'introduced', 'interrupted', 'attend', 'shewed', 'lively', 'receiving', 'deep', 'prepared', 'invited', 'hints', 'explanation', 'chair', 'nonsense', 'servants', 'necessity', 'misery', 'persuade', 'declare', 'avoid', 'decidedly', 'charm', 'comparison', 'hearted', 'alarm', 'dearest', 'possibly', 'favourable', 'properly', 'personal', 'cousins', 'unhappy', 'enjoy', 'considerable', 'gratified', 'feared', 'recovered', 'during', 'leaving', 'book', 'naturally', 'apparent', 'habit', 'arranged', 'likeness', 'improved', 'nurse', 'drawn', 'thoroughly', 'happier', 'six', 'visits', 'dress', 'conscious', 'assistance', 'luck', 'refused', 'solicitude', 'writing', 'christmas', 'prospect', 'settle', 'earnestly', 'motive', 'subjects', 'arrival', 'concerned', 'objection', 'concert', '_her_', 'increased', 'daily', 'evident', 'fancied', 'venture', 'tolerably', 'delicacy', 'material', 'claim', 'assist', 'indulgence', 'continually', 'acknowledge', 'excessively', 'compassion', 'several', 'gradually', 'importance', 'intelligible', 'undoubtedly', 'eat', 'journey', 'judged', 'confess', 'features', 'contrary', 'assured', 'dislike', 'deserved', 'arrangement', 'recollection', 'astonished', 'private', 'intelligence', 'wedding', 'everybody', 'books', 'pianoforte', 'attentive', 'communication', 'eagerness', 'astonishment', 'occur', 'express', 'unpleasant', 'receive', '"--', 'direction', '.,', 'folly', 'employment', 'particulars', 'separate', 'utmost', 'expression', 'above', 'servant', 'induced', 'questions', 'vicarage', 'lucky', 'exquisite', 'drive', 'laughed', 'mortification', 'coles', 'civility', 'relief', 'grown', 'steady', 'judging', 'advice', 'kindly', 'invite', 'persuasion', 'prove', 'superiority', 'appears', 'possibility', 'families', 'suspense', 'interested', 'lovely', 'fancying', '_you_', 'preferred', 'entering', 'accept', 'ford', 'endeavour', 'doors', 'niece', 'played', 'autumn', 'civil', ',"--', '--(', ')--', 'hawkins', 'pretend', 'apples', 'human', 'churchills', 'weymouth', 'shewn', 'connexions', 'admire', 'anybody', 'connected', 'harm', 'domestic', 'dining', 'distinction', 'quitted', 'felicity', 'painful', 'daughters', 'gallantry', 'interference', 'mile', 'buildings', 'cared', 'listening', 'income', 'sufficient', 'united', 'remarkably', 'becoming', 'harvilles', 'gives', 'altered', 'habits', 'observing', 'navy', 'expressed', 'dark', 'wonderful', 'visited', 'affected', 'sentiments', 'illness', 'inn', 'seldom', 'scruple', 'notions', 'frightened', 'readily', 'musical', 'heir', 'support', 'satisfy', 'miserable', 'suspected', 'larkins', 'probable', 'authority', 'soul', 'removal', 'capable', 'pains', 'convince', 'supposing', 'intention', 'silly', 'introduction', 'lives', 'perceive', 'doubtful', 'keeping', 'interval', 'advantages', 'divided', 'rendered', 'uneasy', 'certainty', 'employed', 'valuable', 'nerves', 'affairs', 'success', 'conclusion', 'sentiment', 'principal', 'horror', 'melancholy', 'grandmama', 'tells', 'widow', 'secured', 'dared', 'declared', 'parish', 'tolerable', 'expecting', 'ceased', 'independence', 'add', 'card', 'convey', 'sensation', 'tenderness', 'bye', 'comforts', 'perfection', 'disengaged', 'conveyed', 'consequently', 'bless', 'cobb', 'confined', 'earlier', 'quarrel', 'afford', 'communicated', 'addressing', 'stir', 'square', 'desired', 'encouraging', 'suckling', '_that_', 'pleasantly', 'evils', 'cheeks', 'martins', 'moved', 'pork', 'housekeeper', 'sought', 'november', 'proposal', 'agitated', 'expressions', 'removed', 'ventured', 'carteret', 'unless', 'inquiries', 'lodgings', 'shewing', 'emotion', 'suspicions', 'begged', 'sunk', 'blessed', 'politeness', 'reproach', 'composure', 'delicate', 'village', 'reserve', 'hesitation', 'bustle', 'approach', 'watching', 'grieved', 'pretence', 'presume', 'accepted', 'mistress', 'increase', 'moving', 'farm', 'parlour', 'fairly', 'hair', 'goodness', 'nervous', 'dependence', 'fail', 'ordered', 'accomplished', 'failed', 'liberal', 'clock', 'tempered', 'grave', 'civilities', 'cheerfully', 'fears', 'inclined', 'unjust', 'alarming', 'prevented', 'preference', 'unfortunate', 'frequent', 'probability', 'fortitude', 'save', 'ireland', 'tenant', 'distinguished', 'extreme', 'engage', 'wholly', 'unnecessary', 'comfortably', 'richmond', '_i_', 'continue', 'leading', 'occasionally', 'shock', 'result', 'inconvenience', 'involved', 'parcel', 'independent', 'progress', 'later', 'compare', 'nash', 'syllable', 'strongest', 'active', 'excited', 'neighbour', '_very_', 'reserved', 'exertion', 'choice', 'increasing', 'consolation', 'scruples', 'compared', 'beloved', 'succeeded', 'forming', 'concerns', 'apologies', 'cool', 'amuse', '_not_', 'recommended', 'improvement', 'remaining', 'black', 'suddenly', 'precisely', 'peculiarly', 'sufficiently', 'visitor', 'windows', 'fallen', 'feels', 'tuesday', 'various', 'continual', 'address', 'recommendation', 'partner', 'talents', 'bloom', 'submitted', 'distressing', 'announced', 'uneasiness', 'coldness', 'invitations', 'advise', 'heat', 'selina', 'plans', 'coolly', 'language', 'happiest', 'board', 'speaks', 'hurried', 'rejoice', 'introduce', 'brunswick', 'composed', 'crowd', 'watched', 'symptoms', 'paying', 'merits', 'blind', 'alarmed', 'observations', 'schemes', 'awkward', 'smallridge', 'farmer', 'twelve', 'regrets', 'humoured', "!'", 'resources', 'greatly', 'establishment', 'resolve', 'engaging', 'momentary', 'convenience', 'simplicity', 'nearer', 'occupy', 'trusted', 'expense', 'overcome', 'performance', 'propriety', 'require', 'deceived', 'across', 'excuses', 'anxiously', 'improve', 'brain', 'finished', 'principally', 'excite', 'danced', 'surprised', 'patty', 'accepting', 'mill', 'decision', 'missed', '_him_', 'accordingly', 'mutual', 'unreasonable', 'instance', 'aloud', 'trial', 'dr', 'liking', 'arms', 'positively', 'animated', 'objects', 'amusing', 'encouraged', 'size', 'sink', 'inquire', 'prospects', 'loves', 'suited', 'zeal', 'recovering', 'raised', 'attraction', 'nursery', 'distressed', 'sweetness', 'frequently', '_me_', 'turns', 'affectionate', 'arrive', 'heavy', 'inconvenient', 'displeased', 'applied', 'professed', 'sincerely', 'unwilling', 'guided', 'suggested', 'weak', 'resist', 'distinct', 'roused', 'cheerfulness', 'depended', 'war', 'stopt', 'injury', 'forgive', 'formerly', 'preparing', 'union', 'suspecting', 'correct', 'owed', 'apology', 'ay', 'recollections', 'apparently', 'denying', 'delightfully', 'wondering', 'class', 'fix', 'guessed', 'writes', 'prosperity', 'reconciliation', 'temptation', 'awkwardness', 'broken', 'complaint', 'curate', 'surry', 'fearful', 'mamma', 'female', 'ample', 'approached', 'adjoining', 'cake', 'blunder', 'unfit', 'unfeeling', 'disparity', 'essential', 'previously', 'indignation', 'risk', 'hoping', 'induce', 'september', 'cross', 'employ', 'endured', 'blushed', 'escaped', 'named', 'scrupulous', 'afforded', 'kingston', 'tete', 'latter', 'furniture', 'restored', 'lower', 'perceived', 'existence', 'fifty', "?'", 'continuing', 'principle', 'shirley', 'theirs', 'sinking', 'calm', 'resentment', 'animation', 'pressed', 'heaven', 'quickness', 'youngest', 'forgiven', 'spread', 'reconciled', 'distinctly', 'hastily', 'shook', 'parents', 'jealousy', 'discovery', 'chosen', 'quitting', 'shocked', 'belonged', 'gardens', 'meetings', 'valued', 'plenty', 'unwelcome', 'moreover', 'distinguish', '.\'"', 'cordiality', 'accounts', 'behaved', 'oppose', 'cox', 'prevailed', 'paused', 'alike', 'previous', 'wingfield', 'disgust', 'unwell', 'governess', 'tranquillity', 'plaister', 'shocking', 'baronet', 'winthrop', 'exert', 'difficult', 'spared', 'sincere', 'gruel', 'sixteen', 'yorkshire', 'exploring', 'included', 'informed', 'belonging', 'doubts', 'attempted', 'wet', 'necessarily', 'operation', 'june', 'unworthy', 'sailors', 'travelling', 'talent', 'strongly', 'confused', 'baked', 'weight', 'decent', 'concealment', 'amiss', 'improper', 'cautious', 'dined', 'saturday', 'wealth', 'devoted', 'unexceptionable', 'bragge', 'thousands', 'careful', 'tears', 'running', 'pounds', 'chaise', 'assisted', 'singing', 'softened', 'anticipated', 'modern', 'acquired', 'recover', 'remark', 'honoured', 'reflections', 'pair', 'declined', 'marked', 'unequal', 'visitors', 'neat', 'warmest', 'deeply', 'selfishness', 'requires', 'bit', 'remembered', 'mentioning', 'troublesome', 'discussion', 'related', 'fever', 'lessen', 'questioned', 'additional', 'powerful', 'spending', 'pursuits', 'visiting', 'approved', 'explained', 'unlike', 'wiser', 'consulted', 'footing', 'scenes', 'partial', 'justified', 'lessened', 'laconia', 'ascertain', 'actual', 'bristol', '.)', 'unfortunately', 'grief', 'calmness', 'crossed', 'gay', 'described', 'matrimony', 'gentleness', 'succeeding', 'mischief', 'sincerity', 'imprudence', 'somewhere', 'intelligent', 'constancy', 'prefer', 'addressed', 'complaisance', 'finish', 'wherever', 'wondered', 'relative', 'careless', 'tiresome', 'officer', 'established', 'approaching', 'drove', 'midst', 'misfortune', 'enjoyed', 'conclude', 'treated', 'afternoon', 'jealous', 'expressing', 'procure', 'curious', 'accommodation', 'complain', 'appearing', 'strawberries', 'thankful', 'upright', 'struggle', 'finest', 'describe', 'announce', 'damp', 'intentions', 'chiefly', 'invalid', 'suitable', 'professions', 'reception', 'cards', 'arrangements', '_we_', 'hurrying', 'inviting', 'joke', 'entitled', 'directed', 'stairs', 'exultation', 'promises', 'guests', 'praised', 'dwelt', 'implied', 'appearances', 'pen', 'ungrateful', 'acceptable', 'opposing', 'lover', 'leaning', 'surely', 'hereafter', 'learnt', 'disinterested', 'openly', 'acquit', 'embarrassed', 'talks', 'absolute', 'declaration', 'thoughtful', 'remembering', 'bred', 'gratification', 'polite', 'exclaiming', 'cordial', 'contrived', 'seventeen', 'attach', 'finery', 'ruin', 'leg', 'steadiness', 'deficient', 'prudent', 'abominable', 'applications', 'gloves', 'apprehension', 'thick', 'irish', 'kindest', 'sophy', 'lace', 'westons', 'communications', 'vexation', 'provoking', 'attack', 'possessed', 'pressing', 'design', 'perception', 'hospitality', 'dressed', 'situations', 'foreseen', 'uncertain', 'plea', 'frightful', 'blessings', 'westgate', 'cleared', 'grandeur', 'unable', 'presumption', 'relieved', 'refusal', 'deserted', 'likewise', 'chat', 'sanguine', 'declining', 'enquiries', 'reverse', 'admiring', 'retired', 'raising', 'recollected', 'decide', 'consult', 'utter', 'removing', 'dances', 'blindness', 'materially', 'drawback', 'generosity', 'whispered', 'windsor', 'gaiety', 'arriving', 'rapid', 'filled', 'parade', 'conceal', 'freckles', 'deserving', 'oftener', 'regretted', 'blushing', 'wind', 'slowly', 'shade', 'approve', 'entreated', 'recollecting', 'agony', 'born', 'tall', 'blunders', 'walks', 'sufferings', 'offers', 'estimate', 'prompt', 'settling', 'appeal', 'ear', 'nineteen', 'resolutely', 'firm', 'spectacles', 'sympathy', 'talker', 'principles', 'driving', 'irritation', 'warmer', 'barely', 'knightleys', 'modest', 'harp', 'acquiescence', 'congratulations', 'solicitous', 'february', 'glowing', 'resumed', 'reaching', 'mansion', 'submit', 'provided', 'alive', 'cutting', 'urge', 'abundance', 'natured', 'barouche', 'asp', 'dearer', 'perseverance', 'sensibility', 'alter', 'relations', 'indulge', 'joint', 'conceive', 'motives', 'presumed', 'perpetual', 'bateses', 'belief', 'embarrassment', 'mental', 'exciting', 'positive', 'destroyed', 'accompanied', 'punctually', 'shaken', 'pointed', 'usefulness', 'recommendations', 'clergyman', ';--"', 'explain', 'wholesome', 'extensive', 'gentility', 'sweep', 'example', 'sacrifices', 'consulting', 'rising', 'abruptly', 'quantity', 'slightest', 'destiny', 'model', 'pencil', 'resident', 'coachman', 'deserves', 'relate', '.--`', 'edward', 'unusual', 'pleasanter', 'witnessed', 'portion', 'compassionate', 'convenient', 'judgement', 'gladly', ').', 'group', 'monkford', 'exist', 'display', 'prose', 'unwholesome', 'correspondent', 'gained', 'distinguishing', 'prudence', 'enjoying', 'flow', 'happening', 'marlborough', 'witness', 'wives', 'handed', 'incessant', 'sized', 'cordially', 'welcomed', 'prosperous', 'division', 'entertaining', 'rapidly', 'injustice', 'arise', 'confirmed', 'december', 'taunton', 'plymouth', 'expectations', 'cruel', 'sorrowful', 'volume', 'hopeless', 'sighed', 'surrounded', 'candour', 'tendency', 'breathe', 'attendance', 'promising', 'concealing', 'defer', 'entreaties', 'hesitated', 'exertions', 'unlucky', 'indebted', 'killed', 'twelvemonth', 'bottom', '_one_', 'madness', 'strengthened', 'detained', 'daring', 'obtained', 'renewed', 'contemplation', 'graciously', 'unaffected', 'disturbed', 'calculated', 'introducing', 'shooting', 'w', 'sofa', 'enquired', 'voluntarily', 'forbearance', 'indulged', 'entertained', 'avoiding', 'eligible', 'application', 'safer', 'original', 'deficiency', 'avoided', 'variety', 'comparatively', 'losing', 'spruce', '--`', 'refrain', 'betrayed', 'confident', 'friendliness', 'suits', 'glimpse', 'game', 'amongst', 'envy', 'degradation', 'degrading', 'learned', 'press', 'joyous', 'insensible', 'richard', 'alloy', 'numerous', 'relation', 'believing', 'inevitable', 'allowances', 'bewitching', 'trick', 'unexpected', 'formal', 'lessening', 'naval', 'apple', 'despair', 'upper', 'solitary', 'collecting', 'indies', 'meanwhile', 'manage', 'persuading', 'dissuade', 'retirement', 'mixed', 'earliest', 'activity', 'coxes', 'bathing', 'constitution', 'elliots', 'bowed', 'grandmother', 'sickness', 'recent', 'uncomfortable', 'weakness', 'fashioned', 'uttered', 'ceaseless', 'bank', 'congratulate', 'accommodations', 'voices', 'speedily', 'communicate', 'healthy', 'penance', 'total', 'rejoiced', 'imprudent', 'treatment', 'experience', 'similar', 'native', 'beds', 'amends', 'granted', 'unreserve', 'injured', 'hears', 'handsomely', 'startled', 'landau', 'imaginary', 'endeavoured', 'younger', 'yards', 'discovered', 'acknowledgment', 'affect', 'travel', 'system', 'gravely', 'yield', 'influenced', 'mild', 'dependent', 'misconduct', 'glancing', 'assisting', '_he_', 'irresistible', 'insufferable', 'recovery', 'taught', 'decisive', 'discoveries', 'plainly', 'proposals', 'supplied', 'discern', 'rooke', 'resolving', 'events', 'uncertainty', 'ostler', 'discerned', 'wright', 'hastened', 'utility', 'calls', 'somehow', 'rousing', 'displeasure', 'listener', 'butcher', 'interchange', 'inferred', 'worn', 'expose', 'beings', 'woodhouses', 'pardoned', 'varieties', 'security', 'unnatural', 'burn', 'quarrelling', 'resignation', 'draught', 'kingdom', 'behave', 'addresses', 'drawings', 'alluded', 'somersetshire', '!--(', 'completed', 'unpersuadable', 'contemptible', 'marries', 'centre', 'elsewhere', 'jemima', 'thirteen', 'papers', 'seas', 'adopt', 'reflected', 'hysterical', 'concealed', 'promoted', 'respectability', 'astley', 'disgrace', 'mystery', 'nodding', 'occasioned', 'butler', 'advising', 'deceive', 'guessing', 'emotions', 'subdued', 'explanations', 'goodwill', 'enabled', 'steadily', 'maintaining', 'spirited', 'lent', 'contained', 'alertness', 'rapidity', 'affair', 'scattered', 'impressed', 'unseen', 'fashionable', '_them_', 'curtains', 'bench', 'practicable', 'apartment', 'compose', 'qualities', 'strangers', 'boot', 'depressed', 'treachery', 'dick', 'rejected', 'maintained', 'privilege', 'everywhere', 'january', 'permitted', 'unsuspicious', 'fatigued', 'repeatedly', 'released', 'charades', 'intently', 'resemblance', 'agreeably', 'misled', 'refresh', 'chief', 'intending', 'blue', 'unreasonably', 'unnecessarily', 'apprehensive', 'throughout', 'michaelmas', 'allowable', 'pleases', 'remarkable', 'educated', 'analogy', 'checked', 'deference', 'remains', 'uncommon', 'roast', 'pavement', 'precedence', 'evenings', 'approving', 'brilliant', 'holidays', 'fourteen', 'raptures', 'aid', 'astonishing', 'humph', ";'", 'sailor', 'delays', 'reference', 'bestowed', 'unlikely', 'hesitate', 'accompany', 'admirer', 'seclusion', 'knocked', 'provoked', 'devotion', 'exclamation', 'beforehand', 'rumour', 'suggestions', 'welfare', 'excusable', 'replying', 'quarrelled', 'overpowered', 'notion', 'inferiority', 'marking', 'nursed', 'whispering', 'stupid', 'indubitable', 'alacrity', 'pitiful', 'downright', 'hearty', 'proportions', 'stokes', 'disagree', 'graceful', 'effort', 'reminded', 'originally', 'suggest', 'impossibility', 'repetition', 'widower', 'recently', 'commonplace', 'owned', 'allowing', 'stroll', 'county', 'caring', 'gentlemanlike', 'moderately', 'hayters', 'slave', '_just_', 'preserve', 'conjecture', 'opportunities', 'pursuit', 'serve', ';"--', '_my_', 'deplorable', 'refinement', 'justify', 'esq', 'rules', 'fixing', 'applying', 'examined', 'recommending', 'fondly', 'noticing', 'candles', 'boiled', 'averted', 'contrive', 'source', 'comprehending', 'wretchedness', 'crowded', 'considers', 'assembled', 'confessing', 'park', 'fellows', 'reflect', 'abilities', 'qualified', 'benevolent', 'quarters', 'capital', 'comforted', 'poetry', 'friday', 'irresolute', 'excused', 'rightly', 'punishment', 'announcing', 'inequality', 'lawn', 'delusion', 'disdain', 'dalrymples', 'solemn', 'folding', 'apprehend', 'hodges', 'trembling', 'restraints', 'prevail', 'explore', 'sucklings', 'interview', 'representation', 'favourably', 'pretensions', 'committed', 'freedom', 'reluctance', 'satisfactory', 'custom', 'pays', 'partiality', 'gratifying', 'perplexity', 'airing', 'baldwin', 'row', 'wrapt', 'objections', 'matches', 'handwriting', 'artificial', 'indispensable', 'supposition', 'mortifying', 'protested', 'conceived', 'impressions', 'gain', 'footpath', 'politely', 'started', 'enquiry', 'relationship', 'repeating', 'hospitable', 'accomplishments', 'deaf', 'esteemed', 'respected', 'series', 'collect', 'fruit', 'adding', 'gilbert', 'cheer', 'page', 'tranquil', 'pleasantest', 'letting', 'topic', 'decorum', 'inevitably', 'complaints', 'engrossed', 'connection', 'gown', 'insult', 'concurrence', 'ii', 'observant', 'characters', 'breaking', 'basin', 'limited', 'claimed', 'balls', 'designs', 'pushed', 'dinners', 'separated', 'bearing', 'counsel', 'securing', 'rivet', 'detail', 'selfish', 'topics', 'allusion', 'whoever', 'ridiculous', 'thanked', 'privy', 'contempt', 'subsequent', 'hamilton', 'entertain', 'liberality', 'oblige', 'ushered', 'process', 'asserted', 'flutter', 'curricle', 'comparing', 'shrubberies', 'date', 'reward', 'arrange', 'george', 'openness', 'penetration', 'meadows', 'succeed', 'suppressed', 'jump', 'producing', 'deemed', 'reckoned', 'measures', 'amidst', 'provide', 'ceremonious', 'speculation', 'miserably', 'grieving', 'ungracious', 'livery', 'denial', 'formidable', 'rivers', 'liveliness', 'perturbation', 'negative', 'kitty', 'frozen', 'mutually', 'retrench', 'fairy', 'copied', 'unsuitable', '_the_', 'blank', 'dressing', 'calculations', 'unattended', 'humourist', 'supplying', 'requisite', 'hesitatingly', 'deplore', 'ingenuity', 'unnoticed', 'clearly', '_is_', 'extravagant', 'unaccountable', 'chuses', 'witnessing', 'homes', 'unpardonable', 'unkind', 'maintenance', 'bewildered', 'grandpapa', 'freshness', 'sources', 'ardent', 'heroism', 'apothecary', 'coxcomb', 'using', 'shamefully', 'sequel', 'smoothness', 'probabilities', 'declares', 'moral', 'discover', 'abrupt', 'behaving', 'advisable', 'rencontre', 'condolence', 'overheard', 'poverty', 'shilling', 'reappeared', 'youthful', 'airs', 'pert', 'climate', 'ballroom', 'creditable', 'perrys', 'fearless', 'obscurity', 'mortifications', 'device', 'urgent', 'breathed', 'stepping', 'plants', 'discreet', 'pointing', 'attentively', 'withdrawn', 'soothing', 'renewal', 'privileges', 'tenderest', 'augusta', '10', 'accidental', 'dutiful', 'requiring', 'longest', 'nursing', 'shoes', 'firmly', 'floor', 'gratefully', 'neglecting', 'preceding', 'pitiable', 'truths', 'orders', '_your_', '_will_', 'shore', 'bitterly', 'colds', 'procuring', 'welcoming', 'articles', 'unfairly', 'mutton', 'proposition', 'naming', 'assuring', 'loin', 'dropped', 'cleverer', 'longing', 'fetching', 'shrubbery', 'feelingly', 'club', 'wainscot', 'pronounced', 'calmly', ',--"', 'disordered', 'poignant', 'pushing', 'happens', 'judicious', 'seats', 'measles', 'pitied', 'trusting', 'feet', 'reality', 'milsom', 'phoo', 'solitude', 'seemingly', 'partridge', 'irritated', 'inquiring', ':"', 'kinder', ':--"', 'x', 'bought', 'stept', 'beneath', 'descriptions', 'preserved', 'inconstancy', 'sands', 'forest', 'consequences', 'considerably', 'failure', 'entangled', 'viii', 'promote', 'july', 'premature', 'disgraced', 'prime', 'wear', 'examining', 'nephews', 'grows', 'disappointments', 'solicitudes', 'comprehension', 'thoughtless', 'lessons', 'affectionately', 'forlorn', 'minutiae', 'relenting', 'exposed', 'shameful', 'insolent', 'blameless', 'elegantly', 'saved', 'individual', 'acquittal', 'walker', 'occupying', 'loudly', 'overhearing', 'stile', 'horseback', 'ascertained', 'nut', 'outlived', 'insensibility', 'amount', 'reports', 'gravel', 'serle', 'indisposed', 'proposing', 'universally', 'providence', 'playful', 'infection', 'robinson', 'confinement', 'recall', 'shabby', 'sole', 'nonsensical', 'overpowering', ');', 'deeper', 'usually', 'genteel', 'narration', 'pursued', 'glances', 'vigorously', 'lists', 'newspapers', 'cases', 'dissipated', 'shelter', 'gibraltar', 'residing', 'intimates', 'saving', 'astray', 'affronted', 'boasted', 'unfair', 'delicious', 'stretching', 'situated', 'absenting', 'unqualified', 'earnestness', 'rejoined', 'covered', 'terror', 'cheap', 'conveniently', 'preventing', 'apprehensions', 'females', 'reigns', "'--", 'perpetually', 'hardship', 'grieve', 'unheard', 'bordered', 'amazing', 'unpretending', 'apart', 'counteract', 'driven', 'harmless', 'occurrence', 'intervals', 'iv', 'misunderstandings', 'asparagus', 'gowland', 'prettily', 'creating', 'preferring', 'musician', 'loveliness', 'harmony', 'xi', 'charmingly', 'social', 'nearest', 'voluntary', 'ships', 'destination', 'fastidious', 'detected', 'inducement', 'joyful', 'local', 'confirmation', 'compatible', 'rained', 'shrink', 'explains', 'excessive', 'projected', '_now_', 'riddle', 'ribbon', 'declaring', 'unwillingness', 'hesitating', 'estimation', 'october', 'persevered', 'disclosure', 'teachers', 'moderation', 'tidings', 'describing', 'trivial', 'reverie', 'intimately', 'smaller', 'services', 'august', 'owing', 'sins', 'dowager', 'viscountess', 'dealings', 'modes', 'augur', 'restrictions', 'ruined', 'practised', 'v', 'xiv', 'shawl', 'considerate', 'penetrating', 'staircase', 'finally', 'economy', 'employing', 'persisting', 'correspondence', 'helpless', 'romance', 'storm', 'enquiring', 'maintain', 'piano', 'flower', 'remove', 'efficacy', 'proofs', 'clearing', 'treat', 'divide', 'genuine', 'expenses', 'regretting', 'parent', 'calmer', 'enjoyments', 'spoiled', 'observance', 'examination', 'recur', 'branch', 'vast', 'misunderstood', 'alteration', 'denied', 'suspicious', 'dreaded', 'quivering', 'lip', 'changing', 'contemplate', 'handsomest', 'hereabouts', 'hedges', 'complacency', 'disappoint', 'principals', 'meal', 'noticed', 'unknown', 'agitations', 'respecting', 'hardened', 'waiter', 'pointedly', 'softness', 'seize', 'shropshire', 'revived', 'reluctant', 'mix', '_his_', 'presumptive', 'resigned', 'requested', 'extravagance', 'bonnet', 'wallises', 'gratify', 'limits', 'inquired', 'sixty', 'impulse', 'likelihood', 'ix', 'grievance', 'quicker', 'animating', 'silenced', 'encumbrance', 'affectedly', 'string', 'sketch', 'clearer', 'umbrellas', 'clearness', 'stupidity', 'dulness', 'airy', 'forms', 'foresaw', 'elevate', 'humouredly', 'curacy', 'readiness', 'advanced', 'doubtingly', 'basil', 'felicities', 'iii', 'sentences', 'heavens', 'delivered', 'incommoded', 'congratulated', 'unite', 'unmanageable', 'naivete', 'eloquent', 'persuadable', 'bows', 'interests', 'energy', 'occasional', 'preparatory', 'jumped', 'suddenness', 'glass', 'convictions', 'unquestionably', 'pondered', 'consoling', 'furnished', 'attachments', 'standard', 'disgusting', 'novelty', 'swisserland', 'associate', 'doubly', 'omission', 'stable', 'xii', 'gifted', 'attractions', 'virtues', 'popularity', 'thanking', 'yard', 'expressive', 'vi', 'retract', 'whims', 'vex', 'accomplishment', 'needless', 'departure', 'xiii', 'seized', 'scholar', 'boarder', 'limbs', 'heightened', 'protection', 'including', 'xvii', 'arch', 'artist', 'compliance', 'suspension', 'tones', 'demure', 'remonstrance', 'venturing', 'charms', 'indignant', 'contrast', 'yielding', 'cured', 'comprehended', 'conversable', 'usage', 'composedly', 'wickedness', 'cow', 'prominent', 'regularly', 'yarmouth', 'absurdity', 'snowing', 'foresee', 'viewed', 'exerting', 'concluding', 'accidentally', 'cooler', 'charmed', 'lame', 'convincing', 'undertakes', 'sharp', 'fireside', 'instinctively', '_must_', 'examples', 'divisions', 'luxurious', 'connect', 'renewing', 'animate', 'luckiest', 'stories', 'proportion', 'soften', 'contradict', 'palpably', 'visible', 'humiliation', 'owes', 'changes', 'draper', 'involving', 'eternal', 'privations', 'appointed', 'proving', 'quickest', 'pretended', 'vacant', 'lamenting', '_all_', 'contradiction', 'fortunately', 'rode', 'noisy', 'placing', 'attractive', 'hating', 'disorder', 'plays', 'striving', 'awful', 'guidance', 'immense', 'flying', 'laura', 'tolerate', 'destined', '_to_', '_more_', 'partake', 'affords', 'associations', 'authorised', 'coffee', 'depending', 'whist', 'rarely', 'downstairs', 'accord', 'interruption', 'chatty', 'midsummer', 'achieved', 'trunk', '_', '_elton_', 'cart', 'communicating', 'pained', 'dears', 'adieus', 'inconstant', 'indulging', 'speculations', 'frigate', 'understands', 'discipline', 'deduction', 'unsuspected', 'physician', 'atmosphere', 'arisen', 'accompanying', 'foreign', 'warfare', 'newspaper', 'perceptible', 'simpleton', 'untainted', 'languor', 'dispel', 'gloomy', 'extenuation', 'hind', 'freshened', 'easier', 'surrounding', 'tacitly', 'innocently', 'lengths', 'conundrum', 'repulsive', 'attract', 'eleven', 'fling', 'faster', 'clownish', 'joining', 'publications', 'scrape', 'hue', 'represent', 'recalled', 'permanently', 'dignified', 'graciousness', 'fagged', 'acknowledgement', 'schoolfellow', 'reduced', 'monarch', 'infatuation', 'failings', 'upstart', 'pretension', 'prescribed', 'instrumental', 'thankfulness', 'lowering', 'supports', 'involve', 'secrecy', 'connecting', 'parentage', 'revealed', 'puppy', 'fruitless', 'contrivances', 'representing', 'exposing', 'derive', 'deprecated', 'studiously', 'testify', 'varying', 'insinuating', 'afloat', 'stock', 'realised', 'preserves', 'unconsciously', 'attributing', 'summon', 'ridden', 'poorly', 'gout', '000', 'glorious', 'green', 'brown', 'richly', 'sweetly', 'serviceable', 'expedients', 'pleasantness', 'borrowed', 'scissors', 'appropriated', 'powered', 'falsehood', 'yellow', 'admires', 'muffin', 'removals', 'inconsistent', 'ajar', 'knitting', 'overpower', 'preceded', 'durable', 'finances', 'overtaken', 'huswife', 'apologise', 'pages', 'indignantly', 'disengage', 'elevation', 'desert', 'exchanged', 'estrangement', '_little_', 'honestly', 'embarrassments', 'cruelty', 'impropriety', 'equipped', 'strictly', 'imparted', 'hinting', 'arguments', 'completion', 'tied', 'contentment', 'enable', 'pacing', 'endeavouring', 'balance', 'salted', 'discerning', 'unfelt', 'sour', 'asleep', 'culture', 'earn', 'final', 'unconvinced', 'portraits', 'satin', 'overthrow', 'parishes', 'indescribable', 'incomprehensible', 'strangest', 'brighter', 'shy', 'impediment', 'amuses', 'torment', 'polished', 'ceases', 'playfulness', 'definition', 'orchestra', 'clerks', 'stays', 'nicely', 'fried', 'trimmed', 'seeking', 'abode', 'contriving', 'fearfully', 'intellectual', 'sign', 'clifton', 'closing', 'conjugal', 'states', 'fainted', '_courtship_', 'advised', 'reasoned', 'wittier', 'urged', 'flew', 'forwards', 'disagreement', 'affording', 'purchased', 'kindled', 'fatal', 'charmouth', 'cliffs', 'romantic', 'expedition', 'incumbent', 'sly', 'curtailed', 'expediency', 'grandson', 'regulations', 'reductions', 'disapprobation', 'honourably', 'tunbridge', 'dated', 'writer', 'medium', 'interfering', 'softening', 'lingering', 'courteous', 'undue', 'secondly', 'results', 'reasonably', 'lodged', 'likenesses', 'hetty', 'prized', 'singular', 'speedy', 'brigden', 'examine', 'audible', 'fetched', 'foolishly', 'conjectures', 'abused', 'follies', 'wilful', 'subduing', 'sharing', 'sobering', 'remind', 'contrition', 'flight', 'train', 'expressly', 'materials', 'glowed', 'concession', 'staring', 'extended', 'hero', 'pew', 'favouring', 'witty', 'mischievous', 'construction', 'haue', 'ham', 'caesar', 'brutus', 'bru', 'vs', 'selfe', 'thee', 'loue', 'vpon', 'heere', 'cassi', 'hor', 'hamlet', 'hath', 'giue', 'cassius', 'speake', 'antony', 'ile', 'th', 'vp', 'heare', 'doe', 'thinke', 'qu', 'looke', 'ophe', 'ant', 'feare', 'laer', 'downe', 'againe', 'heauen', 'pol', 'hee', 'leaue', 'rosin', 'owne', 'exeunt', 'queene', 'euen', 'polon', 'neuer', 'horatio', 'caes', 'hast', 'rome', 'marke', 'gods', 'liue', 'euery', 'beare', 'caesars', 'wee', 'himselfe', 'laertes', 'brut', 'caska', 'cask', 'soule', 'mar', 'deere', 'finde', 'cinna', 'meanes', 'sonne', 'ophelia', 'luc', 'lucius', 'poore', 'ghost', 'sword', 'seene', 'euer', 'selues', 'vse', 'keepe', 'clo', 'octauius', 'titinius', 'messala', 'beleeue', 'cas', 'octa', 'players', 'faire', 'bee', 'messa', 'polonius', 'vertue', 'guild', 'meane', 'sleepe', 'osr', 'worke', 'roman', 'vnto', 'backe', 'lye', 'decius', 'crowne', 'guildensterne', 'farre', 'denmarke', 'capitoll', 'madnesse', 'dye', 'thine', 'goe', 'kill', 'yong', 'lucillius', 'betweene', 'beene', 'honor', 'por', 'sicke', 'winde', 'minde', 'walke', 'sayes', 'fortinbras', 'mee', 'eares', 'romans', 'ayre', 'ouer', 'forme', 'eare', 'seeme', 'wilt', 'lookes', 'caius', 'dost', 'onely', 'loues', 'ho', 'kin', 'graue', 'meete', 'foule', 'reuenge', 'reynol', 'greefe', 'generall', 'helpe', 'newes', 'neere', 'ore', 'heauens', 'ioy', 'kinde', 'cato', 'flye', 'oft', 'metellus', 'businesse', 'drinke', 'ser', 'giuen', 'beares', 'voyce', 'pindarus', 'thrice', 'breake', 'murther', 'verie', 'gertrude', 'tooke', 'vnder', 'cicero', 'bin', 'villaine', 'turne', 'knowne', 'twere', 'giues', 'gaue', 'hell', 'portia', 'certaine', 'rosincrance', 'lesse', 'themselues', 'iudgement', 'seeke', 'dayes', 'sweare', 'marcellus', 'tane', 'beseech', 'cymber', 'guil', 'thankes', 'weepe', 'armes', 'talke', 'receiue', 'publius', 'valiant', 'lou', 'soules', 'gho', 'barnardo', 'ligarius', 'flourish', 'strato', 'trebonius', 'thanke', 'shalt', 'barn', 'poyson', 'begge', 'dreame', 'enterprize', 'weare', 'passe', 'knaue', 'halfe', 'growne', 'dreadfull', 'funerall', 'withall', 'teares', 'army', 'hora', 'cass', 'hearke', 'whil', 'ranke', 'shewes', 'hoe', 'loose', 'fat', 'volumnius', 'maiesty', 'bloody', 'stirre', 'prythee', 'beast', 'custome', 'seemes', 'braue', 'peece', 'fran', 'philippi', 'mou', 'deare', 'appeare', 'senate', 'aboue', 'ple', 'weake', 'comming', 'musicke', 'speakes', 'deliuer', 'confesse', 'houre', 'arme', 'vile', 'wherein', 'alarum', 'dagger', 'dane', 'doore', 'teare', 'proofe', 'mettle', 'wits', 'diuell', 'senators', 'slaue', 'countrymen', 'yeare', 'seuerall', 'lordship', 'sirs', 'actus', 'perceiue', 'seruice', 'wherefore', 'deci', 'lucil', 'thunder', 'obey', 'durst', 'blacke', 'seruant', 'writ', 'lacke', 'burne', 'leade', 'maiestie', 'saue', 'claudio', 'bones', 'moue', 'sodaine', 'scull', 'audience', 'twas', 'redresse', 'fauour', 'noyse', 'drowne', 'rites', 'buriall', 'obserue', 'quicke', 'norwey', 'preuent', 'iudge', 'anon', 'neede', 'manet', 'behinde', 'beware', 'yea', 'pompeyes', 'thinkes', 'wisedome', 'lepidus', 'resolu', 'fortunes', 'fie', 'wil', 'doo', 'cocke', 'fooles', 'presse', 'dumbe', 'roome', 'palme', 'alexander', 'ne', 'fellowes', 'suite', 'calphurnia', 'greefes', 'y', 'iust', 'thicke', 'dreames', 'soldiers', 'perchance', 'flauius', 'wholsome', 'mothers', 'content', 'strooke', 'tragedie', 'answere', 'sunne', 'france', 'crosse', 'hoa', 'whilst', 'shout', 'starre', 'pyrrhus', 'hamlets', 'honors', 'cursed', 'warre', 'deeds', 'mens', 'damned', 'reade', 'toward', 'knowes', 'kingdome', 'foole', 'asse', 'vowes', 'vnderstand', 'heauy', 'antonio', 'ist', 'yeares', 'monstrous', 'visage', 'nunnery', 'pulpit', 'stole', 'whereto', 'turnes', 'mindes', 'clocke', 'cruell', 'aliue', 'coward', 'ides', 'ene', 'tit', 'statue', 'liuing', 'yee', 'gowne', 'royall', 'prison', 'tent', 'yeeld', 'sleepes', 'natiue', 'freedome', 'proclaime', 'daggers', 'purposes', 'fla', 'fiery', 'walkes', 'aske', 'slaine', 'distracted', 'clit', 'dyes', 'cai', 'mur', 'tend', 'returne', 'fearefull', 'reueng', 'titin', 'naturall', 'iulius', 'bene', 'keepes', 'cob', 'distemper', 'seale', 'amisse', 'deepe', 'flowers', 'honestie', 'loe', 'var', 'shee', 'steele', 'noblest', 'sings', 'bosome', 'serues', 'locke', 'indeede', 'heerein', 'stra', 'gallowes', 'seru', 'vnfold', 'starres', 'haire', 'marcus', 'foorth', 'cries', 'followes', 'modestie', 'wounds', 'liues', 'gonzago', 'rul', 'beasts', 'gainst', 'vnkle', 'sirra', 'rash', 'spade', 'danish', 'louing', 'falles', 'sooth', 'mans', 'saide', 'feast', 'pompey', 'doores', 'mortall', 'closes', 'goodnight', 'sicknesse', 'therein', 'legions', 'beard', 'hauing', 'soone', 'pit', 'neyther', 'wide', 'recouer', 'dutie', 'clitus', 'motiue', 'traitors', 'publike', 'heeles', 'braine', 'extasie', 'ambassadors', 'valour', 'battaile', 'elsonower', 'successe', 'iephta', 'findes', 'blesse', 'choyce', 'feede', 'repaire', 'trickes', 'plaine', 'mouse', 'osricke', 'alarums', 'rosincrane', 'maine', 'proscription', 'affayres', 'alacke', 'meere', 'peepe', 'weigh', 'subiect', 'lippes', 'grone', 'lep', 'calles', 'nony', 'push', 'runne', 'leysure', 'desperate', 'tearmes', 'traine', 'cheeke', 'aduice', 'dard', 'souldier', 'cals', 'pricke', 'knocke', 'cin', 'contriue', 'pitty', 'breefely', 'glasse', 'seruants', 'powres', 'norman', 'vice', 'bondman', 'ment', 'aboord', 'saile', 'demand', 'lightning', 'deceiu', 'meerely', 'haile', 'humbly', 'flood', 'voyage', 'ee', 'spundge', 'hercules', 'liu', 'fates', 'shooes', 'strew', 'beautie', 'speciall', 'promis', 'cornelius', 'tydings', 'terrible', 'villaines', 'incestuous', 'vnnaturall', 'treb', 'norway', 'vncle', 'heares', 'vnderstanding', 'sence', 'argall', 'rous', 'expresse', 'woo', 'eate', 'olympus', 'bloud', 'tame', 'greene', 'hits', 'prouidence', 'browes', 'stuffe', 'naked', 'cal', 'ayme', 'heereafter', 'shapes', 'hew', 'aduantage', 'humor', 'prophesie', 'ope', 'vtterance', 'limbes', 'strife', 'infants', 'deede', 'groaning', 'taper', 'bap', 'murder', 'braines', 'seuen', 'stones', 'weeping', 'swords', 'lyes', 'attendants', 'wayes', 'kneele', 'fled', 'faithfull', 'dar', 'belike', 'alwayes', 'rests', 'leane', 'reynoldo', 'fret', 'wanton', 'forc', 'acte', 'carpenter', 'bleed', 'ghosts', 'prou', 'closset', 'processe', 'readie', 'varrus', 'laughter', 'signe', 'playes', 'soueraigne', 'priest', 'huge', 'wing', 'receiu', 'louer', 'angell', 'sate', 'refus', 'coniure', 'kissing', 'knees', 'yonder', 'vnlesse', 'reioyce', 'edge', 'corruption', 'sardis', 'pleas', 'loued', 'doest', 'throwne', 'necke', 'crimes', 'moone', 'generals', 'foote', 'princes', 'priuate', 'clowne', 'driue', 'hecuba', 'seal', 'slay', 'envenom', 'com', 'pate', 'quantitie', 'leaues', 'pluckt', 'maker', 'doomesday', 'philosophy', 'immortall', 'dogge', 'cowards', 'rage', 'harme', 'warlike', 'graues', 'moneths', 'growes', 'enuious', 'begger', 'compell', 'battell', 'pluck', 'wager', 'ceremonies', 'sixe', 'counsell', 'fixt', 'wonderfull', 'vow', 'siluer', 'faine', 'proue', 'neerer', 'childe', 'breed', 'hower', 'canst', 'conspirators', 'pastorall', 'historicall', 'graunt', 'suites', 'toe', 'pesant', 'courtier', 'galls', 'diadem', 'loines', 'trumpets', 'trumpet', 'drinkes', 'darke', 'rapier', 'mightie', 'tokens', 'moreouer', 'sutor', 'incorporate', 'ambitions', 'ladder', 'yeeres', 'honorable', 'firme', 'frighted', 'painted', 'sparkes', 'whereof', 'lt', 'wittingly', 'remaines', 'asleepe', 'cic', 'amaz', 'wouldest', 'loosing', 'greeke', 'dy', 'diuel', 'weaknesse', 'potent', 'gaming', 'mock', 'waxe', 'porch', 'iustly', 'lupercall', 'fits', 'grapple', 'billes', 'rises', 'deseru', 'memorie', 'fulfill', 'wheele', 'corpes', 'tending', 'split', 'vrge', 'feares', 'larded', 'importing', 'axe', 'masse', 'diuinity', 'acts', 'soyle', 'greatnesse', 'carue', 'bloodie', 'warres', 'conspiracie', 'girle', 'amaze', 'crew', 'lucianus', 'lap', 'mocke', 'stabb', 'lome', 'magots', 'pregnant', 'leape', 'schoole', 'hilts', 'cobler', 'driuen', 'morne', 'mantle', 'dew', 'yon', 'perils', 'conditions', 'vntill', 'iigge', 'knee', 'thrift', 'actor', 'remoue', 'tardie', 'greeue', 'seate', 'droppes', 'equall', 'chasticement', 'sworne', 'tents', 'maiden', 'courtiers', 'schollers', 'obseru', 'heauenly', 'canopy', 'appeares', 'pestilent', 'swore', 'streetes', 'vnckle', 'liued', 'somthing', 'pitteous', 'conuert', 'indifferently', 'weary', 'booke', 'brauery', 'mountaines', 'vilde', 'praying', 'killes', 'battailes', 'muddy', 'rascall', 'damn', 'batchellor', 'bleede', 'iustice', 'dyest', 'spurre', 'deerely', 'fierie', 'scoene', 'vtter', 'imployment', 'wag', 'ape', 'iaw', 'load', 'dishonour', 'feete', 'volt', 'recorder', 'platforme', 'seuenty', 'fiue', 'drachmaes', 'conspirator', 'sinne', 'controuersie', 'tryall', 'sucke', 'necessitie', 'niggard', 'sweete', 'hastie', 'murderer', 'cynna', 'violets', 'torrent', 'garland', 'voltumand', 'prickt', 'signifie', 'choller', 'thrusting', 'element', 'wormes', 'mistris', 'resort', 'messengers', 'bastard', 'nephewes', 'suppresse', 'whale', 'poysoner', 'loath', 'awhile', 'peeuish', 'foe', 'tyrants', 'sham', 'sinke', 'wildenesse', 'wonted', 'winters', 'mountaine', 'blew', 'eternall', 'sparke', 'vndertake', 'ioynt', 'ioyes', 'chanc', 'plebeians', 'kisse', 'shed', 'foyles', 'odde', 'voltemand', 'clownes', 'vnbraced', 'iealous', 'recount', 'plots', 'camell', 'bonds', 'commoners', 'obseruance', 'bodie', 'fierce', 'ciuill', 'italy', 'confines', 'hauocke', 'carrion', 'burnes', 'winne', 'formall', 'bury', 'euill', 'sometime', 'knauish', 'reades', 'soothsayer', 'murellus', 'heate', 'perillous', 'pronounc', 'mutiny', 'enemie', 'lyons', 'offall', 'progresse', 'weapons', 'forehead', 'ordinance', 'heau', 'sourse', 'denmark', 'wake', 'gouerne', 'tweene', 'lyon', 'throate', 'prick', 'torches', 'ioyn', 'comedie', 'whiles', 'wilde', 'octauio', 'armour', 'yond', 'theame', 'naught', 'passions', 'construe', 'blest', 'calp', 'shooke', 'drum', 'venome', 'pind', 'secrecie', 'apparition', 'cloake', 'griefe', 'birds', 'qualitie', 'natures', 'spake', 'cup', 'fye', 'sawcy', 'mistrust', 'forgiue', 'arras', 'liest', 'barke', 'slaues', 'oathes', 'scandall', 'popil', 'aduancement', 'imports', 'strangely', 'arrant', 'twelue', 'passionate', 'ros', 'plac', 'falne', 'shrunke', 'dec', 'moued', 'lust', 'celestiall', 'prey', 'vttered', 'thriue', 'straine', 'hearers', 'mischeefe', 'mars', 'yesternight', 'doublet', 'tyber', 'shores', 'metel', 'slew', 'lookt', 'vnseene', 'plucke', 'beside', 'maiesties', 'merrie', 'iot', 'cap', 'romane', 'speechlesse', 'quarrell', 'ifaith', 'inobled', 'craft', 'assur', 'scope', 'truely', 'whatsoeuer', 'primus', 'puh', 'wormwood', 'lowe', 'easinesse', 'sorrie', 'cæsar', 'iealousie', 'mess', 'chide', 'whereon', 'fee', 'poleak', 'pole', 'enact', 'replication', '].', 'els', 'sticke', 'sufferance', 'windowes', 'beating', 'rogue', 'drift', 'treacherous', 'sober', 'fals', 'attaine', 'rew', 'kil', 'millions', 'burning', 'cheere', 'discomfort', 'womens', 'actors', 'crowes', 'election', 'discouer', 'swet', 'grownd', 'wauing', 'brands', 'harlot', 'itching', 'mart', 'windes', 'roughly', 'twixt', 'twaine', 'byrlady', 'hoby', 'courtesie', 'tenders', 'shell', 'cryed', 'rend', 'brooke', 'streame', 'liberall', 'weeds', 'spred', 'pul', 'guts', 'farwell', 'pious', 'popillius', 'enuy', 'disclos', 'stretcht', 'gray', 'beards', 'witnesse', 'serue', 'vantage', 'witchcraft', 'grosse', 'charme', 'foure', 'houres', 'crack', 'pith', 'ancestors', 'belou', 'spectacle', 'peeces', 'terme', 'anticke', 'deckt', 'steale', 'cannon', 'hangers', 'moneth', 'yoake', 'couch', 'hounds', 'shal', 'accidentall', 'brest', 'termes', 'pastime', 'wel', 'auoyd', 'whet', 'serpent', 'storme', 'region', 'nightly', 'toyles', 'affrighted', 'clap', 'blasted', 'iudgements', 'forgetfull', 'stab', 'breefe', 'liege', 'shrewdly', 'mutes', 'saies', 'strawes', 'assay', 'gall', 'blowne', 'heauie', 'purging', 'worthinesse', 'vnderneath', 'builds', 'mason', 'shipwright', 'shold', 'ensigne', 'whit', 'distract', 'rapiers', 'bondage', 'visitation', 'flint', 'parchment', 'skinnes', 'tride', 'ripe', 'ioyne', 'flash', 'rotten', 'horrid', 'bleeding', 'butchers', 'priam', 'imperiall', 'blowes', 'vnknowne', 'councell', 'rebellious', 'louers', 'wast', 'kites', 'cerimony', 'fantasie', 'pawse', 'trash', 'scourge', 'speede', 'toy', 'guildenstern', 'stomacke', 'mourn', 'tyrant', 'griefes', 'pittie', 'comicall', 'tragicall', 'beguile', 'buzze', 'strucke', 'mooue', 'fauours', 'keene', 'interim', 'testament', 'dardanius', 'stoope', 'bribes', 'ophel', 'passeth', 'trappings', 'carde', 'equiuocation', 'vndoe', 'picked', 'kibe', 'threatning', 'bisson', 'rheume', 'clout', 'lanke', 'teamed', 'blanket', 'villany', 'cups', 'kettle', 'cannoneer', 'cannons', 'cauerne', 'maske', 'antike', 'heraulds', 'shelfe', 'coynage', 'bodilesse', 'drownes', 'blench', 'ioyfully', 'lowlynesse', 'climber', 'vpward', 'attaines', 'vpmost', 'scorning', 'ascend', 'til', 'priests', 'stayes', 'physicke', 'prolongs', 'dispos', 'seduc', 'israel', 'greekes', 'madman', 'lyest', 'skies', 'vnnumbred', 'vnknowing', 'circumscrib', 'yeelding', 'bargaine', 'iuggel', 'rant', 'offendendo', 'argues', 'ro', 'rebellion', 'gyant', 'meate', 'earnes', 'southerly', 'hawke', 'handsaw', 'apron', 'shouted', 'trap', 'melancholly', 'abuses', 'damne', 'menace', 'pretors', 'chayre', 'hoorded', 'extorted', 'wombe', 'woodcocke', 'sprindge', 'treacherie', 'kingly', 'beauer', 'cum', 'alijs', 'sect', 'boorded', 'cleare', 'shippe', 'outlarie', 'smelt', 'yorick', 'iest', 'gorge', 'vntrod', 'validitie', 'fruite', 'vnripe', 'stickes', 'vnshaken', 'mellow', 'purpled', 'reeke', 'smoake', 'speedier', 'smels', 'primall', 'marrie', 'bang', 'proceede', 'sutors', 'accoutred', 'recame', 'offends', 'robustious', 'pery', 'wig', 'pated', 'tatters', 'ragges', 'groundlings', 'capeable', 'inexplicable', 'whipt', 'termagant', 'outherod', 'herod', 'petitions', 'cabin', 'scarft', 'grop', 'withdrew', 'vnseale', 'knauery', 'denmarks', 'englands', 'hoo', 'bugges', 'goblins', 'superuize', 'leasure', 'bated', 'grinding', 'foolerie', 'digest', 'venom', 'spleene', 'scholler', 'cautell', 'besmerch', 'vnuallued', 'sanctity', 'iumpe', 'polake', 'arriued', 'affabilitie', 'erebus', 'dimme', 'preuention', 'feauer', 'spaine', 'lustre', 'bookes', 'feeble', 'maiesticke', 'sorrowes', 'spies', 'battalians', 'ladie', 'insupportable', 'losse', 'quake', 'returneth', 'conuerted', 'stopp', 'beere', 'barrell', 'quintus', 'reueale', 'ouerlook', 'fac', 'raines', 'doue', 'tarry', 'prethee', 'whilest', 'maisters', 'amb', 'cimber', 'preferre', 'calender', 'challenger', 'chidden', 'ferret', 'crost', 'oration', 'russet', 'clad', 'easterne', 'abler', 'oct', 'signall', 'flashes', 'rore', 'baudry', 'barre', 'libertie', 'candied', 'pompe', 'crooke', 'hindges', 'faining', 'rerule', 'goodman', 'deluer', 'salutation', 'beckens', 'rossius', 'fishmonger', 'bait', 'falshood', 'windlesses', 'assaies', 'indirections', 'tyrannie', 'muddied', 'vnwholsome', 'whispers', 'greenly', 'hugger', 'mugger', 'interre', 'vnskilfull', 'iudicious', 'reway', 'theater', 'globe', 'ruddy', 'buffets', 'nunnerie', 'showts', 'clamors', 'maintains', 'nutshell', 'sodainely', 'pub', 'plucking', 'intrailes', 'feele', 'infaith', 'mood', 'scanter', 'entreatments', 'expectansie', 'mould', 'obseruers', 'inuites', 'discouery', 'secricie', 'moult', 'feather', 'forgone', 'sterrill', 'promontory', 'maiesticall', 'roofe', 'fretted', 'golden', 'congregation', 'vapours', 'drawne', 'heape', 'gastly', 'mowes', 'ducates', 'sterne', 'exits', 'saint', 'patricke', 'remaster', 'reform', 'arriu', 'tenure', 'cutpurse', 'empire', 'brau', 'con', 'roate', 'sayst', 'tut', 'bosomes']
def unique_counts(text, unique_voc):
unique_counts = {}
words = []
for word in text:
counts = Counter(words)
for word in unique_voc:
if word in counts.keys():
unique_counts[word] = counts.get(word)
else: unique_counts[word] = 0
return unique_counts
def initialize_dataset(source, source_docs):
all_features = []
targets = []
for (sent, label) in source:
counts = word_counts(sent)
for word in STOP_WORDS:
if word in counts.keys():
feature_list.append(proportion_words(sent, STOP_WORDS))
p_counts = pos_counts(sent, source_docs, pos_list)
for pos in p_counts.keys():
s_counts = suffix_counts(sent, source_docs, selected_suffixes)
for suffix in s_counts.keys():
u_counts = unique_counts(sent, unique_voc)
for word in u_counts.keys():
if label=="austen": targets.append(0)
else: targets.append(1)
return all_features, targets
13414 13414 3354 3354 6906 6906 0.9633273703041145 [[2237 63] [ 60 994]] precision recall f1-score support 0 0.97 0.97 0.97 2300 1 0.94 0.94 0.94 1054 accuracy 0.96 3354 macro avg 0.96 0.96 0.96 3354 weighted avg 0.96 0.96 0.96 3354 0.9643788010425717 [[4881 118] [ 128 1779]] precision recall f1-score support 0 0.97 0.98 0.98 4999 1 0.94 0.93 0.94 1907 accuracy 0.96 6906 macro avg 0.96 0.95 0.96 6906 weighted avg 0.96 0.96 0.96 6906
Our final and best result: $0.95$-$0.96$ on the pretest and $0.95$-$96$ on the test set – i.e., almost identical! What is more, the performance on both classes, majority as well as minority, is now also almost identical – F1 of $0.97$ and $0.94$ on pretest, and $0.97$ and $0.92$ on the test set.
# use the last accuracy score as the last value
pretestAcc = (96.36, 79.72, 81.22, 83.10, 95.47, 96.27)
# use the last accuracy score as the last value
testAcc = (89.57, 80.49, 81.83, 82.54, 95.34, 95.82)
ind = np.arange(len(pretestAcc)) # the x locations for the groups
width = 0.3 # the width of the bars
fig, ax = plt.subplots()
rects1 = - width/2, pretestAcc, width, label='Pretest', color='#61A4F6')
rects2 = + width/2, testAcc, width, label='Test', color='#DB025B')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Accuracy scores')
ax.set_title('Scores by feature set and data set')
ax.set_xticklabels(('Benchmark', 'F1-2', 'F1-4', 'F1-5', 'F1-6', 'F1-7'))