#!/usr/bin/env python # coding: utf-8 # # Classifying Name Ethnicity with a Character level RNN # This is heavily modeled on the Pytorch tutorial: # https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html # # We use fastai libraries extensively to make dataloading and training easier # ### Download the Pytorch tutorial data # This is a list of surnames and their ethnicities # In[1]: #!wget https://download.pytorch.org/tutorial/data.zip # In[2]: #!unzip -o data.zip # ## Load the data # fastai import pandas and all sorts of other goodies # In[1]: from fastai import * from fastai.text import * # In[2]: from unidecode import unidecode import string # Reduce the ouput to 20 rows to prevent it from taking too much of the output. # In[3]: pd.options.display.max_rows = 20 # Read in the data; names for each language is in a separate file # In[5]: path = Path('data/names') # In[6]: get_ipython().system('ls {path}') # In[7]: get_ipython().system('head -n5 {path}/Arabic.txt') # In[8]: names = [] for p in path.glob('*.txt'): lang = p.name[:-4] with open(p) as f: names += [(lang, l.strip()) for l in f] df = pd.DataFrame(names, columns=['cl', 'name']) # ## Check the Data # It's always worth doing some sanity checks on your data (even supposedly clean tutorial data). # # No matter how good your model is: garbage in, garbage out. # In[9]: df.head() # In[10]: len(df) # ### Character Set # What letters outside of ASCII are in the names? # In[11]: foreign_chars = Counter(_ for _ in ''.join(list(df.name)) if _ not in string.ascii_letters) foreign_chars.most_common() # A few of these look suspicious. (Note the use of a regular expression in `contains` to check each of the characters) # In[12]: suss_chars = [':', '/', '\xa0', ',', '1'] df[df.name.str.contains('|'.join(suss_chars))] # Most of these look like legitimate names with extra junk (except 'To The First Page'). # Since it's so few names it's easiest just to drop them. # In[13]: df = df[~df.name.str.contains('|'.join(suss_chars))] # Single quotes and spaces are common # In[14]: df[df.name.str.contains("'| ")] # Since hyphens mainly join multiple last names (and are pretty rare) we won't lose heaps by dropping them. # In[15]: df[df.name.str.contains('-')] # In[16]: df = df[~df.name.str.contains('-')] # ### Normalising non-ASCII Characters # Let's normalise all non-ASCII characters to ASCII equivalents. # # This makes our classification problem harder in practice: any names containing a ß are almost surely German, wheras "ss" could occur in many language. It also reduces the set of characters we need to represent our language. # In[18]: df['ascii_name'] = df.name.apply(unidecode) df[df.name != df.ascii_name] # Let's check case: I expect names to be in CamelCase. # # These seem to be mistakes. # In[20]: df[~df.ascii_name.str.contains("^[A-Z][^A-Z]*(?:[' -][A-Z][^A-Z]*)*$")] # In[21]: df = df[df.ascii_name.str.contains("^[A-Z][^A-Z]*(?:[' -][A-Z][^A-Z]*)*$")] # Let's lowercase the ascii_names # In[22]: df['ascii_name'] = df.ascii_name.str.lower() # Make a check we've normalised correctly. # In[24]: ascii_chars = Counter(''.join(list(df.ascii_name))) ascii_chars.most_common() # ### How many classes does each name have? # # In practice a surname could have multiple ethnicities, but we'd have to be really careful of how we use this in training. # # If we end up with e.g. 'Michel' as French in the training dataset, but German in the validation set our model has no hope of getting it right (and we may discard an actually good model). # # We could handle this by: # 1) Allowing multiple class labels # 2) Picking the country that the name most commonly associates to # 3) Dropping ambiguous cases # # Without any information about frequency we can't do (2) and (1) is a harder problem, so we'll stick to (3). # In[25]: name_classes = df.\ groupby('ascii_name').\ nunique().cl.sort_values(ascending=False) name_classes.head(20) # In[26]: df[df.name == 'Michel'] # 1 in 40 of our names have multiple classes (most of them do before normalisation too) # In[27]: len(name_classes), sum(name_classes > 1) / len(name_classes) # While some names like [Abel](https://en.wikipedia.org/wiki/Abel_(surname)) do seem to occur commonly in multiple countries, for example: # - [Adamson](https://en.wikipedia.org/wiki/Adamson_(surname)) is very unlikely to be Russian # - [Wong](https://en.wikipedia.org/wiki/Wong_(surname)) is much more prevalant in Chinese than English # - [Yang](https://en.wikipedia.org/wiki/Yang_(surname)) is very rare in English # # It seems like Korean and Chinese have a lot of overlap, as to English and Scottish. # While this makes some linguistic sense it will make it hard to make a reliable classifier. # # Note that most names only occur once; so we can't pick a "most common" frequency class. # In[35]: with pd.option_context('display.max_rows', 60): print(df[df.ascii_name.isin(name_classes[name_classes > 1].index)].groupby(['ascii_name', 'cl']).count()) # Rather than finding the "right" ethnicity the easy thing to do is to remove all ambiguous cases. # In[36]: df = df[~df.ascii_name.isin(name_classes[name_classes > 1].index)] # ### How often do (class, name) pairs occur? # # We need exactly one row per pair; if separate copies appear in the training and validation set we'll get a higher validation accuracy than is reasonable. # Some names occur very frequently. # In[37]: counts = df.assign(n=1).groupby(['ascii_name', 'cl']).count().sort_values('n', ascending=False) counts.head(n=20) # Let's remove the "To The First Page" junk (probably some artifact of where the data was scraped from) # In[39]: df = df[df.ascii_name != 'to the first page'] # There are no multiples in English, and a lot in Arabic. It seems like a data entry error rather than meaningful. # In[41]: counts.assign(multiple=counts.n > 1, rows=1).groupby('cl').sum().sort_values('n', ascending=False) # It makes sense to drop the duplicates and only have a single row per `ascii_name` and `cl`. # In[42]: df = df.drop_duplicates(['ascii_name', 'cl']) # In[43]: len(df) # ### Length Check # It's worth checking if the shortest and longest names make sense. # # They look reasonable. # In[44]: df.assign(len=df.name.str.len()).sort_values('len') # ### Distribution by Language # The dataset is very unbalanced. # # I doubt there's enough data to tacke Portuguese (which will be close to Spanish) and Scottish (which will be close to English) # In[45]: df.groupby('cl').name.count().sort_values(ascending=False) # In[46]: df[df.cl.isin(['Scottish'])] # Let's remove the rarest classes; we're not likely to have enough data to guess them. # In[47]: df = df[~df.cl.isin(['Scottish', 'Portuguese'])] # Note Russian contains variant transliterations to English like Abaimoff and Abaimov (which both correspond to Абаимов). # # But this doesn't quite explain it's high frequency: it seems a lot more Russian data was extracted. # # (Side note: [Chebyshev](https://en.wikipedia.org/wiki/Pafnuty_Chebyshev) can also be spelt e.g. Chebychev, Tchebycheff, Tschebyschef) # In[48]: df[df.cl == 'Russian'] # ### Create Validation and Training Sets # # We want our final model to work well on any language. # # But if we pick our validation set uniformly at random from the data we're likely to get many Russian names and not many Vietnamese names, which isn't a good test of this. # # So instead we'll take our validation set from an equal number from each subclass. # In[50]: df = df.reset_index().drop('index', 1) df # In[52]: counts = df.groupby('cl').name.count().sort_values(ascending=False) counts # In[54]: valid_size = 30 # We'll pick 30 at random from each subclass train_size = 500 # For a balanced training set we'll pick 500 at random with replacement # In[55]: np.random.seed(6011) valid_idx = [] for cl in counts.keys(): # Random sample of size "valid_size" for each class valid_idx += list(df[df.cl == cl].sample(valid_size).index) # In[56]: df['valid'] = False df.loc[valid_idx, 'valid'] = True # Let's also create a balanced training set as an alternative to using everything not in validation # In[57]: np.random.seed(7012) balanced_idx = [] for cl in counts.keys(): # Random sample of size "train_size" for each class from the data outside of the validation set balanced_idx += list(df[(df.cl == cl) & ~df.valid].sample(train_size, replace=True).index) # Note the balanced index contains all 25 (= 55 - 30) Vietnamese names outside of the training set, but only contains 486 of the Russian names (because we sampled randomly with replacement there will be a couple of double ups). # In[58]: df.loc[balanced_idx].groupby('cl').nunique().sort_values('ascii_name', ascending=False) # Let's record our balanced set in the dataframe: this will make it easy to reload at a later point. # In[59]: df['bal'] = 0 for k, v in Counter(balanced_idx).items(): df.loc[k, 'bal'] += v # In[60]: df.head() # We can always retrieve the indexes from the dataframe # In[61]: idx = [] for k, v in zip(df.index, df.bal): idx += [k]*v sorted(balanced_idx) == idx # ### Save the Data # In[62]: df.to_csv('names_clean.csv', index=False) # # Benchmarks # # The first benchmark is random guessing/always guessing the same class. # # The expected return is 1/(number of classes) = 1/16 ~ 6.25% # In[63]: df = pd.read_csv('names_clean.csv') valid_idx = df[df.valid].index train_idx = df[~df.valid].index bal_idx = [] for k, v in zip(df.index, df.bal): bal_idx += [k]*v # ### Sanity check out data # Check training/balanced training data doesn't contain any names in validation set # In[65]: train_intersect_valid = sum(df.iloc[train_idx].ascii_name.isin(df.iloc[valid_idx].ascii_name)) bal_interset_valid = sum(df.iloc[bal_idx].ascii_name.isin(df.iloc[valid_idx].ascii_name)) train_intersect_valid, bal_interset_valid # Make sure the data looks right # In[66]: df.iloc[train_idx].groupby('cl').nunique().sort_values('ascii_name', ascending=False) # In[67]: df.iloc[bal_idx].groupby('cl').nunique().sort_values('ascii_name', ascending=False) # In[68]: df.iloc[valid_idx].groupby('cl').nunique().sort_values('ascii_name', ascending=False) # Picking any one class in validation will give 1/16 = 6.25% # In[69]: (df[df.valid] == 'Korean').cl.sum() / df.valid.sum() # ## n-grams and naive Bayes # # A reasonable way to guess a language is by the frequency of characters and pairs of characters. # # For example 'cz' is very rare in English, but quite common in the slavic languages. # In[70]: name = 'zozrov' # A function to count the occurances of sequences of one, two or three letters (in general these sequences are called "n-grams" particularly when referring to sequences of words). # In[71]: def ngrams(s,n=1): parts = [s[i:] for i in range(n)] # e.g. ['zozrov', 'ozrov', 'zrov'] return Counter(''.join(_) for _ in zip(*parts)) ngrams(name, 1), ngrams(name, 2), ngrams(name, 3) # In[72]: df = df.assign(letters=df.ascii_name.apply(ngrams)) df = df.assign(bigrams=df.ascii_name.apply(ngrams, n=2)) df = df.assign(trigrams=df.ascii_name.apply(ngrams, n=3)) # In[73]: df.head() # Let's try to guess the name using [Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier). # # TL;DR: This is a *really* simple model that works quite well and will give a good benchmark. # # This uses "Bayes Rule" which uses the data to answer questions like: "given the name contains the bigram 'ah' what's the probability it's Korean?". # # The "Naive" part means that that we assume all these probabilities are independent (knowing it contains 'ah' doesn't tell you anything about the fact it contains 'hn'). Even though this definitely isn't true, it's often a reasonable approximation. # # This makes it really fast and simple to fit a model and often works well. # In[82]: from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction import DictVectorizer vd1 = DictVectorizer(sparse=False) vd2 = DictVectorizer(sparse=False) vd3 = DictVectorizer(sparse=False) # In[75]: y = df.cl # In[83]: letters = vd1.fit_transform(df.letters) bigrams = vd2.fit_transform(df.bigrams) trigrams = vd3.fit_transform(df.trigrams) # The letters matrix contains the number of times each of the 28 letters occurs (e.g. number of spaces, number of apostrophes, number of 'a', ...). # In[89]: vd1.get_feature_names()[:10] # In[90]: letters # Similarly bigrams and trigrams contains the number of times each sequence of 2 or 3 letters occurs # In[94]: vd2.get_feature_names()[:5], vd2.get_feature_names()[-5:] # In[95]: letters.shape, bigrams.shape, trigrams.shape, y.shape # How good a model can we get looking at individual letters (e.g. saying 'z' occurs much more frequently in Chinese than in English names). # In[96]: letter_nb = MultinomialNB() letter_nb.fit(letters[train_idx],y[train_idx]) bal_letter_nb = MultinomialNB() bal_letter_nb.fit(letters[bal_idx],y[bal_idx]) # The balanced set does mut better than random; around 33% # In[97]: letter_pred = letter_nb.predict(letters[valid_idx]) bal_letter_pred = bal_letter_nb.predict(letters[valid_idx]) (letter_pred == y[valid_idx]).mean(), (bal_letter_pred == y[valid_idx]).mean() # Let's write a function to test the Naive Bayes on any dataset; fitting on the whole dataset and the balanced dataset separately. # In[98]: def nb(x): model = MultinomialNB() model.fit(x[train_idx], y[train_idx]) preds = model.predict(x[valid_idx]) acc_train = (preds == y[valid_idx]).mean() model = MultinomialNB() model.fit(x[bal_idx], y[bal_idx]) preds = model.predict(x[valid_idx]) acc_bal = (preds == y[valid_idx]).mean() return acc_train, acc_bal # In[99]: nb(letters) # Using bigrams and a balanced training set gives a much better prediction performance 53% (up from the baseline of 6.25%). # In[100]: nb(bigrams) # Adding letters doesn't make much difference (which isn't surprising # In[101]: nb(np.concatenate((letters, bigrams), axis=1)) # Trigrams alone also performs worse # In[102]: nb(trigrams) # Let's try every combination with trigrams: # In[103]: nb(np.concatenate((letters, trigrams), axis=1)) # In[104]: nb(np.concatenate((bigrams, trigrams), axis=1)) # In[105]: nb(np.concatenate((letters, bigrams, trigrams), axis=1)) # None of them significantly outperform the simple bigram model (with 623 parameters; we could probably remove some of the uncommon ones without too many problems. # ### Examining the Bigram Model # Let's remove the bigrams that only occur once as they have practically no value (and there's 100 of them). # In[195]: common_bigrams = (bigrams[bal_idx].sum(axis=0)) >= 2 common_bigrams.sum() # In[196]: common_bigram_index = [i for i, t in enumerate(common_bigrams) if t] bigrams_min = bigrams[:, common_bigram_index] bigrams_min.shape # In[197]: bigram_model = MultinomialNB() bigram_model.fit(bigrams_min[bal_idx], y[bal_idx]) # We get around 53% accuracy. # In[203]: bigram_pred = bigram_model.predict(bigrams_min[valid_idx]) (bigram_pred == y[valid_idx]).mean() # In[213]: bigram_prob = bigram_model.predict_proba(bigrams_min[valid_idx]) bigram_prob.max(axis=1) # In[217]: bigram_preds = (df .iloc[valid_idx] .assign(pred = bigram_pred)[['name', 'cl', 'pred']] .assign(prob = bigram_prob.max(axis=1))) bigram_preds.sort_values('prob', ascending=False).head(15) # The names it's least confident with: they typically seem to be quite short # In[218]: bigram_preds.sort_values('prob', ascending=True).head(15) # The names it's most confidently wrong with: # In[222]: bigram_preds[bigram_preds.cl != bigram_preds.pred].sort_values('prob', ascending=False).head(15) # Our very simple system does great on Japanese and Russian, but relatively poorly on Vietnamese where our data is most sparse (but still much better than random). # In[223]: (bigram_preds .assign(yes=bigram_preds.cl == bigram_preds.pred) .groupby('cl') .yes .mean() .sort_values(ascending=False) # In[227]: from sklearn.metrics import confusion_matrix # In[236]: bigram_pred # In[246]: cm = confusion_matrix(y[valid_idx], bigram_pred, labels=y.unique()) cm # In[201]: def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=90) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() # Vietnamese is often confused for Chinese (which makes sense) and Irish (which doesn't). # Korean is often confused for Japanese. # Spanish is often confused for Italian. # In[250]: plt.figure(figsize=(12,12)) plot_confusion_matrix(cm, y.unique()) # In[256]: bigram_preds[bigram_preds.cl == 'Vietnamese'].sort_values('prob').head(20) # So our baseline is 53%. Let's see if we can do better with deep learning # # Deep Learning # ## Build a Fastai Data Loader # Load in the dataframe and extract indexes for training, validation and balanced trainings. # In[4]: df = pd.read_csv('names_clean.csv') valid_idx = df[df.valid].index train_idx = df[~df.valid].index bal_idx = [] for k, v in zip(df.index, df.bal): bal_idx += [k]*v # As of December 2018 Fastai only has Word level tokenizers; we'll have to create our own letter tokenizer. # # The fastai library injects `BOS` markers (`xxbos`) at the start of every string; we'll have to parse them separately. # In[5]: class LetterTokenizer(BaseTokenizer): "Character level tokenizer function." def __init__(self, lang): pass def tokenizer(self, t:str) -> List[str]: out = [] i = 0 while i < len(t): if t[i:].startswith(BOS): out.append(BOS) i += len(BOS) else: out.append(t[i]) i += 1 return out def add_special_cases(self, toks:Collection[str]): pass # We create a vocab of all ASCII letters, and a character tokenizer that doesn't do any specific processing. # In[8]: itos = [UNK, BOS] + list(string.ascii_lowercase + " -'") # In[9]: vocab=Vocab(itos) tokenizer=Tokenizer(LetterTokenizer, pre_rules=[], post_rules=[]) # We can create a data pipeline using the `TextDataBunch.from_df` constructor. # # `mark_fields` puts and extra `xxfld` marker between each field of text. Since we only have 1 field this is unnecessary. # In[10]: train_df = df.iloc[train_idx, [0,2]] valid_df = df.iloc[valid_idx, [0,2]] # In[9]: train_df.head() # In[11]: data = TextClasDataBunch.from_df(path='.', train_df=train_df, valid_df=valid_df, tokenizer=tokenizer, vocab=vocab, mark_fields=False) # In[12]: data.show_batch() # Or we can create it using data block API. # This uses the `processors` to tokenize and numericalize the input. # In[13]: processors = [TokenizeProcessor(tokenizer=tokenizer, mark_fields=False), NumericalizeProcessor(vocab=vocab)] # In[14]: data = (TextList .from_df(df, cols=[2], processor=processors) .split_by_idxs(train_idx=train_idx, valid_idx=valid_idx) .label_from_df(cols=0) .databunch(bs=32)) # In[15]: data.show_batch() # ### Sanity Checking # In[16]: Counter(_.obj for _ in data.valid_ds.y) # In[17]: Counter(_.obj for _ in data.train_ds.y).most_common() # Check no text is both in Validation and Training # In[18]: valid_set = set(_.text for _ in data.valid_ds.x) for _ in data.train_ds.x: assert _.text not in valid_set, _.text # ## Examine a minibatch # In[19]: trainiter = iter(data.train_dl) batch, cl = next(trainiter) batch2, cl2 = next(trainiter) # In[20]: cl, len(cl) # In[21]: batch.shape # The first 22 letters run *down* the batch backpadded by `BOS`; we have 16 names across. # # Somehow it looks like we also have an extra space at the beginning of each name that wasn't in the input data. # # (Note this is different to what the fastai wrappers will give you; they concatenate the data and split it into 16 chunks). # In[26]: pd.options.display.max_columns = 100 (pd .DataFrame([[vocab.itos[y] for y in x] for x in batch]) .T .assign(category=[data.classes[_] for _ in cl]) .T) # In[27]: [vocab.itos[_] for _ in data.train_ds[0][0].data] # In[28]: list(df.iloc[0,1]) # Note the length of strings varies between batches. # In[29]: (pd .DataFrame([[vocab.itos[y] for y in x] for x in batch2]) .T .assign(category=[data.classes[_] for _ in cl2]) .T) # In[30]: vocab.textify(batch2[:,0]) # In[31]: data.show_batch(ds_type=DatasetType.Valid) # ## One Hot Encoding # # The torch nn.RNN expects the data to be one hot encoded # In[32]: one_hot = torch.eye(len(vocab.itos)) # In[33]: one_hot[batch][:2] # In[34]: one_hot[batch].shape # Here's how we could do it without storing the one_hot matrix in memory. # In[35]: def one_hot_fly(y, length=len(vocab.itos)): length = len(vocab.itos) shape = list(y.shape) assert len(shape) == 2 tensor = torch.zeros(shape + [length]) for i,row in enumerate(y): for j, val in enumerate(row): tensor[i][j][val] = 1. return tensor # In[36]: (one_hot[batch] == one_hot_fly(batch)).all() # Using matrix operations is ~250 times faster at this size than the double for loop. # In[37]: get_ipython().run_line_magic('timeit', 'one_hot[batch]') get_ipython().run_line_magic('timeit', 'one_hot_fly(batch)') None # ## Fitting a model # In[38]: n_letters = len(vocab.itos) n_hidden = 128 n_output = df.cl.nunique() n_letters, n_output # We use an RNN to take our sequence of letters in and calculate the hidden state # In[39]: rnn = nn.RNN(input_size=n_letters, hidden_size=n_hidden, num_layers=1, nonlinearity='relu', dropout=0.) # In[40]: output, hidden = rnn(one_hot[batch]) output.shape, hidden.shape # In[41]: lo = nn.Linear(n_hidden, n_output) # In[42]: preds = lo(output) # In[43]: preds.shape # In[44]: cl # In[45]: nn.functional.softmax(preds[-1], dim=1).argmax(dim=1) # In[46]: one_hot = torch.eye(len(vocab.itos)) # In[47]: class MyLetterRNN(nn.Module): def __init__(self, dropout=0., n_layers=1, n_input=n_letters, n_hidden=n_hidden, n_output=n_output): super().__init__() self.one_hot = torch.eye(n_letters).cuda() self.rnn = nn.RNN(input_size=n_letters, hidden_size=n_hidden, num_layers=n_layers, nonlinearity='relu', dropout=dropout) self.lo = nn.Linear(n_hidden, n_output) def forward(self, input): rnn, _ = self.rnn(self.one_hot[input]) out = self.lo(rnn) return out[-1] # In[48]: rnn = MyLetterRNN().cuda() # In[49]: out = rnn(batch) out.argmax(dim=1), cl # Fit the model # In[50]: F.cross_entropy(out, cl) # In[51]: learn = Learner(data, rnn, loss_func=F.cross_entropy, metrics=[accuracy]) # In[52]: learn.lr_find() # In[53]: learn.recorder.plot() # In[54]: learn.fit_one_cycle(10, max_lr=3e-2) # In[55]: learn.lr_find() learn.recorder.plot() # In[56]: learn.save('char_rnn_1') # In[57]: learn.fit_one_cycle(5, 3e-3) # In[58]: learn.save('char_rnn_1_final') # This is abysmal; 31% is much worse than 52% from the simple Naive Bayes bigram model. # # Does it improve if we add another layer? # In[59]: learn = Learner(data, MyLetterRNN(n_layers=2), loss_func=F.cross_entropy, metrics=[accuracy]) # In[60]: learn.lr_find() learn.recorder.plot() # In[61]: learn.fit_one_cycle(20, max_lr=1e-2) # It looks like the fit has converged, again at a much worse result than our Naive Bayes bigrams. # # But that was trained using a balanced dataset; maybe that will help with RNNs too. # In[62]: learn.recorder.plot_losses() # In[63]: learn.save('char_rnn_2_p0') # In[76]: prob, targ = learn.get_preds() Counter(data.classes[_.item()] for _ in prob.argmax(dim=1)).most_common() # ## Rebalancing # ### Less is more # # Even though the balanced set is a subset of the training set (and throws away a lot of data), the model performs much better on the balanced validation set with it. # # This is because on the whole training set heuristics like "when in doubt, guess Russian/English" and "it's almost never Vietnamese" are good, but are terrible on our validation set. # In[77]: data = (TextList .from_df(df, cols=[2], processor=processors) .split_by_idxs(train_idx=bal_idx, valid_idx=valid_idx) .label_from_df(cols=0) .databunch(bs=1024)) # ### Sanity Checking # In[78]: Counter(_.obj for _ in data.valid_ds.y) # In[79]: Counter(_.obj for _ in data.train_ds.y).most_common() # In[80]: (pd.DataFrame({'x': [_.text for _ in data.train_ds.x], 'y': [_.obj for _ in data.train_ds.y]}) .groupby('y') .nunique() .sort_values('x', ascending=False)) # In[81]: valid_set = set(_.text for _ in data.valid_ds.x) for _ in data.train_ds.x: assert _.text not in valid_set, _.text # ### Fitting # In[82]: learn = Learner(data, MyLetterRNN(n_layers=2), loss_func=F.cross_entropy, metrics=[accuracy]) # In[83]: learn.lr_find() learn.recorder.plot() # Note that our balanced dataset is about half the size of our training dataset. Useful to keep in mind when comparing number of epochs and runtime. # In[85]: len(train_idx) / len(bal_idx) # We only get around ~51% accuracy on a balanced test set (similar to the Naive Bayes) # In[86]: learn.fit_one_cycle(30, max_lr=3e-2) # It's starting to overfit and so could perhaps do with some regularization. # In[87]: learn.recorder.plot_losses() # This model is a little worse in accuracy than the Naive Bayes Bigram model. # # But our Neural Network is much more computationally intense and has about 4 times as many parameters! # In[89]: sum(len(_) for _ in learn.model.parameters()) # ### Regularisation: Dropout # # Adding 50% dropout increases our accuracy a little above what we got with Naive Bayes; to 55%. # In[119]: learn = Learner(data, MyLetterRNN(n_layers=2, dropout=0.5), loss_func=F.cross_entropy, metrics=[accuracy]) # In[120]: learn.fit_one_cycle(30, max_lr=3e-2) # In[92]: learn.recorder.plot_losses() # ### Changing the dimension of hidden layers # Using our default of 128 gets 54% # In[123]: learn = Learner(data, MyLetterRNN(n_layers=2, dropout=0.5, n_hidden=128), loss_func=F.cross_entropy, metrics=[accuracy]) learn.fit_one_cycle(15, max_lr=3e-2) # Doubling to 256 doesn't change performance # In[124]: learn = Learner(data, MyLetterRNN(n_layers=2, dropout=0.5, n_hidden=128), loss_func=F.cross_entropy, metrics=[accuracy]) learn.fit_one_cycle(15, max_lr=3e-2) # Halving to 64 definitely does; 128 does seem to be a sweet spot. # In[125]: learn = Learner(data, MyLetterRNN(n_layers=2, dropout=0.5, n_hidden=64), loss_func=F.cross_entropy, metrics=[accuracy]) learn.fit_one_cycle(15, max_lr=3e-2) # Finally 3 layers also gets a worse result. # In[129]: learn = Learner(data, MyLetterRNN(n_layers=3, dropout=0.5), loss_func=F.cross_entropy, metrics=[accuracy]) learn.fit_one_cycle(30, max_lr=3e-2) # In[129]: learn = Learner(data, MyLetterRNN(n_layers=3, dropout=0.5), loss_func=F.cross_entropy, metrics=[accuracy]) learn.fit_one_cycle(30, max_lr=3e-2) # ## RNN From Scratch # Let's build our own RNN; instead of one hot encoding we'll use a `nn.Embedding`. # In[131]: data = (TextList .from_df(df, cols=[2], processor=processors) .split_by_idxs(train_idx=bal_idx, valid_idx=valid_idx) .label_from_df(cols=0) .databunch(bs=1024)) # In[132]: valid_data_set = set(tuple(_[0].data) for _ in data.valid_ds) for datum in data.train_ds: assert tuple(datum[0].data) not in valid_data_set, datum # In[133]: x, y = next(iter(data.train_dl)) x.shape, y.shape # In[134]: x.shape[-1] # In[135]: class Model(nn.Module): def __init__(self, n_input, n_hidden, n_output, bn=False): super().__init__() self.i_h = nn.Embedding(n_input,n_hidden) self.bn = nn.BatchNorm1d(n_hidden) if bn else None self.o_h = nn.Linear(n_hidden, n_output) self.h_h = nn.Linear(n_hidden, n_hidden) self.reset() def forward(self, x): # I'm not quite sure why the batch size seems to change to 720 in validation... if self.h.shape[0] != x.shape[1]: self.reset(x.shape[1]) h = self.h x = self.i_h(x) for xi in x: h += xi h = self.h_h(h) h = F.relu(h) if self.bn: h = self.bn(h) self.h = h.detach() o = self.o_h(h) return o def reset(self, size=None): size = size or 1 self.h = torch.zeros(size, n_hidden).cuda() # In[136]: model = Model(n_letters, n_hidden, n_output).cuda() # In[137]: learn = Learner(data, model, loss_func=F.cross_entropy, metrics=[accuracy]) # In[138]: learn.lr_find() # In[139]: learn.recorder.plot() # This simple RNN seems to work *better* than the one we built using `nn.rnn`, and we're only using one layer and haven't implemented dropout. # # The big difference is that we're using an embedding layer instead of one-hot encoding. This gives us an extra bunch of parameters we can fit. # In[140]: learn.fit_one_cycle(20, 7e-3) # In[141]: learn.save('rnn-bal-1') # In[142]: data.classes # Let's save the data classes; this will be useful if we want to make predictions. # In[143]: with open('data.classes', 'wb') as f: pickle.dump(data.classes, f) # Let's save the model data directly. # In[144]: with open('models/rnn-bal-1.model', 'wb') as f: pickle.dump(model.state_dict(), f) # And read it back in. # In[145]: with open('models/rnn-bal-1.model', 'rb') as f: state = pickle.load(f) model.load_state_dict(state) # ### Batchnorm # In[146]: model = Model(n_letters, n_hidden, n_output, bn=True).cuda() # In[147]: learn = Learner(data, model, loss_func=F.cross_entropy, metrics=[accuracy]) # In[148]: learn.lr_find() # Batch norm makes the learning surface much smoother # In[149]: learn.recorder.plot() # In[150]: learn.fit_one_cycle(20, 3e-2) # In this case we actually get a very similar fit. # In[151]: learn.recorder.plot_losses() # Adding a little regularisation using weight decay seems to help; we get a 59%. # # Maybe dropout could help more. # In[152]: model = Model(n_letters, n_hidden, n_output, bn=True).cuda() learn = Learner(data, model, loss_func=F.cross_entropy, metrics=[accuracy]) learn.fit_one_cycle(20, 1e-2, wd=0.1) # ## fastai Builtin # How does fastai's built in learner compare? # How long are the names? # In[153]: df.ascii_name.str.len().describe() # In[154]: learn = text_classifier_learner(data, bptt=30) # In[156]: learn.lr_find() # In[159]: learn.recorder.plot(skip_end=10) # Note we don't necessarily *expect* this to do great because the parameters are tuned to processing medium sized documents a word at a time. # # However it gets 67% way outperforms our RNN model without *any* parameter tuning. # In[160]: learn.fit_one_cycle(20, max_lr=7e-3) # In[161]: learn.recorder.plot_losses() # In[167]: learn.save('fastai_bal') # ## Pretraining the Encoder # # From the IMDB example we know for word level data pretraining the encoder gives much better results (albeit on *much* bigger datasets). Let's see if it improves things here. # In[168]: data_lm = (TextList .from_df(df, cols=[2], processor=processors) .random_split_by_pct(0.1) .label_for_lm() .databunch(bs=32)) # In[169]: data_lm.show_batch() # In[175]: learn = language_model_learner(data_lm, drop_mult=0.5) # In[171]: learn.lr_find() # In[173]: learn.recorder.plot(skip_end=10) # In[176]: learn.fit_one_cycle(4, max_lr=1e-2) # In[177]: learn.save('letter_lang') learn.save_encoder('letter_enc') # In[178]: TEXT = "ho" N_WORDS = 4 N_SENTENCES = 5 # In[179]: print("\n".join(learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES))) # In[180]: TEXT = "tr" print("\n".join(learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES))) # In[181]: learn = text_classifier_learner(data, bptt=30) learn.load_encoder('letter_enc') # In[182]: learn.lr_find() learn.recorder.plot() # In this case pretraining the encoder gives a *worse* result. # # Maybe it's because the language model was on the entire (unbalanced) dataset? Or wasn't well trained enough? # In[183]: learn.fit_one_cycle(20, max_lr=2e-2) # ## fastai: Hyperparameter Tuning # With a bit of tuning we can make a much smaller model that trains faster and is almost as good # In[187]: learn = text_classifier_learner(data, bptt=30, emb_sz=200, nh=300, nl=2) learn.fit_one_cycle(15, max_lr=1e-2, moms=(0.2, 0.1)) # In[188]: learn.save('fastai_min') # ## Analysing the results # In[189]: learn.load('fastai_min') None # In[190]: prob, target, losses = learn.get_preds(with_loss=True) pred = np.array([data.classes[_] for _ in prob.argmax(dim=1)]) target = np.array([data.classes[_] for _ in target]) # In[191]: x, y = list(learn.data.valid_dl)[0] y = np.array([data.classes[_] for _ in y]) # In[192]: len(y), len(prob) # In[193]: names = np.array([''.join([vocab.itos[x] for x in l if x != 1][1:]) for l in zip(*x)]) # I certainly think we *could* do better, but let's call it good enough. # In[196]: loss_val, idx = losses.topk(10) list(zip(names[idx], pred[idx], target[idx], loss_val)) # In[203]: confuse = sklearn.metrics.confusion_matrix(target, pred, labels=data.classes) # In[204]: def most_confused(n): top = [] for i, row in enumerate(confuse): for j, cell in enumerate(row): if i == j: continue if cell >= n: top.append([data.classes[i],data.classes[j], cell]) return sorted(top, key=lambda x: x[2], reverse=True) # Most of the confusion is between similar language families: # - Vietnamese and Korean and Chinese # - Czech and Polish # - Spanish and Italian # # This is a good sign # In[205]: most_confused(3) # In[210]: plt.figure(figsize=(6,6)) plot_confusion_matrix(confuse, data.classes) # # Predictions # Let's set up everything from scratch so we could set it up in an external app # In[1]: from fastai import * from fastai.text import * # In[2]: from unidecode import unidecode import string # In[4]: with open('data.classes', 'rb') as f: classes = pickle.load(f) classes # In[5]: class LetterTokenizer(BaseTokenizer): "Character level tokenizer function." def __init__(self, lang): pass def tokenizer(self, t:str) -> List[str]: t = unidecode(t).lower() ## Decode in tokenizer (ideally would be a separate preprocessor) out = [] i = 0 while i < len(t): if t[i:].startswith(BOS): out.append(BOS) i += len(BOS) else: out.append(t[i]) i += 1 return out def add_special_cases(self, toks:Collection[str]): pass # In[6]: itos = [UNK, BOS] + list(string.ascii_lowercase + " -'") # In[7]: vocab=Vocab(itos) tokenizer=Tokenizer(LetterTokenizer, pre_rules=[], post_rules=[]) # In[8]: empty = pd.DataFrame({'text':'', 'cl':classes}) empty # In[9]: processors = [TokenizeProcessor(tokenizer=tokenizer, mark_fields=False), NumericalizeProcessor(vocab=vocab)] # In[10]: data = TextList.from_df(empty, processor=processors).no_split().label_from_df(cols='cl').databunch(bs=2) # In[11]: learn = text_classifier_learner(data, bptt=30, emb_sz=200, nh=300, nl=2) # In[12]: learn.load('fastai_min') None # Check it's not in the training set # In[12]: get_ipython().system("grep -ir '^Wu' data/names") # In[13]: learn.predict('Wu') # Chinese # In[14]: def predictions(name): return sorted(zip(classes, (_.item() for _ in learn.predict(name)[2])), key=lambda x: x[1], reverse=True) # How does it do in practice? # In[15]: predictions("Wojtyła")[:5] # Polish # In[29]: predictions("Dvořák")[:5] # Czech # In[18]: predictions("Gaddafi")[:5] # Arabic # In[49]: predictions('Goethe')[:5] # German # Sometimes it does bad even if it's in the source data (it may not have ended up in training) # In[51]: get_ipython().system("grep -Er 'Pascal|Pham' data/names") # In[27]: predictions("Pascal")[:5] # French # In[28]: predictions("Pham")[:5] # Vietnamese # But sometimes it gets it right # In[19]: get_ipython().system("grep -ir '^Meijer' data/names") # In[20]: predictions("Meijer")[:5] # Dutch # In[21]: predictions('Wójcik')[:5] # Polish # This model is not bad; but definitely sub-human. # What does it think about our ambiguous "Michel"? # In[31]: predictions('Michel')[:7] # ## Predicting from a pretrained custom model # In[32]: class Model(nn.Module): def __init__(self, n_input, n_hidden, n_output, bn=False, use_cuda=False): super().__init__() self.i_h = nn.Embedding(n_input,n_hidden) self.bn = nn.BatchNorm1d(n_hidden) if bn else None self.o_h = nn.Linear(n_hidden, n_output) self.h_h = nn.Linear(n_hidden, n_hidden) self.use_cuda = use_cuda self.reset() def forward(self, x): # I'm not quite sure why the batch size seems to change to 720 in validation... if self.h.shape[0] != x.shape[1]: self.reset(x.shape[1]) h = self.h x = self.i_h(x) for xi in x: h += xi h = self.h_h(h) h = F.relu(h) if self.bn: h = self.bn(h) self.h = h.detach() o = self.o_h(h) return o def reset(self, size=None): size = size or 1 self.h = torch.zeros(size, n_hidden) if self.use_cuda: self.h = self.h.cuda() # In[33]: n_letters = len(vocab.itos) n_hidden = 128 n_output = len(classes) model = Model(n_letters, n_hidden, n_output) # In[34]: with open('models/rnn-bal-1.model', 'rb') as f: state = pickle.load(f) model.load_state_dict(state) model = model.cpu() model = model.eval() # In[35]: for param in model.parameters(): param.requires_grad = False # In[36]: name = 'Wójcik' # Polish # In[37]: decode = BOS + unidecode(name) decode # In[38]: tokens = tokenizer.process_all([decode])[0] tokens # In[39]: nums = vocab.numericalize(tokens) nums # In[40]: x = torch.tensor([nums]).transpose(1,0) x # In[41]: result = model(x).detach() result # In[42]: probs = F.softmax(result[0], dim=0) probs # In[43]: for prob, idx in zip(*probs.topk(3)): print(f'{classes[idx]}: Probability {prob:0.2%}') # In[44]: def get_probs(name): decode = BOS + unidecode(name) tokens = tokenizer.process_all([decode])[0] nums = vocab.numericalize(tokens) x = torch.tensor([nums]).transpose(1,0) model.reset() result = model(x).detach() probs = F.softmax(result[0], dim=0) return probs # In[45]: def print_top_probs(name, n=3): probs = get_probs(name) for prob, idx in zip(*probs.topk(n)): print(f'{classes[idx]}: Probability {prob:0.2%}') # In reality the model doesn't do great by human standards # In[46]: print_top_probs('Goethe') # German # In[47]: print_top_probs('Jinping') # Chinese # In[48]: print_top_probs('Kim') # Korean # In[52]: print_top_probs('Đặng') # Vietnamese # In[53]: print_top_probs('Zahir') # Arabic # It's also possible to use `learn.load` to load in the model, if you make some fake data. # # We need at least 2 rows or it will complain. # In[54]: empty = pd.DataFrame([[' ']]*2) empty # In[55]: processors = [TokenizeProcessor(tokenizer=tokenizer, mark_fields=False), NumericalizeProcessor(vocab=vocab)] # In[56]: data = TextList.from_df(empty, processor=processors).no_split().label_const().databunch(bs=2) # In[57]: model = Model(n_letters, n_hidden, n_output) # In[58]: learn = Learner(data, model) # In[59]: learn = learn.load('rnn-bal-1') # In[60]: learn.model = learn.model.eval().cpu() for param in learn.model.parameters(): param.requires_grad = False # In[61]: x, _ = data.one_item('Dvořák') # Czech # In[62]: learn.model.reset() probs = F.softmax(learn.model(x.cpu())) probs # In[63]: for prob, idx in zip(*probs[0].topk(3)): print(f'{classes[idx]}: Probability {prob:0.2%}') # # Using the model to find similar names # # The idea is to dig into the representation in the 50 dimensional activation and use this to compare names. # # Two names are similar if they are close together in this embedding space. # It's not totally obvious that the RMS distance is appropriate for this, but it's what we'll use. # In[113]: from fastai.callbacks.hooks import * # In[215]: df = pd.read_csv('names_clean.csv') df.head() # In[216]: data = TextList.from_df(df, cols='ascii_name', processor=processors).no_split().label_from_df('cl').databunch(bs=1024) # In[217]: # model = Model(n_letters, n_hidden, n_output).cuda() # learn = Learner(data, model) # learn = learn.load('rnn-bal-1') learn = text_classifier_learner(data, bptt=30, emb_sz=200, nh=300, nl=2) learn.load('fastai_min') None # Let's look at the structure of our model # In[218]: list(learn.model.named_children()) # Let's capture the output of the 50 dimensional embedding near the end # In[221]: layer = 17 # In[222]: list(learn.model.modules())[layer] # In[223]: def embed(x): #with hook_output(list(learn.model.children())[-1]) as hook_a: with hook_output(list(learn.model.modules())[layer]) as hook_a: preds = learn.predict(x) return hook_a.stored[0] # In[224]: get_ipython().run_line_magic('time', 'df = df.assign(embed = df.name.apply(embed))') # In[226]: df.head() # In[227]: def closest(name, n=10): e = embed(name) dist = [d(e, _) for _ in df.embed] for idx in np.argsort(dist)[:10]: print(f'{df.name.iloc[idx.item()]} ({df.cl.iloc[idx.item()]}): {dist[idx]}') # It's not immediately clear in what sense these are similar; but it doesn't seem random to me # In[229]: get_ipython().run_line_magic('time', "closest('Ahn')") # In[232]: closest('Ruder') # In[233]: closest('Gugger') # In[234]: closest('Thomas') # In[201]: closest('Ross') # In[202]: closest('Wu') # In[214]: closest('Chebyshev')