%load_ext autoreload
%autoreload 2
%matplotlib inline
#export
from exp.nb_11a import *
path = datasets.untar_data(datasets.URLs.IMDB)
path.ls()
#export
def read_file(fn):
with open(fn, 'r', encoding = 'utf8') as f: return f.read()
class TextList(ItemList):
@classmethod
def from_files(cls, path, extensions='.txt', recurse=True, include=None, **kwargs):
return cls(get_files(path, extensions, recurse=recurse, include=include), path, **kwargs)
def get(self, i):
if isinstance(i, Path): return read_file(i)
return i
il = TextList.from_files(path, include=['train', 'test', 'unsup'])
len(il.items)
txt = il[0]
txt
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))
sd
#export
import spacy,html
#export
#special tokens
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()
def sub_br(t):
"Replaces the
by \n"
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
return re_br.sub("\n", t)
def spec_add_spaces(t):
"Add spaces around / and #"
return re.sub(r'([/#])', r' \1 ', t)
def rm_useless_spaces(t):
"Remove multiple spaces"
return re.sub(' {2,}', ' ', t)
def replace_rep(t):
"Replace repetitions at the character level: cccc -> TK_REP 4 c"
def _replace_rep(m:Collection[str]) -> str:
c,cc = m.groups()
return f' {TK_REP} {len(cc)+1} {c} '
re_rep = re.compile(r'(\S)(\1{3,})')
return re_rep.sub(_replace_rep, t)
def replace_wrep(t):
"Replace word repetitions: word word word -> TK_WREP 3 word"
def _replace_wrep(m:Collection[str]) -> str:
c,cc = m.groups()
return f' {TK_WREP} {len(cc.split())+1} {c} '
re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
return re_wrep.sub(_replace_wrep, t)
def fixup_text(x):
"Various messy things we've seen in documents"
re1 = re.compile(r' +')
x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
'
', "\n").replace('\\"', '"').replace('',UNK).replace(' @.@ ','.').replace(
' @-@ ','-').replace('\\', ' \\ ')
return re1.sub(' ', html.unescape(x))
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]
replace_rep('cccc')
replace_wrep('word word word word word ')
#export
def replace_all_caps(x):
"Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
res = []
for t in x:
if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
else: res.append(t)
return res
def deal_caps(x):
"Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
res = []
for t in x:
if t == '': continue
if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
res.append(t.lower())
return res
def add_eos_bos(x): return [BOS] + x + [EOS]
default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]
replace_all_caps(['I', 'AM', 'SHOUTING'])
deal_caps(['My', 'name', 'is', 'Jeremy'])
#export
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor
def parallel(func, arr, max_workers=4):
if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
else:
with ProcessPoolExecutor(max_workers=max_workers) as ex:
return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
if any([o is not None for o in results]): return results
#export
class TokenizeProcessor(Processor):
def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4):
self.chunksize,self.max_workers = chunksize,max_workers
self.tokenizer = spacy.blank(lang).tokenizer
for w in default_spec_tok:
self.tokenizer.add_special_case(w, [{ORTH: w}])
self.pre_rules = default_pre_rules if pre_rules is None else pre_rules
self.post_rules = default_post_rules if post_rules is None else post_rules
def proc_chunk(self, args):
i,chunk = args
chunk = [compose(t, self.pre_rules) for t in chunk]
docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
docs = [compose(t, self.post_rules) for t in docs]
return docs
def __call__(self, items):
toks = []
if isinstance(items[0], Path): items = [read_file(i) for i in items]
chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
return sum(toks, [])
def proc1(self, item): return self.proc_chunk([item])[0]
def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
def deproc1(self, tok): return " ".join(tok)
tp = TokenizeProcessor()
txt[:250]
' • '.join(tp(il[:100])[0])[:400]
#export
import collections
class NumericalizeProcessor(Processor):
def __init__(self, vocab=None, max_vocab=60000, min_freq=2):
self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq
def __call__(self, items):
#The vocab is defined on the first use.
if self.vocab is None:
freq = Counter(p for o in items for p in o)
self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq]
for o in reversed(default_spec_tok):
if o in self.vocab: self.vocab.remove(o)
self.vocab.insert(0, o)
if getattr(self, 'otoi', None) is None:
self.otoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.vocab)})
return [self.proc1(o) for o in items]
def proc1(self, item): return [self.otoi[o] for o in item]
def deprocess(self, idxs):
assert self.vocab is not None
return [self.deproc1(idx) for idx in idxs]
def deproc1(self, idx): return [self.vocab[i] for i in idx]
proc_tok,proc_num = TokenizeProcessor(max_workers=8),NumericalizeProcessor()
%time ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num])
ll.train.x_obj(0)
pickle.dump(ll, open(path/'ld.pkl', 'wb'))
ll = pickle.load(open(path/'ld.pkl', 'rb'))
# Just using those for illustration purposes, they're not used otherwise.
from IPython.display import display,HTML
import pandas as pd
stream = """
In this notebook, we will go back over the example of classifying movie reviews we studied in part 1 and dig deeper under the surface.
First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the Processor used in the data block API.
Then we will study how we build a language model and train it.\n
"""
tokens = np.array(tp([stream])[0])
bs,seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))
bs,bptt = 6,5
for k in range(3):
d_tokens = np.array([tokens[i*seq_len + k*bptt:i*seq_len + (k+1)*bptt] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))
#export
class LM_PreLoader():
def __init__(self, data, bs=64, bptt=70, shuffle=False):
self.data,self.bs,self.bptt,self.shuffle = data,bs,bptt,shuffle
total_len = sum([len(t) for t in data.x])
self.n_batch = total_len // bs
self.batchify()
def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
def __getitem__(self, idx):
source = self.batched_data[idx % self.bs]
seq_idx = (idx // self.bs) * self.bptt
return source[seq_idx:seq_idx+self.bptt],source[seq_idx+1:seq_idx+self.bptt+1]
def batchify(self):
texts = self.data.x
if self.shuffle: texts = texts[torch.randperm(len(texts))]
stream = torch.cat([tensor(t) for t in texts])
self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch)
dl = DataLoader(LM_PreLoader(ll.valid, shuffle=True), batch_size=64)
iter_dl = iter(dl)
x1,y1 = next(iter_dl)
x2,y2 = next(iter_dl)
x1.size(),y1.size()
vocab = proc_num.vocab
" ".join(vocab[o] for o in x1[0])
" ".join(vocab[o] for o in y1[0])
" ".join(vocab[o] for o in x2[0])
#export
def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
return (DataLoader(LM_PreLoader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs),
DataLoader(LM_PreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs))
def lm_databunchify(sd, bs, bptt, **kwargs):
return DataBunch(*get_lm_dls(sd.train, sd.valid, bs, bptt, **kwargs))
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)
proc_cat = CategoryProcessor()
il = TextList.from_files(path, include=['train', 'test'])
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='test'))
ll = label_by_func(sd, parent_labeler, proc_x = [proc_tok, proc_num], proc_y=proc_cat)
pickle.dump(ll, open(path/'ll_clas.pkl', 'wb'))
ll = pickle.load(open(path/'ll_clas.pkl', 'rb'))
[(ll.train.x_obj(i), ll.train.y_obj(i)) for i in [1,12552]]
#export
from torch.utils.data import Sampler
class SortSampler(Sampler):
def __init__(self, data_source, key): self.data_source,self.key = data_source,key
def __len__(self): return len(self.data_source)
def __iter__(self):
return iter(sorted(list(range(len(self.data_source))), key=self.key, reverse=True))
#export
class SortishSampler(Sampler):
def __init__(self, data_source, key, bs):
self.data_source,self.key,self.bs = data_source,key,bs
def __len__(self) -> int: return len(self.data_source)
def __iter__(self):
idxs = torch.randperm(len(self.data_source))
megabatches = [idxs[i:i+self.bs*50] for i in range(0, len(idxs), self.bs*50)]
sorted_idx = torch.cat([tensor(sorted(s, key=self.key, reverse=True)) for s in megabatches])
batches = [sorted_idx[i:i+self.bs] for i in range(0, len(sorted_idx), self.bs)]
max_idx = torch.argmax(tensor([self.key(ck[0]) for ck in batches])) # find the chunk with the largest key,
batches[0],batches[max_idx] = batches[max_idx],batches[0] # then make sure it goes first.
batch_idxs = torch.randperm(len(batches)-2)
sorted_idx = torch.cat([batches[i+1] for i in batch_idxs]) if len(batches) > 1 else LongTensor([])
sorted_idx = torch.cat([batches[0], sorted_idx, batches[-1]])
return iter(sorted_idx)
#export
def pad_collate(samples, pad_idx=1, pad_first=False):
max_len = max([len(s[0]) for s in samples])
res = torch.zeros(len(samples), max_len).long() + pad_idx
for i,s in enumerate(samples):
if pad_first: res[i, -len(s[0]):] = LongTensor(s[0])
else: res[i, :len(s[0]) ] = LongTensor(s[0])
return res, tensor([s[1] for s in samples])
bs = 64
train_sampler = SortishSampler(ll.train.x, key=lambda t: len(ll.train[int(t)][0]), bs=bs)
train_dl = DataLoader(ll.train, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate)
iter_dl = iter(train_dl)
x,y = next(iter_dl)
lengths = []
for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item())
lengths[:5], lengths[-1]
x,y = next(iter_dl)
lengths = []
for i in range(x.size(0)): lengths.append(x.size(1) - (x[i]==1).sum().item())
lengths[:5], lengths[-1]
x
#export
def get_clas_dls(train_ds, valid_ds, bs, **kwargs):
train_sampler = SortishSampler(train_ds.x, key=lambda t: len(train_ds.x[t]), bs=bs)
valid_sampler = SortSampler(valid_ds.x, key=lambda t: len(valid_ds.x[t]))
return (DataLoader(train_ds, batch_size=bs, sampler=train_sampler, collate_fn=pad_collate, **kwargs),
DataLoader(valid_ds, batch_size=bs*2, sampler=valid_sampler, collate_fn=pad_collate, **kwargs))
def clas_databunchify(sd, bs, **kwargs):
return DataBunch(*get_clas_dls(sd.train, sd.valid, bs, **kwargs))
bs,bptt = 64,70
data = clas_databunchify(ll, bs)
!python notebook2script.py 12_text.ipynb