from fastai.gen_doc.nbdoc import * from fastai.text import * show_doc(Tokenizer) show_doc(Tokenizer.process_text) show_doc(Tokenizer.process_all) path = untar_data(URLs.IMDB_SAMPLE) path df = pd.read_csv(path/'texts.csv', header=None) example_text = df.iloc[2][1]; example_text tokenizer = Tokenizer() tok = SpacyTokenizer('en') ' '.join(tokenizer.process_text(example_text, tok)) df = pd.read_csv(path/'texts.csv', header=None) texts = df[1].values tokenizer = Tokenizer() tokens = tokenizer.process_all(texts) ' '.join(tokens[2]) show_doc(BaseTokenizer) show_doc(BaseTokenizer.tokenizer) show_doc(BaseTokenizer.add_special_cases) show_doc(SpacyTokenizer) show_doc(deal_caps, doc_string=False) show_doc(fix_html, doc_string=False) fix_html("Some HTML text
") show_doc(replace_all_caps) show_doc(replace_rep, doc_string=False) replace_rep("I'm so excited!!!!!!!!") show_doc(replace_wrep, doc_string=False) replace_wrep("I've never ever ever ever ever ever ever ever done this.") show_doc(rm_useless_spaces) rm_useless_spaces("Inconsistent use of spaces.") show_doc(spec_add_spaces) spec_add_spaces('I #like to #put #hashtags #everywhere!') show_doc(Vocab) show_doc(Vocab.create) show_doc(Vocab.numericalize) show_doc(Vocab.textify) vocab = Vocab.create(tokens, max_vocab=1000, min_freq=2) vocab.numericalize(tokens[2])[:10] show_doc(SpacyTokenizer.tokenizer) show_doc(SpacyTokenizer.add_special_cases)