from fastai.gen_doc.nbdoc import * from fastai.text import * from fastai.gen_doc.nbdoc import * show_doc(TextLMDataBunch, title_level=3) show_doc(TextLMDataBunch.create) show_doc(TextClasDataBunch, title_level=3) show_doc(TextClasDataBunch.create) show_doc(TextDataBunch, title_level=3) jekyll_warn("This class can only work directly if all the texts have the same length.") show_doc(TextDataBunch.from_folder) show_doc(TextDataBunch.from_csv) show_doc(TextDataBunch.from_df) show_doc(TextDataBunch.from_tokens) show_doc(TextDataBunch.from_ids) show_doc(TextDataBunch.load) jekyll_warn("This method should only be used to load back `TextDataBunch` saved in v1.0.43 or before, it is now deprecated.") path = untar_data(URLs.IMDB_SAMPLE) path pd.read_csv(path/'texts.csv').head() data_lm = TextLMDataBunch.from_csv(Path(path), 'texts.csv') data_clas = TextClasDataBunch.from_csv(Path(path), 'texts.csv') show_doc(Text, title_level=3) show_doc(TextList, title_level=3) tokenizer = Tokenizer(SpacyTokenizer, 'en') processor = [TokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(max_vocab=30000)] processor = SPProcessor() show_doc(TextList.label_for_lm) show_doc(TextList.from_folder) show_doc(TextList.show_xys) show_doc(TextList.show_xyzs) show_doc(OpenFileProcessor, title_level=3) show_doc(open_text) show_doc(TokenizeProcessor, title_level=3) show_doc(NumericalizeProcessor, title_level=3) show_doc(SPProcessor, title_level=3) path = untar_data(URLs.IMDB_SAMPLE) data = TextLMDataBunch.from_csv(path, 'texts.csv') x,y = next(iter(data.train_dl)) example = x[:15,:15].cpu() texts = pd.DataFrame([data.train_ds.vocab.textify(l).split(' ') for l in example]) texts jekyll_warn("If you are used to another convention, beware! fastai always uses batch as a first dimension, even in NLP.") show_doc(LanguageModelPreLoader) path = untar_data(URLs.IMDB_SAMPLE) data = TextClasDataBunch.from_csv(path, 'texts.csv') iter_dl = iter(data.train_dl) _ = next(iter_dl) x,y = next(iter_dl) x[-10:,:20] show_doc(SortSampler) show_doc(SortishSampler) show_doc(pad_collate) show_doc(TextList.new) show_doc(TextList.get) show_doc(TokenizeProcessor.process_one) show_doc(TokenizeProcessor.process) show_doc(OpenFileProcessor.process_one) show_doc(NumericalizeProcessor.process) show_doc(NumericalizeProcessor.process_one) show_doc(TextList.reconstruct) show_doc(LanguageModelPreLoader.on_epoch_begin) show_doc(LanguageModelPreLoader.on_epoch_end) show_doc(LMLabelList) show_doc(LanguageModelPreLoader.allocate_buffers) show_doc(LanguageModelPreLoader.CircularIndex.shuffle) show_doc(LanguageModelPreLoader.fill_row)