from fastai.gen_doc.nbdoc import *
from fastai.text import * 
from fastai.gen_doc.nbdoc import *


show_doc(TextLMDataBunch, title_level=3)

show_doc(TextLMDataBunch.create)

show_doc(TextClasDataBunch, title_level=3)

show_doc(TextClasDataBunch.create)

show_doc(TextDataBunch, title_level=3)

jekyll_warn("This class can only work directly if all the texts have the same length.")

show_doc(TextDataBunch.from_folder)

show_doc(TextDataBunch.from_csv)

show_doc(TextDataBunch.from_df)

show_doc(TextDataBunch.from_tokens)

show_doc(TextDataBunch.from_ids)

show_doc(TextDataBunch.load)

jekyll_warn("This method should only be used to load back `TextDataBunch` saved in v1.0.43 or before, it is now deprecated.")

path = untar_data(URLs.IMDB_SAMPLE)
path

pd.read_csv(path/'texts.csv').head()

data_lm = TextLMDataBunch.from_csv(Path(path), 'texts.csv')
data_clas = TextClasDataBunch.from_csv(Path(path), 'texts.csv')

show_doc(Text, title_level=3)

show_doc(TextList, title_level=3)

tokenizer = Tokenizer(SpacyTokenizer, 'en')
processor = [TokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(max_vocab=30000)]

processor = SPProcessor()

show_doc(TextList.label_for_lm)

show_doc(TextList.from_folder)

show_doc(TextList.show_xys)

show_doc(TextList.show_xyzs)

show_doc(OpenFileProcessor, title_level=3)

show_doc(open_text)

show_doc(TokenizeProcessor, title_level=3)

show_doc(NumericalizeProcessor, title_level=3)

show_doc(SPProcessor, title_level=3)

path = untar_data(URLs.IMDB_SAMPLE)
data = TextLMDataBunch.from_csv(path, 'texts.csv')
x,y = next(iter(data.train_dl))
example = x[:15,:15].cpu()
texts = pd.DataFrame([data.train_ds.vocab.textify(l).split(' ') for l in example])
texts

jekyll_warn("If you are used to another convention, beware! fastai always uses batch as a first dimension, even in NLP.")

show_doc(LanguageModelPreLoader)

path = untar_data(URLs.IMDB_SAMPLE)
data = TextClasDataBunch.from_csv(path, 'texts.csv')
iter_dl = iter(data.train_dl)
_ = next(iter_dl)
x,y = next(iter_dl)
x[-10:,:20]

show_doc(SortSampler)

show_doc(SortishSampler)

show_doc(pad_collate)

show_doc(TextList.new)

show_doc(TextList.get)

show_doc(TokenizeProcessor.process_one)

show_doc(TokenizeProcessor.process)

show_doc(OpenFileProcessor.process_one)

show_doc(NumericalizeProcessor.process)

show_doc(NumericalizeProcessor.process_one)

show_doc(TextList.reconstruct)

show_doc(LanguageModelPreLoader.on_epoch_begin)

show_doc(LanguageModelPreLoader.on_epoch_end)

show_doc(LMLabelList)

show_doc(LanguageModelPreLoader.allocate_buffers)

show_doc(LanguageModelPreLoader.CircularIndex.shuffle)

show_doc(LanguageModelPreLoader.fill_row)