#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # # Text Extraction # # As of v0.28.x, **ktrain** includes the `TextExtractor` class allows you to easily extract text from various file formats such as PDFs and MS Word documents. # In[2]: get_ipython().system('wget https://aclanthology.org/N19-1423.pdf -O /tmp/bert_paper.pdf') # In[3]: from ktrain.text import TextExtractor te = TextExtractor() # #### Extract text into single string variable: # In[4]: rawtext = te.extract('/tmp/bert_paper.pdf') print(rawtext[:1000]) # #### Extract text and split into sentences: # In[5]: sentences = te.extract('/tmp/bert_paper.pdf', return_format='sentences') print(sentences[:5]) # #### Extract text and split into paragraphs: # In[6]: paragraphs = te.extract('/tmp/bert_paper.pdf', return_format='paragraphs') print("%s paragraphs" % (len(paragraphs))) print('Third paragraph from paper is:\n') print(paragraphs[2]) # You can also feed the `TextExtractor` strings to simply split them into lists of sentences or paragraphs: # In[7]: two_sentences = 'This is the first sentence. This is the second sentence.' te.extract(text=two_sentences, return_format='sentences') # In[ ]: