#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 


# # Text Extraction
# 
# As of v0.28.x, **ktrain** includes the `TextExtractor` class allows you to easily extract text from various file formats such as PDFs and MS Word documents.

# In[2]:


get_ipython().system('wget https://aclanthology.org/N19-1423.pdf -O /tmp/bert_paper.pdf')


# In[3]:


from ktrain.text import TextExtractor
te = TextExtractor()


# #### Extract text into single string variable:

# In[4]:


rawtext = te.extract('/tmp/bert_paper.pdf')
print(rawtext[:1000])


# #### Extract text and split into sentences:

# In[5]:


sentences = te.extract('/tmp/bert_paper.pdf', return_format='sentences')
print(sentences[:5])


# #### Extract text and split into paragraphs:

# In[6]:


paragraphs = te.extract('/tmp/bert_paper.pdf', return_format='paragraphs')
print("%s paragraphs" % (len(paragraphs)))
print('Third paragraph from paper is:\n')
print(paragraphs[2])


# You can also feed the `TextExtractor` strings to simply split them into lists of sentences or paragraphs:

# In[7]:


two_sentences = 'This is the first sentence.  This is the second sentence.'
te.extract(text=two_sentences, return_format='sentences')


# In[ ]: