#!/usr/bin/env python
# coding: utf-8

# # Activity 10 - More on Text Analytics
# 
# In this notebook we give a simplified version of how next word prediction can be performed. This is the core of how larger and more complex models perform, such as GPT, when trained on highly diverse and rich text datasets.

# In[5]:


### Here are the imports that you will require
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
def load_data():
    # the data is a standard pcap packet capture file (saved as a csv output)
    file_name = './data/movie_lines.txt'
    # this will then put the csv data into a pandas dataframe
    #data = pd.read_csv(file_name, sep='+++$+++')
    lines = []
    with open(file_name, 'r', encoding='utf-8', errors="replace") as f:
        for line in f:
            line = line.split(" +++$+++ ")
            line[4] = line[4].split('\n')[0]
            lines.append(line)
    data = pd.DataFrame.from_records(lines, columns=['ID1', 'ID2', 'ID3', 'ID4', 'Text'])
    return data


# In[6]:


data = load_data()
data


# In[39]:


large_data_string = ' '.join(data['Text'].values) 
large_data_string = large_data_string.replace(".", " ")
large_data_string = large_data_string.replace(",", " ")
large_data_string = large_data_string.replace("-", " ")
large_data_string = large_data_string.replace("--", " ")
large_data_string = large_data_string.replace("!", " ")
large_data_string = large_data_string.replace("?", " ")


# In[40]:


#large_data_string # this will output the full string


# In[41]:


print (len(large_data_string)) # this is the number of characters in the string


# In[42]:


large_data = large_data_string.split(" ") # we split by the space to create an array of words
ld = []
for i in large_data:
    if len(i) > 1:
        ld.append(i)
#ld


# Below we are now processing the text to have a sequence of words, which we will use to train our ML model.

# In[43]:


val1 = 0
val2 = 5
ld_matrix = []
for i in range(len(ld)):
    ld_matrix.append(ld[val1:val2])
    val1 = val1+1
    val2 = val2+1
ld_matrix


# In[44]:


ld


# In[46]:


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ld_labels = le.fit_transform(ld)


# In[48]:


len(ld_labels)


# In[49]:


len(ld)


# In[54]:


ld_labels


# In[65]:


val1 = 0
val2 = 5
ld_labels = list(ld_labels)
ld_label_matrix = []
for i in range(len(ld_labels)):
    ld_label_matrix.append(np.array(ld_labels[val1:val2]))
    val1 = val1+1
    val2 = val2+1


# In[67]:


ld_label_matrix = np.array(ld_label_matrix)
ld_label_matrix


# In[66]:


ld_label_matrix


# In[ ]: