#!/usr/bin/env python # coding: utf-8 # # Activity 10 - More on Text Analytics # # In this notebook we give a simplified version of how next word prediction can be performed. This is the core of how larger and more complex models perform, such as GPT, when trained on highly diverse and rich text datasets. # In[5]: ### Here are the imports that you will require import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import urllib.request def load_data(): # the data is a standard pcap packet capture file (saved as a csv output) file_name = './data/movie_lines.txt' # this will then put the csv data into a pandas dataframe #data = pd.read_csv(file_name, sep='+++$+++') lines = [] with open(file_name, 'r', encoding='utf-8', errors="replace") as f: for line in f: line = line.split(" +++$+++ ") line[4] = line[4].split('\n')[0] lines.append(line) data = pd.DataFrame.from_records(lines, columns=['ID1', 'ID2', 'ID3', 'ID4', 'Text']) return data # In[6]: data = load_data() data # In[39]: large_data_string = ' '.join(data['Text'].values) large_data_string = large_data_string.replace(".", " ") large_data_string = large_data_string.replace(",", " ") large_data_string = large_data_string.replace("-", " ") large_data_string = large_data_string.replace("--", " ") large_data_string = large_data_string.replace("!", " ") large_data_string = large_data_string.replace("?", " ") # In[40]: #large_data_string # this will output the full string # In[41]: print (len(large_data_string)) # this is the number of characters in the string # In[42]: large_data = large_data_string.split(" ") # we split by the space to create an array of words ld = [] for i in large_data: if len(i) > 1: ld.append(i) #ld # Below we are now processing the text to have a sequence of words, which we will use to train our ML model. # In[43]: val1 = 0 val2 = 5 ld_matrix = [] for i in range(len(ld)): ld_matrix.append(ld[val1:val2]) val1 = val1+1 val2 = val2+1 ld_matrix # In[44]: ld # In[46]: from sklearn.preprocessing import LabelEncoder le = LabelEncoder() ld_labels = le.fit_transform(ld) # In[48]: len(ld_labels) # In[49]: len(ld) # In[54]: ld_labels # In[65]: val1 = 0 val2 = 5 ld_labels = list(ld_labels) ld_label_matrix = [] for i in range(len(ld_labels)): ld_label_matrix.append(np.array(ld_labels[val1:val2])) val1 = val1+1 val2 = val2+1 # In[67]: ld_label_matrix = np.array(ld_label_matrix) ld_label_matrix # In[66]: ld_label_matrix # In[ ]: