!python -m spacy download es_core_news_sm
! pip install -U symspellpy
import nltk # importar natural language toolkit
nltk.download('punkt') 
nltk.download('stopwords') # modulo para descargar stopwords en diferentes idiomas
nltk.download('wordnet')
from nltk.corpus import stopwords
import pandas as pd
import numpy  as np
import re
import string
import plotly
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer 
import time
import spacy
import es_core_news_sm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.probability import FreqDist
from wordcloud import WordCloud
import pickle
from symspellpy import SymSpell
import pkg_resources
from symspellpy import SymSpell, Verbosity

!python -m spacy download es_core_news_md

import es_core_news_md
nlp = es_core_news_md.load()

texto= ('Este es un tutorial acerca de Procesamiento de lenguaje usando Python con spaCy')
doc = nlp(texto)
#tokenizar
print([token.text for token in doc])

text=('Gus es un desarrollador en Python actualmente trabajando para una compañia Fintech en Londres Inglaterra. Se encuentra interesado en aprender NLP.')
t=nlp(text)
oraciones= list(t.sents)
print(len(oraciones))
for x in oraciones:
  print(x)

for token in t:
  print(token, token.idx)

for token in t:
  print(token, token.idx, token.text_with_ws,
        token.is_alpha, token.is_punct, token.is_space,
        token.shape_, token.is_stop)

import spacy
spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS
print(len(spacy_stopwords))
for stop_word in list(spacy_stopwords)[:10]:
  print(stop_word)

for token in t:
  if not token.is_stop:
    print(token)

# Creacion adicional de stopwrods
documento_sin_stopword = [token for token in t if not token.is_stop]
print(documento_sin_stopword)

for token in t:
  print(token, '-', token.lemma_)

texto= '''
La FIFA responde así a una denuncia interpuesta por la Federación de Chile ante esa Comisión Disciplinaria, en la que presentaba alegaciones sobre la posible falsificación de los documentos que conceden la nacionalidad ecuatoriana Byron Castillo.

La selección de Ecuador se clasificó de forma directa para el Mundial, junto con las de Brasil, Argentina y Uruguay, al contrario que las de Chile y Perú. El combinado peruano, que terminó quinto por detrás del ecuatoriano, disputará una repesca.

El defensa fue alineado por el seleccionador ecuatoriano Gustavo Alfaro para los dos partidos contra Paraguay y Chile y en una ocasión ante Uruguay, Bolivia, Venezuela y Argentina, partidos clave para que el equipo lograse uno de los cupos directos para el Mundial.

"Innumerables pruebas de que nació en Colombia"
La Federación de Chile denunció el pasado día 5 que hay "innumerables pruebas de que el jugador nació en Colombia".

"Las investigaciones realizadas en Ecuador, entre ellas, un informe jurídico de la Dirección Nacional de Registro Civil, declararon la existencia de inconsistencias en el certificado de nacimiento presentado por el jugador", afirmó este organismo, que acusó a la Federación Ecuatoriana de tener "total conocimiento" de las irregularidades.

Una posible sanción de la FIFA podría implicar la resta de puntos a Ecuador por los partidos que Castillo jugó, lo que alteraría la nómina de clasificados.

Un informe técnico jurídico de la dirección nacional del registro civil de Ecuador afirma que la inscripción de nacimiento de Byron Castillo en la ciudad ecuatoriana de Guayas no consta en el tomo, la página y el acta solicitado, según un documento oficial.
'''

type(texto)

import re
texto1 = re.sub('\n', '', texto) #remover saltos de linea
print(type(texto))
str(texto1)

doc= nlp(texto1)
# Remover stopwrods
words= [token.text for token in doc if not token.is_stop and not token.is_punct]
from collections import Counter
word_freq= Counter(words)
# Sacar las 5 mas frecuentes y sus frecuencias
common_words= word_freq.most_common(5)
print(common_words)
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print(unique_words)

for token in doc:
  print(token,' -', token.tag_, ' -', token.pos_,' -' ,spacy.explain(token.tag_))

nouns=[]
adjectives=[]
for token in doc:
  if token.pos_ =='NOUN':
    nouns.append(token)
  if token.pos_ =='ADJ':
    adjectives.append(token)
print(nouns)
print(adjectives)

from spacy import displacy
texto = ('el se encuentra interesado en aprender Procesamiento de Lenguaje Natural')
t = nlp(texto)
displacy.render(t, style='dep',jupyter=True)

import os
from collections import Counter

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd

from pylab import rcParams
from wordcloud import WordCloud
from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

rcParams['figure.figsize'] = 30, 60
%matplotlib inline

from google.colab import drive
import os
drive.mount('/content/gdrive')
# Establecer ruta de acceso en dr
import os
print(os.getcwd())
os.chdir("/content/gdrive/My Drive")

import pandas as pd
amazon_reviews = pd.read_csv('Reviews.csv')
# Seleccionando solo los primeros 10,000 registros para calculo mas rapido
#amazon_reviews = amazon_reviews[:10000]
amazon_reviews.head()

amazon_reviews.shape

words_per_review = amazon_reviews.Text.apply(lambda x: len(x.split(" ")))
words_per_review.hist(bins = 100)
plt.title('Numero de palabras por revision')
plt.xlabel('Palabras')
plt.ylabel('Frecuencia')

words_per_review.mean()

amazon_reviews.Score.value_counts()

percent_val = 100 * amazon_reviews.Score.value_counts()/amazon_reviews.shape[0]
percent_val

percent_val.plot.bar()
plt.title('Revisiones por scores')
plt.xlabel('Score')
plt.ylabel('Porcentaje (%)')

word_cloud_text = ''.join(amazon_reviews.Text)
print(len(word_cloud_text))

wordcloud = WordCloud(
    max_font_size=100,
    max_words=100,
    background_color="white",
    scale=10,
    width=800,
    height=400
).generate(word_cloud_text)

plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

amazon_reviews['Sentiment_rating'] = np.where(amazon_reviews.Score > 3, 1, 0)
amazon_reviews['Sentiment_rating'].value_counts()

# removiendo neutrales
amazon_reviews = amazon_reviews[amazon_reviews.Score != 3]

#rcParams['figure.figsize'] = 8, 5
amazon_reviews.Sentiment_rating.value_counts().plot.bar()
plt.title('Scores luego de estandarizacion')
plt.xlabel('Score')
plt.ylabel('Frecuencia')

amazon_reviews['reviews_text_new'] = amazon_reviews.Text.apply(lambda x: x.lower())

from nltk import word_tokenize
import nltk

nltk.download('punkt')

token_lists = [word_tokenize(each) for each in amazon_reviews.Text]
tokens = [item for sublist in token_lists for item in sublist]
print("Numero de tokens unicos antes: ", len(set(tokens)))

token_lists_lower = [word_tokenize(each) for each in amazon_reviews.reviews_text_new]
tokens_lower = [item for sublist in token_lists_lower for item in sublist]
print("Numero de tokens unicos nuevos: ", len(set(tokens_lower)))

(22865-27899)/27899

# Seleccionando los caracteres no alfa numericos que no son espacios
special_chars = amazon_reviews.reviews_text_new.apply(lambda x: [each for each in list(x) if not each.isalnum() and each != ' '])

# obtener una lista de listas
flat_list = [item for sublist in special_chars for item in sublist]

# caracteres especiales unicos 
print(set(flat_list))

import re
review_backup = amazon_reviews.reviews_text_new.copy()
amazon_reviews.reviews_text_new = amazon_reviews.reviews_text_new.apply(
    lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)
)

print("Review anterior:")
review_backup.values[6]

print("Review nuevo:")
amazon_reviews.reviews_text_new[6]

token_lists = [word_tokenize(each) for each in amazon_reviews.Text]
tokens = [item for sublist in token_lists for item in sublist]
print("Numero de token unicos antes: ", len(set(tokens)))

token_lists = [word_tokenize(each) for each in amazon_reviews.reviews_text_new]
tokens = [item for sublist in token_lists for item in sublist]
print("Numero de tokens unicos despues: ", len(set(tokens)))

import nltk
nltk.download('stopwords')

noise_words = []
stopwords_corpus = nltk.corpus.stopwords
eng_stop_words = stopwords_corpus.words('english')
noise_words.extend(eng_stop_words)
print(len(noise_words))
noise_words

one_percentile = int(len(set(tokens)) * 0.01)
top_1_percentile = Counter(tokens).most_common(one_percentile)
top_1_percentile[:10]

pd.DataFrame(top_1_percentile[:10], columns=['Palabras','Frecuencia']).set_index('Palabras').plot(kind='bar')
plt.title('Percentil 1 de palabras mas frecuentes')
plt.xlabel('Palabras')
plt.ylabel('Frecuencia')

bottom_1_percentile = Counter(tokens).most_common()[-one_percentile:]
bottom_1_percentile[:10]

noise_words.extend([word for word,val in top_1_percentile])
noise_words.extend([word for word,val in bottom_1_percentile])

from nltk.stem import PorterStemmer, WordNetLemmatizer, LancasterStemmer

nltk.download('wordnet')

from nltk.corpus import wordnet

porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

print("Lancaster Stemmer")
print(lancaster.stem("trouble"))
print(lancaster.stem("troubling"))
print(lancaster.stem("troubled"))

# Proveer una palabra que sera lemantizada
print("WordNet Lemmatizer")
print(lemmatizer.lemmatize("trouble", wordnet.NOUN))
print(lemmatizer.lemmatize("troubling", wordnet.VERB))
print(lemmatizer.lemmatize("troubled", wordnet.VERB))

amazon_reviews[['Text','Score','Sentiment_rating']].head(5)

# Creacion de metodo para stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

# Creacion de un objeto tipo CountVectorizer
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    stop_words=noise_words,
    ngram_range=(1, 1)
)

reviews_train, reviews_test = train_test_split(amazon_reviews, test_size=0.2, random_state=0)

X_train_bow = bow_counts.fit_transform(reviews_train.reviews_text_new)
X_test_bow = bow_counts.transform(reviews_test.reviews_text_new)

y_train_bow = reviews_train['Sentiment_rating']
y_test_bow = reviews_test['Sentiment_rating']

y_test_bow.value_counts() / y_test_bow.shape[0]

# Entrenar el modelo
lr_model_all = LogisticRegression(C=1, solver="liblinear")
lr_model_all.fit(X_train_bow, y_train_bow)

# Predecir el output
test_pred_lr_prob = lr_model_all.predict_proba(X_test_bow)
test_pred_lr_all = lr_model_all.predict(X_test_bow)

print("F1 score: ", f1_score(y_test_bow, test_pred_lr_all))
print("Accuracy: ", accuracy_score(y_test_bow, test_pred_lr_all) * 100)

test_pred_lr_prob

probabilities = [each[1] for each in test_pred_lr_prob]

predictions = pd.DataFrame()
predictions['Text'] = reviews_test['Text']
predictions['Actual_Score'] = reviews_test['Score']
predictions['Sentiment_rating'] = reviews_test['Sentiment_rating']
predictions['Predicted_sentiment'] = test_pred_lr_all
predictions['Predicted_probability'] = probabilities

predictions.head(5)

accuracy_score(predictions['Sentiment_rating'], predictions['Predicted_sentiment'])

predictions[
    predictions['Predicted_sentiment'] != predictions['Sentiment_rating']
].sort_values(by=["Predicted_probability"]).head(3)

predictions.loc[7692].values

# Cambios con respecto al código anterior
# 1. Aumentar los n-gramas de solo tener 1 gramo a (1 gramo, 2 gramos, 3 gramos y 4 gramos)
# 2. Incluir las palabras vacías en las características de la bolsa de palabras
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1,4)
)

X_train_bow = bow_counts.fit_transform(reviews_train.reviews_text_new)
X_test_bow = bow_counts.transform(reviews_test.reviews_text_new)

# Observe el aumento de funciones con la inclusión de palabras vacías
X_train_bow

# Cambios en la regresión logística
# Cambio de la sanción de regularización por defecto de l2 a l1
# Cambiando el parámetro de costo C a 0.9
lr_model_all_new = LogisticRegression(C=0.9, solver="liblinear")

# Entrenar el modelo
lr_model_all_new.fit(X_train_bow, y_train_bow)

# Predecir resultados
test_pred_lr_prob = lr_model_all_new.predict_proba(X_test_bow)
test_pred_lr_all = lr_model_all_new.predict(X_test_bow)

print("F1 score: ", f1_score(y_test_bow, test_pred_lr_all))
print("Accuracy: ", accuracy_score(y_test_bow, test_pred_lr_all) * 100)

lr_weights = pd.DataFrame(list(
    zip(
        bow_counts.get_feature_names(),
        lr_model_all_new.coef_[0])
    ),
    columns=['words','weights']
)

lr_weights.sort_values(['weights'],ascending = False)[:15]

lr_weights.sort_values(['weights'],ascending = False)[-15:]

rf_model_all = RandomForestClassifier(n_estimators=100)

# Entrenamiento
rf_model_all.fit(X_train_bow, y_train_bow)

# predicciones
test_pred_lr_prob = rf_model_all.predict_proba(X_test_bow)
test_pred_lr_all = rf_model_all.predict(X_test_bow)

print("F1 score: ", f1_score(y_test_bow,test_pred_lr_all))
print("Accuracy: ", accuracy_score(y_test_bow,test_pred_lr_all)* 100)

feature_importances = pd.DataFrame(
    rf_model_all.feature_importances_,
    index=bow_counts.get_feature_names(),
    columns=['importance']
)

feature_importances.sort_values(['importance'], ascending=False)[:10]

# Cree un vectorizador: aún alimentamos nuestras stopwords, aunque
# estos son menos relevantes ahora ya que TF-IDF los ponderaría menos
# de todas formas.
tfidf_counts = TfidfVectorizer(
    tokenizer=word_tokenize,
    stop_words=noise_words,
    ngram_range=(1,1)
)

X_train_tfidf = tfidf_counts.fit_transform(reviews_train.reviews_text_new)
X_test_tfidf = tfidf_counts.transform(reviews_test.reviews_text_new)

# Crear el clasificador
lr_model_tf_idf = LogisticRegression(solver="liblinear")

# Entrenar
lr_model_tf_idf.fit(X_train_tfidf, y_train_bow)

# Predecir
test_pred_lr_prob = lr_model_tf_idf.predict_proba(X_test_tfidf)
test_pred_lr_all = lr_model_tf_idf.predict(X_test_tfidf)

## Evaluar el modelo
print("F1 score: ",f1_score(y_test_bow, test_pred_lr_all))
print("Accuracy: ", accuracy_score(y_test_bow, test_pred_lr_all) * 100)

tfidf_counts = TfidfVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1,4)
)

X_train_tfidf = tfidf_counts.fit_transform(reviews_train.reviews_text_new)
X_test_tfidf = tfidf_counts.transform(reviews_test.reviews_text_new)

# definiendo la clase del modelo
lr_model_tf_idf_new = LogisticRegression(solver="liblinear", penalty='l1', C=10)

# Entrenar
lr_model_tf_idf_new.fit(X_train_tfidf, y_train_bow)

# Predecir
test_pred_lr_prob = lr_model_tf_idf_new.predict_proba(X_test_tfidf)
test_pred_lr_all = lr_model_tf_idf_new.predict(X_test_tfidf)

# Evaluar el modelo
print("F1 score: ",f1_score(y_test_bow, test_pred_lr_all))
print("Accuracy: ", accuracy_score(y_test_bow, test_pred_lr_all)*100)

lr_weights = pd.DataFrame(
    list(
        zip(tfidf_counts.get_feature_names(), lr_model_tf_idf_new.coef_[0])
    ),
    columns=['words','weights']
)

lr_weights.sort_values(['weights'],ascending = False)[:10]

lr_weights.sort_values(['weights'],ascending = False)[-10:]

import gensim
# Cargar una incrustación de palabras de guante preentrenada que se entrena en un conjunto de datos de Twitter
# Esta palabra incrustada tiene 200 dimensiones, lo que significa que cada palabra está representada
# por un vector de 200 dimensiones.
model = gensim.models.KeyedVectors.load_word2vec_format(
    os.path.join(os.getcwd(), 'glove.twitter.27B.200d_out.txt'),
    binary=False,
    unicode_errors='ignore'
)

print("El embedding para food es", len(model['food']), "dimensional")

model['food']

print("El embedding para great es", len(model['great']), "dimensional")

model['great']

def print_similarity(word1, word2, model):
    v1 = model[word1]
    v2 = model[word2]
    similarity = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    print(f"{word1} y {word2} son {round(similarity * 100)}% similar")

print_similarity("cat", "dog", model)
print_similarity("good", "bad", model)
print_similarity("great", "good", model)
print_similarity("grass", "model", model)

review_embeddings = []

for each_review in amazon_reviews.reviews_text_new:
    review_average = np.zeros(model.vector_size)
    count_val = 0
    
    for each_word in word_tokenize(each_review):

        # Cambiar a "if True" para remover stopwords del promedio de embeddings
        if False:
            if(each_word.lower() in noise_words):
                print(each_word.lower())
                continue
        
        if(each_word.lower() in model):
            review_average += model[each_word.lower()]
            count_val += 1
    
    review_embeddings.append(list(review_average/count_val))

embedding_data = pd.DataFrame(review_embeddings)
embedding_data = embedding_data.fillna(0)

X_train_embed, X_test_embed, y_train_embed, y_test_embed =  train_test_split(
    embedding_data,
    amazon_reviews.Sentiment_rating,
    test_size=0.2,
    random_state=0
)

lr_model = LogisticRegression(penalty="l1", C=10, solver="liblinear")
lr_model.fit(X_train_embed, y_train_embed)
test_pred_lr_prob = lr_model.predict_proba(X_test_embed)
test_pred_lr_all = lr_model.predict(X_test_embed)

print("F1 score: ", f1_score(y_test_embed, test_pred_lr_all))
print("Accuracy: ", accuracy_score(y_test_embed, test_pred_lr_all)*100)