#!/usr/bin/env python
# coding: utf-8

# ```{admonition} Information
# __Section__: Put everything together  
# __Goal__: Apply all the seen methods together to see the transformation of the text.    
# __Time needed__: 10 min  
# __Prerequisites__: Chapter 3
# ```

# # Put everything together

# Now that we have detailed the transformations to do with the text, let's see how the tweets are transformed when we apply all the methods after each other.

# ## Tweets examples

# In[1]:


# Put everything in a single list
tweet_1 = 'Hospitalizations from COVID-19 have increased nearly 90% and California officials say they could triple by Christmas. https://t.co/hrBnP04HnB'
tweet_2 = 'Something for the afternoon slump / journey home / after school / cooking dinner ... a special 30 minute mix of cool Christmas tunes intercut with Christmas film samples and scratching @BBCSounds https://t.co/rHovIA3u5e'
tweet_3 = 'This happened in Adelaide the other day. #koala #Adelaide https://t.co/vAQFkd5r7q'
list_tweets = [tweet_1, tweet_2, tweet_3]


# ## Result

# Change the value of the tweets to see how any text is changed by our transformations.

# In[4]:


# Function for tweet preprocessing with what we saw in the chapter

def preprocess_tweet(tweet):
    '''
    Takes a tweet as an input and output the list of tokens.
    '''
    
    import emoji
    import re
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    
    # Initialization
    new_tweet = tweet
    
    ## Changes on string
    
    # Remove urls
    new_tweet = re.sub(r'https?://[^ ]+', '', new_tweet)
    
    # Remove usernames
    new_tweet = re.sub(r'@[^ ]+', '', new_tweet)
    
    # Remove hashtags
    new_tweet = re.sub(r'#', '', new_tweet)
    
    # Character normalization
    new_tweet = re.sub(r'([A-Za-z])\1{2,}', r'\1', new_tweet)
    
    # Emoji transformation
    new_tweet = emoji.demojize(new_tweet)
    
    # Punctuation and special characters
    new_tweet = re.sub(r' 0 ', 'zero', new_tweet)
    new_tweet = re.sub(r'[^A-Za-z ]', '', new_tweet)
    
    # Lower casing
    new_tweet = new_tweet.lower()
    
    
    ## Changes on tokens
    
    # Tokenization
    tokens = word_tokenize(new_tweet)
    
    porter = PorterStemmer()
    
    for token in tokens:
        # Stopwords removal
        if token in stopwords.words('english'):
            tokens.remove(token)
        # Stemming
            token = porter.stem(token)
    
    return tokens


# In[7]:


# Use function on our list of tweets

list_tweets2 = []
for tweet in list_tweets:
    print(tweet)
    tokens = preprocess_tweet(tweet)
    print(tokens)
    list_tweets2.append([tokens])


# In[10]:


# Beginner version: cell to hide

import ipywidgets as widgets
from ipywidgets import interact

def preprocess_tweet(tweet):
    '''
    Takes a tweet as an input and output the list of tokens.
    '''
    
    import emoji
    import re
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    
    new_tweet = tweet
    new_tweet = re.sub(r'https?://[^ ]+', '', new_tweet)
    new_tweet = re.sub(r'@[^ ]+', '', new_tweet)
    new_tweet = re.sub(r'#', '', new_tweet)
    new_tweet = re.sub(r'([A-Za-z])\1{2,}', r'\1', new_tweet)
    new_tweet = emoji.demojize(new_tweet)
    new_tweet = re.sub(r' 0 ', 'zero', new_tweet)
    new_tweet = re.sub(r'[^A-Za-z ]', '', new_tweet)
    new_tweet = new_tweet.lower()
    
    tokens = word_tokenize(new_tweet)
    porter = PorterStemmer()
    for token in tokens:
        if token in stopwords.words('english'):
            tokens.remove(token)
            token = porter.stem(token)
            
    print(tokens)

interact(preprocess_tweet, tweet = widgets.Textarea(
    value = tweet_1,
    description = 'Tweet:',
    disabled = False
))


# ## Conclusion

# This chapter showed some simple text transformations for a machine learning experiment based on text analysis. This was only a simple case (bag-of-words), where we treat each word as an independent entity.  
# Other, more complicated, methods exist to also take into consideration the role of the word in the sentence (part-of-speech for example) and do more language-based analysis. To go further on this topic theoretically, you can have a look at this [good article](https://machinelearningmastery.com/natural-language-processing/) or [this one](https://becominghuman.ai/a-simple-introduction-to-natural-language-processing-ea66a1747b32). For some Python oriented resources, have a look [here](https://towardsdatascience.com/gentle-start-to-natural-language-processing-using-python-6e46c07addf3) or [there](https://medium.com/towards-artificial-intelligence/natural-language-processing-nlp-with-python-tutorial-for-beginners-1f54e610a1a0).

# In[ ]:


from IPython.display import IFrame
IFrame("https://blog.hoou.de/wp-admin/admin-ajax.php?action=h5p_embed&id=65", "959", "332")