#!/usr/bin/env python # coding: utf-8 # ```{admonition} Information # __Section__: Put everything together # __Goal__: Apply all the seen methods together to see the transformation of the text. # __Time needed__: 10 min # __Prerequisites__: Chapter 3 # ``` # # Put everything together # Now that we have detailed the transformations to do with the text, let's see how the tweets are transformed when we apply all the methods after each other. # ## Tweets examples # In[1]: # Put everything in a single list tweet_1 = 'Hospitalizations from COVID-19 have increased nearly 90% and California officials say they could triple by Christmas. https://t.co/hrBnP04HnB' tweet_2 = 'Something for the afternoon slump / journey home / after school / cooking dinner ... a special 30 minute mix of cool Christmas tunes intercut with Christmas film samples and scratching @BBCSounds https://t.co/rHovIA3u5e' tweet_3 = 'This happened in Adelaide the other day. #koala #Adelaide https://t.co/vAQFkd5r7q' list_tweets = [tweet_1, tweet_2, tweet_3] # ## Result # Change the value of the tweets to see how any text is changed by our transformations. # In[4]: # Function for tweet preprocessing with what we saw in the chapter def preprocess_tweet(tweet): ''' Takes a tweet as an input and output the list of tokens. ''' import emoji import re from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer # Initialization new_tweet = tweet ## Changes on string # Remove urls new_tweet = re.sub(r'https?://[^ ]+', '', new_tweet) # Remove usernames new_tweet = re.sub(r'@[^ ]+', '', new_tweet) # Remove hashtags new_tweet = re.sub(r'#', '', new_tweet) # Character normalization new_tweet = re.sub(r'([A-Za-z])\1{2,}', r'\1', new_tweet) # Emoji transformation new_tweet = emoji.demojize(new_tweet) # Punctuation and special characters new_tweet = re.sub(r' 0 ', 'zero', new_tweet) new_tweet = re.sub(r'[^A-Za-z ]', '', new_tweet) # Lower casing new_tweet = new_tweet.lower() ## Changes on tokens # Tokenization tokens = word_tokenize(new_tweet) porter = PorterStemmer() for token in tokens: # Stopwords removal if token in stopwords.words('english'): tokens.remove(token) # Stemming token = porter.stem(token) return tokens # In[7]: # Use function on our list of tweets list_tweets2 = [] for tweet in list_tweets: print(tweet) tokens = preprocess_tweet(tweet) print(tokens) list_tweets2.append([tokens]) # In[10]: # Beginner version: cell to hide import ipywidgets as widgets from ipywidgets import interact def preprocess_tweet(tweet): ''' Takes a tweet as an input and output the list of tokens. ''' import emoji import re from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer new_tweet = tweet new_tweet = re.sub(r'https?://[^ ]+', '', new_tweet) new_tweet = re.sub(r'@[^ ]+', '', new_tweet) new_tweet = re.sub(r'#', '', new_tweet) new_tweet = re.sub(r'([A-Za-z])\1{2,}', r'\1', new_tweet) new_tweet = emoji.demojize(new_tweet) new_tweet = re.sub(r' 0 ', 'zero', new_tweet) new_tweet = re.sub(r'[^A-Za-z ]', '', new_tweet) new_tweet = new_tweet.lower() tokens = word_tokenize(new_tweet) porter = PorterStemmer() for token in tokens: if token in stopwords.words('english'): tokens.remove(token) token = porter.stem(token) print(tokens) interact(preprocess_tweet, tweet = widgets.Textarea( value = tweet_1, description = 'Tweet:', disabled = False )) # ## Conclusion # This chapter showed some simple text transformations for a machine learning experiment based on text analysis. This was only a simple case (bag-of-words), where we treat each word as an independent entity. # Other, more complicated, methods exist to also take into consideration the role of the word in the sentence (part-of-speech for example) and do more language-based analysis. To go further on this topic theoretically, you can have a look at this [good article](https://machinelearningmastery.com/natural-language-processing/) or [this one](https://becominghuman.ai/a-simple-introduction-to-natural-language-processing-ea66a1747b32). For some Python oriented resources, have a look [here](https://towardsdatascience.com/gentle-start-to-natural-language-processing-using-python-6e46c07addf3) or [there](https://medium.com/towards-artificial-intelligence/natural-language-processing-nlp-with-python-tutorial-for-beginners-1f54e610a1a0). # In[ ]: from IPython.display import IFrame IFrame("https://blog.hoou.de/wp-admin/admin-ajax.php?action=h5p_embed&id=65", "959", "332")