#!/usr/bin/env python # coding: utf-8 # # Regular Expressions # # Regular expressions are a powerful way of building patterns to match text. # - Using regular expressions, complex operations with string data can be written a lot quicker, which will save you time. # - Regular expressions are often faster to execute than their manual equivalents. # - Regular expressions are supported in almost every modern programming language, as well as other places like command line utilities and databases. Understanding regular expressions gives us a powerful tool that you can use wherever you work with data. # In[1]: import pandas as pd hn = pd.read_csv('hacker_news.csv') hn.head() # ![image.png](attachment:image.png) # Python has a built in module for regular expressions, the re module. One of its useful function is re.search() function which takes in two arguments : # - the regex pattern # - the string in which we want that pattern # # # In[2]: import re m = re.search("and", "hand") m # The function returns a [Match](https://docs.python.org/3/library/re.html#match-objects) object when there is a match and returns None when the pattern is not matched. Also, boolean value of a match object is True while None is False. # In[3]: string_list = ["Julie's favorite color is Blue.", "Keli's favorite color is Green.", "Craig's favorite colors are blue and red."] pattern = "Blue" for s in string_list: if re.search(pattern, s): print("Match") else: print("No Match") # Now, these are simple operations that we could even perform with in. The usefulness of regular expressions comes in when we use charcater sequences. he first of these we'll learn is called a set. A set allows us to specify two or more characters that can match in a single character's position. # # ![image.png](attachment:image.png) # # ![image-2.png](attachment:image-2.png) # # Now using the same list in the cell above, we can test by setting the pattern to a set. # In[4]: pattern = '[bB]lue' for s in string_list: if re.search(pattern, s): print("Match") else: print("No Match") # Now we are going to check how many times is python present in the title column in our dataset. # In[5]: pattern = '[Pp]ython' python_titles = [] for i in hn['title']: if re.search(pattern,i): python_titles.append(i) len(python_titles) # Since we are using pandas, we should try to use more vectorised operations. We will use [Series.str.contains() method](http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.contains.html) to check whether a Series of strings match a particular regex pattern. # In[6]: # boolean values are returned pd.Series(string_list).str.contains('[Bb]lue') # We can use the Series.sum() method to sum all the values in the boolean mask, with each True value counting as 1, and each False as 0. This means that we can easily count the number of values in the original series that matched our pattern. # In[7]: pattern_bool = pd.Series(string_list).str.contains('[Bb]lue') pattern_bool.sum() # In[8]: # for python in title python_titles = hn['title'].str.contains('[Pp]ython').sum() python_titles # In[9]: # to select rows containing python in title hn[hn['title'].str.contains('[Pp]ython')] # In[10]: # for ruby in title ruby_titles = hn['title'].str.contains('[Rr]uby').sum() ruby_titles # If we want to specify that a charcater repeats, we can use '{}'. # ![image.png](attachment:image.png) # The name for this type of regular expression syntax is called a quantifier. Quantifiers specify how many of the previous character our pattern requires, which can help us when we want to match substrings of specific lengths. # Different types of 'numeric quanitfiers' # ![image-2.png](attachment:image-2.png) # # ![image-3.png](attachment:image-3.png) # # Suppose if we want to look for the titles which contain e-mail or email, we will need to use ?, the optional quantifier, to include '-' as an option in our pattern. # In[11]: pattern = 'e-?mail' email_bool = hn['title'].str.contains(pattern) email_count = email_bool.sum() email_titles = hn['title'][email_bool] email_titles # In[12]: titles = hn['title'] # Some titles contain tag such as [pdf],[video], for example: # # [video] Google Self-Driving SUV Sideswipes Bus # New Directions in Cryptography by Diffie and Hellman (1976) [pdf] # Wallace and Gromit The Great Train Chase (1993) [video] # # So our next task is to filter out the titles which contain the tags. Since our expressions are enclosed in squared brackets, on entering [pdf], the function would search for 'pdf' rather than '[pdf]'. To escape both the open and closed brackets we can add a backslash '\' before each one of them. # ![image.png](attachment:image.png) # # One more challenge we have to solve is to make the pattern recognise unknown characters, like pdf or video. We will use character classes. # # ![image-2.png](attachment:image-2.png) # # Two points to observe: # - Ranges can be used for letters as well as numbers. # - Sets and ranges can be combined. # # These are some common character classes that we will be using. # ![image-3.png](attachment:image-3.png) # # The one that we'll be using to match characters in tags is \w, which represents any number or letter. Each character class represents a single character, so to match multiple characters (e.g. words like video and pdf), we'll need to combine them with quantifiers. # # In order to match word characters between our brackets, we can combine the word character class (\w) with the 'one or more' quantifier (+), giving us a combined pattern of \w+. # # Also, these will only match tags without speacial characters. To match other tags we can use .+ # # CELL RECAP: # # - We can use a backslash to escape characters that have special meaning in regular expressions (e.g. \[ will match an open bracket character). # # - Character classes let us match certain groups of characters (e.g. \w will match any word character). # - Character classes can be combined with quantifiers when we want to match different numbers of characters. # In[13]: pattern = '\[\w+\]' tag_titles = titles.str.contains(pattern) tag_titles.sum() # Backslashes are used to escape many other characters in regular expressions, as well as to denote some special character sequences (like character classes). # Generally in Python, backslashes are used for escape sequences. Escape sequence is a sequence of characters that does not represent itself when used inside a character or string literal, but is translated into another character or a sequence of characters that may be difficult or impossible to represent directly. For exmaple, \n, is used to represent a new line. Now, while using regular expressions there can be some conflict. We have two methods to solve this: # # # 1. add an extra backslash # In[14]: print('hello\b world') # this will not activate the escape sequence print('hello\\b world') # 2. use [raw strings](https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals), which we denote by prefixing our string with the r character. # In[15]: print(r'hello\b world') # Until now, we have only determined whether a particular string contains our pattern or not usinn Boolean datatype. Next, we will use [Series.str.extract() method](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.extract.html) to extract the actual data we were finding. In order to do this, we'll need to use capture groups. Capture groups allow us to specify one or more groups within our match that we can access separately. For now, we will only create a single capture group for our regular expression. We specify capture groups using parentheses. # # For context: # # ![image.png](attachment:image.png) # # # In[16]: pattern = r"(\[\w+\])" tag_titles_text = titles.str.extract(pattern) # the column '0' is the default column name # we can apply the parenthesis just around \w+, to get only the text tag_titles_text.dropna() # In[17]: type(tag_titles_text) # In[18]: # using expand = False, we get a Series tag_titles_text = titles.str.extract(pattern, expand = False) type(tag_titles_text) # In[19]: pattern = r"\[(\w+)\]" tag_titles_freq = titles.str.extract(pattern, expand = False).value_counts() tag_titles_freq # While using regular expressions, we can come across some bad instances that have been included due to our pattern. Since we need to exclude them, we mostly iterate to find those. # # We will create a function that returns our first ten matches, for us to exclude unwanted instances. # In[20]: def first_10_matches(pattern): all_matches = titles[titles.str.contains(pattern)] first_10 = all_matches.head(10) return first_10 # In[21]: # similar to python_titles first_10_matches(r"[Jj]ava") # We can see that there are a number of matches that contain Java as part of the word JavaScript. We want to exclude these titles from matching so we get an accurate count. One way to do this is by using negative character classes. Negative character classes are character classes that match every character except a character class. # # ![image.png](attachment:image.png) # In[22]: # pattern is defined in accordance with the table above. We exclude any occurence of 's' in our title pattern = r"[Jj]ava[^Ss]" java_titles = titles[titles.str.contains(pattern)] java_titles.head() # While the negative set was effective in removing any bad matches that mention JavaScript, it also had the side-effect of removing any titles where Java occurs at the end of the string. # This is because the negative set [^Ss] must match one character. Instances at the end of a string aren't followed by any characters, so there is no match. # # A different approach to take in cases like these is to use the word boundary anchor, specified using the syntax \b. A word boundary matches the position between a word character and a non-word character, or a word character and the start/end of a string. # # # # In[23]: # note that if we have a full stop at the end of the title, we will have get a Match object. The example below does # not have full stop. print(re.search(pattern,'Sometimes people confuse JavaScript with Java')) # The regular expression returns None, because there is no substring that contains Java followed by a character that isn't S. # In[24]: print(re.search(r'[Jj]ava','Sometimes people confuse JavaScript with Java')) re.findall(r'[Jj]ava','Sometimes people confuse JavaScript with Java') # In[25]: pattern_2 = r"\bJava\b" # check the span in th output object print(re.search(pattern_2, "Sometimes people Java confuse JavaScript with Java")) re.findall(pattern_2,'Sometimes people Java confuse JavaScript with Java') # In[26]: re.findall(r'[Jj]ava','Sometimes people Java confuse JavaScript with Java') # In[27]: pattern = r'\b[Jj]ava\b' java_titles = titles[titles.str.contains(pattern)] java_titles # Now that we have had some glimpse of word boundary anchor, now we will check out beginning anchor and end anchor # # ![image.png](attachment:image.png) # # Note that the ^ character is used both as a beginning anchor and to indicate a negative set, depending on whether the character preceding it is a [ or not. # In[28]: test_cases = pd.Series([ "Red Nose Day is a well-known fundraising event", "My favorite color is Red", "My Red Car was purchased three years ago" ]) test_cases.str.contains(r"^[Rr]ed") # In[29]: # using these anchors to determine tags at the start and the end titles.str.contains(r'^(\[\w+\])').sum() # In[30]: titles.str.contains(r'(\[\w+\])$').sum() # Until now we have using [Jj] to check for capitalisation. This works well for when we only have to check for a single character # In[31]: email_tests = pd.Series(['email', 'Email', 'e Mail', 'e mail', 'E-mail', 'e-mail', 'eMail', 'E-Mail', 'EMAIL', 'emails', 'Emails', 'E-Mails']) # We can use flags to specify that our regular expression should ignore case. Both re.search() and the pandas regular expression methods accept an optional flags argument. This argument accepts one or more flags, which are special variables in the re module that modify the behavior of the regex interpreter. # # The most common and useful one is the re.IGNORECASE flag, which for convenience can be used as re.I. # # # In[32]: email_tests.str.contains(r'email') # In[33]: email_tests.str.contains(r'email', flags = re.I) # In[34]: def email_count(val): return len(re.findall(r'email',val, flags = re.I)) email_titles = titles[titles.str.contains(r'email', flags = re.I)] email_titles # In[35]: count = email_titles.apply(email_count) count.sum() # In[43]: titles.str.contains(r'email', flags = re.I).sum() # In[44]: titles.str.contains(r"\be[\-\s]?mails?\b", flags = re.I).sum() # In[ ]: