Regex (*regular ex*pression) is a tool to find the specific pattern of string within a string
If we want to extract the leading number in a address string: 10 AURORA ST
a = '10000 AURORA ST'
a[0:2]
'10'
a.split()[0]
'100'
import re
pattern = re.compile(r'\d')
match = pattern.findall(a)
match
['1', '0', '0']
pattern = re.compile(r'\d{3}')
match = pattern.findall(a)
match
['100']
pattern = re.compile(r'\d+') # one of more element
match = pattern.findall(a)
match
['10000']
a = '1161 21ST AVE sS'
pattern = re.compile(r'\d+')
match = pattern.findall(a)
match
['1161', '21']
pattern = re.compile(r'\d+\s')
match = pattern.findall(a)
match
['1161 ']
pattern = re.compile(r'^\d+')
match = pattern.findall(a)
match
['1161']
pattern = re.compile(r'[A-Z]$')
match = pattern.findall(a)
match
[]
a = '407 S ORCHARD ST'
b = '131 E CHELTEN AVE'
c = '211 S 9TH ST'
d = '211 S 9TH 99'
pattern = re.compile(r'(\d+)\s([A-Z])\s([a-zA-Z0-9]+)\s([a-zA-Z]+)')
sub_address_a = pattern.sub(r'\3',a)
sub_address_b = pattern.sub(r'\4',b)
sub_address_c = pattern.sub(r'\3',c)
sub_address_d = pattern.sub(r'\4',d)
sub_address_c
'9TH'
sub_address_a = pattern.sub(r'\3',a)+ '-' + pattern.sub(r'\4',a) + '-' + pattern.sub(r'\2',a) + '-' + pattern.sub(r'\1',a)
sub_address_a
'ORCHARD-ST-S-407'
date = '1/2/2022'
pattern = re.compile((r'^\d{1,2})\/(\d{1,2})\/(\d{4})\n')
pattern = re.compile(^(10|11|12|[1-9])\/(\d{1,2})\/(\d{4})\n)
1/2/2022
10/25/2022
12/23/2022
9/21/2022
9/21/2022
2022-9-1
sub_address_d
'211 S 9TH 99'
import re
regex = r"(\d+)\s([A-Z])\s([a-zA-Z0-9]+)\s([a-zA-Z]+)"
regex = r"(\d+)\s(E|S|W|N)\s([a-zA-Z0-9]+)\s([a-zA-Z]+)"
# test_str = ("131 E CHELTEN AVE"
# "211 S 9TH ST"
# "407 S ORCHARD ST"
# "211 S 9TH 99")
test_str = '''
131 E CHELTEN AVE
211 S 9TH ST
211 S 9TH 99
Null
234S XX st
407 S ORCHARD ST
211 A 9TH ST'''
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Match 1 was found at 1-18: 131 E CHELTEN AVE Group 1 found at 1-4: 131 Group 2 found at 5-6: E Group 3 found at 7-14: CHELTEN Group 4 found at 15-18: AVE Match 2 was found at 19-31: 211 S 9TH ST Group 1 found at 19-22: 211 Group 2 found at 23-24: S Group 3 found at 25-28: 9TH Group 4 found at 29-31: ST Match 3 was found at 61-77: 407 S ORCHARD ST Group 1 found at 61-64: 407 Group 2 found at 65-66: S Group 3 found at 67-74: ORCHARD Group 4 found at 75-77: ST
For the purposes of this assignment, a valid address begins with a positive integer and is followed by one or more words or abbreviations. The words or abbreviations must consist of only letters from the English alphabet, may only contain a capital letter as the first letter, and may or may not end in a period.
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"^\d+( [A-Z]?[a-z.]*)+\n"
test_str = '''1189 Beall Avenue
123 S. Main St.
456 elm
Eight fifty two North Washington
10 10 Springfield Lane
14.5 S Main
12 S. Main
'''
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Match 1 was found at 0-18: 1189 Beall Avenue Group 1 found at 10-17: Avenue Match 2 was found at 18-34: 123 S. Main St. Group 1 found at 29-33: St. Match 3 was found at 34-42: 456 elm Group 1 found at 37-41: elm Match 4 was found at 110-121: 12 S. Main Group 1 found at 115-120: Main
Here a valid price is a numeric value using the US dollar sign. The number of cents is optional, but there must be two digits if the cents are shown. For prices above $999.99, there may optionally be a comma separating thousands, millions, etc.
import re
regex = r"\$((\d+))(\.\d{2}|\n)"
regex = r"\$((\d{1,3},)(\d{3},?)*|(\d+))(\.\d{2}|\n)"
regex = r"\$((\d{1,3},)(\d{3},)*(\d{3})|(\d+))(\.\d{2}|\s)"
test_str = '''
$1
$20
$1.99
$10.00
$1500.50
$150000000000000000.50
$200,000.99
$1,234,567.89
$1.9
$10,23.4
$1,02,567.89
$000,300,400.39
'''
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
Match 1 was found at 1-4: $1 Group 1 found at 2-3: 1 Group 2 found at -1--1: None Group 3 found at -1--1: None Group 4 found at 2-3: 1 Group 5 found at 3-4: Match 2 was found at 4-8: $20 Group 1 found at 5-7: 20 Group 2 found at -1--1: None Group 3 found at -1--1: None Group 4 found at 5-7: 20 Group 5 found at 7-8: Match 3 was found at 8-13: $1.99 Group 1 found at 9-10: 1 Group 2 found at -1--1: None Group 3 found at -1--1: None Group 4 found at 9-10: 1 Group 5 found at 10-13: .99 Match 4 was found at 14-20: $10.00 Group 1 found at 15-17: 10 Group 2 found at -1--1: None Group 3 found at -1--1: None Group 4 found at 15-17: 10 Group 5 found at 17-20: .00 Match 5 was found at 21-29: $1500.50 Group 1 found at 22-26: 1500 Group 2 found at -1--1: None Group 3 found at -1--1: None Group 4 found at 22-26: 1500 Group 5 found at 26-29: .50 Match 6 was found at 30-52: $150000000000000000.50 Group 1 found at 31-49: 150000000000000000 Group 2 found at -1--1: None Group 3 found at -1--1: None Group 4 found at 31-49: 150000000000000000 Group 5 found at 49-52: .50 Match 7 was found at 53-62: $2,000.99 Group 1 found at 54-59: 2,000 Group 2 found at 54-56: 2, Group 3 found at 56-59: 000 Group 4 found at -1--1: None Group 5 found at 59-62: .99 Match 8 was found at 63-76: $1,234,567.89 Group 1 found at 64-73: 1,234,567 Group 2 found at 64-66: 1, Group 3 found at 70-73: 567 Group 4 found at -1--1: None Group 5 found at 73-76: .89 Match 9 was found at 104-119: $000,300,400.39 Group 1 found at 105-116: 000,300,400 Group 2 found at 105-109: 000, Group 3 found at 113-116: 400 Group 4 found at -1--1: None Group 5 found at 116-119: .39
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"(^[a-zA-Z0-9_.+-]+)@([a-zA-Z0-9-]+\.)([a-zA-Z0-9]+\.)*([a-zA-Z0-9]+)$"
test_str = '''
nsommer@wooster.edu
n.sommer@cs.wooster.edu
yippee_skippy@yee-haw.wheeeee
fun-times@Taylor.hall.wooster.edu
n@sommer@wooster.edu
n sommer@wooster.edu
nsommer@wooster..edu
nsommer@wooster.edu-org'''
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
Match 1 was found at 1-20: nsommer@wooster.edu Group 1 found at 1-8: nsommer Group 2 found at 9-17: wooster. Group 3 found at -1--1: None Group 4 found at 17-20: edu Match 2 was found at 21-44: n.sommer@cs.wooster.edu Group 1 found at 21-29: n.sommer Group 2 found at 30-33: cs. Group 3 found at 33-41: wooster. Group 4 found at 41-44: edu Match 3 was found at 45-74: yippee_skippy@yee-haw.wheeeee Group 1 found at 45-58: yippee_skippy Group 2 found at 59-67: yee-haw. Group 3 found at -1--1: None Group 4 found at 67-74: wheeeee Match 4 was found at 75-108: fun-times@Taylor.hall.wooster.edu Group 1 found at 75-84: fun-times Group 2 found at 85-92: Taylor. Group 3 found at 97-105: wooster. Group 4 found at 105-108: edu
match(): match regex only at the beginning of the string.
search(): find the first match string
findall(): find all matched string and return a list
finditer(): find all matched string and return an iterator
Ex: padexposur attribute has string value like '0 ', '1 ', '3 ', '4 ', '5+ '.
However, I only want to keep rows whose paderposur value are not '5+ '
The equivalent query will be:
select distinct padexposur from prod_us9_516047.TEMP_B22161_PHR_NURTEC_RAWDATA where padexposur regexp '^[0-9][[:blank:]]'
The unique attribute value are too little which make it possible to list all desire value
What if unique attribute values are too much to list? Regex is the only solution