Data Cleaning [04]: Regular Expression Examples
Published:
Some examples from https://docs.python.org/3/library/re.html#regular-expression-examples.
Checking for a Pair
Simulate a poker program where a player’s hand is represented as a 5-character string with each character representing a card, “a” for ace, “k” for king, “q” for queen, “j” for jack, “t” for 10, and “2” through “9” representing the card with that value.
valid = re.compile(r"^[a2-9tjqk]{5}$")
pair = re.compile(r".*(.).*\1")
hand1 = "akt5q"
hand2 = "akt5e"
hand3 = "718ak"
hand4 = "354aa"
# check whether the hand is valid
valid.match(hand1)
Out[345]: <re.Match object; span=(0, 5), match='akt5q'>
valid.match(hand2)
# not valid
valid.match(hand3)
# not valid
valid.match(hand4)
Out[348]: <re.Match object; span=(0, 5), match='354aa'>
# check whether the valid hand has a pair
pair.match(hand1)
# no pair
pair.match(hand4)
Out[350]: <re.Match object; span=(0, 5), match='354aa'>
# display the pair
pair.match(hand4).group()
Out[351]: '354aa'
pair.match(hand4).group(1)
Out[352]: 'a'
scanf() equivalence
scanf() | Regular Expression |
---|---|
%c | . |
%5c | .{5} |
%d | [-+]?\d+ |
%e , %E , %f , %g | [-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)? |
%i | [-+]?(0[xX][\dA-Fa-f]+|0[0-7]*|\d+) |
%o | [-+]?[0-7]+ |
%s | \S+ |
%u | \d+ |
%x , %X | [-+]?(0[xX])?[\dA-Fa-f]+ |
Writing a Tokenizer
A tokenizer or scanner analyzes a string to categorize groups of characters.
from typing import NamedTuple
import re
class Token(NamedTuple):
type: str
value: str
line: int
column: int
def tokenize(code):
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
token_specification = [
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
('ASSIGN', r':='), # Assignment operator
('END', r';'), # Statement terminator
('ID', r'[A-Za-z]+'), # Identifiers
('OP', r'[+\-*/]'), # Arithmetic operators
('NEWLINE', r'\n'), # Line endings
('SKIP', r'[ \t]+'), # Skip over spaces and tabs
('MISMATCH', r'.'), # Any other character
]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
line_num = 1
line_start = 0
for mo in re.finditer(tok_regex, code):
kind = mo.lastgroup # name of the last matched group (the rule applied)
value = mo.group()
column = mo.start() - line_start
if kind == 'NUMBER':
value = float(value) if '.' in value else int(value)
elif kind == 'ID' and value in keywords:
kind = value
elif kind == 'NEWLINE':
line_start = mo.end()
line_num += 1
continue
elif kind == 'SKIP':
continue
elif kind == 'MISMATCH':
raise RuntimeError(f'{value!r} unexpected on line {line_num}')
yield Token(kind, value, line_num, column)
#
tok_regex
Out[363]: '(?P<NUMBER>\\d+(\\.\\d*)?)|(?P<ASSIGN>:=)|(?P<END>;)|(?P<ID>[A-Za-z]+)|(?P<OP>[+\\-*/])|(?P<NEWLINE>\\n)|(?P<SKIP>[ \\t]+)|(?P<MISMATCH>.)'
statements = '''
IF quantity THEN
total := total + price * quantity;
tax := price * 0.05;
ENDIF;
'''
for token in tokenize(statements):
print(token)
Token(type='IF', value='IF', line=2, column=4)
Token(type='ID', value='quantity', line=2, column=7)
Token(type='THEN', value='THEN', line=2, column=16)
Token(type='ID', value='total', line=3, column=8)
Token(type='ASSIGN', value=':=', line=3, column=14)
Token(type='ID', value='total', line=3, column=17)
Token(type='OP', value='+', line=3, column=23)
Token(type='ID', value='price', line=3, column=25)
Token(type='OP', value='*', line=3, column=31)
Token(type='ID', value='quantity', line=3, column=33)
Token(type='END', value=';', line=3, column=41)
Token(type='ID', value='tax', line=4, column=8)
Token(type='ASSIGN', value=':=', line=4, column=12)
Token(type='ID', value='price', line=4, column=15)
Token(type='OP', value='*', line=4, column=21)
Token(type='NUMBER', value=0.05, line=4, column=23)
Token(type='END', value=';', line=4, column=27)
Token(type='ENDIF', value='ENDIF', line=5, column=4)
Token(type='END', value=';', line=5, column=9)
split() Example
text = """Ross McFluff: 834.345.1254 155 Elm Street
Ronald Heathmore: 892.345.3428 436 Finley Avenue
Frank Burger: 925.541.7625 662 South Dogwood Way
Heather Albrecht: 548.326.4584 919 Park Place"""
# split with newline
entries = re.split("\n+", text)
entries
Out[397]:
['Ross McFluff: 834.345.1254 155 Elm Street',
'Ronald Heathmore: 892.345.3428 436 Finley Avenue',
'Frank Burger: 925.541.7625 662 South Dogwood Way',
'Heather Albrecht: 548.326.4584 919 Park Place']
# split with : or space, maximum 3
[re.split(":? ", entry, 3) for entry in entries]
Out[398]:
[['Ross', 'McFluff', '834.345.1254', '155 Elm Street'],
['Ronald', 'Heathmore', '892.345.3428', '436 Finley Avenue'],
['Frank', 'Burger', '925.541.7625', '662 South Dogwood Way'],
['Heather', 'Albrecht', '548.326.4584', '919 Park Place']]
sub() Example
def repl(m):
inner_word = list(m.group(2))
random.shuffle(inner_word)
return m.group(1) + "".join(inner_word) + m.group(3)
text = "Professor Abdolmalek, please report your absences promptly."
re.sub(r"(\w)(\w+)(\w)", repl, text)
Out[401]: 'Pfooessrr Abmedoallk, paelse rperot your asnecbes ptpormly.'
def repl(m):
inner_word = 'Chao'
return m.group(1) + "".join(inner_word) + m.group(3)
re.sub(r"(\w)(\w+)(\w)", repl, text)
Out[402]: 'PChaor AChaok, pChaoe rChaot yChaor aChaos pChaoy.'
findall() & finditer() Example
text = "He was carefully disguised but captured quickly by police."
re.findall(r"\w+ly\b", text)
Out[404]: ['carefully', 'quickly']
re.finditer(r'\w+ly\b',text)
Out[405]: <callable_iterator at 0x16186f2fb08>
for m in re.finditer(r"\w+ly\b", text):
print('%02d-%02d: %s' % (m.start(), m.end(), m.group(0)))
07-16: carefully
40-47: quickly
Comments