Data Cleaning [04]: Regular Expression Examples

4 minute read

Published:

Some examples from https://docs.python.org/3/library/re.html#regular-expression-examples.


Checking for a Pair

Simulate a poker program where a player’s hand is represented as a 5-character string with each character representing a card, “a” for ace, “k” for king, “q” for queen, “j” for jack, “t” for 10, and “2” through “9” representing the card with that value.

valid = re.compile(r"^[a2-9tjqk]{5}$")
pair = re.compile(r".*(.).*\1")

hand1 = "akt5q"
hand2 = "akt5e"
hand3 = "718ak"
hand4 = "354aa"

# check whether the hand is valid
valid.match(hand1)
Out[345]: <re.Match object; span=(0, 5), match='akt5q'>

valid.match(hand2)
# not valid

valid.match(hand3)
# not valid

valid.match(hand4)
Out[348]: <re.Match object; span=(0, 5), match='354aa'>

# check whether the valid hand has a pair
pair.match(hand1)
# no pair

pair.match(hand4)
Out[350]: <re.Match object; span=(0, 5), match='354aa'>

# display the pair
pair.match(hand4).group()
Out[351]: '354aa'

pair.match(hand4).group(1)
Out[352]: 'a'

scanf() equivalence

scanf()Regular Expression
%c.
%5c.{5}
%d[-+]?\d+
%e, %E, %f, %g[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?
%i[-+]?(0[xX][\dA-Fa-f]+|0[0-7]*|\d+)
%o[-+]?[0-7]+
%s\S+
%u\d+
%x, %X[-+]?(0[xX])?[\dA-Fa-f]+

Writing a Tokenizer

A tokenizer or scanner analyzes a string to categorize groups of characters.

from typing import NamedTuple
import re

class Token(NamedTuple):
    type: str
    value: str
    line: int
    column: int
def tokenize(code):
    keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
    token_specification = [
        ('NUMBER',   r'\d+(\.\d*)?'),  # Integer or decimal number
        ('ASSIGN',   r':='),           # Assignment operator
        ('END',      r';'),            # Statement terminator
        ('ID',       r'[A-Za-z]+'),    # Identifiers
        ('OP',       r'[+\-*/]'),      # Arithmetic operators
        ('NEWLINE',  r'\n'),           # Line endings
        ('SKIP',     r'[ \t]+'),       # Skip over spaces and tabs
        ('MISMATCH', r'.'),            # Any other character
    ]
    tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
    line_num = 1
    line_start = 0
    for mo in re.finditer(tok_regex, code):
        kind = mo.lastgroup     # name of the last matched group (the rule applied)
        value = mo.group()
        column = mo.start() - line_start
        if kind == 'NUMBER':
            value = float(value) if '.' in value else int(value)
        elif kind == 'ID' and value in keywords:
            kind = value
        elif kind == 'NEWLINE':
            line_start = mo.end()
            line_num += 1
            continue
        elif kind == 'SKIP':
            continue
        elif kind == 'MISMATCH':
            raise RuntimeError(f'{value!r} unexpected on line {line_num}')
        yield Token(kind, value, line_num, column)

# 
tok_regex
Out[363]: '(?P<NUMBER>\\d+(\\.\\d*)?)|(?P<ASSIGN>:=)|(?P<END>;)|(?P<ID>[A-Za-z]+)|(?P<OP>[+\\-*/])|(?P<NEWLINE>\\n)|(?P<SKIP>[ \\t]+)|(?P<MISMATCH>.)'
statements = '''
    IF quantity THEN
        total := total + price * quantity;
        tax := price * 0.05;
    ENDIF;
'''

for token in tokenize(statements):
    print(token)

Token(type='IF', value='IF', line=2, column=4)
Token(type='ID', value='quantity', line=2, column=7)
Token(type='THEN', value='THEN', line=2, column=16)
Token(type='ID', value='total', line=3, column=8)
Token(type='ASSIGN', value=':=', line=3, column=14)
Token(type='ID', value='total', line=3, column=17)
Token(type='OP', value='+', line=3, column=23)
Token(type='ID', value='price', line=3, column=25)
Token(type='OP', value='*', line=3, column=31)
Token(type='ID', value='quantity', line=3, column=33)
Token(type='END', value=';', line=3, column=41)
Token(type='ID', value='tax', line=4, column=8)
Token(type='ASSIGN', value=':=', line=4, column=12)
Token(type='ID', value='price', line=4, column=15)
Token(type='OP', value='*', line=4, column=21)
Token(type='NUMBER', value=0.05, line=4, column=23)
Token(type='END', value=';', line=4, column=27)
Token(type='ENDIF', value='ENDIF', line=5, column=4)
Token(type='END', value=';', line=5, column=9)

split() Example

text = """Ross McFluff: 834.345.1254 155 Elm Street
Ronald Heathmore: 892.345.3428 436 Finley Avenue
Frank Burger: 925.541.7625 662 South Dogwood Way
Heather Albrecht: 548.326.4584 919 Park Place"""

# split with newline
entries = re.split("\n+", text)
entries
Out[397]: 
['Ross McFluff: 834.345.1254 155 Elm Street',
 'Ronald Heathmore: 892.345.3428 436 Finley Avenue',
 'Frank Burger: 925.541.7625 662 South Dogwood Way',
 'Heather Albrecht: 548.326.4584 919 Park Place']
 
# split with : or space, maximum 3
[re.split(":? ", entry, 3) for entry in entries]
Out[398]: 
[['Ross', 'McFluff', '834.345.1254', '155 Elm Street'],
 ['Ronald', 'Heathmore', '892.345.3428', '436 Finley Avenue'],
 ['Frank', 'Burger', '925.541.7625', '662 South Dogwood Way'],
 ['Heather', 'Albrecht', '548.326.4584', '919 Park Place']]

sub() Example

def repl(m):
    inner_word = list(m.group(2))
    random.shuffle(inner_word)
    return m.group(1) + "".join(inner_word) + m.group(3)
    
text = "Professor Abdolmalek, please report your absences promptly."

re.sub(r"(\w)(\w+)(\w)", repl, text)
Out[401]: 'Pfooessrr Abmedoallk, paelse rperot your asnecbes ptpormly.'

def repl(m):
    inner_word = 'Chao'
    return m.group(1) + "".join(inner_word) + m.group(3)
    
re.sub(r"(\w)(\w+)(\w)", repl, text)
Out[402]: 'PChaor AChaok, pChaoe rChaot yChaor aChaos pChaoy.'

findall() & finditer() Example

text = "He was carefully disguised but captured quickly by police."

re.findall(r"\w+ly\b", text)
Out[404]: ['carefully', 'quickly']

re.finditer(r'\w+ly\b',text)
Out[405]: <callable_iterator at 0x16186f2fb08>

for m in re.finditer(r"\w+ly\b", text):
    print('%02d-%02d: %s' % (m.start(), m.end(), m.group(0)))
07-16: carefully
40-47: quickly

Comments