The re.sub function

Documentation

  1. The re.sub function in module re.
  2. Search and Replace in the Regular Expression HOWTO.
  3. re.sub example in String Pattern Matching in the Python Tutorial.

Parenthesize every negative number

If the file outfile.txt does not exist, the following program will create it. If the file outfile.txt already exists, the program will overwrite the contents of the file, if the program has permission to do so.

"""
Output a copy of the input file,
with every negative integer parenthesized instead of with a negative sign.
"""

import sys
import re   #regular expressions

infilename = "infile.txt"
outfilename = "outfile.txt"

try:
    infile = open(infilename, "r")   #read (the "r" is unnecessary)
except FileNotFoundError:
    print(f"Sorry, could not find input file \"{infilename}\".")
    sys.exit(1)
except PermissionError:
    print(f"Sorry, no permission to open input file \"{infilename}\".")
    sys.exit(1)

try:
    outfile = open(outfilename, "w")   #write
except PermissionError:
    print(f"Sorry, no permission to open output file \"{outfilename}\".")
    sys.exit(1)

for line in infile:
    line = line.rstrip("\n")   #Remove the trailing newline.
    line = re.sub(r"-(\d+)", r"(\1)", line)
    print(line, file = outfile)

outfile.close()
infile.close()
sys.exit(0)

Warm up:
what has four eyes and can’t see?

import sys
import re   #regular expressions

filename = "/usr/share/dict/words"

try:
    lines = open(filename)
except FileNotFoundError:
    print(f"Sorry, could not find file \"{filename}\".")
    sys.exit(1)
except PermissionError:
    print(f"Sorry, no permission to open file \"{filename}\".")
    sys.exit(1)

for line in lines:
    line = line.rstrip("\n")   #Remove the trailing newline.
    if re.search("i.*i.*i.*i", line, flags = re.IGNORECASE):    #not "iiii"
        print(line)

lines.close()
sys.exit(0)
antidisciplinarian
Mississippi
multimillionaire
    #every line that has exactly 4 i's, no more and no less

    if re.search("^[^i]*i[^i]*i[^i]*i[^i]*i[^i]*$", line, flags = re.IGNORECASE):
    #every line that has exactly 4 i's, no more and no less

    if re.search("^[^i]*(i[^i]*){4}$", line, flags = re.IGNORECASE):

The regular expression example that made me a true believer

/* part of a C program */

typedef struct {
	char   field1;
	int    field2;
	double field3;
	char * field4;
} stooge_t;

stooge_t stooge[] = {
	1.11, 1, ’a’, "moe",
	2.22, 2, ’b’, "larry",
	3.33, 3, ’c’, "curly",
	4.44, 4, ’d’, "shemp",
	5.55, 5, ’e’, "Buster Keaton",
};

I typed the four columns in the wrong order.

"""
Output a copy of the input file,
with the columns in the correct order.
"""

import sys
import re   #regular expressions

infilename = "infile.c"
outfilename = "outfile.c"

try:
    infile = open(infilename)
except FileNotFoundError:
    print(f"Sorry, could not find input file \"{infilename}\".")
    sys.exit(1)
except PermissionError:
    print(f"Sorry, no permission to open input file \"{infilename}\".")
    sys.exit(1)

try:
    outfile = open(outfilename, "w")
except PermissionError:
    print(f"Sorry, no permission to open output file \"{outfilename}\".")
    sys.exit(1)

for i, line in enumerate(infile, start = 1):
    line = line.rstrip("\n")   #Remove the trailing newline.
    if 11 <= i <= 15:
        line = re.sub(r'\s*(\S+),\s*(\S+),\s*(\S+),\s*(".*"),', "\t\\3, \\2, \\1, \\4", line)
    print(line, file = outfile)

outfile.close()
infile.close()
sys.exit(0)

Normalize social security numbers

123-45-6789           123-45-6789
123 45 6789           123-45-6789
	123456789     123-45-6789
SS Num 123-45-6789    123-45-6789
123-45-678            ???-??-????
000-00-0oOl           ???-??-????
"""
Normalize social security numbers.
"""

import sys
import re   #regular expressions

infilename = "infile.txt"

try:
    infile = open(infilename)
except FileNotFoundError:
    print(f"Sorry, could not find input file \"{infilename}\".")
    sys.exit(1)
except PermissionError:
    print(f"Sorry, no permission to open input file \"{infilename}\".")
    sys.exit(1)

for line in infile:
    line = line.rstrip("\n")       #Remove the trailing newline.
    line = re.sub(r"\D", "", line) #Remove every non-digit.
    if not re.search("^\d{9}$", line):
        line = 9 * "?"
    line = re.sub("(...)(..)(....)", r"\1-\2-\3", line)   #or "(.{3})(.{2})(.{4})"
    print(line)

infile.close()
sys.exit(0)

Normalize phone numbers

(201) 200-1800    (201) 200-1800
212-639-5555      (212) 639-5555
2013433434        (201) 343-3434
1-800-343-3434    (800) 343-3434
555-1212          (???) 555-1212
012-345-6789      (???) 345-6789
122-345-6789      (???) 345-6789
113-045-6789      (113) ???-????
123-045-6789      (???) ???-????
  1. Remove all the non-digits.
  2. If the line consists of exactly seven characters, add three leading question marks to hold the place of the missing area code.
  3. If the line consists of a 1 followed by exactly ten characters, remove the leading 1.
  4. If the line now does not consist of exactly ten characters, change it to ten question marks.
  5. If the first digit of the area code is a 0, or the middle digit of the area code is neither a 1 nor a 0, change the area code to three question marks.
  6. If the first digit of the phone number is a 0 or a 1, change the phone number to seven question marks.
  7. Insert the parentheses, blank, and dash.

Translate English to Pig Latin

"""
Translate a string (a word or sentence) from English into Pig Latin.
Example:
Read my open lips: "No new taxes!"
Eadray myay openway ipslay: "Onay ewnay axestay!"
"""

import sys
import re   #regular expressions

#Rearrange the matched word into Pig Latin.

def rearrange(m):
    assert type(m) == re.Match
    word = m.group()
    leadingConsonants = m.group(1)
    rest = m.group(2)   #the rest of the word
    isupper = word and word[0].isupper()
    if re.match(r"[aeiou]", word, flags = re.IGNORECASE):
        leadingConsonants = "w"
    pigLatinWord = rest + leadingConsonants + "ay"
    if isupper:
        pigLatinWord = pigLatinWord.capitalize()
    return pigLatinWord

#Divide the matched word into the leading consonants and the rest of the word.
#Pass these two groups of characters to rearrange.

def divide(m):
    assert type(m) == re.Match
    word = m.group()
    return re.sub(r"^([bcdfghjklmnpqrstvwxyz]*)(\w*)",
        rearrange, word, flags = re.IGNORECASE)

#Pass each non-empty word in the user's string to divide.

s = input("Please type one or more words: ")
pigLatin = re.sub(r"\w+", divide, s)
print(pigLatin)
sys.exit(0)
Please type one or more words: Read my open lips: "No new taxes!"
Eadray myay openway ipslay: "Onay ewnay axestay!"

Things to Try

  1. In Flanders Field.

    import sys
    import re   #regular expressions
    
    rows = [
        ".....+...",
        "+......+.",
        "....+.+..",
        "+.+...+..",
        "...++.+.."
    ]
    
    for _ in range(2):
        rows = [re.sub(r"\+.*\+", lambda m: m.group().replace(".", "*"), row) for row in rows]
        rows = ["".join(row) for row in zip(*rows)]   #Transpose the rows and columns.
    
    for row in rows:
        print(row)
    
    sys.exit(0)
    
    .....+...
    +******+.
    *...+*+..
    +*+***+..
    ...++*+..