Regular Expressions

Documentation

  1. Regular Expression HOWTO in the Python HOWTOs
  2. String Pattern Matching in the Python Tutorial
  3. re module.
  4. Raw strings such as the two-character string r"\n" instead of "\\n". The Backslash Plague.

Backslash in a string literal

print("\thello")   #one tab character
print("\\thello")  #a backslash and a lowercase t

print("The \\ is a backslash.")
print(r"The \ is a backslash.")

print("The \\\\ is two backslashes.")
print(r"The \\ is two backslashes.")
	hello
\thello
The \ is a backslash.
The \ is a backslash.
The \\ is two backslashes.
The \\ is two backslashes.

grep -i string infile.txt

import sys
import re   #regular expressions

filename = "infile.txt"   #e.g., macOS "/usr/share/dict/words"

try:
    lines = open(filename)
except FileNotFoundError:
    print(f"Sorry, could not find file \"{filename}\".")
    sys.exit(1)
except PermissionError:
    print(f"Sorry, no permission to open file \"{filename}\".")
    sys.exit(1)

for line in lines:
    line = line.rstrip("\n")   #Remove the trailing newline.
    if re.search("hello, there", line):   #or if not
        print(line)

lines.close()
sys.exit(0)

You can combine multiple flags with the bitwise | operator.

    if re.search("hello, there", line, flags = re.IGNORECASE):

^ means start of string

    if re.search("^anti", line, flags = re.IGNORECASE):

$ means end of string

    if re.search("phobia$", line, flags = re.IGNORECASE):
    if re.search("^$", line):   #Search for empty lines.
    if re.search("\\$100", line):   #Search for one hundred dollars.
    if re.search(r"\$100", line):   #Search for one hundred dollars.
    if re.search("\\^", line):   #Search for a caret.
    if re.search(r"\^", line):   #Search for a caret.
    if re.search(r"^\^", line):   #Search for lines that begin with a caret.

. is a wildcard

    if re.search("sep.rate", line, flags = re.IGNORECASE):   #Search for separate, seperate, etc.
    if re.search("^...u.$", line, flags = re.IGNORECASE):   #with the anchors
    if re.search("...u.", line, flags = re.IGNORECASE):   #without the anchors
    if re.search("^b.g$", line, flags = re.IGNORECASE):
    if re.search("^p.t$", line, flags = re.IGNORECASE):
    if re.search("^.....$", line):   #all lines of exactly 5 characters, no more and no less
    if re.search(".....", line):   #all lines of 5 or more characters
    if re.search("^.{5}$", line):   #all lines of exactly 5 characters, no more and no less
    if re.search("...ism$", line, flags = re.IGNORECASE):   #idiological movement

Wildcards with []

    if re.search(".", line):   #lines containing a character (or maybe more)
    if re.search("[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", line):   #lines containing an uppercase letter
    if re.search("[A-Z]", line):   #no space around dash, can't say [Z-A]
    if re.search("^[A-K]", line, flags = re.IGNORECASE):   #lines starting with letter in 1st half of alphabet
    if re.search("^[L-Z]", line, flags = re.IGNORECASE):   #lines starting with letter in 1st half of alphabet
    if re.search("[a-z]", line):   #any lowercase letter
    if re.search("[0-9]", line):   #any decimal digit
    if re.search("19[0-9][0-9]", line):   #any year in the 1900's
    if re.search("[CcTt][sz]ar", line):   #in search of Russia's imperial past
    if re.search("[0-7]", line):   #any octal digit
    if re.search("[0-9A-Fa-f]", line):   #any hexadecimal digit
    if re.search("[0-9][0-9][0-9][0-9][0-9]", line):   #any zip code

\d wildcard

See also \s, etc.

    if re.search(r"^\d\d\d\d\d$", line):   #any zip code
    if re.search(r"^\d{5}$", line):   #any zip code
"^[A-Z][0-9][A-Z] [0-9][A-Z][0-9]$"   #Canadian postal code, e.g. A2B 3C4

Can’t use the letters D, F, I, O, Q or U; first letter can’t be W or Z.

"^[A-CEGHJ-NPR-TVXY][0-9][A-CEGHJ-NPR-TV-Z] [0-9][A-CEGHJ-NPR-TV-Z][0-9]$"
r"ˆ[A-CEGHJ-NPR-TVXY]\d[A-CEGHJ-NPR-TV-Z] \d[A-CEGHJ-NPR-TV-Z]\d$"
"^[0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]$"   #social security number
r"^\d\d\d-\d\d-\d\d\d\d$"
r"^\d{3}-\d{2}-\d{4}$"

1-800-737-3783 spells “reserve”.

"^[pqrs][def][pqrs][def][pqrs][tuv][def]$"
"[pqrs][def][pqrs][def][pqrs][tuv][def]"

How many Thursdays in June, 2019?

cal -h 6 2019
     June 2019        
Su Mo Tu We Th Fr Sa  
                   1  
 2  3  4  5  6  7  8  
 9 10 11 12 13 14 15  
16 17 18 19 20 21 22  
23 24 25 26 27 28 29  
30

cal -h 6 2019 | tr ' ' .
.....June.2019........
Su.Mo.Tu.We.Th.Fr.Sa..
...................1..
.2..3..4..5..6..7..8..
.9.10.11.12.13.14.15..
16.17.18.19.20.21.22..
23.24.25.26.27.28.29..
30....................

cal -h 6 2019 | tail -n +3 | tr ' ' .
...................1..
.2..3..4..5..6..7..8..
.9.10.11.12.13.14.15..
16.17.18.19.20.21.22..
23.24.25.26.27.28.29..
30....................
import re   #regular expression
import os   #operating system

command = "cal -h 6 2019 | tail -n +3"
lines = os.popen(command)   #pipe open

count = 0

for line in lines:
    line = line.rstrip("\n")   #Remove the trailing newline.
    if re.search(r"^.{13}\d", line):   #or if re.search(r"\d.{8}$", line):
        count += 1

print(f"June 2019 countains {count} Thursdays.")
lines.close()
June 2019 countains 4 Thursdays.

[^] wildcard

"[A-Z]"   #any uppercase letter

#any character except an uppercase letter
r"[] !\"#$%&'()*+,./0123456789:;<=>?@[\^_`abcdefghijklmnopqrstuvwxyz{|}~-]"
"[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]"
"[^A-Z]"
"[^A]"   #any character except uppercase A

q not followed by u?

Does /usr/share/dict/words have any line with q not followed by u?

    if re.search("q[^u]", line, flags = re.IGNORECASE):
    if re.search("q$", line, flags = re.IGNORECASE):
    if re.search("q[^u]|q$", line, flags = re.IGNORECASE):   #The vertical bar means "or".
    if re.search("q([^u]|$)", line, flags = re.IGNORECASE):

What’s the difference between …

    if re.search("[^A]", line, flags = re.IGNORECASE):
    if not re.search("A", line, flags = re.IGNORECASE):

Search for a number that is not part of a longer number

      "100"
"[^0-9]100[^0-9]"
   r"\D100\D"
    r"^100\D"
   r"\D100$"
     "^100$"
r"(^|\D)100(\D|$)"

Search for a word that is not part of a longer word

             "max"
"[^A-Za-z0-9_]max[^A-Za-z0-9]"
          r"\Wmax\W"
           r"^max\W"
          r"\Wmax$"
            "^max$"
r"(^|\W)max(\W|$)"

Search for a non-printable character

"[ -~]"    #any printable ASCII character except tab
"[ -~\t]"  #any printable ASCII character
"[^ -~\t]" #any nonprintable ASCII character

Search for a dash

"[A-C]"   #search for any of the three characters A, B, C
"[AC-]"   #search for any of the three characters A, C, dash
"[^BC]"   #search for any character except B or C
"[B^C]"   #search for any of the three characters B, caret, C
"[BC]]"   #search for A or B, followed by ]
"[]BC]"   #search for any of the three characters left bracket, B, C
"[]^-]"   #search for any of the three characters ], ^, -

* in a regular expression

"^Manhattan"
"^ Manhattan"
"^  Manhattan"
"^   Manhattan"
"^ *Manhattan"
"21210040"   #area code, zip code
"212.10040"
"212..10040"
"212...10040"
"212.*10040"
    if re.search("^anti.*ism$", line, flags = re.IGNORECASE):   #in /usr/share/dict/words
    if re.search("^[^aeiou]*a[^aeiou]*e[^aeiou]*i[^aeiou]*o[^aeiou]*u[^aeiou]*$", line, flags = re.IGNORECASE):   #in /usr/share/dict/words

Two ways to do the same thing. The second one is better because it’s simpler.

    if re.search("^anti.*", line, flags = re.IGNORECASE):   #in /usr/share/dict/words
    if re.search("^anti", line, flags = re.IGNORECASE):   #in /usr/share/dict/words

Or

"prochoice|prolife"
"pro(choice|life)"                  #a*b+a*c = a*(b+c).  What would go wrong without the parentheses?
"^(anti|pro)(choice|life|abortion)" #six possibilities

Question mark means “zero or one”

"colou?r"      #American or British spelling
"dialog(ue)?s" #What would go wrong without the parentheses?
"^..*$"  #lines consisting of one or more characters
"^.+$"   #lines consisting of one or more characters

Minimum and maximum

"^.....$"  #lines consisting of exactly 5 characters
"^.{5}$"   #lines consisting of exactly 5 characters

"^.{5,7}$" #lines consisting of a minimum of 5 and a maximum of 7 characters
"^.{,7}$"  #lines consisting of a minimum of 0 and a maximum of 7 characters
"^.{5,}$"  #lines consisting of a minimum of 5 characters

#More complicated way to find lines consisting of a minimum of 5 and a maximum of 7 characters.
"^(.{5}|.{6}|.{7})$"

Domain name, e.g., www.bmcc.cuny.edu

A domain name must consists of one or more dot-separated labels, each starting with a letter. If the label contains additional characters, the last character must be a letter or digit. The characters in the middle of the label could be letters, digits, or hyphens. A label must be less than 64 characters long.

#Search for a line consisting of one label.
"^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$"
#Search for a line consisting of a domain name
#of one or more dot-separated labels.
r"[a-z]([a-z0-9-]{0,61}[a-z0-9])?(\.[a-z]([a-z0-9-]{0,61}[a-z0-9])?)*"

Email address

An email address consists of two parts: firstPart@secondPart. Let’s say that the first part can contain the characters A-Za-z0-9!#$%&'*+/=?^_`{|}~.- with two restrictions:

  1. The first and last characters cannot be dots.
  2. There cannot be two or more consecutive dots.
#Search for a line consisting of one email address.

if re.search(r"^[a-z0-9!#$%&'*+/=?^_`{|}~-]([a-z0-9!#$%&'*+/=?^_`{|}~.-]*[a-z0-9!#$%&'*+/=?^_`{|}~-])?@[a-z]([a-z0-9-]{0,61}[a-z0-9])?(\.[a-z]([a-z0-9-]{0,61}[a-z0-9])?)*$", line, flags = re.IGNORECASE) \
    and not re.search(r"\.\..*@"):