re
module.
r"\n"
instead of
"\\n"
.
The
Backslash Plague.
print("\thello") #one tab character print("\\thello") #a backslash and a lowercase t print("The \\ is a backslash.") print(r"The \ is a backslash.") print("The \\\\ is two backslashes.") print(r"The \\ is two backslashes.")
hello \thello The \ is a backslash. The \ is a backslash. The \\ is two backslashes. The \\ is two backslashes.
import sys import re #regular expressions filename = "infile.txt" #e.g., macOS "/usr/share/dict/words" try: lines = open(filename) except FileNotFoundError: print(f"Sorry, could not find file \"{filename}\".") sys.exit(1) except PermissionError: print(f"Sorry, no permission to open file \"{filename}\".") sys.exit(1) for line in lines: line = line.rstrip("\n") #Remove the trailing newline. if re.search("hello, there", line): #or if not print(line) lines.close() sys.exit(0)
You can combine multiple flags with the
bitwise
|
operator.
if re.search("hello, there", line, flags = re.IGNORECASE):
if re.search("^anti", line, flags = re.IGNORECASE):
if re.search("phobia$", line, flags = re.IGNORECASE):
if re.search("^$", line): #Search for empty lines.
if re.search("\\$100", line): #Search for one hundred dollars.
if re.search(r"\$100", line): #Search for one hundred dollars.
if re.search("\\^", line): #Search for a caret.
if re.search(r"\^", line): #Search for a caret.
if re.search(r"^\^", line): #Search for lines that begin with a caret.
if re.search("sep.rate", line, flags = re.IGNORECASE): #Search for separate, seperate, etc.
if re.search("^...u.$", line, flags = re.IGNORECASE): #with the anchors
if re.search("...u.", line, flags = re.IGNORECASE): #without the anchors
if re.search("^b.g$", line, flags = re.IGNORECASE):
if re.search("^p.t$", line, flags = re.IGNORECASE):
if re.search("^.....$", line): #all lines of exactly 5 characters, no more and no less
if re.search(".....", line): #all lines of 5 or more characters
if re.search("^.{5}$", line): #all lines of exactly 5 characters, no more and no less
if re.search("...ism$", line, flags = re.IGNORECASE): #idiological movement
if re.search(".", line): #lines containing a character (or maybe more)
if re.search("[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", line): #lines containing an uppercase letter
if re.search("[A-Z]", line): #no space around dash, can't say [Z-A]
if re.search("^[A-K]", line, flags = re.IGNORECASE): #lines starting with letter in 1st half of alphabet
if re.search("^[L-Z]", line, flags = re.IGNORECASE): #lines starting with letter in 1st half of alphabet
if re.search("[a-z]", line): #any lowercase letter
if re.search("[0-9]", line): #any decimal digit
if re.search("19[0-9][0-9]", line): #any year in the 1900's
if re.search("[CcTt][sz]ar", line): #in search of Russia's imperial past
if re.search("[0-7]", line): #any octal digit
if re.search("[0-9A-Fa-f]", line): #any hexadecimal digit
if re.search("[0-9][0-9][0-9][0-9][0-9]", line): #any zip code
See also
\s
,
etc.
if re.search(r"^\d\d\d\d\d$", line): #any zip code
if re.search(r"^\d{5}$", line): #any zip code
"^[A-Z][0-9][A-Z] [0-9][A-Z][0-9]$" #Canadian postal code, e.g. A2B 3C4
Can’t use the letters D, F, I, O, Q or U; first letter can’t be W or Z.
"^[A-CEGHJ-NPR-TVXY][0-9][A-CEGHJ-NPR-TV-Z] [0-9][A-CEGHJ-NPR-TV-Z][0-9]$" r"ˆ[A-CEGHJ-NPR-TVXY]\d[A-CEGHJ-NPR-TV-Z] \d[A-CEGHJ-NPR-TV-Z]\d$"
"^[0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]$" #social security number r"^\d\d\d-\d\d-\d\d\d\d$" r"^\d{3}-\d{2}-\d{4}$"
1-800-737-3783 spells “reserve”.
"^[pqrs][def][pqrs][def][pqrs][tuv][def]$" "[pqrs][def][pqrs][def][pqrs][tuv][def]"
cal -h 6 2019 June 2019 Su Mo Tu We Th Fr Sa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 cal -h 6 2019 | tr ' ' . .....June.2019........ Su.Mo.Tu.We.Th.Fr.Sa.. ...................1.. .2..3..4..5..6..7..8.. .9.10.11.12.13.14.15.. 16.17.18.19.20.21.22.. 23.24.25.26.27.28.29.. 30.................... cal -h 6 2019 | tail -n +3 | tr ' ' . ...................1.. .2..3..4..5..6..7..8.. .9.10.11.12.13.14.15.. 16.17.18.19.20.21.22.. 23.24.25.26.27.28.29.. 30....................
import re #regular expression import os #operating system command = "cal -h 6 2019 | tail -n +3" lines = os.popen(command) #pipe open count = 0 for line in lines: line = line.rstrip("\n") #Remove the trailing newline. if re.search(r"^.{13}\d", line): #or if re.search(r"\d.{8}$", line): count += 1 print(f"June 2019 countains {count} Thursdays.") lines.close()
June 2019 countains 4 Thursdays.
"[A-Z]" #any uppercase letter #any character except an uppercase letter r"[] !\"#$%&'()*+,./0123456789:;<=>?@[\^_`abcdefghijklmnopqrstuvwxyz{|}~-]" "[^ABCDEFGHIJKLMNOPQRSTUVWXYZ]" "[^A-Z]" "[^A]" #any character except uppercase A
Does
/usr/share/dict/words
have any line with q not followed by u?
if re.search("q[^u]", line, flags = re.IGNORECASE):
if re.search("q$", line, flags = re.IGNORECASE):
if re.search("q[^u]|q$", line, flags = re.IGNORECASE): #The vertical bar means "or".
if re.search("q([^u]|$)", line, flags = re.IGNORECASE):
if re.search("[^A]", line, flags = re.IGNORECASE):
if not re.search("A", line, flags = re.IGNORECASE):
"100" "[^0-9]100[^0-9]" r"\D100\D" r"^100\D" r"\D100$" "^100$"
r"(^|\D)100(\D|$)"
"max" "[^A-Za-z0-9_]max[^A-Za-z0-9]" r"\Wmax\W" r"^max\W" r"\Wmax$" "^max$"
r"(^|\W)max(\W|$)"
"[ -~]" #any printable ASCII character except tab "[ -~\t]" #any printable ASCII character "[^ -~\t]" #any nonprintable ASCII character
"[A-C]" #search for any of the three characters A, B, C "[AC-]" #search for any of the three characters A, C, dash
"[^BC]" #search for any character except B or C "[B^C]" #search for any of the three characters B, caret, C
"[BC]]" #search for A or B, followed by ] "[]BC]" #search for any of the three characters left bracket, B, C
"[]^-]" #search for any of the three characters ], ^, -
"^Manhattan" "^ Manhattan" "^ Manhattan" "^ Manhattan" "^ *Manhattan"
"21210040" #area code, zip code "212.10040" "212..10040" "212...10040" "212.*10040"
if re.search("^anti.*ism$", line, flags = re.IGNORECASE): #in /usr/share/dict/words
if re.search("^[^aeiou]*a[^aeiou]*e[^aeiou]*i[^aeiou]*o[^aeiou]*u[^aeiou]*$", line, flags = re.IGNORECASE): #in /usr/share/dict/words
Two ways to do the same thing. The second one is better because it’s simpler.
if re.search("^anti.*", line, flags = re.IGNORECASE): #in /usr/share/dict/words
if re.search("^anti", line, flags = re.IGNORECASE): #in /usr/share/dict/words
"prochoice|prolife" "pro(choice|life)" #a*b+a*c = a*(b+c). What would go wrong without the parentheses? "^(anti|pro)(choice|life|abortion)" #six possibilities
"colou?r" #American or British spelling "dialog(ue)?s" #What would go wrong without the parentheses?
"^..*$" #lines consisting of one or more characters "^.+$" #lines consisting of one or more characters
"^.....$" #lines consisting of exactly 5 characters "^.{5}$" #lines consisting of exactly 5 characters "^.{5,7}$" #lines consisting of a minimum of 5 and a maximum of 7 characters "^.{,7}$" #lines consisting of a minimum of 0 and a maximum of 7 characters "^.{5,}$" #lines consisting of a minimum of 5 characters #More complicated way to find lines consisting of a minimum of 5 and a maximum of 7 characters. "^(.{5}|.{6}|.{7})$"
A domain name must consists of one or more dot-separated labels, each starting with a letter. If the label contains additional characters, the last character must be a letter or digit. The characters in the middle of the label could be letters, digits, or hyphens. A label must be less than 64 characters long.
#Search for a line consisting of one label. "^[a-z]([a-z0-9-]{0,61}[a-z0-9])?$"
#Search for a line consisting of a domain name #of one or more dot-separated labels. r"[a-z]([a-z0-9-]{0,61}[a-z0-9])?(\.[a-z]([a-z0-9-]{0,61}[a-z0-9])?)*"
An email address consists of two parts:
firstPart@secondPart
.
Let’s say that the first part can contain the characters
A-Za-z0-9!#$%&'*+/=?^_`{|}~.-
with two restrictions:
#Search for a line consisting of one email address. if re.search(r"^[a-z0-9!#$%&'*+/=?^_`{|}~-]([a-z0-9!#$%&'*+/=?^_`{|}~.-]*[a-z0-9!#$%&'*+/=?^_`{|}~-])?@[a-z]([a-z0-9-]{0,61}[a-z0-9])?(\.[a-z]([a-z0-9-]{0,61}[a-z0-9])?)*$", line, flags = re.IGNORECASE) \ and not re.search(r"\.\..*@"):