Class collections.Counter

See the Unix command uniq -c.

A dictionary that counts items

"""
Count how many times each score appears.
"""

import sys

scores = [10, 9, 10, 8, 11, 10, 12, 8, 11]   #The most frequent score is 10.

counter = {}  #an empty dict

for score in scores:
    try:
        counter[score] += 1
    except KeyError:
        counter[score] = 1

for score in sorted(counter):
    print(f"{score:2} {counter[score]}")

sys.exit(0)
 8 2
 9 1
10 3
11 2
12 1

A dictionary that you only put keys into

Usually you put keys and values into a dictionary. But there is a special type of dictionary called a collections.Counter. (In the language of Object-Oriented Programming, we say that class collections.Counter is a subclass of class dict.) You put keys into a collections.Counter, and the collections.Counter provides the values for you.

"""
Count how many times each score appears.
"""

import sys
import collections

scores = [10, 9, 10, 8, 11, 10, 12, 8, 11]   #The most frequent score is 10.
counter = collections.Counter(scores)

print("In original key order:")
for key, value in counter.items(): #The key is the score, the value is the number of occurrences.
    print(f"{key:2} {value}")

print()

print("In order of decreasing frequency:")
for key, value in counter.most_common(): #Also try for key, value in reversed(counter.most_common()):
    print(f"{key:2} {value}")

print()

print("In order of increasing score:")
for key in sorted(counter):   #Also try for key in sorted(counter, reverse = True):
    print(f"{key:2} {counter[key]}")

sys.exit(0)
In original key order:
10 3
 9 1
 8 2
11 2
12 1

In order of decreasing frequency:
10 3
 8 2
11 2
 9 1
12 1

In order of increasing score:
 8 2
 9 1
10 3
11 2
12 1

Class collections.Counter is a subclass of class dict.

import collections

for t in collections.Counter.mro(): #method resolution order
    print(t)                        #t is a type
<class 'collections.Counter'>
<class 'dict'>
<class 'object'>

Count the characters in a string instead of the ints in a list

The following program produces exit status 0 if every letter is present, exit status 1 otherwise. See Intersect for another way to find the missing letters.

"""
Count how many times each letter appears.  Is every letter present?
"""

import sys
import string
import collections

s = "Pack my box with five dozen liquor jugs." #pangram

#Prep the patient for surgery.
listOfLetters = [c for c in s if c.isalpha()]  #listOfLetters is a list of one-character strings.
s = "".join(listOfLetters)                     #s is a string
s = s.lower()

counter = collections.Counter(s)

for key in sorted(counter):      #List the lowercase letters in alphabetical order.
    print(key, counter[key])

allPresent = True                #Innocent until proven guilty.
for c in string.ascii_lowercase: #Loop 26 times.
    if counter[c] == 0:
        print(f'"{c}" is missing!')
        allPresent = False

sys.exit(0 if allPresent else 1)
a 1
b 1
c 1
d 1
e 2
f 1
g 1
h 1
i 3
j 1
k 1
l 1
m 1
n 1
o 3
p 1
q 1
r 1
s 1
t 1
u 2
v 1
w 1
x 1
y 1
z 1

Count manually.

"""
Count how many times each state was entered.
The counts automatically start at 0.
"""

import sys
import collections

counter = collections.Counter() #Create an empty Counter.

counter["NY"] += 1              #Automatically starts at 0.
counter["NY"] += 1
counter["NJ"] += 1
counter["CT"] += 1
counter["NJ"] += 1
counter["NY"] += 1

for state in sorted(counter):   #alphabetical order
    print(state, counter[state])

sys.exit(0)
CT 1
NJ 2
NY 3

Who has the most mouse violations?

An example of counting manually.

"""
List the CAMIS number, name, and number of mouse violations
of the 10 restaurants with the largest number of mouse violations.
"""

import sys
import csv  #Comma-Separated Values
import urllib.request
import collections

#Database is at
#https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j
url = "https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv"

try:
    fileFromUrl = urllib.request.urlopen(url)
except urllib.error.URLError as error:
    print(error, file = sys.stderr)
    sys.exit(1)

sequenceOfBytes = fileFromUrl.read() #Slurp whole file into one big sequenceOfBytes.
fileFromUrl.close()

try:
    s = sequenceOfBytes.decode("utf-8")    #s is a string
except UnicodeError as error:
    print(error, file = sys.stderr)
    sys.exit(1)

lines = csv.reader(s.splitlines())   #lines is a list of lists

#Two dictionaries that let you look up a CAMIS number and find the corresponding ...
dba = {}                                   #name of the restaurant
numberOfViolations = collections.Counter() #number of mice violations for that restaurant

for line in lines:                     #line is a list of 26 strings
    if "Evidence of mice or live mice present in facility's food and/or non-food areas." in line[11]:
        camis = int(line[0])           #the id number of the restaurant
        dba[camis] = line[1]           #Record the name of this restaurant.
        numberOfViolations[camis] += 1 #Tally an additional violation.  Automatically starts at 0.

for camis, n in numberOfViolations.most_common(10): #the 10 worst offenders, starting with the worst
    print(f"{camis:8} {n:2} {dba[camis]}")

sys.exit(0)
50016943 13 EL NUEVO ROBLE BILLIARDS
50046623 12 COLD STONE CREAMERY
40423819 12 ALFONSO'S PASTRY SHOPPE
50015263 12 LA POSADA MEXICAN FOOD
41259444 12 COCO ROCO RESTAURANT
50058969 11 CAFE CREOLE
41642251 11 LITTLE CAESARS
50035603 11 AUTHENTIC SZECHUAN
41407999 11 BERMUDEZ BAKERY
50038412 11 TWIN SISTER PAN

Things to try

  1. After creating the variable lines, change the rest of the above program to the following. What are the tradeoffs?
    listOfTuples = [
        (int(line[0]), line[1])   #tuple containing CAMIS number and name of restaurant
        for line in lines
        if "Evidence of mice or live mice present in facility's food and/or non-food areas." in line[11]
    ]
    
    for (camis, dba), n in collections.Counter(listOfTuples).most_common(10): #Start with worst offender.
        print(f"{camis:8} {n:2} {dba}")
    
    sys.exit(0)
    
    50016943 13 EL NUEVO ROBLE BILLIARDS
    50046623 12 COLD STONE CREAMERY
    40423819 12 ALFONSO'S PASTRY SHOPPE
    50015263 12 LA POSADA MEXICAN FOOD
    41259444 12 COCO ROCO RESTAURANT
    50058969 11 CAFE CREOLE
    41642251 11 LITTLE CAESARS
    50035603 11 AUTHENTIC SZECHUAN
    41407999 11 BERMUDEZ BAKERY
    50038412 11 TWIN SISTER PAN
    
    What I really want to create is a dictionary in which each key is a CAMIS number, and each value is a tuple containing the restaurant’s name and number of mouse violations. Then I’d like to treat this dictionary as a collections.Counter based on the number of mouse violations (and ignoring the name of the restaurant). Is this possible?
  2. The first example in pandas:
    import sys
    import pandas as pd
    
    scores = [10, 9, 10, 8, 11, 10, 12, 8, 11]   #The most frequent score is 10.
    series = pd.Series(scores)                   #series is a pandas Series.
    
    print("The original series:")
    print(series)
    print()
    
    print("In order of decreasing frequency:")
    print(series.value_counts(sort = True)) #also try print(series.value_counts(sort = True, ascending = True))
    print()
    
    print("In order of increasing score:")
    print(series.value_counts(sort = False)) #also try print(series.value_counts(sort = False).sort_index(ascending = False))
    print()
    
    counts = series.value_counts(sort = True)
    mostFrequentScore = counts.index[0]
    numberOfTimes = counts.array[0]
    print(f"The most frequently occurring score is {mostFrequentScore}.")
    print(f"It occurs {numberOfTimes} times.")
    sys.exit(0)
    
    The original series:
    0    10
    1     9
    2    10
    3     8
    4    11
    5    10
    6    12
    7     8
    8    11
    dtype: int64
    
    In order of decreasing frequency:
    10    3
    11    2
    8     2
    12    1
    9     1
    dtype: int64
    
    In order of increasing score:
    8     2
    9     1
    10    3
    11    2
    12    1
    dtype: int64
    
    The most frequently occurring score is 10.
    It occurs 3 times.