Beautiful Soup

Documentation

  1. Website.
  2. Documentation.
  3. Wikipedia.

Installation

pip3 search beautifulsoup4
pip3 install beautifulsoup4

pip3 list | awk 'NR <= 2 || $1 == "beautifulsoup4"'
Package        Version   
-------------- ----------
beautifulsoup4 4.8.2 

pip3 show beautifulsoup4
Name: beautifulsoup4
Version: 4.8.2
Summary: Screen-scraping library
Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/
Author: Leonard Richardson
Author-email: leonardr@segfault.org
License: MIT
Location: /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages
Requires: soupsieve

pip3 show --files beautifulsoup4 | cat -n

Elements and their children

A web browser would render this page of html as page.html.

htmlElement.contents is a sequence (in fact, it’s a list), so it can be the argument of the len function. On the other hand, htmlElement.children is an iterator (in fact, it’s a list_iterator), so we can give it to a for loop but not to len.

"""
Parse a page of html.
Print the children of the html element and the children of the body element.
"""

import sys
import bs4   #beautiful soup

page = """\
<HTML>
<HEAD>
<TITLE>The Title</TITLE>
</HEAD>

<BODY>
<H1>The Header</H1>

<P>
Paragraph 1
</P>

<P>
Paragraph 2
</P>

<P>
Paragraph 3
</P>
</BODY>
</HTML>"""

soup = bs4.BeautifulSoup(page, "html.parser")   #Try it without the "html.parser".

htmlElement = soup.html   #the first html element in the page
print(f"The {len(htmlElement.contents)} children of the html element are:")

for i, child in enumerate(htmlElement.children, start = 1):
    print(i, child.name, type(child))
print()

bodyElement = soup.body   #the first body element in the page
print(f"The {len(bodyElement.contents)} children of the body element are:")

for i, child in enumerate(bodyElement.children, start = 1):
    print(i, child.name, type(child))

sys.exit(0)

An HTML element has a name, but the name of a NavigableString is None:

The 5 children of the html element are:
1 None <class 'bs4.element.NavigableString'>
2 head <class 'bs4.element.Tag'>
3 None <class 'bs4.element.NavigableString'>
4 body <class 'bs4.element.Tag'>
5 None <class 'bs4.element.NavigableString'>

The 9 children of the body element are:
1 None <class 'bs4.element.NavigableString'>
2 h1 <class 'bs4.element.Tag'>
3 None <class 'bs4.element.NavigableString'>
4 p <class 'bs4.element.Tag'>
5 None <class 'bs4.element.NavigableString'>
6 p <class 'bs4.element.Tag'>
7 None <class 'bs4.element.NavigableString'>
8 p <class 'bs4.element.Tag'>
9 None <class 'bs4.element.NavigableString'>

To examine the three NavigableStrings in the HTML element, insert the following code after the print in the first for loop. They’s not very interesting.

    if isinstance(child, bs4.element.NavigableString):
        print(f'"{child}"')

More interesting are the HTML elements that contain content, such as H1 and P.

bodyElement = soup.body   #the first body element in the page
children = [child for child in bodyElement.children if child.name]
print(f"The {len(children)} non-bs4.element.NavigableString children of the body element are:")

for i, child in enumerate(children, start = 1):
    print(f'{i} {child.name} {type(child)} "{child.string}"')
The 4 non-bs4.element.NavigableString children of the body element are:
1 h1 <class 'bs4.element.Tag'> "The Header"
2 p <class 'bs4.element.Tag'> "
Paragraph 1
"
3 p <class 'bs4.element.Tag'> "
Paragraph 2
"
4 p <class 'bs4.element.Tag'> "
Paragraph 3
"

Print the total number of elements in a page of HTML

The front page of the New York Times often contains more than a thousand elements.

#Exclude the NavigableStrings.
elements = [element for element in soup.descendants if element.name]
print(f"The page contains {len(elements)} elements.")

To find the total number of elements in the body element,

elements = [element for element in soup.body.descendants if element.name]
print(f"The body element contains {len(elements)} elements.")

The find_all method

"""
Print the name and content of every h2 element in the front page of the New York Times.
"""

import sys
import urllib.request
import bs4   #beautiful soup

try:
    response = urllib.request.urlopen("http://nytimes.com/")
except urllib.error.URLError as error:
    print(f"urllib.error.URLError: {error}", file = sys.stderr)
    sys.exit(1)

soup = bs4.BeautifulSoup(response, "html.parser")
response.close()

for tag in soup.find_all("h2"):   #Must be lowercase.
    print(tag.name, tag.string)

sys.exit(0)
h2 Your Thursday Briefing
h2 Listen to ‘The Daily’
h2 In the ‘In Her Words’ Newsletter
h2 After Stone Case, Prosecutors Say They Fear Pressure From Trump
h2 As a Post-Impeachment Trump Pushes the Limits, Republicans Say Little
etc.
h2 Diary of a Song: ‘Dance Monkey’
h2 He Was Fearless on a Football Field. It Was the Future That Scared Him.
h2 The Chaos at Condé Nast
h2 Site Index
h2 Site Information Navigation
for tag in soup.find_all(["title", "h1", "h2", "h3", "p"]):
import re
for tag in soup.find_all(re.compile(r"^h\d$")):

Recursively print the whole tree

"""
Print each element, one per line, indented to show its level of nesting.
If the element contains text, print the text.
"""

import sys
import urllib.request
import bs4   #beautiful soup

#Print one copy of the text contained by these tags.
containers = [
    "h1",   #headers
    "h2",
    "h3",
    "li",   #list
    "title",
    "a",    #anchor
    "p",    #paragraph
    "span",
    "button",
    "section",
    "div",  #division
    "header"
]

def printtag(tag, depth):
    if tag.name:
        print(f'{depth * "."}{tag.name}', end = "")
        if tag.name in containers and tag.string != None and tag.string != tag.parent.string:
            print(f" {tag.string}", end = "")
        print()

        if hasattr(tag, "children"):   #LBYL
            for child in tag.children:
                printtag(child, depth + 1)

response = urllib.request.urlopen("http://nytimes.com/")
soup = bs4.BeautifulSoup(response, "html.parser")
response.close()
printtag(soup.html, 0)
sys.exit(0)
html
.head
..title The New York Times - Breaking News, World News & Multimedia
..meta
..meta
..meta
..meta
..meta
..meta
..meta
etc.
..script
..script
..script
..script
..script
..script
..noscript
...iframe
..div
...script

Upgrade to Python 4

"""
Output a copy of the home page for this course, with all the Python 3 links changed to Python 4.
"""

import sys
import re
import urllib.request
import bs4   #beautiful soup

url = "http://oit2.scps.nyu.edu/~meretzkm/python/"
try:
    response = urllib.request.urlopen(url)
except urllib.error.URLError as error:
    print(f"urllib.error.URLError: {error}", file = sys.stderr)
    sys.exit(1)

soup = bs4.BeautifulSoup(response, "html.parser")
response.close()

for a in soup.find_all("a", href = lambda url: re.search(r"^https://docs\.python\.org/3/", str(url))):
    a["href"] = re.sub(r"^https://docs\.python\.org/3/", "https://docs.python.org/4/", a["href"])

print(soup.prettify())
sys.exit(0)