pip3 search beautifulsoup4 pip3 install beautifulsoup4 pip3 list | awk 'NR <= 2 || $1 == "beautifulsoup4"' Package Version -------------- ---------- beautifulsoup4 4.8.2 pip3 show beautifulsoup4 Name: beautifulsoup4 Version: 4.8.2 Summary: Screen-scraping library Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/ Author: Leonard Richardson Author-email: leonardr@segfault.org License: MIT Location: /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages Requires: soupsieve pip3 show --files beautifulsoup4 | cat -n
A web browser would render this page of html as
page.html
.
htmlElement.contents
is a
sequence
(in fact, it’s a
list
),
so it can be the argument of the
len
function.
On the other hand,
htmlElement.children
is an
iterator
(in fact, it’s a
list_iterator
),
so we can give it to a
for
loop but not to
len
.
""" Parse a page of html. Print the children of the html element and the children of the body element. """ import sys import bs4 #beautiful soup page = """\ <HTML> <HEAD> <TITLE>The Title</TITLE> </HEAD> <BODY> <H1>The Header</H1> <P> Paragraph 1 </P> <P> Paragraph 2 </P> <P> Paragraph 3 </P> </BODY> </HTML>""" soup = bs4.BeautifulSoup(page, "html.parser") #Try it without the "html.parser". htmlElement = soup.html #the first html element in the page print(f"The {len(htmlElement.contents)} children of the html element are:") for i, child in enumerate(htmlElement.children, start = 1): print(i, child.name, type(child)) print() bodyElement = soup.body #the first body element in the page print(f"The {len(bodyElement.contents)} children of the body element are:") for i, child in enumerate(bodyElement.children, start = 1): print(i, child.name, type(child)) sys.exit(0)
An HTML element has a
name
,
but the
name
of a
NavigableString
is
None
:
The 5 children of the html element are: 1 None <class 'bs4.element.NavigableString'> 2 head <class 'bs4.element.Tag'> 3 None <class 'bs4.element.NavigableString'> 4 body <class 'bs4.element.Tag'> 5 None <class 'bs4.element.NavigableString'> The 9 children of the body element are: 1 None <class 'bs4.element.NavigableString'> 2 h1 <class 'bs4.element.Tag'> 3 None <class 'bs4.element.NavigableString'> 4 p <class 'bs4.element.Tag'> 5 None <class 'bs4.element.NavigableString'> 6 p <class 'bs4.element.Tag'> 7 None <class 'bs4.element.NavigableString'> 8 p <class 'bs4.element.Tag'> 9 None <class 'bs4.element.NavigableString'>
To examine the three
NavigableString
s
in the
HTML
element,
insert the following code after the
print
in the first
for
loop.
They’s not very interesting.
if isinstance(child, bs4.element.NavigableString): print(f'"{child}"')
More interesting are the HTML elements that contain content,
such as
H1
and
P
.
bodyElement = soup.body #the first body element in the page children = [child for child in bodyElement.children if child.name] print(f"The {len(children)} non-bs4.element.NavigableString children of the body element are:") for i, child in enumerate(children, start = 1): print(f'{i} {child.name} {type(child)} "{child.string}"')
The 4 non-bs4.element.NavigableString children of the body element are: 1 h1 <class 'bs4.element.Tag'> "The Header" 2 p <class 'bs4.element.Tag'> " Paragraph 1 " 3 p <class 'bs4.element.Tag'> " Paragraph 2 " 4 p <class 'bs4.element.Tag'> " Paragraph 3 "
The front page of the New York Times often contains more than a thousand elements.
#Exclude the NavigableStrings. elements = [element for element in soup.descendants if element.name] print(f"The page contains {len(elements)} elements.")
To find the total number of elements in the
body
element,
elements = [element for element in soup.body.descendants if element.name] print(f"The body element contains {len(elements)} elements.")
""" Print the name and content of every h2 element in the front page of the New York Times. """ import sys import urllib.request import bs4 #beautiful soup try: response = urllib.request.urlopen("http://nytimes.com/") except urllib.error.URLError as error: print(f"urllib.error.URLError: {error}", file = sys.stderr) sys.exit(1) soup = bs4.BeautifulSoup(response, "html.parser") response.close() for tag in soup.find_all("h2"): #Must be lowercase. print(tag.name, tag.string) sys.exit(0)
h2 Your Thursday Briefing h2 Listen to ‘The Daily’ h2 In the ‘In Her Words’ Newsletter h2 After Stone Case, Prosecutors Say They Fear Pressure From Trump h2 As a Post-Impeachment Trump Pushes the Limits, Republicans Say Little etc. h2 Diary of a Song: ‘Dance Monkey’ h2 He Was Fearless on a Football Field. It Was the Future That Scared Him. h2 The Chaos at Condé Nast h2 Site Index h2 Site Information Navigation
for tag in soup.find_all(["title", "h1", "h2", "h3", "p"]):
import re
for tag in soup.find_all(re.compile(r"^h\d$")):
""" Print each element, one per line, indented to show its level of nesting. If the element contains text, print the text. """ import sys import urllib.request import bs4 #beautiful soup #Print one copy of the text contained by these tags. containers = [ "h1", #headers "h2", "h3", "li", #list "title", "a", #anchor "p", #paragraph "span", "button", "section", "div", #division "header" ] def printtag(tag, depth): if tag.name: print(f'{depth * "."}{tag.name}', end = "") if tag.name in containers and tag.string != None and tag.string != tag.parent.string: print(f" {tag.string}", end = "") print() if hasattr(tag, "children"): #LBYL for child in tag.children: printtag(child, depth + 1) response = urllib.request.urlopen("http://nytimes.com/") soup = bs4.BeautifulSoup(response, "html.parser") response.close() printtag(soup.html, 0) sys.exit(0)
html .head ..title The New York Times - Breaking News, World News & Multimedia ..meta ..meta ..meta ..meta ..meta ..meta ..meta etc. ..script ..script ..script ..script ..script ..script ..noscript ...iframe ..div ...script
""" Output a copy of the home page for this course, with all the Python 3 links changed to Python 4. """ import sys import re import urllib.request import bs4 #beautiful soup url = "http://oit2.scps.nyu.edu/~meretzkm/python/" try: response = urllib.request.urlopen(url) except urllib.error.URLError as error: print(f"urllib.error.URLError: {error}", file = sys.stderr) sys.exit(1) soup = bs4.BeautifulSoup(response, "html.parser") response.close() for a in soup.find_all("a", href = lambda url: re.search(r"^https://docs\.python\.org/3/", str(url))): a["href"] = re.sub(r"^https://docs\.python\.org/3/", "https://docs.python.org/4/", a["href"]) print(soup.prettify()) sys.exit(0)