from bs4 import BeautifulSoupsoup = BeautifulSoup(html, "lxml") # recommendedsoup = BeautifulSoup(html, "html.parser") # no depssoup = BeautifulSoup(xml, "lxml-xml") # for XML
Core Objects
Type
Description
BeautifulSoup
Root document object, subclass of Tag
Tag
An HTML/XML tag and its contents
NavigableString
Text inside a tag
Comment
HTML comment (subclass of NavigableString)
Finding Elements
Primary methods — work on both BeautifulSoup and Tag objects:
# find — returns first match or Nonesoup.find("div")soup.find("a", class_="link")soup.find("input", {"type": "text"})# find_all — returns list (alias: findAll)soup.find_all("p")soup.find_all(["h1", "h2", "h3"])soup.find_all("a", limit=5)soup.find_all(True) # all tags# CSS selectorssoup.select("div.container > p")soup.select_one("a[href]")soup.select("ul li:nth-of-type(2)")# By attributesoup.find_all(id="main")soup.find_all(class_="active") # class_ avoids keyword clashsoup.find_all(attrs={"data-id": "42"})
Regex and functions as filters:
import resoup.find_all("a", href=re.compile(r"^https"))soup.find_all(re.compile(r"^h[1-6]$")) # all headings# Function filtersoup.find_all(lambda tag: tag.has_attr("data-custom") and tag.name =="div")
Navigating the Tree
tag = soup.find("div")# Children / descendantstag.children # direct children (iterator)tag.descendants # all nested (iterator)tag.contents # direct children (list)list(tag.strings) # all text strings, nestedlist(tag.stripped_strings) # stripped, no empty strings# Parentstag.parentlist(tag.parents) # all ancestors up to document# Siblingstag.next_siblingtag.previous_siblinglist(tag.next_siblings)list(tag.previous_siblings)# Parsed tree ordertag.next_elementtag.previous_element
Accessing Attributes & Text
tag = soup.find("a")# Attributestag["href"] # KeyError if missingtag.get("href") # None if missingtag.get("class", []) # class is always a listtag.attrs # dict of all attributestag.has_attr("id")# Text contenttag.string # NavigableString if single text node, else Nonetag.get_text() # all text concatenatedtag.get_text(separator=" ", strip=True)
Modifying the Tree
tag["class"] ="new-class"tag["data-x"] ="value"del tag["id"]tag.string.replace_with("new text")tag.append(new_tag)tag.insert(0, new_tag)tag.insert_before(sibling)tag.insert_after(sibling)tag.decompose() # remove tag + contents from tree, destroys ittag.extract() # remove and return tag (reusable)tag.replace_with(other_tag)# Create new tagsnew_tag = soup.new_tag("a", href="https://example.com")new_tag.string ="Click here"soup.body.append(new_tag)new_string = soup.new_string("some text")
Output
str(soup) # full HTML stringsoup.prettify() # indented, unicode outputsoup.encode("utf-8")tag.decode() # tag as stringtag.encode() # tag as bytes
Common Patterns
Scraping links:
for a in soup.find_all("a", href=True):print(a["href"], a.get_text(strip=True))
Scraping a table:
rows = soup.select("table tbody tr")data = [[td.get_text(strip=True) for td in row.find_all("td")] for row in rows]
# Example 1: Basic HTML Parsinghtml_string ="""<html> <body> <div class="container"> <h1>Welcome to BeautifulSoup</h1> <p class="intro">This is an introduction paragraph.</p> <ul> <li><a href="https://example.com">Example Link</a></li> <li><a href="https://python.org">Python</a></li> <li><a href="https://github.com">GitHub</a></li> </ul> </div> </body></html>"""soup = BeautifulSoup(html_string, "lxml")print("Parsed HTML document:")print(soup.prettify()[:300] +"...\n")
Parsed HTML document:
<html>
<body>
<div class="container">
<h1>
Welcome to BeautifulSoup
</h1>
<p class="intro">
This is an introduction paragraph.
</p>
<ul>
<li>
<a href="https://example.com">
Example Link
</a>
</li>
<li>
<a href="https://python.org">
Pyt...
# Example 2: Finding Elementsprint("=== Finding Elements ===\n")# find() - returns first matchh1 = soup.find("h1")print(f"First h1: {h1}")# find_all() - returns listlinks = soup.find_all("a")print(f"All links found: {len(links)}")for link in links:print(f" - {link.get_text()}: {link.get('href')}")# CSS selectorscontainer = soup.select_one("div.container")print(f"Container found: {container.name if container else'None'}")# By attributeparagraphs = soup.find_all("p", class_="intro")print(f"Paragraphs with class 'intro': {len(paragraphs)}")
=== Finding Elements ===
First h1: <h1>Welcome to BeautifulSoup</h1>
All links found: 3
- Example Link: https://example.com
- Python: https://python.org
- GitHub: https://github.com
Container found: div
Paragraphs with class 'intro': 1
# Example 3: Navigating the Treeprint("\n=== Navigating the Tree ===\n")# Get the first linkfirst_link = soup.find("a")print(f"First link: {first_link.get_text()}")# Parent navigationparent = first_link.parentprint(f"Parent tag: {parent.name}") # li# Siblings navigationnext_sibling = first_link.next_siblingprint(f"Next sibling (text): {repr(next_sibling)}")next_tag_sibling = first_link.find_next_sibling("a")print(f"Next <a> sibling: {next_tag_sibling.get_text() if next_tag_sibling else'None'}")# Descendantscontainer = soup.find("div", class_="container")all_text =list(container.stripped_strings)print(f"All text in container: {all_text[:5]}...")
=== Navigating the Tree ===
First link: Example Link
Parent tag: li
Next sibling (text): None
Next <a> sibling: None
All text in container: ['Welcome to BeautifulSoup', 'This is an introduction paragraph.', 'Example Link', 'Python', 'GitHub']...
# Example 4: Accessing Attributes & Textprint("\n=== Accessing Attributes & Text ===\n")link = soup.find("a")print(f"href attribute: {link['href']}")print(f"href (safe get): {link.get('href')}")print(f"data-custom (missing): {link.get('data-custom', 'N/A')}")print(f"All attributes: {link.attrs}")print(f"Has 'href'?: {link.has_attr('href')}")# Text contenth1 = soup.find("h1")print(f"Text content: {h1.get_text()}")print(f"Text with strip: {h1.get_text(strip=True)}")
=== Accessing Attributes & Text ===
href attribute: https://example.com
href (safe get): https://example.com
data-custom (missing): N/A
All attributes: {'href': 'https://example.com'}
Has 'href'?: True
Text content: Welcome to BeautifulSoup
Text with strip: Welcome to BeautifulSoup
# Example 5: Modifying the Treeprint("\n=== Modifying the Tree ===\n")# Create a copy to modifysoup_copy = BeautifulSoup(str(soup), "lxml")# Modify attributeslink = soup_copy.find("a")link["data-modified"] ="true"link["class"] ="external-link"print(f"Modified link: {link}")# Create and append new tagsnew_tag = soup_copy.new_tag("p", attrs={"class": "note"})new_tag.string ="This is a new paragraph!"container = soup_copy.find("div", class_="container")container.append(new_tag)print(f"New paragraph added: {new_tag}")# Extract specific tagextracted = soup_copy.find("h1").extract()print(f"Extracted h1 from tree: {extracted}")
=== Modifying the Tree ===
Modified link: <a class="external-link" data-modified="true" href="https://example.com">Example Link</a>
New paragraph added: <p class="note">This is a new paragraph!</p>
Extracted h1 from tree: <h1>Welcome to BeautifulSoup</h1>
# Example 6: Scraping a Tableprint("\n=== Scraping a Table ===\n")table_html ="""<table> <thead> <tr><th>Name</th><th>Age</th><th>City</th></tr> </thead> <tbody> <tr><td>Alice</td><td>28</td><td>New York</td></tr> <tr><td>Bob</td><td>35</td><td>San Francisco</td></tr> <tr><td>Charlie</td><td>31</td><td>Boston</td></tr> </tbody></table>"""table_soup = BeautifulSoup(table_html, "lxml")# Extract headerheaders = [th.get_text(strip=True) for th in table_soup.find_all("th")]print(f"Headers: {headers}")# Extract rowsrows = table_soup.find_all("tr")[1:] # Skip header rowdata = []for row in rows: cols = [td.get_text(strip=True) for td in row.find_all("td")] data.append(cols)print(f" {cols}")print(f"\nTotal rows scraped: {len(data)}")