-
-
Save alexmill/1dbd8b865353994bd5621ad1e884c491 to your computer and use it in GitHub Desktop.
Visualize HTML structure of a BeautifulSoup object
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Visualize HTML structure of a BeautifulSoup object with: | |
# - vertical connecting lines | |
# - option to remove attributes | |
# Forked from Dan Mattera's : https://gist.github.com/danmattera/ef11cb37c31d732f9e5d2347eea876c2 | |
# By Alex Miller https://alex.miller.im | |
from bs4 import BeautifulSoup as BS | |
def BeautifulSoup(X): | |
# This just sets the default parser for BeautifulSoup | |
# to "html.parser" so it doesn't alwasy add <html><body> | |
# tags by default. | |
return(BS(X, "html.parser")) | |
def soup_viz(soup, spacing=1, with_attrs=False, output='print'): #where desired_indent is number of spaces as an int() | |
desired_indent = 2 | |
pretty_soup = str() | |
previous_indent = 0 | |
for line in soup.prettify().split("\n"): # iterate over each line of a prettified soup | |
current_indent = str(line).find("<") # returns the index for the opening html tag '<' | |
# which is also represents the number of spaces in the lines indentation | |
if current_indent == -1 or current_indent > previous_indent + 2: | |
current_indent = previous_indent + 1 | |
# str.find() will equal -1 when no '<' is found. This means the line is some kind | |
# of text or script instead of an HTML element and should be treated as a child | |
# of the previous line. also, current_indent should never be more than previous + 1. | |
previous_indent = current_indent | |
pretty_soup += write_new_line(line, current_indent, spacing=spacing, with_attrs=with_attrs) | |
if output=='print': | |
print(pretty_soup) | |
else: | |
return(pretty_soup) | |
def write_new_line(line, current_indent, spacing=1, with_attrs=False): | |
new_line = "" | |
spaces_to_add = (current_indent * 2) - current_indent | |
if spaces_to_add > 0: | |
for i in range(spaces_to_add): | |
new_line += "│"+ " "*(spacing-1) | |
# plain text line | |
if not line.strip().startswith('<'): | |
connector = " "*(spacing-1) | |
line_content = str(line).strip() | |
# opening tag line | |
elif not line.strip().startswith('</'): | |
connector = '┌' | |
if with_attrs: | |
line_content = str(line).strip() | |
else: | |
line_content = '<{}>'.format(BeautifulSoup(str(line).strip()).find().name) | |
# closing tag line | |
else: | |
connector = '└' | |
line_content = str(line.strip()) | |
new_line += connector + line_content + "\n" | |
return(new_line) | |
# Example: | |
soup = BeautifulSoup("""<div><div><span>a</span><span>b</span> | |
<a>link</a></div><a>link1</a><a>link2</a></div>""") | |
viz = soup_viz(soup, spacing=3, with_attrs=True) | |
print(viz) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output of script above should look like: