alexmill · October 2, 2019 01:33 · alexmill · Sep 30, 2019
diff --git a/soup_viz.py b/soup_viz.py
 # Visualize HTML structure of a BeautifulSoup object with:
 #	- vertical connecting lines
 #	- option to remove attributes
 # Forked from Dan Mattera's : https://gist.github.com/danmattera/ef11cb37c31d732f9e5d2347eea876c2
 # By Alex Miller https://alex.miller.im
 from bs4 import BeautifulSoup as BS

 def BeautifulSoup(X):
    # This just sets the default parser for BeautifulSoup
    # to "html.parser" so it doesn't alwasy add <html><body>
    # tags by default.
    return(BS(X, "html.parser"))

 def soup_viz(soup, spacing=1, with_attrs=False, output='print'): #where desired_indent is number of spaces as an int() 
    desired_indent = 2
    pretty_soup = str()
    previous_indent = 0
    for line in soup.prettify().split("\n"): # iterate over each line of a prettified soup
        current_indent = str(line).find("<") # returns the index for the opening html tag '<' 
        # which is also represents the number of spaces in the lines indentation
        if current_indent == -1 or current_indent > previous_indent + 2:
            current_indent = previous_indent + 1
            # str.find() will equal -1 when no '<' is found. This means the line is some kind 
            # of text or script instead of an HTML element and should be treated as a child 
            # of the previous line. also, current_indent should never be more than previous + 1.    
        previous_indent = current_indent
        pretty_soup += write_new_line(line, current_indent, spacing=spacing, with_attrs=with_attrs)
    if output=='print':
        print(pretty_soup)
    else:
        return(pretty_soup)
        
        
 def write_new_line(line, current_indent, spacing=1, with_attrs=False):
    new_line = ""
    spaces_to_add = (current_indent * 2) - current_indent
    if spaces_to_add > 0:
        for i in range(spaces_to_add):
            new_line += "│"+ " "*(spacing-1)  
            
    # plain text line
    if not line.strip().startswith('<'):
        connector = " "*(spacing-1)  
        line_content = str(line).strip()
    # opening tag line
    elif not line.strip().startswith('</'):
        connector = '┌'
        if with_attrs:
            line_content = str(line).strip()
        else:
            line_content = '<{}>'.format(BeautifulSoup(str(line).strip()).find().name)
                
    # closing tag line
    else:
        connector = '└'
        line_content = str(line.strip())
        
    new_line += connector + line_content + "\n"
    return(new_line)



 # Example: 
 soup = BeautifulSoup("""<div><div><span>a</span><span>b</span>
 <a>link</a></div><a>link1</a><a>link2</a></div>""")
 viz = soup_viz(soup, spacing=3, with_attrs=True)
 print(viz)
	# Visualize HTML structure of a BeautifulSoup object with:
	# - vertical connecting lines
	# - option to remove attributes
	# Forked from Dan Mattera's : https://gist.github.com/danmattera/ef11cb37c31d732f9e5d2347eea876c2
	# By Alex Miller https://alex.miller.im
	from bs4 import BeautifulSoup as BS

	def BeautifulSoup(X):
	# This just sets the default parser for BeautifulSoup
	# to "html.parser" so it doesn't alwasy add <html><body>
	# tags by default.
	return(BS(X, "html.parser"))

	def soup_viz(soup, spacing=1, with_attrs=False, output='print'): #where desired_indent is number of spaces as an int()
	desired_indent = 2
	pretty_soup = str()
	previous_indent = 0
	for line in soup.prettify().split("\n"): # iterate over each line of a prettified soup
	current_indent = str(line).find("<") # returns the index for the opening html tag '<'
	# which is also represents the number of spaces in the lines indentation
	if current_indent == -1 or current_indent > previous_indent + 2:
	current_indent = previous_indent + 1
	# str.find() will equal -1 when no '<' is found. This means the line is some kind
	# of text or script instead of an HTML element and should be treated as a child
	# of the previous line. also, current_indent should never be more than previous + 1.
	previous_indent = current_indent
	pretty_soup += write_new_line(line, current_indent, spacing=spacing, with_attrs=with_attrs)
	if output=='print':
	print(pretty_soup)
	else:
	return(pretty_soup)


	def write_new_line(line, current_indent, spacing=1, with_attrs=False):
	new_line = ""
	spaces_to_add = (current_indent * 2) - current_indent
	if spaces_to_add > 0:
	for i in range(spaces_to_add):
	new_line += "│"+ " "*(spacing-1)

	# plain text line
	if not line.strip().startswith('<'):
	connector = " "*(spacing-1)
	line_content = str(line).strip()
	# opening tag line
	elif not line.strip().startswith('</'):
	connector = '┌'
	if with_attrs:
	line_content = str(line).strip()
	else:
	line_content = '<{}>'.format(BeautifulSoup(str(line).strip()).find().name)

	# closing tag line
	else:
	connector = '└'
	line_content = str(line.strip())

	new_line += connector + line_content + "\n"
	return(new_line)



	# Example:
	soup = BeautifulSoup("""<div><div><span>a</span><span>b</span>
	<a>link</a></div><a>link1</a><a>link2</a></div>""")
	viz = soup_viz(soup, spacing=3, with_attrs=True)
	print(viz)