brandonmburroughs · April 30, 2017 22:35
diff --git a/broken_link_finder.py b/broken_link_finder.py
 """
 Name:  Broken Link Finder
 Author:  Brandon M. Burroughs
 Description:  Give the URL for a page and this will find all broken links on 
    that page.  Currently, "broken" is defined as return page not found.
 """


 import requests
 from bs4 import BeautifulSoup

 frac_webpage = 'http://frac.org/federal-foodnutrition-programs/child-and-adult-care-program/child-care-wellness-plans-and-policies/'


 def find_broken_links(webpage):
    """
    Finds broken links within a webpage

    Parameters
    ----------
    webpage : str
        The URL for the webpage to search

    Returns
    -------
    broken_links : str
        A list of broken links
    """
    # Get the page
    r = requests.get(page)

    # Soup the page text
    soup = BeautifulSoup(r.text, 'html.parser')

    # Find all of the a elements
    all_a = soup.findAll('a')

    # Get all of the links out the a elments
    links = [link.get('href') for link in all_a if link.has_attr('href') and link.get('href')[0] != "#"]

    # Intialize empty list
    broken_links = []

    n = len(links)
    i = 0
    # Loop through all of the links
    for link in links:
        i += 1
        try:
            # Request the page
            r_link = requests.get(link)

            # If we get a response
            if r.ok:
                # Check for page not found message
                if 'Page Not Found' in r_link.text or 'page not found' in r_link.text:
                    broken_links.append(link)
            # If we get an error status code, append the link
            else:
                broken_links.append(link)
        except:
            broken_links.append(link)

        # Check progress
        if i % 10 == 0:
            print "%s%% finished!" % (round(float(i) / n * 100, 2))

    return broken_links


 if __name__ == '__main__':
    frac_webpage = 'http://frac.org/federal-foodnutrition-programs/child-and-adult-care-program/child-care-wellness-plans-and-policies/'

    broken_links = find_broken_links(frac_webpage)

    for link in broken_links:
        print link
	"""
	Name: Broken Link Finder
	Author: Brandon M. Burroughs
	Description: Give the URL for a page and this will find all broken links on
	that page. Currently, "broken" is defined as return page not found.
	"""


	import requests
	from bs4 import BeautifulSoup

	frac_webpage = 'http://frac.org/federal-foodnutrition-programs/child-and-adult-care-program/child-care-wellness-plans-and-policies/'


	def find_broken_links(webpage):
	"""
	Finds broken links within a webpage

	Parameters
	----------
	webpage : str
	The URL for the webpage to search

	Returns
	-------
	broken_links : str
	A list of broken links
	"""
	# Get the page
	r = requests.get(page)

	# Soup the page text
	soup = BeautifulSoup(r.text, 'html.parser')

	# Find all of the a elements
	all_a = soup.findAll('a')

	# Get all of the links out the a elments
	links = [link.get('href') for link in all_a if link.has_attr('href') and link.get('href')[0] != "#"]

	# Intialize empty list
	broken_links = []

	n = len(links)
	i = 0
	# Loop through all of the links
	for link in links:
	i += 1
	try:
	# Request the page
	r_link = requests.get(link)

	# If we get a response
	if r.ok:
	# Check for page not found message
	if 'Page Not Found' in r_link.text or 'page not found' in r_link.text:
	broken_links.append(link)
	# If we get an error status code, append the link
	else:
	broken_links.append(link)
	except:
	broken_links.append(link)

	# Check progress
	if i % 10 == 0:
	print "%s%% finished!" % (round(float(i) / n * 100, 2))

	return broken_links


	if __name__ == '__main__':
	frac_webpage = 'http://frac.org/federal-foodnutrition-programs/child-and-adult-care-program/child-care-wellness-plans-and-policies/'

	broken_links = find_broken_links(frac_webpage)

	for link in broken_links:
	print link