Created
April 30, 2017 22:35
-
-
Save brandonmburroughs/21e0b9cdb9298215b9c899706de07116 to your computer and use it in GitHub Desktop.
Find broken links on a webpage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Name: Broken Link Finder | |
Author: Brandon M. Burroughs | |
Description: Give the URL for a page and this will find all broken links on | |
that page. Currently, "broken" is defined as return page not found. | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
frac_webpage = 'http://frac.org/federal-foodnutrition-programs/child-and-adult-care-program/child-care-wellness-plans-and-policies/' | |
def find_broken_links(webpage): | |
""" | |
Finds broken links within a webpage | |
Parameters | |
---------- | |
webpage : str | |
The URL for the webpage to search | |
Returns | |
------- | |
broken_links : str | |
A list of broken links | |
""" | |
# Get the page | |
r = requests.get(page) | |
# Soup the page text | |
soup = BeautifulSoup(r.text, 'html.parser') | |
# Find all of the a elements | |
all_a = soup.findAll('a') | |
# Get all of the links out the a elments | |
links = [link.get('href') for link in all_a if link.has_attr('href') and link.get('href')[0] != "#"] | |
# Intialize empty list | |
broken_links = [] | |
n = len(links) | |
i = 0 | |
# Loop through all of the links | |
for link in links: | |
i += 1 | |
try: | |
# Request the page | |
r_link = requests.get(link) | |
# If we get a response | |
if r.ok: | |
# Check for page not found message | |
if 'Page Not Found' in r_link.text or 'page not found' in r_link.text: | |
broken_links.append(link) | |
# If we get an error status code, append the link | |
else: | |
broken_links.append(link) | |
except: | |
broken_links.append(link) | |
# Check progress | |
if i % 10 == 0: | |
print "%s%% finished!" % (round(float(i) / n * 100, 2)) | |
return broken_links | |
if __name__ == '__main__': | |
frac_webpage = 'http://frac.org/federal-foodnutrition-programs/child-and-adult-care-program/child-care-wellness-plans-and-policies/' | |
broken_links = find_broken_links(frac_webpage) | |
for link in broken_links: | |
print link |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment