To run this script,
- Add
crawl_broken_urls.rbto you directory - At the same directory level, add a
broken_links.yml. - Then run
ruby crawl_broken_urls.rb
| require 'yaml' | |
| require 'pry' | |
| require 'rb-readline' | |
| require 'net/http' | |
| require 'uri' | |
| require 'timeout' | |
| broken_urls = YAML.load_file("./broken_urls.yml") | |
| def fetch(uri_str, limit = 10) | |
| default_error = 'HTTPError' | |
| raise ArgumentError, 'HTTP redirect too deep' if limit == 0 | |
| url = URI.parse(uri_str) | |
| req = Net::HTTP::Get.new(url.path) | |
| response = Net::HTTP.start(url.host, url.port) do |http| | |
| begin | |
| status = Timeout::timeout(3) { | |
| http.request(req) | |
| } | |
| rescue Timeout::Error | |
| puts 'That took too long, exiting...' | |
| end | |
| end | |
| begin | |
| case response | |
| when Net::HTTPSuccess then response | |
| when (Net::HTTPRedirection && response.code != '404') then | |
| if (uri_str != response['location']) && (response.code != '302') | |
| fetch(response['location'], limit - 1) | |
| else | |
| response | |
| end | |
| else | |
| default_error | |
| end | |
| rescue | |
| "TimeoutError" | |
| end | |
| end | |
| broken_urls.each do |link| | |
| # puts "#{link}" | |
| # puts `curl -I #{link}` | |
| puts "#{link}: #{fetch(link)}" | |
| end |