Created
June 18, 2013 15:56
Revisions
-
joemiller created this gist
Jun 18, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,193 @@ #!/usr/bin/env ruby # # this is a special meta-check. It runs ping checks against all hosts in # the /endpoints API and sends individual results directly to sensu-client via # the udp/3030 client socket. this is different from the normal sensu check model # where individual scripts run and their exit status and output is used to create # a single event. # # the reason for this check is to be able to dynamically ping a list of hosts # without the race conditions and timing issues involved with creating individual # sensu check definitions using chef. # require 'rubygems' require 'json' require 'popen4' require 'forkmanager' # gem install parallel-forkmanager require 'rest-client' require 'sensu-plugin/check/cli' class PantheonCheckPingEndpoints < Sensu::Plugin::Check::CLI # option :host, :short => '-h HOST', :long => "--host HOST", :required => true option :critical_rtt, :short => '-c MS', :long => "--critical-rtt MS", :default => nil, :proc => Proc.new { |c| c.to_f } option :warning_rtt, :short => '-w MS', :long => "--warning-rtt MS", :default => nil, :proc => Proc.new { |w| w.to_f } option :critical_loss, :long => "--critical-loss COUNT", :default => 0, :proc => Proc.new { |c| c.to_f } option :warning_loss, :long => "--warning-loss COUNT", :default => 0, :proc => Proc.new { |w| w.to_f } option :options, :short => '-f OPTIONS', :long => "--fping-args OPTIONS", :default => nil option :verbose, :short => '-v', :long => "--verbose", :boolean => true, :default => false option :handler, :short => '-l HANDLER', :long => '--handler HANDLER', :default => 'default' option :zone, :short => '-z ZONE', :long => '--zone ZONE', :required => true option :procs, :short => '-p NUM_PROCS', :long => '--procs NUM_PROCS', :default => 50, :proc => Proc.new { |p| p.to_i } # pantheon api option :api, :long => '--api API_URL', :default => 'https://redacted:443' option :timeout, :long => '--api-timeout SECONDS', :default => 30 option :client_cert, :long => '--client-cert FILE', :default => 'cert.pem' option :ca_file, :long => '--ca-file FILE', :default => 'ca.pem' def pantheon_api(resource, jsonify=true) begin request = RestClient::Resource.new(config[:api] + resource, { :timeout => config[:timeout], :ssl_client_cert => OpenSSL::X509::Certificate.new(File.read(config[:client_cert])), :ssl_client_key => OpenSSL::PKey::RSA.new(File.read(config[:client_cert])), :ssl_ca_file => config[:ca_file], :verify_ssl => OpenSSL::SSL::VERIFY_NONE }) if jsonify JSON.parse(request.get, :symbolize_names => true) else request.get end rescue Errno::ECONNREFUSED warning "Connection refused" rescue RestClient::RequestFailed warning "Request failed" rescue RestClient::RequestTimeout warning "Connection timed out" rescue RestClient::Unauthorized warning "Missing or incorrect Pantheon API credentials" rescue JSON::ParserError warning "Pantheon API returned invalid JSON" end end def endpoints pantheon_api('/endpoints?extended=0&source=check_ping') end def sensu_client_socket(msg) u = UDPSocket.new u.send(msg + "\n", 0, '127.0.0.1', 3030) end def send_ok(check_name, msg) d = { 'name' => check_name, 'status' => 0, 'output' => 'OK: ' + msg, 'handler' => config[:handler] } sensu_client_socket d.to_json end def send_warning(check_name, msg) d = { 'name' => check_name, 'status' => 1, 'output' => 'WARNING: ' + msg, 'handler' => config[:handler] } sensu_client_socket d.to_json end def send_critical(check_name, msg) d = { 'name' => check_name, 'status' => 2, 'output' => 'CRITICAL: ' + msg, 'handler' => config[:handler] } sensu_client_socket d.to_json end def run_fping(host) cmd = "fping -s #{host} #{config[:options]}" puts "Command:\n#{cmd}" if config[:verbose] stats = nil errors = nil result = POpen4::popen4(cmd) do |stdin, stdout, stderr, pid| stats = stdout.read begin errors = stderr.read rescue # stderr is not always open for reading. end end exit_status = result ? result.exitstatus : nil puts "Output:\n#{stats}" if config[:verbose] puts "Exit Status:\n#{exit_status}" if config[:verbose] return [cmd, exit_status, stats, errors] end def get_max_rtt(stats) stats.match('(\d+[\.\d+]*) ms \(max round trip time\)')[1].to_f end def get_lost_packet_count(stats) sent = stats.match('\d+ ICMP Echos sent')[1].to_i received = stats.match('\d+ ICMP Echos sent')[1].to_i sent - received end def ping_host(check_name, hostname, host) cmd, exit_status, stats, errors = run_fping(host) puts "results from #{host} #{hostname}: #{exit_status}, #{stats}" if config[:verbose] case exit_status when 0 begin max_rtt = get_max_rtt(stats) lost_packets = get_lost_packet_count(stats) if config[:critical_rtt] && max_rtt > config[:critical_rtt] send_critical check_name, "Host '#{host}' reached in #{max_rtt} ms, which is greater than specified RTT of #{config[:critical_rtt]} ms" elsif config[:warning_rtt] && max_rtt > config[:warning_rtt] warning "Host '#{host}' reached in #{max_rtt} ms, which is greater than specified RTT of #{config[:warning_rtt]} ms" elsif config[:critical_loss] && lost_packets > config[:critical_loss] send_critical check_name, "Host '#{host}' dropped #{lost_packets}, which is greater than allowed loss of #{config[:critical_loss]} packet" elsif config[:warning_loss] && lost_packets > config[:warning_loss] send_warning check_name, "Host '#{host}' dropped #{lost_packets}, which is greater than allowed loss of #{config[:warning_loss]} packet" else send_ok check_name, "Host '#{host}' reached in #{max_rtt} ms dropping #{lost_packets} packets" end rescue send_critical check_name, "Error extracting results: [#{cmd}, #{exit_status}, #{stats}, #{errors}]" end when 1 send_critical check_name, "Host '#{host}' is unreachable" when 2 send_warning check_name, "Invalid IP address: #{host}" when 3 send_warning check_name, "Invalid fping command: #{cmd}" when 4 send_warning check_name, "Fping system call error: #{cmd}" when nil send_warning check_name, "Cannot locate 'fping', please add to your system path." end end # this is the main method executed in the child processes def process_endpoint(uuid, meta) puts "in child process: pid: #{$$}, endpoint: #{uuid}" if config[:verbose] if meta[:host].nil? or meta[:hostname].nil? puts "skipping endpoint #{uuid}, missing 'host' or 'hostname' attributes." return end public_ip_check_name = "#{meta[:hostname]}_ping_check" private_ip_check_name = "#{meta[:hostname]}_private_ip_ping_check" if meta[:pool] == 'down' # endpoint is marked down, cleanup any open alerts in sensu by sending an 'OK' event send_ok public_ip_check_name, "host is marked down. no ping necessary." send_ok private_ip_check_name, "host is marked down. no ping necessary." else ping_host public_ip_check_name, meta[:hostname], meta[:host] # only check private_ip if the endpoint is in the same zone specified by the '-z' arg if config[:zone] == meta[:zone] ping_host private_ip_check_name, meta[:hostname], meta[:private_ip] end end end def run pm = Parallel::ForkManager.new(config[:procs]) endpoints.each do |uuid, meta| pm.start(uuid) and next # block until new process slot is available process_endpoint(uuid, meta) pm.finish(0) end pm.wait_all_children ok "Finished ping checks." end end