Skip to content

Instantly share code, notes, and snippets.

@joemiller
Created June 18, 2013 15:56

Revisions

  1. joemiller created this gist Jun 18, 2013.
    193 changes: 193 additions & 0 deletions pantheon-check-ping-endpionts.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,193 @@
    #!/usr/bin/env ruby
    #
    # this is a special meta-check. It runs ping checks against all hosts in
    # the /endpoints API and sends individual results directly to sensu-client via
    # the udp/3030 client socket. this is different from the normal sensu check model
    # where individual scripts run and their exit status and output is used to create
    # a single event.
    #
    # the reason for this check is to be able to dynamically ping a list of hosts
    # without the race conditions and timing issues involved with creating individual
    # sensu check definitions using chef.
    #

    require 'rubygems'
    require 'json'
    require 'popen4'
    require 'forkmanager' # gem install parallel-forkmanager
    require 'rest-client'
    require 'sensu-plugin/check/cli'


    class PantheonCheckPingEndpoints < Sensu::Plugin::Check::CLI

    # option :host, :short => '-h HOST', :long => "--host HOST", :required => true
    option :critical_rtt, :short => '-c MS', :long => "--critical-rtt MS", :default => nil, :proc => Proc.new { |c| c.to_f }
    option :warning_rtt, :short => '-w MS', :long => "--warning-rtt MS", :default => nil, :proc => Proc.new { |w| w.to_f }
    option :critical_loss, :long => "--critical-loss COUNT", :default => 0, :proc => Proc.new { |c| c.to_f }
    option :warning_loss, :long => "--warning-loss COUNT", :default => 0, :proc => Proc.new { |w| w.to_f }
    option :options, :short => '-f OPTIONS', :long => "--fping-args OPTIONS", :default => nil
    option :verbose, :short => '-v', :long => "--verbose", :boolean => true, :default => false
    option :handler, :short => '-l HANDLER', :long => '--handler HANDLER', :default => 'default'
    option :zone, :short => '-z ZONE', :long => '--zone ZONE', :required => true
    option :procs, :short => '-p NUM_PROCS', :long => '--procs NUM_PROCS', :default => 50, :proc => Proc.new { |p| p.to_i }

    # pantheon api
    option :api, :long => '--api API_URL', :default => 'https://redacted:443'
    option :timeout, :long => '--api-timeout SECONDS', :default => 30
    option :client_cert, :long => '--client-cert FILE', :default => 'cert.pem'
    option :ca_file, :long => '--ca-file FILE', :default => 'ca.pem'

    def pantheon_api(resource, jsonify=true)
    begin
    request = RestClient::Resource.new(config[:api] + resource, {
    :timeout => config[:timeout],
    :ssl_client_cert => OpenSSL::X509::Certificate.new(File.read(config[:client_cert])),
    :ssl_client_key => OpenSSL::PKey::RSA.new(File.read(config[:client_cert])),
    :ssl_ca_file => config[:ca_file],
    :verify_ssl => OpenSSL::SSL::VERIFY_NONE
    })
    if jsonify
    JSON.parse(request.get, :symbolize_names => true)
    else
    request.get
    end
    rescue Errno::ECONNREFUSED
    warning "Connection refused"
    rescue RestClient::RequestFailed
    warning "Request failed"
    rescue RestClient::RequestTimeout
    warning "Connection timed out"
    rescue RestClient::Unauthorized
    warning "Missing or incorrect Pantheon API credentials"
    rescue JSON::ParserError
    warning "Pantheon API returned invalid JSON"
    end
    end

    def endpoints
    pantheon_api('/endpoints?extended=0&source=check_ping')
    end

    def sensu_client_socket(msg)
    u = UDPSocket.new
    u.send(msg + "\n", 0, '127.0.0.1', 3030)
    end

    def send_ok(check_name, msg)
    d = { 'name' => check_name, 'status' => 0, 'output' => 'OK: ' + msg, 'handler' => config[:handler] }
    sensu_client_socket d.to_json
    end

    def send_warning(check_name, msg)
    d = { 'name' => check_name, 'status' => 1, 'output' => 'WARNING: ' + msg, 'handler' => config[:handler] }
    sensu_client_socket d.to_json
    end

    def send_critical(check_name, msg)
    d = { 'name' => check_name, 'status' => 2, 'output' => 'CRITICAL: ' + msg, 'handler' => config[:handler] }
    sensu_client_socket d.to_json
    end

    def run_fping(host)
    cmd = "fping -s #{host} #{config[:options]}"
    puts "Command:\n#{cmd}" if config[:verbose]
    stats = nil
    errors = nil
    result = POpen4::popen4(cmd) do |stdin, stdout, stderr, pid|
    stats = stdout.read
    begin
    errors = stderr.read
    rescue
    # stderr is not always open for reading.
    end
    end
    exit_status = result ? result.exitstatus : nil
    puts "Output:\n#{stats}" if config[:verbose]
    puts "Exit Status:\n#{exit_status}" if config[:verbose]
    return [cmd, exit_status, stats, errors]
    end

    def get_max_rtt(stats)
    stats.match('(\d+[\.\d+]*) ms \(max round trip time\)')[1].to_f
    end

    def get_lost_packet_count(stats)
    sent = stats.match('\d+ ICMP Echos sent')[1].to_i
    received = stats.match('\d+ ICMP Echos sent')[1].to_i
    sent - received
    end

    def ping_host(check_name, hostname, host)
    cmd, exit_status, stats, errors = run_fping(host)
    puts "results from #{host} #{hostname}: #{exit_status}, #{stats}" if config[:verbose]
    case exit_status
    when 0
    begin
    max_rtt = get_max_rtt(stats)
    lost_packets = get_lost_packet_count(stats)

    if config[:critical_rtt] && max_rtt > config[:critical_rtt]
    send_critical check_name, "Host '#{host}' reached in #{max_rtt} ms, which is greater than specified RTT of #{config[:critical_rtt]} ms"
    elsif config[:warning_rtt] && max_rtt > config[:warning_rtt]
    warning "Host '#{host}' reached in #{max_rtt} ms, which is greater than specified RTT of #{config[:warning_rtt]} ms"
    elsif config[:critical_loss] && lost_packets > config[:critical_loss]
    send_critical check_name, "Host '#{host}' dropped #{lost_packets}, which is greater than allowed loss of #{config[:critical_loss]} packet"
    elsif config[:warning_loss] && lost_packets > config[:warning_loss]
    send_warning check_name, "Host '#{host}' dropped #{lost_packets}, which is greater than allowed loss of #{config[:warning_loss]} packet"
    else
    send_ok check_name, "Host '#{host}' reached in #{max_rtt} ms dropping #{lost_packets} packets"
    end
    rescue
    send_critical check_name, "Error extracting results: [#{cmd}, #{exit_status}, #{stats}, #{errors}]"
    end
    when 1
    send_critical check_name, "Host '#{host}' is unreachable"
    when 2
    send_warning check_name, "Invalid IP address: #{host}"
    when 3
    send_warning check_name, "Invalid fping command: #{cmd}"
    when 4
    send_warning check_name, "Fping system call error: #{cmd}"
    when nil
    send_warning check_name, "Cannot locate 'fping', please add to your system path."
    end
    end

    # this is the main method executed in the child processes
    def process_endpoint(uuid, meta)
    puts "in child process: pid: #{$$}, endpoint: #{uuid}" if config[:verbose]
    if meta[:host].nil? or meta[:hostname].nil?
    puts "skipping endpoint #{uuid}, missing 'host' or 'hostname' attributes."
    return
    end
    public_ip_check_name = "#{meta[:hostname]}_ping_check"
    private_ip_check_name = "#{meta[:hostname]}_private_ip_ping_check"

    if meta[:pool] == 'down'
    # endpoint is marked down, cleanup any open alerts in sensu by sending an 'OK' event
    send_ok public_ip_check_name, "host is marked down. no ping necessary."
    send_ok private_ip_check_name, "host is marked down. no ping necessary."
    else
    ping_host public_ip_check_name, meta[:hostname], meta[:host]
    # only check private_ip if the endpoint is in the same zone specified by the '-z' arg
    if config[:zone] == meta[:zone]
    ping_host private_ip_check_name, meta[:hostname], meta[:private_ip]
    end
    end
    end

    def run
    pm = Parallel::ForkManager.new(config[:procs])

    endpoints.each do |uuid, meta|
    pm.start(uuid) and next # block until new process slot is available
    process_endpoint(uuid, meta)
    pm.finish(0)
    end
    pm.wait_all_children

    ok "Finished ping checks."
    end

    end