Skip to content

Instantly share code, notes, and snippets.

@h0hmj
Created August 15, 2024 03:26
Show Gist options
  • Save h0hmj/df8abd125b4648e71e4611ac82bc2348 to your computer and use it in GitHub Desktop.
Save h0hmj/df8abd125b4648e71e4611ac82bc2348 to your computer and use it in GitHub Desktop.
ceph laggy count
from prometheus_client import start_http_server, Counter
from threading import Thread, Lock
import subprocess
import json
import time
from datetime import datetime, timedelta
# p8s metric
LAGGY_COUNT = Counter("laggy_count", "count of node may be laggy", ["ip"])
# cache osd -> host map
OSD2HOST = {}
MUTEX = Lock()
def refresh_osd2host():
osd2host = {}
try:
osd_dump = json.loads(
subprocess.check_output(["ceph", "osd", "dump", "-f", "json"])
)
except Exception as e:
print(str(e))
return
for osd in osd_dump["osds"]:
try:
osd2host[osd["osd"]] = osd["public_addrs"]["addrvec"][0]["addr"].split(":")[
0
]
except Exception as e:
print(str(e))
pass
global OSD2HOST
with MUTEX:
OSD2HOST = osd2host
def _find_host_ip(osd_id):
osd_find = json.loads(
subprocess.check_output(["ceph", "osd", "find", str(osd_id), "-f", "json"])
)
host_ip = osd_find["addrs"]["addrvec"][0]["addr"].split(":")[0]
return host_ip
def _fill_osd2host(osd_id, host_ip):
with MUTEX:
OSD2HOST[osd_id] = host_ip
def get_host_ip(osd_id):
# cached?
with MUTEX:
if osd_id in OSD2HOST:
return OSD2HOST[osd_id]
# find and cache
try:
host_ip = _find_host_ip(osd_id)
_fill_osd2host(osd_id, host_ip)
return host_ip
except Exception as e:
print("failed to get hostip")
print(str(e))
return None
def dump_laggy_pg_with_osd():
pg_stats = json.loads(
subprocess.check_output(
["ceph", "pg", "dump", "pgs_brief", "-f", "json"], stderr=subprocess.DEVNULL
)
)["pg_stats"]
laggy_pg_with_osd = {}
for pg in pg_stats:
if "laggy" in pg["state"]:
laggy_pg_with_osd[pg["pgid"]] = pg["acting"]
return laggy_pg_with_osd
def count_host_freq(laggy_pg_with_osd):
host_freq = {}
for pg in laggy_pg_with_osd:
acting_osds = laggy_pg_with_osd[pg]
for osd_id in acting_osds:
host_ip = get_host_ip(osd_id)
if host_ip is None:
continue
if host_ip in host_freq:
host_freq[host_ip] += 1
else:
host_freq[host_ip] = 1
return host_freq
def refresh_osd2host_helper():
while True:
refresh_osd2host()
time.sleep(60)
def _print_time():
utc_time = datetime.utcnow()
utc_plus_8_time = utc_time + timedelta(hours=8)
time_string = utc_plus_8_time.strftime("%Y-%m-%d %H:%M:%S")
print("Current time in UTC+8:", time_string)
def count_helper():
while True:
# _print_time()
pgs = dump_laggy_pg_with_osd()
host_freq = count_host_freq(pgs)
if host_freq:
print("found laggy pg")
print(host_freq)
for host in host_freq:
LAGGY_COUNT.labels(ip=host).inc(host_freq[host])
time.sleep(10)
def main():
# ensure cache once at start
refresh_osd2host()
refresh_thread = Thread(target=refresh_osd2host_helper)
refresh_thread.start()
count_thread = Thread(target=count_helper)
count_thread.start()
start_http_server(9999)
refresh_thread.join()
count_thread.join()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment