Created
August 15, 2024 03:26
-
-
Save h0hmj/df8abd125b4648e71e4611ac82bc2348 to your computer and use it in GitHub Desktop.
ceph laggy count
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from prometheus_client import start_http_server, Counter | |
from threading import Thread, Lock | |
import subprocess | |
import json | |
import time | |
from datetime import datetime, timedelta | |
# p8s metric | |
LAGGY_COUNT = Counter("laggy_count", "count of node may be laggy", ["ip"]) | |
# cache osd -> host map | |
OSD2HOST = {} | |
MUTEX = Lock() | |
def refresh_osd2host(): | |
osd2host = {} | |
try: | |
osd_dump = json.loads( | |
subprocess.check_output(["ceph", "osd", "dump", "-f", "json"]) | |
) | |
except Exception as e: | |
print(str(e)) | |
return | |
for osd in osd_dump["osds"]: | |
try: | |
osd2host[osd["osd"]] = osd["public_addrs"]["addrvec"][0]["addr"].split(":")[ | |
0 | |
] | |
except Exception as e: | |
print(str(e)) | |
pass | |
global OSD2HOST | |
with MUTEX: | |
OSD2HOST = osd2host | |
def _find_host_ip(osd_id): | |
osd_find = json.loads( | |
subprocess.check_output(["ceph", "osd", "find", str(osd_id), "-f", "json"]) | |
) | |
host_ip = osd_find["addrs"]["addrvec"][0]["addr"].split(":")[0] | |
return host_ip | |
def _fill_osd2host(osd_id, host_ip): | |
with MUTEX: | |
OSD2HOST[osd_id] = host_ip | |
def get_host_ip(osd_id): | |
# cached? | |
with MUTEX: | |
if osd_id in OSD2HOST: | |
return OSD2HOST[osd_id] | |
# find and cache | |
try: | |
host_ip = _find_host_ip(osd_id) | |
_fill_osd2host(osd_id, host_ip) | |
return host_ip | |
except Exception as e: | |
print("failed to get hostip") | |
print(str(e)) | |
return None | |
def dump_laggy_pg_with_osd(): | |
pg_stats = json.loads( | |
subprocess.check_output( | |
["ceph", "pg", "dump", "pgs_brief", "-f", "json"], stderr=subprocess.DEVNULL | |
) | |
)["pg_stats"] | |
laggy_pg_with_osd = {} | |
for pg in pg_stats: | |
if "laggy" in pg["state"]: | |
laggy_pg_with_osd[pg["pgid"]] = pg["acting"] | |
return laggy_pg_with_osd | |
def count_host_freq(laggy_pg_with_osd): | |
host_freq = {} | |
for pg in laggy_pg_with_osd: | |
acting_osds = laggy_pg_with_osd[pg] | |
for osd_id in acting_osds: | |
host_ip = get_host_ip(osd_id) | |
if host_ip is None: | |
continue | |
if host_ip in host_freq: | |
host_freq[host_ip] += 1 | |
else: | |
host_freq[host_ip] = 1 | |
return host_freq | |
def refresh_osd2host_helper(): | |
while True: | |
refresh_osd2host() | |
time.sleep(60) | |
def _print_time(): | |
utc_time = datetime.utcnow() | |
utc_plus_8_time = utc_time + timedelta(hours=8) | |
time_string = utc_plus_8_time.strftime("%Y-%m-%d %H:%M:%S") | |
print("Current time in UTC+8:", time_string) | |
def count_helper(): | |
while True: | |
# _print_time() | |
pgs = dump_laggy_pg_with_osd() | |
host_freq = count_host_freq(pgs) | |
if host_freq: | |
print("found laggy pg") | |
print(host_freq) | |
for host in host_freq: | |
LAGGY_COUNT.labels(ip=host).inc(host_freq[host]) | |
time.sleep(10) | |
def main(): | |
# ensure cache once at start | |
refresh_osd2host() | |
refresh_thread = Thread(target=refresh_osd2host_helper) | |
refresh_thread.start() | |
count_thread = Thread(target=count_helper) | |
count_thread.start() | |
start_http_server(9999) | |
refresh_thread.join() | |
count_thread.join() | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment