Last active
February 29, 2024 10:46
-
-
Save FlorianHeigl/7790daac7f84a491e10f59dfd862432a to your computer and use it in GitHub Desktop.
omd tuning script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# License: BSD | |
# Author: Florian Heigl | |
set -eu | |
# variablen ziehen falls nicht da | |
SITECFG=~/etc/omd/site.conf | |
test -r $SITECFG && bash -eun $SITECFG && source $SITECFG | |
# total memory ermitteln | |
TOTAL=$( free | awk '/Mem:/ {print $2}' ) | |
# liste von testkommandos | |
tests[1]="grep -c ^processor /proc/cpuinfo" | |
tests[2]="echo $(( ${TOTAL} / 1024 / 1024 )) GB" | |
tests[3]="cat /proc/sys/kernel/pid_max" | |
tests[4]="cat /proc/sys/fs/file-max" | |
tests[5]="awk '{print \$1}' /proc/sys/fs/file-nr" | |
tests[6]="grep $(df --output=source $OMD_ROOT | grep ^/dev) /proc/mounts | awk '{print \$4}'" | |
tests[7="ps hux -U $OMD_SITE | awk -v total=$TOTAL '{ sum += \$6 } END { printf \"%.2f%%\n\", sum / total * 100}'" | |
tests[8]="ulimit -n" | |
tests[9]="ulimit -u" | |
tests[10]="lsof | grep -c $OMD_SITE" | |
tests[11]="ps -ef | grep -c ${OMD_SITE}" | |
# rrdcached can kill the system if it falls behind, the worker setting in OMD is not correctly applied, so you normally | |
# only have the default number of workers! | |
tests[12]="grep ^write_bytes /proc/$( pgrep -u $OMD_SITE rrdcached )/io | awk '{print \$2 / 1024 / 1024}'" | |
tests[13]="grep ^read_bytes /proc/$( pgrep -u $OMD_SITE rrdcached )/io | awk '{print \$2 / 1024 / 1024}'" | |
# active check worker, can block and _will_ block on down hosts (they're executed anyway) | |
tests[14]="pgrep -u $OMD_SITE checkhelper | wc -l" | |
# cmk workers, needed to get check throughput. beware there are rolling restart issues with some buggy checks that OOM them. | |
tests[15]="pgrep -u $OMD_SITE -f 'python /omd/sites/$OMD_SITE/bin/cmk --keepalive' | wc -l" | |
tests[16]="grep -c 'Resource temp' ~/var/log/cmc.log" | |
# connected to number of livestatus slots | |
tests[17]="pgrep -u $OMD_SITE -f 'python /omd/sites/$OMD_SITE/bin/liveproxyd' | wc -l" | |
tests[18]="grep -c 'Site is considered dead. Closing all connections.' ~/var/log/liveproxyd.log" | |
tests[19]="grep -c -E 'Cannot forward next' ~/var/log/liveproxyd.log" | |
tests[20]="curl -s localhost:${CONFIG_APACHE_TCP_PORT}/server-status | grep -E 'requests currently being processed'" | |
tests[21]="grep -c 'WARNING: ping-queueing has lasted' ~/var/log/cmc.log" # richtiges log? | |
tests[22]="ethtool -g eth0 | grep -A1 'Current hardware' | grep ^RX | awk '{print \$2}'" | |
tests[23]="ethtool -S eth0 | grep -i OOB | awk '{sum+=\$4} END {print sum}'" | |
#tests[24]="" # lost the command | |
# liste von tests, name je nach komponente | |
messages[1]="System: CPU Cores" | |
messages[2]="System: Total Memory" | |
messages[3]="System: process limit" | |
messages[4]="System: file limit" | |
messages[5]="System: open file handles" | |
messages[6]="System: OMD_ROOT mount info" | |
messages[7]="OMD Site: Total memory used" | |
messages[8]="OMD Site: process limit" | |
messages[9]="OMD Site: file limit" | |
messages[10]="OMD Site: open file handles" | |
messages[11]="OMD Site: running processes" | |
messages[12]="rrdcached: written GB" | |
messages[13]="rrdcached: read GB" | |
messages[14]="CMC: active check workers" | |
messages[15]="CMC: check_mk workers" | |
messages[16]="CMC: resources exhaustion errors" | |
messages[17]="Liveproxyd: processes" | |
messages[18]="Liveproxyd: remote site conn dead errors" | |
messages[19]="Liveproxyd: remote site query aborted errors" | |
messages[20]="Apache: worker usage" | |
messages[21]="icmphelper: ping-queue over 100ms errors " | |
messages[22]="System: RX buffer size " | |
messages[23]="System: NIC out of buffer errrors " | |
messages[24]="System: driver rx/tx drop errors " | |
#TODO: | |
# would be nice, kann man aber auch einfach direkt monitoren | |
# memory stats pro prozessgruppe | |
# wait auf prozessen der site | |
# site uptime ermitteln und mit prozess usage rechnen (aber dann kann man sie auch einfach monitoren) | |
# evtl. noch reinnehmen: | |
# compressing / caching in site.conf, sinnvolle settings ja/nein | |
# tests anwerfen, werte ausgeben | |
for index in ${!messages[*]}; do | |
echo "${messages[$index]} : $(eval "${tests[$index]}" )" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment