FlorianHeigl · February 29, 2024 10:46
diff --git a/omd-tuning.sh b/omd-tuning.sh
 #!/usr/bin/env bash
 
 # License: BSD
 # Author: Florian Heigl
 
 set -eu
 
 # variablen ziehen falls nicht da
 SITECFG=~/etc/omd/site.conf
 test -r $SITECFG && bash -eun $SITECFG && source $SITECFG
 
 # total memory ermitteln
 TOTAL=$( free | awk '/Mem:/ {print $2}' )
 
 # liste von testkommandos
 tests[1]="grep -c ^processor /proc/cpuinfo"
 tests[2]="echo $(( ${TOTAL} / 1024 / 1024 )) GB"
 tests[3]="cat         /proc/sys/kernel/pid_max"
 tests[4]="cat         /proc/sys/fs/file-max"
 tests[5]="awk '{print \$1}' /proc/sys/fs/file-nr"
 tests[6]="grep $(df --output=source $OMD_ROOT | grep ^/dev) /proc/mounts | awk '{print \$4}'"
 tests[7="ps hux -U   $OMD_SITE | awk -v total=$TOTAL '{ sum += \$6 } END { printf \"%.2f%%\n\", sum / total * 100}'"
 tests[8]="ulimit -n"
 tests[9]="ulimit -u"
 tests[10]="lsof | grep -c $OMD_SITE"
 tests[11]="ps -ef | grep -c ${OMD_SITE}"
 # rrdcached can kill the system if it falls behind, the worker setting in OMD is not correctly applied, so you normally
 # only have the default number of workers!
 tests[12]="grep        ^write_bytes /proc/$( pgrep -u $OMD_SITE rrdcached )/io | awk '{print \$2 / 1024 / 1024}'"
 tests[13]="grep        ^read_bytes  /proc/$( pgrep -u $OMD_SITE rrdcached )/io | awk '{print \$2 / 1024 / 1024}'"
 # active check worker, can block and _will_ block on down hosts (they're executed anyway)
 tests[14]="pgrep -u    $OMD_SITE checkhelper | wc -l"
 # cmk workers, needed to get check throughput. beware there are rolling restart issues with some buggy checks that OOM them.
 tests[15]="pgrep -u    $OMD_SITE -f 'python /omd/sites/$OMD_SITE/bin/cmk --keepalive' | wc -l"
 tests[16]="grep -c    'Resource temp'                                     ~/var/log/cmc.log"
 # connected to number of livestatus slots
 tests[17]="pgrep -u    $OMD_SITE -f 'python /omd/sites/$OMD_SITE/bin/liveproxyd' | wc -l"
 tests[18]="grep -c    'Site is considered dead. Closing all connections.' ~/var/log/liveproxyd.log"
 tests[19]="grep -c -E 'Cannot forward next'                               ~/var/log/liveproxyd.log"
 tests[20]="curl -s localhost:${CONFIG_APACHE_TCP_PORT}/server-status | grep -E 'requests currently being processed'"
 tests[21]="grep -c 'WARNING: ping-queueing has lasted' ~/var/log/cmc.log" # richtiges log?
 tests[22]="ethtool -g eth0 | grep -A1 'Current hardware' | grep ^RX | awk '{print \$2}'"
 tests[23]="ethtool -S eth0 | grep -i OOB | awk '{sum+=\$4} END {print sum}'"
 #tests[24]="" # lost the command

 
 # liste von tests, name je nach komponente
 messages[1]="System: CPU Cores"
 messages[2]="System: Total Memory"
 messages[3]="System: process limit"
 messages[4]="System: file limit"
 messages[5]="System: open file handles"
 messages[6]="System: OMD_ROOT mount info"
 messages[7]="OMD Site: Total memory used"
 messages[8]="OMD Site: process limit"
 messages[9]="OMD Site: file limit"
 messages[10]="OMD Site: open file handles"
 messages[11]="OMD Site: running processes"
 messages[12]="rrdcached: written GB"
 messages[13]="rrdcached: read GB"
 messages[14]="CMC: active check workers"
 messages[15]="CMC: check_mk workers"
 messages[16]="CMC: resources exhaustion errors"
 messages[17]="Liveproxyd: processes"
 messages[18]="Liveproxyd: remote site conn dead errors"
 messages[19]="Liveproxyd: remote site query aborted errors"
 messages[20]="Apache: worker usage"
 messages[21]="icmphelper:    ping-queue over 100ms errors    "
 messages[22]="System:        RX buffer size                  "
 messages[23]="System:        NIC out of buffer errrors       "
 messages[24]="System:        driver rx/tx drop errors        " 


 #TODO:
 # would be nice, kann man aber auch einfach direkt monitoren
 # memory stats pro prozessgruppe
 # wait auf prozessen der site
 # site uptime ermitteln und mit prozess usage rechnen (aber dann kann man sie auch einfach monitoren)
 # evtl. noch reinnehmen:
 # compressing / caching in site.conf, sinnvolle settings ja/nein
 
 
 # tests anwerfen, werte ausgeben
 for index in ${!messages[*]}; do
  echo "${messages[$index]} : $(eval "${tests[$index]}" )"
 done
	#!/usr/bin/env bash

	# License: BSD
	# Author: Florian Heigl

	set -eu

	# variablen ziehen falls nicht da
	SITECFG=~/etc/omd/site.conf
	test -r $SITECFG && bash -eun $SITECFG && source $SITECFG

	# total memory ermitteln
	TOTAL=$( free \| awk '/Mem:/ {print $2}' )

	# liste von testkommandos
	tests[1]="grep -c ^processor /proc/cpuinfo"
	tests[2]="echo $(( ${TOTAL} / 1024 / 1024 )) GB"
	tests[3]="cat /proc/sys/kernel/pid_max"
	tests[4]="cat /proc/sys/fs/file-max"
	tests[5]="awk '{print \$1}' /proc/sys/fs/file-nr"
	tests[6]="grep $(df --output=source $OMD_ROOT \| grep ^/dev) /proc/mounts \| awk '{print \$4}'"
	tests[7="ps hux -U $OMD_SITE \| awk -v total=$TOTAL '{ sum += \$6 } END { printf \"%.2f%%\n\", sum / total * 100}'"
	tests[8]="ulimit -n"
	tests[9]="ulimit -u"
	tests[10]="lsof \| grep -c $OMD_SITE"
	tests[11]="ps -ef \| grep -c ${OMD_SITE}"
	# rrdcached can kill the system if it falls behind, the worker setting in OMD is not correctly applied, so you normally
	# only have the default number of workers!
	tests[12]="grep ^write_bytes /proc/$( pgrep -u $OMD_SITE rrdcached )/io \| awk '{print \$2 / 1024 / 1024}'"
	tests[13]="grep ^read_bytes /proc/$( pgrep -u $OMD_SITE rrdcached )/io \| awk '{print \$2 / 1024 / 1024}'"
	# active check worker, can block and _will_ block on down hosts (they're executed anyway)
	tests[14]="pgrep -u $OMD_SITE checkhelper \| wc -l"
	# cmk workers, needed to get check throughput. beware there are rolling restart issues with some buggy checks that OOM them.
	tests[15]="pgrep -u $OMD_SITE -f 'python /omd/sites/$OMD_SITE/bin/cmk --keepalive' \| wc -l"
	tests[16]="grep -c 'Resource temp' ~/var/log/cmc.log"
	# connected to number of livestatus slots
	tests[17]="pgrep -u $OMD_SITE -f 'python /omd/sites/$OMD_SITE/bin/liveproxyd' \| wc -l"
	tests[18]="grep -c 'Site is considered dead. Closing all connections.' ~/var/log/liveproxyd.log"
	tests[19]="grep -c -E 'Cannot forward next' ~/var/log/liveproxyd.log"
	tests[20]="curl -s localhost:${CONFIG_APACHE_TCP_PORT}/server-status \| grep -E 'requests currently being processed'"
	tests[21]="grep -c 'WARNING: ping-queueing has lasted' ~/var/log/cmc.log" # richtiges log?
	tests[22]="ethtool -g eth0 \| grep -A1 'Current hardware' \| grep ^RX \| awk '{print \$2}'"
	tests[23]="ethtool -S eth0 \| grep -i OOB \| awk '{sum+=\$4} END {print sum}'"
	#tests[24]="" # lost the command


	# liste von tests, name je nach komponente
	messages[1]="System: CPU Cores"
	messages[2]="System: Total Memory"
	messages[3]="System: process limit"
	messages[4]="System: file limit"
	messages[5]="System: open file handles"
	messages[6]="System: OMD_ROOT mount info"
	messages[7]="OMD Site: Total memory used"
	messages[8]="OMD Site: process limit"
	messages[9]="OMD Site: file limit"
	messages[10]="OMD Site: open file handles"
	messages[11]="OMD Site: running processes"
	messages[12]="rrdcached: written GB"
	messages[13]="rrdcached: read GB"
	messages[14]="CMC: active check workers"
	messages[15]="CMC: check_mk workers"
	messages[16]="CMC: resources exhaustion errors"
	messages[17]="Liveproxyd: processes"
	messages[18]="Liveproxyd: remote site conn dead errors"
	messages[19]="Liveproxyd: remote site query aborted errors"
	messages[20]="Apache: worker usage"
	messages[21]="icmphelper: ping-queue over 100ms errors "
	messages[22]="System: RX buffer size "
	messages[23]="System: NIC out of buffer errrors "
	messages[24]="System: driver rx/tx drop errors "


	#TODO:
	# would be nice, kann man aber auch einfach direkt monitoren
	# memory stats pro prozessgruppe
	# wait auf prozessen der site
	# site uptime ermitteln und mit prozess usage rechnen (aber dann kann man sie auch einfach monitoren)
	# evtl. noch reinnehmen:
	# compressing / caching in site.conf, sinnvolle settings ja/nein


	# tests anwerfen, werte ausgeben
	for index in ${!messages[*]}; do
	echo "${messages[$index]} : $(eval "${tests[$index]}" )"
	done