Last active
November 23, 2023 02:49
-
-
Save utdrmac/43725169cd67e552becb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Gracewatch 1.0 | |
* Matthew Boehm <[email protected]> | |
* | |
* Gracewatch is a multi-threaded MySQL monitoring solution developed for | |
* a client that had no in-house monitoring team. | |
* | |
* Using libConfig (http://www.hyperrealm.com/libconfig/), gracewatch reads | |
* a list of servers and credentials and spawns a pthread for each server. | |
* The thread connects to the host and every minute preforms a mysql_ping() | |
* to verify server is up and attempts to get slave status. If slave is not | |
* running or is more than 600 seconds behind, an email is sent to DBA. | |
* | |
* Gracewatch does not repeat the alarm to reduce emails sent. It will | |
* send out notification if the situation corrects itself (ie: slave | |
* catches up). | |
* | |
*/ | |
#include <stdlib.h> // for malloc | |
#include <stdarg.h> // for va_list | |
#include <stdio.h> // for printf | |
#include <unistd.h> // for sleep | |
#include <string.h> // for strcmp | |
#include <signal.h> // for signals | |
#include <libconfig.h> // for reading config | |
#include <mysql.h> // for mysql stuff | |
#include <pthread.h> // for pthreads | |
#include <time.h> // for time printing | |
#define EMAIL_TO "[email protected]" | |
#define EMAIL_FR "[email protected]" | |
#define ERROR_DIE(x) fprintf(stderr, "%s\n", x); config_destroy(&cfg); if(logFd) { fclose(logFd); } return(EXIT_FAILURE); | |
#define CFG_ERROR_DIE(x) fprintf(stderr, "%s - %s:%d - %s\n", x, config_error_file(&cfg), config_error_line(&cfg), config_error_text(&cfg)); config_destroy(&cfg); if(logFd) { fclose(logFd); } return(EXIT_FAILURE); | |
#define GLOG(x) glog("%s on %s (%s) - %s", x, displayName, ip, mysql_error(mysql)); | |
// random globals | |
int needToExit = 0; | |
int numServers = 0; | |
int daemonize = 1; | |
FILE *logFd; | |
// global thread array | |
pthread_t *threads; | |
// globals for connecting | |
const char *username = NULL; | |
const char *password = NULL; | |
// the mutex for writing output | |
pthread_mutex_t writerMutex = PTHREAD_MUTEX_INITIALIZER; | |
// logger | |
static void glog(const char *format, ...) | |
{ | |
va_list args; | |
va_start (args, format); | |
time_t t = time(NULL); | |
struct tm *tmp; | |
char timeString[20]; | |
tmp = localtime(&t); | |
strftime(timeString, sizeof(timeString), "%F %T", tmp); | |
// so only one can write at a time | |
pthread_mutex_lock(&writerMutex); | |
if(!daemonize) | |
{ | |
fprintf(stdout, "%s - ", timeString); | |
vfprintf(stdout, format, args); | |
fprintf(stdout, "\n"); | |
} | |
else | |
{ | |
fprintf(logFd, "%s - ", timeString); | |
vfprintf(logFd, format, args); | |
fprintf(logFd, "\n"); | |
fflush(logFd); | |
} | |
pthread_mutex_unlock(&writerMutex); | |
va_end (args); | |
} | |
// emailer | |
int sendEmailError(char *msg, const char *disp, const char *ip) | |
{ | |
char cmd[500] = ""; | |
snprintf(cmd, 500, "/usr/local/bin/mailer.pl %s %s 'GraceWatch - %s' 'Host: %s\nIP : %s\n\n%s'", EMAIL_TO, EMAIL_FR, disp, disp, ip, msg); | |
return system(cmd); | |
} | |
// Cleanup Worker Thread | |
static void workerCleanup(void *arg) | |
{ | |
MYSQL *mysql = (MYSQL *)arg; | |
char host_info[30]; | |
const char *info = mysql_get_host_info(mysql); | |
snprintf(host_info, sizeof(host_info), "%s", info); | |
mysql_close(mysql); | |
mysql_thread_end(); | |
glog("Thread closed on %s", host_info); | |
} | |
// Worker Thread | |
static void *workerThread(void *arg) | |
{ | |
(void)mysql_thread_init(); | |
MYSQL *mysql = mysql_init(NULL); | |
MYSQL_RES *res = NULL; | |
MYSQL_ROW row = NULL; | |
MYSQL_FIELD *fields = NULL; | |
config_setting_t *server = (config_setting_t *)arg; | |
const char *displayName, *ip; | |
int i, numFields, isSlave = 1, sentReplNotification = 0, sentLagNotification = 0; | |
int slaveIoRunning = 0, slaveSqlRunning = 0, secondsBehindMaster = 0; | |
int connected = 0, counter = 0; | |
unsigned long threadId = 0; | |
unsigned int connTimeout = 10; | |
my_bool reconnect = 1; | |
// set clean up function on thread terminiation | |
pthread_cleanup_push(workerCleanup, (void *)mysql); | |
// get settings from config | |
config_setting_lookup_string(server, "displayName", &displayName); | |
config_setting_lookup_string(server, "ip", &ip); | |
config_setting_lookup_bool(server, "isSlave", &isSlave); | |
// set auto-reconnect flag | |
mysql_options(mysql, MYSQL_OPT_RECONNECT, &reconnect); | |
mysql_options(mysql, MYSQL_OPT_CONNECT_TIMEOUT, &connTimeout); | |
connect: | |
while(!connected) | |
{ | |
// connect to MySQL | |
if(!mysql_real_connect(mysql, ip, username, password, NULL, 3306, NULL, CLIENT_REMEMBER_OPTIONS)) | |
{ | |
GLOG("Unable to connect to MySQL. Sleeping 1 min before attempting again") | |
sleep(60); | |
} | |
else | |
{ | |
connected = 1; | |
} | |
} | |
while(!needToExit) | |
{ | |
// get our thread Id for reconnect checking | |
threadId = mysql_thread_id(mysql); | |
// ping first to check connection and do auto-reconnect | |
if(mysql_ping(mysql) != 0) | |
{ | |
GLOG("Ping Failure. Auto-Reconnect Failure") | |
connected = 0; | |
goto connect; | |
} | |
else | |
{ | |
// ping is ok. let's force a log entry every 60 iterations | |
if(counter >= 60) | |
{ | |
GLOG("Ping OK") | |
counter = 0; | |
} | |
} | |
// did we reconnect? if so, make note in log | |
if(threadId != mysql_thread_id(mysql)) | |
{ | |
GLOG("Auto-Reconnect Occured") | |
} | |
// if not a slave, we can just continue from here to next iteration | |
if(!isSlave) | |
{ | |
goto cycle; | |
} | |
// get slave status | |
if((i = mysql_query(mysql, "SHOW SLAVE STATUS"))) | |
{ | |
GLOG("Could not get slave status") | |
goto cycle; | |
} | |
if(!(res = mysql_store_result(mysql))) | |
{ | |
GLOG("Could not store slave status") | |
goto cycle; | |
} | |
numFields = mysql_num_fields(res); | |
fields = mysql_fetch_fields(res); | |
if(!(row = mysql_fetch_row(res))) | |
{ | |
GLOG("Unable to get slave status row") | |
goto cycle; | |
} | |
if(!fields) | |
{ | |
GLOG("Did not get fields info") | |
goto cycle; | |
} | |
// loop through the fields of this one row looking | |
// for the fields we want | |
for(i = 0; i < numFields; i++) | |
{ | |
if(!fields[i].name) | |
{ | |
GLOG("Got empty name field") | |
continue; | |
} | |
// get row fields in to easy C vars | |
if(strcmp(fields[i].name, "Slave_IO_Running") == 0) | |
{ | |
slaveIoRunning = (strcmp(row[i], "Yes") == 0) ? 1 : 0; | |
} | |
if(strcmp(fields[i].name, "Slave_SQL_Running") == 0) | |
{ | |
slaveSqlRunning = (strcmp(row[i], "Yes") == 0) ? 1 : 0; | |
} | |
if(strcmp(fields[i].name, "Seconds_Behind_Master") == 0) | |
{ | |
secondsBehindMaster = (!row[i] ? -1 : atoi(row[i])); | |
} | |
} | |
// we've fetched the row; dont need result handler any longer | |
mysql_free_result(res); | |
// alert logic - slave not running | |
if(slaveIoRunning == 0 || slaveSqlRunning == 0) | |
{ | |
GLOG("One or both slave threads is not running") | |
if(sentReplNotification == 0) | |
{ | |
sentReplNotification = 1; | |
sendEmailError("One or both slave threads is not running.", displayName, ip); | |
} | |
} | |
// reset sent notification flag if replication is running again | |
if(slaveIoRunning == 1 && slaveSqlRunning == 1 && sentReplNotification == 1) | |
{ | |
sentReplNotification = 0; | |
} | |
// slave is 10 min behind | |
if(secondsBehindMaster > 600) | |
{ | |
GLOG("Host/Slave is more than 10 minutes behind in replication") | |
if(sentLagNotification == 0) | |
{ | |
sentLagNotification = 1; | |
sendEmailError("Host/Slave is more than 10 minutes behind in replication.", displayName, ip); | |
} | |
} | |
// slave is OK. send Clear email. | |
if(slaveIoRunning == 1 && slaveSqlRunning == 1 && secondsBehindMaster < 50 && sentLagNotification == 1) | |
{ | |
sentLagNotification = 0; | |
GLOG("Host/Slave replication is running and has caught up in replication") | |
sendEmailError("Host/Slave replication is running and has caught up in replication.", displayName, ip); | |
} | |
cycle: | |
// wait 1 minute and check again | |
counter++; | |
sleep(60); | |
} | |
pthread_cleanup_pop(1); | |
// cleanly exit thread | |
pthread_exit(NULL); | |
} | |
static void catchSignal(int sig) | |
{ | |
int i = 0; | |
needToExit = 1; | |
glog("Caught signal %d. Shutting down threads...", sig); | |
for(i = 0; i < numServers; i++) | |
{ | |
pthread_cancel(threads[i]); | |
} | |
glog("Threads canceled."); | |
} | |
int main(int argc, char** argv) | |
{ | |
pid_t pid, sid; | |
pthread_attr_t threadAttr; | |
config_t cfg; | |
config_setting_t *setting; | |
const char *logFile; | |
config_init(&cfg); | |
// setup signal handling | |
if(signal(SIGTERM, catchSignal) == SIG_ERR || signal(SIGINT, catchSignal) == SIG_ERR) | |
{ | |
ERROR_DIE("An error occurred while setting a signal handler.") | |
} | |
// Read the file. If there is an error, report it and exit. | |
if(!config_read_file(&cfg, "/etc/gracewatch.cfg")) | |
{ | |
CFG_ERROR_DIE("Unable to load/read config file.") | |
} | |
// get log file from config | |
if(!(config_lookup_string(&cfg, "log", &logFile))) | |
{ | |
CFG_ERROR_DIE("Unable to find log file path setting in config.") | |
} | |
// open log file if daemonized | |
if(daemonize && !(logFd = fopen(logFile, "a"))) | |
{ | |
ERROR_DIE("Unable to open log file.") | |
} | |
glog("-- Gracewatch Started --"); | |
// get list of servers and connect | |
setting = config_lookup(&cfg, "servers"); | |
if(!setting) | |
{ | |
CFG_ERROR_DIE("No servers found in config file.") | |
} | |
// set globals from read-config | |
if(!(config_lookup_string(&cfg, "username", &username) | |
&& config_lookup_string(&cfg, "password", &password))) | |
{ | |
CFG_ERROR_DIE("Unable to find username/password in config.") | |
} | |
numServers = config_setting_length(setting); | |
int i; | |
glog("Found %d servers in config.", numServers); | |
// initialize the pthreads array | |
if((threads = (pthread_t *) malloc(numServers * sizeof(pthread_t))) == NULL) | |
{ | |
ERROR_DIE("Unable to allocate memory for threads array.") | |
} | |
// in case we dont want to daemonize | |
if(argc == 2 && !strcmp(argv[1], "-nd")) | |
{ | |
daemonize = 0; | |
} | |
// we can fork/detach here | |
if(daemonize) | |
{ | |
pid = fork(); | |
if(pid < 0) | |
{ | |
ERROR_DIE("Unable to fork parent process.") | |
} | |
if(pid > 0) | |
{ | |
// we are parent and can exit safely | |
return (EXIT_SUCCESS); | |
} | |
// child process continues here | |
sid = setsid(); | |
if(sid < 0) | |
{ | |
ERROR_DIE("Unable to continue child process") | |
} | |
// no need keeping these open for daemon | |
close(STDIN_FILENO); | |
close(STDOUT_FILENO); | |
close(STDERR_FILENO); | |
} | |
// Set up threaded MySQL-client stuff | |
mysql_library_init(0, NULL, NULL); | |
mysql_thread_init(); | |
// some pthread initialization. make each one joinable (ie: wait for it) | |
pthread_attr_init(&threadAttr); | |
pthread_attr_setdetachstate(&threadAttr, PTHREAD_CREATE_JOINABLE); | |
for(i = 0; i < numServers; ++i) | |
{ | |
config_setting_t *serverCfg = config_setting_get_elem(setting, i); | |
const char *displayName, *ip; | |
// if config doesnt have both display name and ip, go to next entry | |
if(!(config_setting_lookup_string(serverCfg, "displayName", &displayName) | |
&& config_setting_lookup_string(serverCfg, "ip", &ip))) | |
{ | |
continue; | |
} | |
// launch pthread here | |
pthread_create(&threads[i], &threadAttr, workerThread, (void *)serverCfg); | |
glog("Launched monitor thread on %s (%s).", displayName, ip); | |
} | |
// join threads together | |
for(i = 0; i < numServers; i++) | |
{ | |
pthread_join(threads[i], NULL); | |
} | |
// child process now waits here for all threads to cleanly | |
// exit before continuing on to final clean-up below | |
// now that threads have ended, clean up | |
config_destroy(&cfg); | |
pthread_attr_destroy(&threadAttr); | |
mysql_library_end(); | |
glog("-- Gracewatch Ended --"); | |
if(daemonize) | |
fclose(logFd); | |
return(EXIT_SUCCESS); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment