Skip to content

Instantly share code, notes, and snippets.

@opmat
Forked from utdrmac/gracewatch.c
Created November 23, 2023 02:49
Show Gist options
  • Save opmat/4897641390f9e60d12f5a9d7cd2548f4 to your computer and use it in GitHub Desktop.
Save opmat/4897641390f9e60d12f5a9d7cd2548f4 to your computer and use it in GitHub Desktop.
/*
* Gracewatch 1.0
* Matthew Boehm <[email protected]>
*
* Gracewatch is a multi-threaded MySQL monitoring solution developed for
* a client that had no in-house monitoring team.
*
* Using libConfig (http://www.hyperrealm.com/libconfig/), gracewatch reads
* a list of servers and credentials and spawns a pthread for each server.
* The thread connects to the host and every minute preforms a mysql_ping()
* to verify server is up and attempts to get slave status. If slave is not
* running or is more than 600 seconds behind, an email is sent to DBA.
*
* Gracewatch does not repeat the alarm to reduce emails sent. It will
* send out notification if the situation corrects itself (ie: slave
* catches up).
*
*/
#include <stdlib.h> // for malloc
#include <stdarg.h> // for va_list
#include <stdio.h> // for printf
#include <unistd.h> // for sleep
#include <string.h> // for strcmp
#include <signal.h> // for signals
#include <libconfig.h> // for reading config
#include <mysql.h> // for mysql stuff
#include <pthread.h> // for pthreads
#include <time.h> // for time printing
#define EMAIL_TO "[email protected]"
#define EMAIL_FR "[email protected]"
#define ERROR_DIE(x) fprintf(stderr, "%s\n", x); config_destroy(&cfg); if(logFd) { fclose(logFd); } return(EXIT_FAILURE);
#define CFG_ERROR_DIE(x) fprintf(stderr, "%s - %s:%d - %s\n", x, config_error_file(&cfg), config_error_line(&cfg), config_error_text(&cfg)); config_destroy(&cfg); if(logFd) { fclose(logFd); } return(EXIT_FAILURE);
#define GLOG(x) glog("%s on %s (%s) - %s", x, displayName, ip, mysql_error(mysql));
// random globals
int needToExit = 0;
int numServers = 0;
int daemonize = 1;
FILE *logFd;
// global thread array
pthread_t *threads;
// globals for connecting
const char *username = NULL;
const char *password = NULL;
// the mutex for writing output
pthread_mutex_t writerMutex = PTHREAD_MUTEX_INITIALIZER;
// logger
static void glog(const char *format, ...)
{
va_list args;
va_start (args, format);
time_t t = time(NULL);
struct tm *tmp;
char timeString[20];
tmp = localtime(&t);
strftime(timeString, sizeof(timeString), "%F %T", tmp);
// so only one can write at a time
pthread_mutex_lock(&writerMutex);
if(!daemonize)
{
fprintf(stdout, "%s - ", timeString);
vfprintf(stdout, format, args);
fprintf(stdout, "\n");
}
else
{
fprintf(logFd, "%s - ", timeString);
vfprintf(logFd, format, args);
fprintf(logFd, "\n");
fflush(logFd);
}
pthread_mutex_unlock(&writerMutex);
va_end (args);
}
// emailer
int sendEmailError(char *msg, const char *disp, const char *ip)
{
char cmd[500] = "";
snprintf(cmd, 500, "/usr/local/bin/mailer.pl %s %s 'GraceWatch - %s' 'Host: %s\nIP : %s\n\n%s'", EMAIL_TO, EMAIL_FR, disp, disp, ip, msg);
return system(cmd);
}
// Cleanup Worker Thread
static void workerCleanup(void *arg)
{
MYSQL *mysql = (MYSQL *)arg;
char host_info[30];
const char *info = mysql_get_host_info(mysql);
snprintf(host_info, sizeof(host_info), "%s", info);
mysql_close(mysql);
mysql_thread_end();
glog("Thread closed on %s", host_info);
}
// Worker Thread
static void *workerThread(void *arg)
{
(void)mysql_thread_init();
MYSQL *mysql = mysql_init(NULL);
MYSQL_RES *res = NULL;
MYSQL_ROW row = NULL;
MYSQL_FIELD *fields = NULL;
config_setting_t *server = (config_setting_t *)arg;
const char *displayName, *ip;
int i, numFields, isSlave = 1, sentReplNotification = 0, sentLagNotification = 0;
int slaveIoRunning = 0, slaveSqlRunning = 0, secondsBehindMaster = 0;
int connected = 0, counter = 0;
unsigned long threadId = 0;
unsigned int connTimeout = 10;
my_bool reconnect = 1;
// set clean up function on thread terminiation
pthread_cleanup_push(workerCleanup, (void *)mysql);
// get settings from config
config_setting_lookup_string(server, "displayName", &displayName);
config_setting_lookup_string(server, "ip", &ip);
config_setting_lookup_bool(server, "isSlave", &isSlave);
// set auto-reconnect flag
mysql_options(mysql, MYSQL_OPT_RECONNECT, &reconnect);
mysql_options(mysql, MYSQL_OPT_CONNECT_TIMEOUT, &connTimeout);
connect:
while(!connected)
{
// connect to MySQL
if(!mysql_real_connect(mysql, ip, username, password, NULL, 3306, NULL, CLIENT_REMEMBER_OPTIONS))
{
GLOG("Unable to connect to MySQL. Sleeping 1 min before attempting again")
sleep(60);
}
else
{
connected = 1;
}
}
while(!needToExit)
{
// get our thread Id for reconnect checking
threadId = mysql_thread_id(mysql);
// ping first to check connection and do auto-reconnect
if(mysql_ping(mysql) != 0)
{
GLOG("Ping Failure. Auto-Reconnect Failure")
connected = 0;
goto connect;
}
else
{
// ping is ok. let's force a log entry every 60 iterations
if(counter >= 60)
{
GLOG("Ping OK")
counter = 0;
}
}
// did we reconnect? if so, make note in log
if(threadId != mysql_thread_id(mysql))
{
GLOG("Auto-Reconnect Occured")
}
// if not a slave, we can just continue from here to next iteration
if(!isSlave)
{
goto cycle;
}
// get slave status
if((i = mysql_query(mysql, "SHOW SLAVE STATUS")))
{
GLOG("Could not get slave status")
goto cycle;
}
if(!(res = mysql_store_result(mysql)))
{
GLOG("Could not store slave status")
goto cycle;
}
numFields = mysql_num_fields(res);
fields = mysql_fetch_fields(res);
if(!(row = mysql_fetch_row(res)))
{
GLOG("Unable to get slave status row")
goto cycle;
}
if(!fields)
{
GLOG("Did not get fields info")
goto cycle;
}
// loop through the fields of this one row looking
// for the fields we want
for(i = 0; i < numFields; i++)
{
if(!fields[i].name)
{
GLOG("Got empty name field")
continue;
}
// get row fields in to easy C vars
if(strcmp(fields[i].name, "Slave_IO_Running") == 0)
{
slaveIoRunning = (strcmp(row[i], "Yes") == 0) ? 1 : 0;
}
if(strcmp(fields[i].name, "Slave_SQL_Running") == 0)
{
slaveSqlRunning = (strcmp(row[i], "Yes") == 0) ? 1 : 0;
}
if(strcmp(fields[i].name, "Seconds_Behind_Master") == 0)
{
secondsBehindMaster = (!row[i] ? -1 : atoi(row[i]));
}
}
// we've fetched the row; dont need result handler any longer
mysql_free_result(res);
// alert logic - slave not running
if(slaveIoRunning == 0 || slaveSqlRunning == 0)
{
GLOG("One or both slave threads is not running")
if(sentReplNotification == 0)
{
sentReplNotification = 1;
sendEmailError("One or both slave threads is not running.", displayName, ip);
}
}
// reset sent notification flag if replication is running again
if(slaveIoRunning == 1 && slaveSqlRunning == 1 && sentReplNotification == 1)
{
sentReplNotification = 0;
}
// slave is 10 min behind
if(secondsBehindMaster > 600)
{
GLOG("Host/Slave is more than 10 minutes behind in replication")
if(sentLagNotification == 0)
{
sentLagNotification = 1;
sendEmailError("Host/Slave is more than 10 minutes behind in replication.", displayName, ip);
}
}
// slave is OK. send Clear email.
if(slaveIoRunning == 1 && slaveSqlRunning == 1 && secondsBehindMaster < 50 && sentLagNotification == 1)
{
sentLagNotification = 0;
GLOG("Host/Slave replication is running and has caught up in replication")
sendEmailError("Host/Slave replication is running and has caught up in replication.", displayName, ip);
}
cycle:
// wait 1 minute and check again
counter++;
sleep(60);
}
pthread_cleanup_pop(1);
// cleanly exit thread
pthread_exit(NULL);
}
static void catchSignal(int sig)
{
int i = 0;
needToExit = 1;
glog("Caught signal %d. Shutting down threads...", sig);
for(i = 0; i < numServers; i++)
{
pthread_cancel(threads[i]);
}
glog("Threads canceled.");
}
int main(int argc, char** argv)
{
pid_t pid, sid;
pthread_attr_t threadAttr;
config_t cfg;
config_setting_t *setting;
const char *logFile;
config_init(&cfg);
// setup signal handling
if(signal(SIGTERM, catchSignal) == SIG_ERR || signal(SIGINT, catchSignal) == SIG_ERR)
{
ERROR_DIE("An error occurred while setting a signal handler.")
}
// Read the file. If there is an error, report it and exit.
if(!config_read_file(&cfg, "/etc/gracewatch.cfg"))
{
CFG_ERROR_DIE("Unable to load/read config file.")
}
// get log file from config
if(!(config_lookup_string(&cfg, "log", &logFile)))
{
CFG_ERROR_DIE("Unable to find log file path setting in config.")
}
// open log file if daemonized
if(daemonize && !(logFd = fopen(logFile, "a")))
{
ERROR_DIE("Unable to open log file.")
}
glog("-- Gracewatch Started --");
// get list of servers and connect
setting = config_lookup(&cfg, "servers");
if(!setting)
{
CFG_ERROR_DIE("No servers found in config file.")
}
// set globals from read-config
if(!(config_lookup_string(&cfg, "username", &username)
&& config_lookup_string(&cfg, "password", &password)))
{
CFG_ERROR_DIE("Unable to find username/password in config.")
}
numServers = config_setting_length(setting);
int i;
glog("Found %d servers in config.", numServers);
// initialize the pthreads array
if((threads = (pthread_t *) malloc(numServers * sizeof(pthread_t))) == NULL)
{
ERROR_DIE("Unable to allocate memory for threads array.")
}
// in case we dont want to daemonize
if(argc == 2 && !strcmp(argv[1], "-nd"))
{
daemonize = 0;
}
// we can fork/detach here
if(daemonize)
{
pid = fork();
if(pid < 0)
{
ERROR_DIE("Unable to fork parent process.")
}
if(pid > 0)
{
// we are parent and can exit safely
return (EXIT_SUCCESS);
}
// child process continues here
sid = setsid();
if(sid < 0)
{
ERROR_DIE("Unable to continue child process")
}
// no need keeping these open for daemon
close(STDIN_FILENO);
close(STDOUT_FILENO);
close(STDERR_FILENO);
}
// Set up threaded MySQL-client stuff
mysql_library_init(0, NULL, NULL);
mysql_thread_init();
// some pthread initialization. make each one joinable (ie: wait for it)
pthread_attr_init(&threadAttr);
pthread_attr_setdetachstate(&threadAttr, PTHREAD_CREATE_JOINABLE);
for(i = 0; i < numServers; ++i)
{
config_setting_t *serverCfg = config_setting_get_elem(setting, i);
const char *displayName, *ip;
// if config doesnt have both display name and ip, go to next entry
if(!(config_setting_lookup_string(serverCfg, "displayName", &displayName)
&& config_setting_lookup_string(serverCfg, "ip", &ip)))
{
continue;
}
// launch pthread here
pthread_create(&threads[i], &threadAttr, workerThread, (void *)serverCfg);
glog("Launched monitor thread on %s (%s).", displayName, ip);
}
// join threads together
for(i = 0; i < numServers; i++)
{
pthread_join(threads[i], NULL);
}
// child process now waits here for all threads to cleanly
// exit before continuing on to final clean-up below
// now that threads have ended, clean up
config_destroy(&cfg);
pthread_attr_destroy(&threadAttr);
mysql_library_end();
glog("-- Gracewatch Ended --");
if(daemonize)
fclose(logFd);
return(EXIT_SUCCESS);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment