Created
April 14, 2016 16:55
-
-
Save Uberi/f17a3647ea3f9d087c576e8fe5b112c0 to your computer and use it in GitHub Desktop.
crash-rate-aggregates-watchdog
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# # Crash Rate Aggregates Watchdog | |
# Watches the output directory of the [crash rate aggregates](https://github.com/mozilla/moz-crash-rate-aggregates) job on S3 to make sure it's properly outputting results. If the crash rate aggregates job ever fails, this notebook detects that and sends out an alert email. | |
# Configuration options: | |
# In[ ]: | |
S3_BUCKET = "telemetry-parquet" # S3 bucket name | |
S3_PREFIX = "crash_aggregates/v1/" # must end with a slash | |
FROM_ADDR = "[email protected]" # email address to send alerts from | |
GENERAL_TELEMETRY_ALERT = "[email protected]" # email address that will receive notifications | |
# In[ ]: | |
import boto | |
from email.mime.application import MIMEApplication | |
from email.mime.multipart import MIMEMultipart | |
from email.mime.text import MIMEText | |
def send_ses(fromaddr, | |
subject, | |
body, | |
recipient, | |
filename=''): | |
"""Send an email via the Amazon SES service. | |
Example: | |
send_ses('[email protected], 'greetings', "Hi!", '[email protected]) | |
Return: | |
If 'ErrorResponse' appears in the return message from SES, | |
return the message, otherwise return an empty '' string.""" | |
msg = MIMEMultipart() | |
msg['Subject'] = subject | |
msg['From'] = fromaddr | |
msg['To'] = recipient | |
msg.attach(MIMEText(body)) | |
if filename: | |
attachment = open(filename, 'rb').read() | |
part = MIMEApplication(attachment) | |
part.add_header('Content-Disposition', 'attachment', filename=filename) | |
msg.attach(part) | |
conn = boto.connect_ses() | |
result = conn.send_raw_email(msg.as_string()) | |
return result if 'ErrorResponse' in result else '' | |
# In[ ]: | |
import os | |
import sys | |
import re | |
from datetime import datetime, date, timedelta | |
import boto | |
def print_help(): | |
print "Check if the crash rate aggregator job is giving the expected output." | |
print "Usage: {} email|test".format(sys.argv[0]) | |
print " {} email [YYYY-MM-DD] if crash aggregates haven't been updated in about a day as of YYYY-MM-DD (defaults to current date), email the telemetry alerts mailing list saying so".format(sys.argv[0]) | |
print " {} test [YYYY-MM-DD] print out whether crash aggregates have been updated in about a day as of YYYY-MM-DD (defaults to current date)".format(sys.argv[0]) | |
def is_job_failing(current_date): | |
# obtain the S3 bucket | |
conn = boto.s3.connect_to_region("us-west-2", host="s3-us-west-2.amazonaws.com") | |
try: | |
bucket = conn.get_bucket(S3_BUCKET, validate=False) | |
except boto.exception.S3ResponseError: # bucket doesn't exist | |
return True | |
# list all of the prefixes under the given one | |
crash_aggregate_partitions = bucket.list(prefix=S3_PREFIX, delimiter="/") | |
start, end = current_date - timedelta(days=2), current_date | |
for partition in crash_aggregate_partitions: | |
match = re.search(r"/submission_date=(\d\d\d\d-\d\d-\d\d)/$", partition.name) | |
if not match: continue | |
submission_date = datetime.strptime(match.group(1), "%Y-%m-%d").date() | |
if start <= submission_date <= end: | |
return False # found suitable partition, job is working | |
# no suitable partition found, job is failing | |
return True | |
# In[ ]: | |
now = date.today() | |
if is_job_failing(now): | |
print("Sending email notification about crash aggregates not being updated to {}.".format(GENERAL_TELEMETRY_ALERT)) | |
email_body = ( | |
"As of {}, the daily crash aggregates job [1] has not output results for 2 days. This is an automated message from Cerberus [2].\n" | |
"\n" | |
"[1]: https://github.com/mozilla/moz-crash-rate-aggregates\n" | |
"[2]: https://github.com/mozilla/cerberus\n" | |
).format(now) | |
send_ses(FROM_ADDR, "[FAILURE] Crash aggregates not updating", email_body, GENERAL_TELEMETRY_ALERT) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment