Checks now have a new status: "paused". sendalerts management command will mark checks as paused if sending a notification throws exception. This should avoid potential infinite loops of sendalerts crashes/respawns.

This commit is contained in:
Pēteris Caune 2015-10-30 14:21:12 +02:00
parent 99b6030eeb
commit c1840a92bd
8 changed files with 92 additions and 39 deletions

3
.gitignore vendored
View File

@ -1,5 +1,6 @@
__pycache__/ __pycache__/
*.pyc *.pyc
.coverage .coverage
local_settings.py hc.sqlite
hc/local_settings.py
static-collected static-collected

View File

@ -1,17 +1,63 @@
import logging
import sys import sys
import time import time
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db.models import Q
from django.utils import timezone from django.utils import timezone
from hc.api.models import Check from hc.api.models import Check
logger = logging.getLogger(__name__)
def _log(message):
def _stdout(message):
sys.stdout.write(message) sys.stdout.write(message)
sys.stdout.flush() sys.stdout.flush()
def handle_one():
""" Send an alert for a single check.
Return True if an appropriate check was selected and processed.
Return False if no checks need to be processed.
"""
query = Check.objects.filter(user__isnull=False)
now = timezone.now()
going_down = Q(alert_after__lt=now, status="up")
going_up = Q(alert_after__gt=now, status="down")
query = query.filter(going_down | going_up)
try:
check = query[0]
except IndexError:
return False
check.status = check.get_status()
tmpl = "\nSending alert, status=%s, code=%s\n"
_stdout(tmpl % (check.status, check.code))
try:
check.send_alert()
except:
# Catch EVERYTHING. If we crash here, what can happen is:
# - the sendalerts command will crash
# - supervisor will respawn sendalerts command
# - sendalerts will try same thing again, resulting in infinite loop
# So instead we catch and log all exceptions, and mark
# the checks as paused so they are not retried.
logger.error("Could not alert %s" % check.code, exc_info=True)
check.status = "paused"
finally:
check.save()
return True
class Command(BaseCommand): class Command(BaseCommand):
help = 'Sends UP/DOWN email alerts' help = 'Sends UP/DOWN email alerts'
@ -19,36 +65,12 @@ class Command(BaseCommand):
ticks = 0 ticks = 0
while True: while True:
# Gone down? success = True
query = Check.objects while success:
query = query.filter(alert_after__lt=timezone.now()) success = handle_one()
query = query.filter(user__isnull=False) ticks = 0 if success else ticks + 1
query = query.filter(status="up")
for check in query:
check.status = "down"
_log("\nSending notification(s) about going down for %s\n" % check.code)
check.send_alert()
ticks = 0
# Save status after the notification is sent
check.save()
# Gone up?
query = Check.objects
query = query.filter(alert_after__gt=timezone.now())
query = query.filter(user__isnull=False)
query = query.filter(status="down")
for check in query:
check.status = "up"
_log("\nSending notification(s) about going up for %s\n" % check.code)
check.send_alert()
ticks = 0
# Save status after the notification is sent
check.save()
time.sleep(1) time.sleep(1)
ticks = (ticks + 1) % 80 _stdout(".")
_log("." + ("\n" if ticks == 0 else "")) if ticks % 60 == 0:
_stdout("\n")

View File

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import models, migrations
class Migration(migrations.Migration):
dependencies = [
('api', '0015_auto_20151022_1008'),
]
operations = [
migrations.AlterField(
model_name='check',
name='status',
field=models.CharField(default='new', max_length=6, choices=[('up', 'Up'), ('down', 'Down'), ('new', 'New'), ('paused', 'Paused')]),
),
]

View File

@ -15,8 +15,12 @@ import requests
from hc.lib import emails from hc.lib import emails
STATUSES = (
STATUSES = (("up", "Up"), ("down", "Down"), ("new", "New")) ("up", "Up"),
("down", "Down"),
("new", "New"),
("paused", "Paused")
)
DEFAULT_TIMEOUT = td(days=1) DEFAULT_TIMEOUT = td(days=1)
DEFAULT_GRACE = td(hours=1) DEFAULT_GRACE = td(hours=1)
CHANNEL_KINDS = (("email", "Email"), ("webhook", "Webhook"), CHANNEL_KINDS = (("email", "Email"), ("webhook", "Webhook"),
@ -60,8 +64,8 @@ class Check(models.Model):
channel.notify(self) channel.notify(self)
def get_status(self): def get_status(self):
if self.status == "new": if self.status in ("new", "paused"):
return "new" return self.status
now = timezone.now() now = timezone.now()

View File

@ -42,7 +42,7 @@ body {
font-size: small; font-size: small;
} }
.glyphicon.up, .glyphicon.new, .glyphicon.grace, .glyphicon.down { .glyphicon.up, .glyphicon.new, .glyphicon.paused, .glyphicon.grace, .glyphicon.down {
font-size: 22px; font-size: 22px;
} }
@ -50,7 +50,7 @@ body {
color: #5cb85c; color: #5cb85c;
} }
.glyphicon.new { .glyphicon.new, .glyphicon.paused {
color: #AAA; color: #AAA;
} }

View File

@ -19,6 +19,7 @@
} }
.new { background: #AAA; } .new { background: #AAA; }
.paused { background: #AAA; }
.up { background: #5cb85c; } .up { background: #5cb85c; }
.grace { background: #f0ad4e; } .grace { background: #f0ad4e; }
.down { background: #d9534f; } .down { background: #d9534f; }
@ -55,6 +56,8 @@
<span class="badge grace">LATE</span> <span class="badge grace">LATE</span>
{% elif check.get_status == "down" %} {% elif check.get_status == "down" %}
<span class="badge down">DOWN</span> <span class="badge down">DOWN</span>
{% elif check.get_status == "paused" %}
<span class="badge paused">PAUSED</span>
{% endif %} {% endif %}
</td> </td>
<td> <td>

View File

@ -26,6 +26,8 @@
<span class="glyphicon glyphicon-exclamation-sign grace"></span> <span class="glyphicon glyphicon-exclamation-sign grace"></span>
{% elif check.get_status == "down" %} {% elif check.get_status == "down" %}
<span class="glyphicon glyphicon-exclamation-sign down"></span> <span class="glyphicon glyphicon-exclamation-sign down"></span>
{% elif check.get_status == "paused" %}
<span class="glyphicon glyphicon-minus-sign paused"></span>
{% endif %} {% endif %}
</td> </td>
<td class="name-cell"> <td class="name-cell">

View File

@ -31,6 +31,8 @@
<span class="label label-warning">LATE</span> <span class="label label-warning">LATE</span>
{% elif check.get_status == "down" %} {% elif check.get_status == "down" %}
<span class="label label-danger">DOWN</span> <span class="label label-danger">DOWN</span>
{% elif check.get_status == "paused" %}
<span class="label label-default">PAUSED</span>
{% endif %} {% endif %}
</td> </td>
</tr> </tr>