forked from GithubBackups/healthchecks
Checks now have a new status: "paused". sendalerts management command will mark checks as paused if sending a notification throws exception. This should avoid potential infinite loops of sendalerts crashes/respawns.
This commit is contained in:
parent
99b6030eeb
commit
c1840a92bd
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,5 +1,6 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.coverage
|
||||
local_settings.py
|
||||
hc.sqlite
|
||||
hc/local_settings.py
|
||||
static-collected
|
@ -1,17 +1,63 @@
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
|
||||
from hc.api.models import Check
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def _log(message):
|
||||
|
||||
def _stdout(message):
|
||||
sys.stdout.write(message)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def handle_one():
|
||||
""" Send an alert for a single check.
|
||||
|
||||
Return True if an appropriate check was selected and processed.
|
||||
Return False if no checks need to be processed.
|
||||
|
||||
"""
|
||||
|
||||
query = Check.objects.filter(user__isnull=False)
|
||||
|
||||
now = timezone.now()
|
||||
going_down = Q(alert_after__lt=now, status="up")
|
||||
going_up = Q(alert_after__gt=now, status="down")
|
||||
query = query.filter(going_down | going_up)
|
||||
|
||||
try:
|
||||
check = query[0]
|
||||
except IndexError:
|
||||
return False
|
||||
|
||||
check.status = check.get_status()
|
||||
|
||||
tmpl = "\nSending alert, status=%s, code=%s\n"
|
||||
_stdout(tmpl % (check.status, check.code))
|
||||
|
||||
try:
|
||||
check.send_alert()
|
||||
except:
|
||||
# Catch EVERYTHING. If we crash here, what can happen is:
|
||||
# - the sendalerts command will crash
|
||||
# - supervisor will respawn sendalerts command
|
||||
# - sendalerts will try same thing again, resulting in infinite loop
|
||||
# So instead we catch and log all exceptions, and mark
|
||||
# the checks as paused so they are not retried.
|
||||
logger.error("Could not alert %s" % check.code, exc_info=True)
|
||||
check.status = "paused"
|
||||
finally:
|
||||
check.save()
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Sends UP/DOWN email alerts'
|
||||
|
||||
@ -19,36 +65,12 @@ class Command(BaseCommand):
|
||||
|
||||
ticks = 0
|
||||
while True:
|
||||
# Gone down?
|
||||
query = Check.objects
|
||||
query = query.filter(alert_after__lt=timezone.now())
|
||||
query = query.filter(user__isnull=False)
|
||||
query = query.filter(status="up")
|
||||
for check in query:
|
||||
check.status = "down"
|
||||
|
||||
_log("\nSending notification(s) about going down for %s\n" % check.code)
|
||||
check.send_alert()
|
||||
ticks = 0
|
||||
|
||||
# Save status after the notification is sent
|
||||
check.save()
|
||||
|
||||
# Gone up?
|
||||
query = Check.objects
|
||||
query = query.filter(alert_after__gt=timezone.now())
|
||||
query = query.filter(user__isnull=False)
|
||||
query = query.filter(status="down")
|
||||
for check in query:
|
||||
check.status = "up"
|
||||
|
||||
_log("\nSending notification(s) about going up for %s\n" % check.code)
|
||||
check.send_alert()
|
||||
ticks = 0
|
||||
|
||||
# Save status after the notification is sent
|
||||
check.save()
|
||||
success = True
|
||||
while success:
|
||||
success = handle_one()
|
||||
ticks = 0 if success else ticks + 1
|
||||
|
||||
time.sleep(1)
|
||||
ticks = (ticks + 1) % 80
|
||||
_log("." + ("\n" if ticks == 0 else ""))
|
||||
_stdout(".")
|
||||
if ticks % 60 == 0:
|
||||
_stdout("\n")
|
||||
|
19
hc/api/migrations/0016_auto_20151030_1107.py
Normal file
19
hc/api/migrations/0016_auto_20151030_1107.py
Normal file
@ -0,0 +1,19 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import models, migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('api', '0015_auto_20151022_1008'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='check',
|
||||
name='status',
|
||||
field=models.CharField(default='new', max_length=6, choices=[('up', 'Up'), ('down', 'Down'), ('new', 'New'), ('paused', 'Paused')]),
|
||||
),
|
||||
]
|
@ -15,8 +15,12 @@ import requests
|
||||
|
||||
from hc.lib import emails
|
||||
|
||||
|
||||
STATUSES = (("up", "Up"), ("down", "Down"), ("new", "New"))
|
||||
STATUSES = (
|
||||
("up", "Up"),
|
||||
("down", "Down"),
|
||||
("new", "New"),
|
||||
("paused", "Paused")
|
||||
)
|
||||
DEFAULT_TIMEOUT = td(days=1)
|
||||
DEFAULT_GRACE = td(hours=1)
|
||||
CHANNEL_KINDS = (("email", "Email"), ("webhook", "Webhook"),
|
||||
@ -60,8 +64,8 @@ class Check(models.Model):
|
||||
channel.notify(self)
|
||||
|
||||
def get_status(self):
|
||||
if self.status == "new":
|
||||
return "new"
|
||||
if self.status in ("new", "paused"):
|
||||
return self.status
|
||||
|
||||
now = timezone.now()
|
||||
|
||||
|
@ -42,7 +42,7 @@ body {
|
||||
font-size: small;
|
||||
}
|
||||
|
||||
.glyphicon.up, .glyphicon.new, .glyphicon.grace, .glyphicon.down {
|
||||
.glyphicon.up, .glyphicon.new, .glyphicon.paused, .glyphicon.grace, .glyphicon.down {
|
||||
font-size: 22px;
|
||||
}
|
||||
|
||||
@ -50,7 +50,7 @@ body {
|
||||
color: #5cb85c;
|
||||
}
|
||||
|
||||
.glyphicon.new {
|
||||
.glyphicon.new, .glyphicon.paused {
|
||||
color: #AAA;
|
||||
}
|
||||
|
||||
|
@ -19,6 +19,7 @@
|
||||
}
|
||||
|
||||
.new { background: #AAA; }
|
||||
.paused { background: #AAA; }
|
||||
.up { background: #5cb85c; }
|
||||
.grace { background: #f0ad4e; }
|
||||
.down { background: #d9534f; }
|
||||
@ -55,6 +56,8 @@
|
||||
<span class="badge grace">LATE</span>
|
||||
{% elif check.get_status == "down" %}
|
||||
<span class="badge down">DOWN</span>
|
||||
{% elif check.get_status == "paused" %}
|
||||
<span class="badge paused">PAUSED</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
|
@ -26,6 +26,8 @@
|
||||
<span class="glyphicon glyphicon-exclamation-sign grace"></span>
|
||||
{% elif check.get_status == "down" %}
|
||||
<span class="glyphicon glyphicon-exclamation-sign down"></span>
|
||||
{% elif check.get_status == "paused" %}
|
||||
<span class="glyphicon glyphicon-minus-sign paused"></span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td class="name-cell">
|
||||
|
@ -31,6 +31,8 @@
|
||||
<span class="label label-warning">LATE</span>
|
||||
{% elif check.get_status == "down" %}
|
||||
<span class="label label-danger">DOWN</span>
|
||||
{% elif check.get_status == "paused" %}
|
||||
<span class="label label-default">PAUSED</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
|
Loading…
x
Reference in New Issue
Block a user