forked from GithubBackups/healthchecks
Checks now have a new status: "paused". sendalerts management command will mark checks as paused if sending a notification throws exception. This should avoid potential infinite loops of sendalerts crashes/respawns.
This commit is contained in:
parent
99b6030eeb
commit
c1840a92bd
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,5 +1,6 @@
|
|||||||
__pycache__/
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
.coverage
|
.coverage
|
||||||
local_settings.py
|
hc.sqlite
|
||||||
|
hc/local_settings.py
|
||||||
static-collected
|
static-collected
|
@ -1,17 +1,63 @@
|
|||||||
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
|
from django.db.models import Q
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from hc.api.models import Check
|
from hc.api.models import Check
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def _log(message):
|
|
||||||
|
def _stdout(message):
|
||||||
sys.stdout.write(message)
|
sys.stdout.write(message)
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
|
||||||
|
def handle_one():
|
||||||
|
""" Send an alert for a single check.
|
||||||
|
|
||||||
|
Return True if an appropriate check was selected and processed.
|
||||||
|
Return False if no checks need to be processed.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
query = Check.objects.filter(user__isnull=False)
|
||||||
|
|
||||||
|
now = timezone.now()
|
||||||
|
going_down = Q(alert_after__lt=now, status="up")
|
||||||
|
going_up = Q(alert_after__gt=now, status="down")
|
||||||
|
query = query.filter(going_down | going_up)
|
||||||
|
|
||||||
|
try:
|
||||||
|
check = query[0]
|
||||||
|
except IndexError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
check.status = check.get_status()
|
||||||
|
|
||||||
|
tmpl = "\nSending alert, status=%s, code=%s\n"
|
||||||
|
_stdout(tmpl % (check.status, check.code))
|
||||||
|
|
||||||
|
try:
|
||||||
|
check.send_alert()
|
||||||
|
except:
|
||||||
|
# Catch EVERYTHING. If we crash here, what can happen is:
|
||||||
|
# - the sendalerts command will crash
|
||||||
|
# - supervisor will respawn sendalerts command
|
||||||
|
# - sendalerts will try same thing again, resulting in infinite loop
|
||||||
|
# So instead we catch and log all exceptions, and mark
|
||||||
|
# the checks as paused so they are not retried.
|
||||||
|
logger.error("Could not alert %s" % check.code, exc_info=True)
|
||||||
|
check.status = "paused"
|
||||||
|
finally:
|
||||||
|
check.save()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
help = 'Sends UP/DOWN email alerts'
|
help = 'Sends UP/DOWN email alerts'
|
||||||
|
|
||||||
@ -19,36 +65,12 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
ticks = 0
|
ticks = 0
|
||||||
while True:
|
while True:
|
||||||
# Gone down?
|
success = True
|
||||||
query = Check.objects
|
while success:
|
||||||
query = query.filter(alert_after__lt=timezone.now())
|
success = handle_one()
|
||||||
query = query.filter(user__isnull=False)
|
ticks = 0 if success else ticks + 1
|
||||||
query = query.filter(status="up")
|
|
||||||
for check in query:
|
|
||||||
check.status = "down"
|
|
||||||
|
|
||||||
_log("\nSending notification(s) about going down for %s\n" % check.code)
|
|
||||||
check.send_alert()
|
|
||||||
ticks = 0
|
|
||||||
|
|
||||||
# Save status after the notification is sent
|
|
||||||
check.save()
|
|
||||||
|
|
||||||
# Gone up?
|
|
||||||
query = Check.objects
|
|
||||||
query = query.filter(alert_after__gt=timezone.now())
|
|
||||||
query = query.filter(user__isnull=False)
|
|
||||||
query = query.filter(status="down")
|
|
||||||
for check in query:
|
|
||||||
check.status = "up"
|
|
||||||
|
|
||||||
_log("\nSending notification(s) about going up for %s\n" % check.code)
|
|
||||||
check.send_alert()
|
|
||||||
ticks = 0
|
|
||||||
|
|
||||||
# Save status after the notification is sent
|
|
||||||
check.save()
|
|
||||||
|
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
ticks = (ticks + 1) % 80
|
_stdout(".")
|
||||||
_log("." + ("\n" if ticks == 0 else ""))
|
if ticks % 60 == 0:
|
||||||
|
_stdout("\n")
|
||||||
|
19
hc/api/migrations/0016_auto_20151030_1107.py
Normal file
19
hc/api/migrations/0016_auto_20151030_1107.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import models, migrations
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('api', '0015_auto_20151022_1008'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='check',
|
||||||
|
name='status',
|
||||||
|
field=models.CharField(default='new', max_length=6, choices=[('up', 'Up'), ('down', 'Down'), ('new', 'New'), ('paused', 'Paused')]),
|
||||||
|
),
|
||||||
|
]
|
@ -15,8 +15,12 @@ import requests
|
|||||||
|
|
||||||
from hc.lib import emails
|
from hc.lib import emails
|
||||||
|
|
||||||
|
STATUSES = (
|
||||||
STATUSES = (("up", "Up"), ("down", "Down"), ("new", "New"))
|
("up", "Up"),
|
||||||
|
("down", "Down"),
|
||||||
|
("new", "New"),
|
||||||
|
("paused", "Paused")
|
||||||
|
)
|
||||||
DEFAULT_TIMEOUT = td(days=1)
|
DEFAULT_TIMEOUT = td(days=1)
|
||||||
DEFAULT_GRACE = td(hours=1)
|
DEFAULT_GRACE = td(hours=1)
|
||||||
CHANNEL_KINDS = (("email", "Email"), ("webhook", "Webhook"),
|
CHANNEL_KINDS = (("email", "Email"), ("webhook", "Webhook"),
|
||||||
@ -60,8 +64,8 @@ class Check(models.Model):
|
|||||||
channel.notify(self)
|
channel.notify(self)
|
||||||
|
|
||||||
def get_status(self):
|
def get_status(self):
|
||||||
if self.status == "new":
|
if self.status in ("new", "paused"):
|
||||||
return "new"
|
return self.status
|
||||||
|
|
||||||
now = timezone.now()
|
now = timezone.now()
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ body {
|
|||||||
font-size: small;
|
font-size: small;
|
||||||
}
|
}
|
||||||
|
|
||||||
.glyphicon.up, .glyphicon.new, .glyphicon.grace, .glyphicon.down {
|
.glyphicon.up, .glyphicon.new, .glyphicon.paused, .glyphicon.grace, .glyphicon.down {
|
||||||
font-size: 22px;
|
font-size: 22px;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -50,7 +50,7 @@ body {
|
|||||||
color: #5cb85c;
|
color: #5cb85c;
|
||||||
}
|
}
|
||||||
|
|
||||||
.glyphicon.new {
|
.glyphicon.new, .glyphicon.paused {
|
||||||
color: #AAA;
|
color: #AAA;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
.new { background: #AAA; }
|
.new { background: #AAA; }
|
||||||
|
.paused { background: #AAA; }
|
||||||
.up { background: #5cb85c; }
|
.up { background: #5cb85c; }
|
||||||
.grace { background: #f0ad4e; }
|
.grace { background: #f0ad4e; }
|
||||||
.down { background: #d9534f; }
|
.down { background: #d9534f; }
|
||||||
@ -55,6 +56,8 @@
|
|||||||
<span class="badge grace">LATE</span>
|
<span class="badge grace">LATE</span>
|
||||||
{% elif check.get_status == "down" %}
|
{% elif check.get_status == "down" %}
|
||||||
<span class="badge down">DOWN</span>
|
<span class="badge down">DOWN</span>
|
||||||
|
{% elif check.get_status == "paused" %}
|
||||||
|
<span class="badge paused">PAUSED</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
|
@ -26,6 +26,8 @@
|
|||||||
<span class="glyphicon glyphicon-exclamation-sign grace"></span>
|
<span class="glyphicon glyphicon-exclamation-sign grace"></span>
|
||||||
{% elif check.get_status == "down" %}
|
{% elif check.get_status == "down" %}
|
||||||
<span class="glyphicon glyphicon-exclamation-sign down"></span>
|
<span class="glyphicon glyphicon-exclamation-sign down"></span>
|
||||||
|
{% elif check.get_status == "paused" %}
|
||||||
|
<span class="glyphicon glyphicon-minus-sign paused"></span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</td>
|
</td>
|
||||||
<td class="name-cell">
|
<td class="name-cell">
|
||||||
|
@ -31,6 +31,8 @@
|
|||||||
<span class="label label-warning">LATE</span>
|
<span class="label label-warning">LATE</span>
|
||||||
{% elif check.get_status == "down" %}
|
{% elif check.get_status == "down" %}
|
||||||
<span class="label label-danger">DOWN</span>
|
<span class="label label-danger">DOWN</span>
|
||||||
|
{% elif check.get_status == "paused" %}
|
||||||
|
<span class="label label-default">PAUSED</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user