Pēteris Caune 7ba5fcbb71
Fix sendalerts to clear Profile.next_nag_date if all checks up
Profile.next_nag_date tracks when the next hourly/daily reminder
should be sent. Normally, sendalerts sets this field when
a check goes down, and sendreports clears it out whenever
it is about to send a reminder but realizes all checks are up.

The problem: sendalerts can set next_nag_date to a non-null
value, but it does not clear it out when all checks are up.
This can result in a hourly/daily reminder being sent out
at the wrong time. Specific example, assuming hourly reminders:

13:00: Check A goes down. next_nag_date gets set to 14:00.
13:05: Check A goes up. next_nag_date remains set to 14:00.
13:55: Check B goes down. next_nag_date remains set to 14:00.
14:00: Healthchecks sends a hourly reminder, just 5 minutes
       after Check B going down. It should have sent the reminder
       at 13:55 + 1 hour = 14:55

The fix: sendalerts can now both set and clear the next_nag_date
field. The main changes are in Project.update_next_nag_dates()
and in Profile.update_next_nag_date(). With the fix:

13:00: Check A goes down. next_nag_date gets set to 14:00.
13:05: Check A goes up. next_nag_date gets set to null.
13:55: Check B goes down. next_nag_date gets set to 14:55.
14:55: Healthchecks sends a hourly reminder.
2021-03-15 12:34:39 +02:00

163 lines
4.9 KiB
Python

from datetime import timedelta as td
import time
from threading import Thread
from django.core.management.base import BaseCommand
from django.utils import timezone
from hc.api.models import Check, Flip
from statsd.defaults.env import statsd
SENDING_TMPL = "Sending alert, status=%s, code=%s\n"
SEND_TIME_TMPL = "Sending took %.1fs, code=%s\n"
def notify(flip_id, stdout):
flip = Flip.objects.get(id=flip_id)
check = flip.owner
# Set the historic status here but *don't save it*.
# It would be nicer to pass the status explicitly, as a separate parameter.
check.status = flip.new_status
# And just to make sure it doesn't get saved by a future coding accident:
setattr(check, "save", None)
stdout.write(SENDING_TMPL % (flip.new_status, check.code))
# Set or clear dates for followup nags
check.project.update_next_nag_dates()
# Send notifications
send_start = timezone.now()
for ch, error, secs in flip.send_alerts():
label = "OK"
if error:
label = "ERROR"
elif secs > 5:
label = "SLOW"
s = " * %-5s %4.1fs %-10s %s %s\n" % (label, secs, ch.kind, ch.code, error)
stdout.write(s)
send_time = timezone.now() - send_start
stdout.write(SEND_TIME_TMPL % (send_time.total_seconds(), check.code))
statsd.timing("hc.sendalerts.dwellTime", send_start - flip.created)
statsd.timing("hc.sendalerts.sendTime", send_time)
def notify_on_thread(flip_id, stdout):
t = Thread(target=notify, args=(flip_id, stdout))
t.start()
class Command(BaseCommand):
help = "Sends UP/DOWN email alerts"
def add_arguments(self, parser):
parser.add_argument(
"--no-loop",
action="store_false",
dest="loop",
default=True,
help="Do not keep running indefinitely in a 2 second wait loop",
)
parser.add_argument(
"--no-threads",
action="store_false",
dest="use_threads",
default=False,
help="Send alerts synchronously, without using threads",
)
def process_one_flip(self, use_threads=True):
""" Find unprocessed flip, send notifications. """
# Order by processed, otherwise Django will automatically order by id
# and make the query less efficient
q = Flip.objects.filter(processed=None).order_by("processed")
flip = q.first()
if flip is None:
return False
q = Flip.objects.filter(id=flip.id, processed=None)
num_updated = q.update(processed=timezone.now())
if num_updated != 1:
# Nothing got updated: another worker process got there first.
return True
if use_threads:
notify_on_thread(flip.id, self.stdout)
else:
notify(flip.id, self.stdout)
return True
def handle_going_down(self):
""" Process a single check going down. """
now = timezone.now()
q = Check.objects.filter(alert_after__lt=now).exclude(status="down")
# Sort by alert_after, to avoid unnecessary sorting by id:
check = q.order_by("alert_after").first()
if check is None:
return False
old_status = check.status
q = Check.objects.filter(id=check.id, status=old_status)
try:
status = check.get_status()
except Exception as e:
# Make sure we don't trip on this check again for an hour:
# Otherwise sendalerts may end up in a crash loop.
q.update(alert_after=now + td(hours=1))
# Then re-raise the exception:
raise e
if status != "down":
# It is not down yet. Update alert_after
q.update(alert_after=check.going_down_after())
return True
# Atomically update status
flip_time = check.going_down_after()
num_updated = q.update(alert_after=None, status="down")
if num_updated != 1:
# Nothing got updated: another worker process got there first.
return True
flip = Flip(owner=check)
flip.created = flip_time
flip.old_status = old_status
flip.new_status = "down"
flip.save()
return True
def handle(self, use_threads=True, loop=True, *args, **options):
self.stdout.write("sendalerts is now running\n")
i, sent = 0, 0
while True:
# Create flips for any checks going down
while self.handle_going_down():
pass
# Process the unprocessed flips
while self.process_one_flip(use_threads):
sent += 1
if not loop:
break
time.sleep(2)
i += 1
if i % 60 == 0:
timestamp = timezone.now().isoformat()
self.stdout.write("-- MARK %s --\n" % timestamp)
return "Sent %d alert(s)" % sent