healthchecks/hc/api/management/commands/sendalerts.py

from datetime import timedelta as td
import time
from threading import Thread

from django.core.management.base import BaseCommand
from django.utils import timezone
from hc.api.models import Check, Flip
from statsd.defaults.env import statsd

SENDING_TMPL = "Sending alert, status=%s, code=%s\n"
SEND_TIME_TMPL = "Sending took %.1fs, code=%s\n"


def notify(flip_id, stdout):
    flip = Flip.objects.get(id=flip_id)

    check = flip.owner
    # Set the historic status here but *don't save it*.
    # It would be nicer to pass the status explicitly, as a separate parameter.
    check.status = flip.new_status
    # And just to make sure it doesn't get saved by a future coding accident:
    setattr(check, "save", None)

    stdout.write(SENDING_TMPL % (flip.new_status, check.code))

    # Set or clear dates for followup nags
    check.project.update_next_nag_dates()

    # Send notifications
    send_start = timezone.now()

    for ch, error, secs in flip.send_alerts():
        label = "OK"
        if error:
            label = "ERROR"
        elif secs > 5:
            label = "SLOW"

        s = " * %-5s %4.1fs %-10s %s %s\n" % (label, secs, ch.kind, ch.code, error)
        stdout.write(s)

    send_time = timezone.now() - send_start
    stdout.write(SEND_TIME_TMPL % (send_time.total_seconds(), check.code))

    statsd.timing("hc.sendalerts.dwellTime", send_start - flip.created)
    statsd.timing("hc.sendalerts.sendTime", send_time)


def notify_on_thread(flip_id, stdout):
    t = Thread(target=notify, args=(flip_id, stdout))
    t.start()


class Command(BaseCommand):
    help = "Sends UP/DOWN email alerts"

    def add_arguments(self, parser):
        parser.add_argument(
            "--no-loop",
            action="store_false",
            dest="loop",
            default=True,
            help="Do not keep running indefinitely in a 2 second wait loop",
        )

        parser.add_argument(
            "--no-threads",
            action="store_false",
            dest="use_threads",
            default=False,
            help="Send alerts synchronously, without using threads",
        )

    def process_one_flip(self, use_threads=True):
        """ Find unprocessed flip, send notifications.  """

        # Order by processed, otherwise Django will automatically order by id
        # and make the query less efficient
        q = Flip.objects.filter(processed=None).order_by("processed")
        flip = q.first()
        if flip is None:
            return False

        q = Flip.objects.filter(id=flip.id, processed=None)
        num_updated = q.update(processed=timezone.now())
        if num_updated != 1:
            # Nothing got updated: another worker process got there first.
            return True

        if use_threads:
            notify_on_thread(flip.id, self.stdout)
        else:
            notify(flip.id, self.stdout)

        return True

    def handle_going_down(self):
        """ Process a single check going down.  """

        now = timezone.now()

        q = Check.objects.filter(alert_after__lt=now).exclude(status="down")
        # Sort by alert_after, to avoid unnecessary sorting by id:
        check = q.order_by("alert_after").first()
        if check is None:
            return False

        old_status = check.status
        q = Check.objects.filter(id=check.id, status=old_status)

        try:
            status = check.get_status()
        except Exception as e:
            # Make sure we don't trip on this check again for an hour:
            # Otherwise sendalerts may end up in a crash loop.
            q.update(alert_after=now + td(hours=1))
            # Then re-raise the exception:
            raise e

        if status != "down":
            # It is not down yet. Update alert_after
            q.update(alert_after=check.going_down_after())
            return True

        # Atomically update status
        flip_time = check.going_down_after()
        num_updated = q.update(alert_after=None, status="down")
        if num_updated != 1:
            # Nothing got updated: another worker process got there first.
            return True

        flip = Flip(owner=check)
        flip.created = flip_time
        flip.old_status = old_status
        flip.new_status = "down"
        flip.save()

        return True

    def handle(self, use_threads=True, loop=True, *args, **options):
        self.stdout.write("sendalerts is now running\n")

        i, sent = 0, 0
        while True:
            # Create flips for any checks going down
            while self.handle_going_down():
                pass

            # Process the unprocessed flips
            while self.process_one_flip(use_threads):
                sent += 1

            if not loop:
                break

            time.sleep(2)
            i += 1
            if i % 60 == 0:
                timestamp = timezone.now().isoformat()
                self.stdout.write("-- MARK %s --\n" % timestamp)

        return "Sent %d alert(s)" % sent