Skip to content

Commit

Permalink
Introduce health checking based on docker events API
Browse files Browse the repository at this point in the history
Uses docker system events API to count containers being destroyed and
created within a time window. Requires a scraper container on each node.
  • Loading branch information
brablc committed Jun 3, 2024
1 parent 07796e6 commit 1f517c5
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 32 deletions.
65 changes: 38 additions & 27 deletions docker-cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

script_path=$(readlink -f $0)
script_dir=${script_path%/*}
source "$script_dir/config.sh"
source "$script_dir/logger.sh"
cd "$script_dir"
source "./config.sh"
source "./logger.sh"

LOOP_SLEEP=${LOOP_SLEEP:-10s}

Expand All @@ -12,18 +13,33 @@ if [[ ! -S /var/run/docker.sock ]]; then
exit 1
fi

test -z "$ALERT_SCRIPT" && log_warn "Env ALERT_SCRIPT not defined - alerting disabled"
test -z "$SWARM_NAME" && log_warn "Env SWARM_NAME not defined using default"
if [[ -z $ALERT_SCRIPT || ! -f $ALERT_SCRIPT ]]; then
log_error "Alert script not defined or not accessible on \"$ALERT_SCRIPT\" path!"
ALERT_SCRIPT="jq ."
fi

test -z "$SWARM_NAME" && log_warn "Env SWARM_NAME not defined using default"
swarm_name="${SWARM_NAME:-Swarm}"
DATA_DIR=${DATA_DIR:-$script_dir/data}
mkdir -p $DATA_DIR

if [[ -n $ALERT_SCRIPT && ! -f $ALERT_SCRIPT ]]; then
log_error "Alert script defined but not accessible on $ALERT_SCRIPT path!"
ALERT_SCRIPT="jq ."
# On all nodes start scraper, on manager node start alerter

services=$(./services.sh 2>&1)
if [ $? = 0 ]; then
log_info "Initial list of services (run services.sh using docker exec to see actual):"
echo "$services"
log_info "Starting event alerter ..."
./event-alerter.py &
trap "kill $!" EXIT
log_info "Starting event scraper ..."
./event-scraper.sh &
trap "kill $!" EXIT
else
./event-monitor.sh
exit
fi

### Manager code only

function check_services() {
while read service_name network_alias port; do
unique_name=$(echo "${swarm_name} ${service_name} ${network_alias} ${port}" )
Expand All @@ -33,55 +49,50 @@ function check_services() {
prefix="$DATA_DIR/${unique_code}"
pending_file="${prefix}.pending"
log_file="${prefix}.log"

# used for testing
real_port="$port"
if [[ -f "$DATA_DIR/test-change-port-$port" ]]; then
real_port=$(< "$DATA_DIR/test-change-port-$port")
fi

action=""
appendix=""
message="${swarm_name} service ${service_name} (${network_alias}:${port})"
./dockerize -timeout 5s -wait tcp://$network_alias:$real_port true 2>$log_file
if [ $? -ne 0 ]; then
if [[ -f $pending_file ]]; then
log_warn "$service_name|$network_alias:$port|Pending alert"
log_warn "Pending alert: $message"
else
log_error "$service_name|$network_alias:$port|Creating alert"
echo "$unique_id" > $pending_file
action="create"
appendix="not available"
fi
else
if [[ -f $pending_file ]]; then
log_info "$service_name|$network_alias:$port|Resolving alert"
action="resolve"
appendix="is available"
unique_id=$(cat $pending_file)
rm -f $pending_file
fi
fi
if [[ -n $action ]]; then
jq -n \
--arg action "$action" \
--arg unique_id "$unique_id" \
--arg swarm_name "$swarm_name" \
--arg service_name "$service_name" \
--arg network_alias "$network_alias" \
--arg port "$port" \
--arg log "$(jq -R -s @json $log_file)" \
--arg action "$action" \
--arg unique_id "$unique_id" \
--arg message "$message $appendix" \
--arg summary "$(jq -R -s @json $log_file)" \
'{
"action": $action,
"unique_id": $unique_id,
"swarm_name": $swarm_name,
"service_name": $service_name,
"network_alias": $network_alias,
"port": $port,
"log": $log
"message": $message,
"summary": $log
}' | /bin/bash -c "$ALERT_SCRIPT"
fi
rm -f $log_file
done < <(./services.sh)
}

log_info "Initial list of services (run services.sh using docker exec to see actual):"
./services.sh

log_info "Entering loop with ${LOOP_SLEEP} sleep on entry ..."

while true; do
Expand Down
160 changes: 160 additions & 0 deletions event-alerter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#!/usr/bin/env python3

import hashlib
import json
import os
import secrets
import string
import subprocess
import time
import threading
import urllib.parse as urlparse

from http.server import BaseHTTPRequestHandler, HTTPServer
from collections import defaultdict, deque
from logger import log_info, log_error

CLEANUP_INTERVAL = 60

EVENTS_WINDOW = int(os.getenv("EVENTS_WINDOW", "300"))
EVENTS_THRESHOLD = int(os.getenv("EVENTS_THRESHOLD", "3"))
ALERT_SCRIPT = os.getenv("ALERT_SCRIPT", "jq .")
SWARM_NAME = os.getenv("SWARM_NAME", "Swarm")

events = deque()
pending_alerts = {}
lock = threading.Lock()


def get_random_str(length):
characters = string.ascii_letters + string.digits
return "".join(secrets.choice(characters) for _ in range(length))


def process_events():
current_time = time.time()

counts = defaultdict(lambda: {"create": 0, "destroy": 0})
hosts = defaultdict(set)
seen_services = set()

# Remove events older than EVENTS_WINDOW
while events and events[0]["ts"] <= current_time - EVENTS_WINDOW:
events.popleft()

# Count events per service
for event in events:
counts[event["service_name"]][event["action"]] += 1
hosts[event["service_name"]].add(event["host"])
seen_services.add(event["service_name"])

for service_name, actions in counts.items():
if (
actions["destroy"] < EVENTS_THRESHOLD
and actions["create"] < EVENTS_THRESHOLD
):
continue

if service_name in pending_alerts:
continue

data = {
"action": "create",
"unique_id": calculate_md5(
f"{SWARM_NAME} {service_name} {get_random_str(10)}"
),
"message": f"{SWARM_NAME} service {service_name} not healthy",
"summary": f"There were {actions["create"]} containers created and {actions["destroy"]} destroyed within {EVENTS_WINDOW} seconds.\nReported by {list(hosts[service_name])} hosts.",
}
pending_alerts[service_name] = data
log_error(f"Creating alert: {data["message"]}")
send_alert(data)

for service_name in list(pending_alerts.keys()):
if service_name in seen_services:
continue

data = {
"action": "resolve",
"unique_id": pending_alerts[service_name]["unique_id"],
"message": f"{SWARM_NAME} service {service_name} is healthy",
"summary": f"No events in last {EVENTS_WINDOW} seconds, assuming service is healthy (or stopped)",
}
del pending_alerts[service_name]
log_info(f"Resolving alert: {data["message"]}")
send_alert(data)


def calculate_md5(input_str):
md5_hash = hashlib.md5()
md5_hash.update(input_str.encode("utf-8"))
return md5_hash.hexdigest()


def send_alert(data):
if not ALERT_SCRIPT:
return

json_data = json.dumps(data)
process = subprocess.Popen(
["/bin/bash", "-c", ALERT_SCRIPT], stdin=subprocess.PIPE, text=True
)
process.communicate(input=json_data)


def resolve_pending():
while True:
time.sleep(CLEANUP_INTERVAL)
with lock:
process_events()


class EventHandler(BaseHTTPRequestHandler):
def do_GET(self):
parsed_path = urlparse.urlparse(self.path)
query = urlparse.parse_qs(parsed_path.query)
payload = query.get("payload", [None])[0]
if not payload:
self.send_response(400)
self.end_headers()
self.wfile.write(b"No payload received")
return

payload_data = json.loads(payload)
host = payload_data["host"]
timestamp = payload_data["ts"]
action = payload_data["action"]
service_name = payload_data["service_name"]

with lock:
events.append(
{
"ts": timestamp,
"action": action,
"service_name": service_name,
"host": host,
}
)
process_events()

self.send_response(200)
self.end_headers()
self.wfile.write(b"OK")

def log_message(self, format, *args):
return


def main():
try:
cleanup_thread = threading.Thread(target=resolve_pending, daemon=True)
cleanup_thread.start()

server = HTTPServer(("0.0.0.0", 80), EventHandler)
server.serve_forever()
except Exception as e:
log_error(f"{e}")


if __name__ == "__main__":
main()
35 changes: 35 additions & 0 deletions event-scraper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash

script_path=$(readlink -f $0)
script_dir=${script_path%/*}
cd "$script_dir"
source "./config.sh"
source "./logger.sh"

if [[ ! -S /var/run/docker.sock ]]; then
log_error "Mount to /var/run/docker.sock missing?"
exit 1
fi

if [[ -z $ALERTER_URL ]]; then
log_warn "Missing ALERTER_URL, not passing scraped data"
else
./dockerize -wait ${ALERTER_URL/http/tcp} -timeout 10s true
fi

FIFO="$DATA_DIR/fifo_events"

mkfifo $FIFO
trap "rm -f $FIFO" EXIT
exec 3<> $FIFO # keep open
./docker-api.sh /events filters '{"type":["container"],"event":["create","destroy"]}' > $FIFO &
while read -r event < $FIFO; do
result=$(jq --arg host "$HOSTNAME" -r '. | { host: $host, ts: .time, action: .Action, service_name: .Actor.Attributes["com.docker.swarm.service.name"]}' <<< "$event")
if [ $? != 0 ]; then
log_warn "Cannot parse event (multiple writers?):"
echo "$event"
continue
fi
[[ -z $ALERTER_URL ]] && continue
curl -s -S "$ALERTER_URL?payload=$(echo "$result" | jq -s -R -r @uri)" -o /dev/null
done
14 changes: 9 additions & 5 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,15 @@ docker run --rm \
-it \
--name $NAME \
--network $network \
--env LOOP_SLEEP="$LOOP_SLEEP" \
--env ALERT_SCRIPT="$ALERT_SCRIPT" \
--env SWARM_NAME="$SWARM_NAME" \
--env ZENDUTY_API_KEY="$ZENDUTY_API_KEY" \
--env DATA_DIR=/app/data \
--env ALERTER_URL="${ALERTER_URL:-http://localhost:80}" \
--env ALERT_SCRIPT="${ALERT_SCRIPT}" \
--env DATA_DIR=${DATA_DIR:-/app/data} \
--env EVENTS_THRESHOLD="${EVENTS_THRESHOLD:-3}" \
--env EVENTS_WINDOW="${EVENTS_WINDOW:-60}" \
--env LOGGER_USE_TS="${LOGGER_USE_TS:-1}" \
--env LOOP_SLEEP="${LOOP_SLEEP:-10s}" \
--env SWARM_NAME="${SWARM_NAME:-Swarm}" \
--env ZENDUTY_API_KEY="${ZENDUTY_API_KEY:-N/A}" \
--volume /var/run/docker.sock:/var/run/docker.sock \
--volume .:/app/ \
brablc/swarm-health-alerter:dev "$@"

0 comments on commit 1f517c5

Please sign in to comment.