From b03da3553332f14b53ca65249c74998eb28474b0 Mon Sep 17 00:00:00 2001 From: Ondrej Brablc Date: Mon, 3 Jun 2024 21:11:36 +0200 Subject: [PATCH] Refactor - same container globally each reports failing services for its own host --- .gitignore | 1 + Dockerfile | 3 ++ config.sh | 23 ++++++++- docker-cmd.sh | 77 +++------------------------- event-alerter.py | 130 ++++++++++++++++++++++------------------------- event-scraper.sh | 35 ------------- port-alerter.sh | 68 +++++++++++++++++++++++++ requirements.txt | 6 +++ 8 files changed, 168 insertions(+), 175 deletions(-) delete mode 100755 event-scraper.sh create mode 100755 port-alerter.sh create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index ebe34ff..681efec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ dockerize data __pycache__ +.venv diff --git a/Dockerfile b/Dockerfile index 1b822d3..68ae575 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,9 +11,12 @@ RUN mkdir -p "$DATA_DIR" COPY *.sh ./ COPY *.py ./ COPY integrations/ integrations/ +COPY requirements.txt ./ RUN apk update --no-cache \ && apk add --no-cache bash curl jq openssl \ && curl -s -L https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz | tar xzf - -C . +RUN pip install --no-warn-script-location --no-cache-dir -r requirements.txt + CMD ["./docker-cmd.sh"] diff --git a/config.sh b/config.sh index 210e290..548a2e7 100644 --- a/config.sh +++ b/config.sh @@ -1,5 +1,26 @@ -export SCRIPT_NAME=${0##*/} +SCRIPT_NAME=${0##*/} + +export LOOP_SLEEP=${LOOP_SLEEP:-10} +export ALERTER_URL=${ALERTER_URL:-http://alerter:80} +export SWARM_NAME=${SWARM_NAME:-Swarm} export LOGGER_USE_TS=1 export LOGGER_USE_SYSLOG=0 + export DATA_DIR=${DATA_DIR:-$script_dir/data} mkdir -p $DATA_DIR + +if [[ ! -S /var/run/docker.sock ]]; then + log_error "Mount to /var/run/docker.sock missing?" + exit 1 +fi + +if [[ -z $ALERT_SCRIPT ]]; then + log_error "Alert script not defined!" + export ALERT_SCRIPT="jq ." +fi + +if [[ ! -f $ALERT_SCRIPT ]]; then + || ! -f $ALERT_SCRIPT ]]; then + log_error "Alert script not accessible on $ALERT_SCRIPT path!" + export ALERT_SCRIPT="jq ." +fi diff --git a/docker-cmd.sh b/docker-cmd.sh index ce75256..817361d 100755 --- a/docker-cmd.sh +++ b/docker-cmd.sh @@ -20,82 +20,17 @@ if [[ -z $ALERT_SCRIPT || ! -f $ALERT_SCRIPT ]]; then export ALERT_SCRIPT="jq ." fi -# On all nodes start scraper, on manager node start alerter +log_info "Starting event alerter ..." +./event-alerter.py & +trap "kill $!" EXIT services=$(./services.sh 2>&1) if [ $? != 0 ]; then - ./event-scraper.sh exit fi -log_info "Initial list of services (run services.sh using docker exec to see actual):" +log_info "Initial list of services:" echo "$services" -log_info "Starting event alerter ..." -./event-alerter.py & -trap "kill $!" EXIT -log_info "Starting event scraper ..." -./event-scraper.sh & -trap "kill $!" EXIT - -### Manager code only - -function check_services() { - local swarm_name=$SWARM_NAME - while read service_name network_alias port; do - unique_name=$(echo "${swarm_name} ${service_name} ${network_alias} ${port}" ) - unique_code=$(echo "${unique_name,,}" | sed -e 's/ /_/g' -e 's/[^a-zA-Z0-9_-]/_/g') - random_str=$(tr -dc 'a-zA-Z0-9' < /dev/urandom | head -c 10) - read unique_id _ < <(echo -n "$unique_name $random_str" | md5sum) - prefix="$DATA_DIR/${unique_code}" - pending_file="${prefix}.pending" - log_file="${prefix}.log" - - # used for testing - real_port="$port" - if [[ -f "$DATA_DIR/test-change-port-$port" ]]; then - real_port=$(< "$DATA_DIR/test-change-port-$port") - fi - - action="" - appendix="" - message="${swarm_name} service ${service_name} (${network_alias}:${port})" - ./dockerize -timeout 5s -wait tcp://$network_alias:$real_port true 2>$log_file - if [ $? -ne 0 ]; then - if [[ -f $pending_file ]]; then - log_warn "Pending alert: $message" - else - echo "$unique_id" > $pending_file - action="create" - appendix="not available" - fi - else - if [[ -f $pending_file ]]; then - action="resolve" - appendix="is available" - unique_id=$(cat $pending_file) - rm -f $pending_file - fi - fi - if [[ -n $action ]]; then - jq -n \ - --arg action "$action" \ - --arg unique_id "$unique_id" \ - --arg message "$message $appendix" \ - --arg summary "$(jq -R -s @json $log_file)" \ - '{ - "action": $action, - "unique_id": $unique_id, - "message": $message, - "summary": $summary - }' | /bin/bash -c "$ALERT_SCRIPT" - fi - rm -f $log_file - done < <(./services.sh) -} - -log_info "Entering loop with ${LOOP_SLEEP} sleep on entry ..." -while true; do - sleep $LOOP_SLEEP - check_services -done +log_info "Starting port alerter ..." +./port-alerter.sh diff --git a/event-alerter.py b/event-alerter.py index a14350a..078ac59 100755 --- a/event-alerter.py +++ b/event-alerter.py @@ -3,20 +3,23 @@ import hashlib import json import os +import requests +import requests_unixsocket import secrets +import socket import string import subprocess import time import threading import urllib.parse as urlparse -from http.server import BaseHTTPRequestHandler, HTTPServer from collections import defaultdict, deque from logger import log_info, log_error ALERT_SCRIPT = os.getenv("ALERT_SCRIPT", "jq .") -EVENTS_WINDOW = int(os.getenv("EVENTS_WINDOW", "300")) -EVENTS_THRESHOLD = int(os.getenv("EVENTS_THRESHOLD", "3")) +EVENTS_WINDOW = int(os.getenv("EVENTS_WINDOW", "60")) +EVENTS_THRESHOLD = int(os.getenv("EVENTS_THRESHOLD", "2")) +HOSTNAME = os.getenv("HOSTNAME", socket.gethostname()) LOOP_SLEEP = int(os.getenv("LOOP_SLEEP", "10")) SWARM_NAME = os.getenv("SWARM_NAME", "Swarm") @@ -25,16 +28,51 @@ lock = threading.Lock() +def calculate_md5(input_str): + md5_hash = hashlib.md5() + md5_hash.update(input_str.encode("utf-8")) + return md5_hash.hexdigest() + + +def send_alert(data): + if not ALERT_SCRIPT: + return + + json_data = json.dumps(data) + process = subprocess.Popen( + ["/bin/bash", "-c", ALERT_SCRIPT], stdin=subprocess.PIPE, text=True + ) + process.communicate(input=json_data) + + def get_random_str(length): characters = string.ascii_letters + string.digits return "".join(secrets.choice(characters) for _ in range(length)) +def docker_events_stream(): + base_url = "http+unix://" + socket_path = "/var/run/docker.sock" + endpoint = "/v1.41/events" + url = f"{base_url}{urlparse.quote(socket_path, safe='')}{endpoint}" + + params = { + "filters": json.dumps({"type": ["container"], "event": ["create", "destroy"]}) + } + + session = requests_unixsocket.Session() + response = session.get(url, params=params, stream=True) + response.raise_for_status() + + for line in response.iter_lines(): + if line: + yield json.loads(line.decode("utf-8")) + + def process_events(): current_time = time.time() counts = defaultdict(lambda: {"create": 0, "destroy": 0}) - hosts = defaultdict(set) seen_services = set() # Remove events older than EVENTS_WINDOW @@ -43,7 +81,6 @@ def process_events(): for event in events: counts[event["service_name"]][event["action"]] += 1 - hosts[event["service_name"]].add(event["host"]) seen_services.add(event["service_name"]) for service_name, actions in counts.items(): @@ -61,8 +98,8 @@ def process_events(): "unique_id": calculate_md5( f"{SWARM_NAME} {service_name} {get_random_str(10)}" ), - "message": f"{SWARM_NAME} service {service_name} not healthy", - "summary": f"There were {actions["create"]} containers created and {actions["destroy"]} destroyed within {EVENTS_WINDOW} seconds.\nReported by {list(hosts[service_name])} host(s).", + "message": f"{SWARM_NAME} service {service_name} failing on {HOSTNAME}", + "summary": f"There were {actions["create"]} containers created and {actions["destroy"]} destroyed within {EVENTS_WINDOW} seconds.", } pending_alerts[service_name] = data send_alert(data) @@ -74,30 +111,13 @@ def process_events(): data = { "action": "resolve", "unique_id": pending_alerts[service_name]["unique_id"], - "message": f"{SWARM_NAME} service {service_name} is healthy", + "message": f"{SWARM_NAME} service {service_name} stable on {HOSTNAME}", "summary": f"No events in last {EVENTS_WINDOW} seconds, assuming service is healthy (or stopped)", } del pending_alerts[service_name] send_alert(data) -def calculate_md5(input_str): - md5_hash = hashlib.md5() - md5_hash.update(input_str.encode("utf-8")) - return md5_hash.hexdigest() - - -def send_alert(data): - if not ALERT_SCRIPT: - return - - json_data = json.dumps(data) - process = subprocess.Popen( - ["/bin/bash", "-c", ALERT_SCRIPT], stdin=subprocess.PIPE, text=True - ) - process.communicate(input=json_data) - - def resolve_pending(): while True: time.sleep(LOOP_SLEEP) @@ -105,51 +125,25 @@ def resolve_pending(): process_events() -class EventHandler(BaseHTTPRequestHandler): - def do_GET(self): - parsed_path = urlparse.urlparse(self.path) - query = urlparse.parse_qs(parsed_path.query) - payload = query.get("payload", [None])[0] - if not payload: - self.send_response(400) - self.end_headers() - self.wfile.write(b"No payload received") - return - - payload_data = json.loads(payload) - host = payload_data["host"] - timestamp = payload_data["ts"] - action = payload_data["action"] - service_name = payload_data["service_name"] - - with lock: - events.append( - { - "ts": timestamp, - "action": action, - "service_name": service_name, - "host": host, - } - ) - process_events() - - self.send_response(200) - self.end_headers() - self.wfile.write(b"OK") - - def log_message(self, format, *args): - return - - def main(): try: - cleanup_thread = threading.Thread(target=resolve_pending, daemon=True) - cleanup_thread.start() - - server = HTTPServer(("0.0.0.0", 80), EventHandler) - server.serve_forever() - except Exception as e: - log_error(f"{e}") + resolving_thread = threading.Thread(target=resolve_pending, daemon=True) + resolving_thread.start() + + for event in docker_events_stream(): + data = { + "ts": event["time"], + "action": event["Action"], + "service_name": event["Actor"]["Attributes"].get( + "com.docker.swarm.service.name", "unknown" + ), + } + + # log_info(json.dumps(data)) + events.append(data) + process_events() + except requests.exceptions.RequestException as e: + log_error(f"Error connecting to Docker API: {e}") if __name__ == "__main__": diff --git a/event-scraper.sh b/event-scraper.sh deleted file mode 100755 index 2b327c3..0000000 --- a/event-scraper.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash - -script_path=$(readlink -f $0) -script_dir=${script_path%/*} -cd "$script_dir" -source "./config.sh" -source "./logger.sh" - -if [[ ! -S /var/run/docker.sock ]]; then - log_error "Mount to /var/run/docker.sock missing?" - exit 1 -fi - -if [[ -z $ALERTER_URL ]]; then - log_warn "Missing ALERTER_URL, not passing scraped data" -else - ./dockerize -wait ${ALERTER_URL/http/tcp} -timeout 10s true -fi - -FIFO="$DATA_DIR/fifo_events" - -mkfifo $FIFO -trap "rm -f $FIFO" EXIT -exec 3<> $FIFO # keep open -./docker-api.sh /events filters '{"type":["container"],"event":["create","destroy"]}' > $FIFO & -while read -r event < $FIFO; do - result=$(jq --arg host "$HOSTNAME" -r '. | { host: $host, ts: .time, action: .Action, service_name: .Actor.Attributes["com.docker.swarm.service.name"]}' <<< "$event") - if [ $? != 0 ]; then - log_warn "Cannot parse event (multiple writers?):" - echo "$event" - continue - fi - [[ -z $ALERTER_URL ]] && continue - curl -s -S "$ALERTER_URL?payload=$(echo "$result" | jq -s -R -r @uri)" -o /dev/null -done diff --git a/port-alerter.sh b/port-alerter.sh new file mode 100755 index 0000000..87cf8f1 --- /dev/null +++ b/port-alerter.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +script_path=$(readlink -f $0) +script_dir=${script_path%/*} +cd "$script_dir" +source "./config.sh" +source "./logger.sh" + +function check_services() { + local swarm_name=$SWARM_NAME + while read service_name network_alias port; do + unique_name=$(echo "${swarm_name} ${service_name} ${network_alias} ${port}" ) + unique_code=$(echo "${unique_name,,}" | sed -e 's/ /_/g' -e 's/[^a-zA-Z0-9_-]/_/g') + random_str=$(tr -dc 'a-zA-Z0-9' < /dev/urandom | head -c 10) + read unique_id _ < <(echo -n "$unique_name $random_str" | md5sum) + prefix="$DATA_DIR/${unique_code}" + pending_file="${prefix}.pending" + log_file="${prefix}.log" + + # used for testing + real_port="$port" + if [[ -f "$DATA_DIR/test-change-port-$port" ]]; then + real_port=$(< "$DATA_DIR/test-change-port-$port") + fi + + action="" + appendix="" + message="${swarm_name} service ${service_name} (${network_alias}:${port})" + ./dockerize -timeout 5s -wait tcp://$network_alias:$real_port true 2>$log_file + if [ $? -ne 0 ]; then + if [[ -f $pending_file ]]; then + log_warn "Pending alert: $message" + else + echo "$unique_id" > $pending_file + action="create" + appendix="not available" + fi + else + if [[ -f $pending_file ]]; then + action="resolve" + appendix="is available" + unique_id=$(cat $pending_file) + rm -f $pending_file + fi + fi + if [[ -n $action ]]; then + jq -n \ + --arg action "$action" \ + --arg unique_id "$unique_id" \ + --arg message "$message $appendix" \ + --arg summary "$(jq -R -s @json $log_file)" \ + '{ + "action": $action, + "unique_id": $unique_id, + "message": $message, + "summary": $summary + }' | /bin/bash -c "$ALERT_SCRIPT" + fi + rm -f $log_file + done < <(./services.sh) +} + +log_info "Entering loop with ${LOOP_SLEEP} sleep on entry ..." + +while true; do + sleep $LOOP_SLEEP + check_services +done diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b0f5670 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +certifi==2024.6.2 +chardet==4.0.0 +idna==2.10 +requests==2.25.1 +requests-unixsocket==0.3.0 +urllib3==1.26.18