diff --git a/justfile b/justfile index 0056277..3f84982 100644 --- a/justfile +++ b/justfile @@ -1,5 +1,6 @@ set positional-arguments set dotenv-load +set export IMAGE := env_var_or_default("IMAGE", "debian-11") IMAGE_SRC := env_var_or_default("IMAGE_SRC", "debian-11") diff --git a/nfpm.yaml b/nfpm.yaml index e9a74fc..84bc679 100644 --- a/nfpm.yaml +++ b/nfpm.yaml @@ -23,6 +23,14 @@ contents: group: tedge owner: tedge + - src: ./src/tedge-monit-setup/env + dst: /etc/tedge-monit-setup/env + type: config|noreplace + file_info: + mode: 0644 + group: root + owner: root + - src: ./src/bin/monit-tedge-message dst: /usr/bin/ file_info: @@ -30,6 +38,13 @@ contents: group: root owner: root + - src: ./src/bin/monit-tedge-reconnect + dst: /usr/bin/ + file_info: + mode: 0755 + group: root + owner: root + overrides: apk: depends: diff --git a/src/bin/monit-tedge-reconnect b/src/bin/monit-tedge-reconnect new file mode 100755 index 0000000..e18b4b1 --- /dev/null +++ b/src/bin/monit-tedge-reconnect @@ -0,0 +1,169 @@ +#!/bin/sh +set -e + +help() { + cat << EOT +Collect debug information and repair the given cloud connection + +$0 + +EXAMPLE + + $0 c8y + # Repair the Cumulocity IoT cloud connection (and collect debug information) + +EOT +} + +log() { echo "$@" >&2; } + +CLOUD= +LOG_DIR=/var/log/tedge-monit-setup + +# Only read the file if it has the correct permissions, to prevent people from editing it +# and side-loading functions +SETTINGS_FILE=/etc/tedge-monit-setup/env +FOUND_FILE= +if [ -f "$SETTINGS_FILE" ]; then + FOUND_FILE=$(find "$SETTINGS_FILE" -perm 644 | head -n 1) +fi + +if [ -n "$FOUND_FILE" ]; then + log "Loading setting file: $SETTINGS_FILE" + # shellcheck disable=SC1091,SC1090 + . "$SETTINGS_FILE" +fi + +# +# Parse arguments +# +while [ $# -gt 0 ]; do + case "$1" in + -h|--help) + help + exit 0 + ;; + --log-dir) + LOG_DIR="$2" + shift + ;; + --*|-*) + log "Unknown flag. $1" + help + exit 1 + ;; + *) + if [ -z "$CLOUD" ]; then + CLOUD="$1" + fi + ;; + esac + shift +done + +if [ -z "$CLOUD" ]; then + log "Cloud setting is empty" + help + exit 1 +fi + +remove_old_files() { + cd "$LOG_DIR" + + MAX_FILES=${MAX_FILES:-10} + if [ "$MAX_FILES" -gt 0 ]; then + # ls -t "$LOG_DIR"/*.tar.gz | tail +6 | xargs rm + find "$LOG_DIR" -name "tedge-debug-*.tar.gz" | tail +"$MAX_FILES" | xargs rm + fi + + # Only keep files newer than x days (protect against an empty log dir) + MAX_DAYS="${MAX_DAYS:-30}" + if [ "$MAX_DAYS" -gt 0 ] && [ -n "$LOG_DIR" ]; then + find "$LOG_DIR" -name "tedge-debug-*.tar.gz" -mtime "$MAX_DAYS" -exec rm -f {} \; ||: + fi +} + +pre_log_collection() { + # + # Collect logs before the corrective action has been executed + # + mkdir -p "$LOGD" ||: + + journalctl -u "tedge-mapper-$CLOUD" -n 100 > "$LOGD/tedge-mapper-$CLOUD.log" ||: + journalctl -u tedge-agent -n 100 > "$LOGD/tedge-agent.log" ||: + + tedge config list > "$LOGD/tedge-config-list.txt" + + if [ -f /etc/tedge/.tedge-mapper-c8y/entity_store.jsonl ]; then + cp /etc/tedge/.tedge-mapper-c8y/entity_store.jsonl "$LOGD/" ||: + fi + + if [ -f /var/log/mosquitto/mosquitto.log ]; then + tail -n 100 /var/log/mosquitto/mosquitto.log > "$LOGD/mosquitto.log" ||: + fi + + monit summary > "$LOGD/monit.summary.txt" ||: + monit status > "$LOGD/monit.status.txt" ||: +} + +post_log_collection() { + # + # Collect logs after the corrective action has been executed + # + mkdir -p "$LOGD" ||: + + if command -V timeout >/dev/null 2>&1; then + timeout 5 tedge mqtt sub '#' > "$LOGD/mqtt.txt" ||: + fi +} + +publish_event() { + attempt=0 + code=1 + while [ "$attempt" -lt 5 ]; do + if MONIT_EVENT="Reconnected to cloud ($CLOUD)" MONIT_DESCRIPTION="A monit rule was able to reconnect to the cloud. Check the $LOG_DIR directory for files" /usr/bin/monit-tedge-message event "${CLOUD}_reconnected"; then + code=0 + break + fi + attempt=$((attempt + 1)) + sleep 5 + done + if [ "$code" != 0 ]; then + log "Warning: Could not publish reconnected event" + fi + return "$code" +} + +LOGD="$LOG_DIR/tmp" +log "Logging files to $LOGD" + +if ! remove_old_files; then + log "Unexpected error whilst cleaning up old log files" +fi + +# Collect logs +if ! pre_log_collection; then + log "Warning: Unkown error whilst collecting pre debug information" +fi + +# Perform corrective action to try to reestablish the cloud connection +tedge reconnect "$CLOUD" + +# send event so it is easier to query the cloud to find devices where a problem occurred +publish_event ||: + +# Collect logs after the connection has hopefully been repaired +if ! post_log_collection; then + log "Warning: Unknown error whilst collecting post debug information" +fi + +# Compress archive +LOG_NAME="$(date +"tedge-debug-${CLOUD}_%Y%m%d_%H%M")" +cd "$LOGD" && tar czf "$LOG_DIR/${LOG_NAME}.tar.gz" . + +# Also create a concatenated log file so it can be retrieved via the log_upload operation +if command -V zcat >/dev/null 2>&1; then + zcat "$LOG_DIR/${LOG_NAME}.tar.gz" > "$LOG_DIR/${LOG_NAME}.log" +fi + +rm -rf "$LOGD" diff --git a/src/conf.d/tedge-monitoring.conf b/src/conf.d/tedge-monitoring.conf index 209e592..c536890 100644 --- a/src/conf.d/tedge-monitoring.conf +++ b/src/conf.d/tedge-monitoring.conf @@ -47,7 +47,7 @@ check program c8y-connectivity with path "/usr/bin/tedge connect c8y --test" with timeout 60 seconds every 120 cycles if status != 0 then alert - if status != 0 for 10 cycles then exec "/usr/bin/tedge reconnect c8y" + if status != 0 for 10 cycles then exec "/usr/bin/monit-tedge-reconnect c8y" depends on c8y-enabled group c8y @@ -64,7 +64,7 @@ check program az-connectivity with path "/usr/bin/tedge connect az --test" with timeout 60 seconds every 120 cycles if status != 0 then alert - if status != 0 for 10 cycles then exec "/usr/bin/tedge reconnect az" + if status != 0 for 10 cycles then exec "/usr/bin/monit-tedge-reconnect az" depends on az-enabled group az @@ -81,6 +81,6 @@ check program aws-connectivity with path "/usr/bin/tedge connect aws --test" with timeout 60 seconds every 120 cycles if status != 0 then alert - if status != 0 for 10 cycles then exec "/usr/bin/tedge reconnect aws" + if status != 0 for 10 cycles then exec "/usr/bin/monit-tedge-reconnect aws" depends on aws-enabled group aws diff --git a/src/tedge-monit-setup/env b/src/tedge-monit-setup/env new file mode 100644 index 0000000..7de999e --- /dev/null +++ b/src/tedge-monit-setup/env @@ -0,0 +1,8 @@ +# Where to write the log files to +#LOG_DIR=/var/log/tedge-monit-setup + +# Maximum number of days to keep log (tar.gz) files +#MAX_DAYS=30 + +# Maximum number of log (tar.gz) files to keep +#MAX_FILES=10 diff --git a/tests/monit.robot b/tests/monit.robot index bcf9087..e2b56b9 100644 --- a/tests/monit.robot +++ b/tests/monit.robot @@ -20,6 +20,10 @@ Reconnect on cloud connection loss (by stopping mosquitto) Execute Command tedge connect c8y --test timeout=120 Execute Command systemctl is-active mosquitto Execute Command systemctl is-active tedge-mapper-c8y + Cumulocity.Device Should Exist ${DEVICE_SN} + Cumulocity.Device Should Have Event/s expected_text=.*Reconnected to cloud .*c8y.* type=c8y_reconnected + Assert File Count /var/log/tedge-monit-setup/*.tar.gz 1 + Assert File Count /var/log/tedge-monit-setup/*.log 1 *** Keywords *** @@ -35,3 +39,7 @@ Decrease monit intervals Execute Command cmd=sed -i 's/every 120 cycles/every 5 cycles/g' /etc/monit/conf.d/tedge-monitoring.conf Execute Command cmd=sed -i 's/if status != 0 for 10 cycles/if status != 0 for 2 cycles/g' /etc/monit/conf.d/tedge-monitoring.conf Execute Command systemctl restart monit + +Assert File Count + [Arguments] ${path} ${count} + DeviceLibrary.Execute Command cmd=[ $(ls -l ${path} | wc -l | xargs) = ${count} ]