Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: collect logs and send event on reconnect #20

Merged
merged 8 commits into from
Jul 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions justfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set positional-arguments
set dotenv-load
set export

IMAGE := env_var_or_default("IMAGE", "debian-11")
IMAGE_SRC := env_var_or_default("IMAGE_SRC", "debian-11")
Expand Down
15 changes: 15 additions & 0 deletions nfpm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,28 @@ contents:
group: tedge
owner: tedge

- src: ./src/tedge-monit-setup/env
dst: /etc/tedge-monit-setup/env
type: config|noreplace
file_info:
mode: 0644
group: root
owner: root

- src: ./src/bin/monit-tedge-message
dst: /usr/bin/
file_info:
mode: 0755
group: root
owner: root

- src: ./src/bin/monit-tedge-reconnect
dst: /usr/bin/
file_info:
mode: 0755
group: root
owner: root

overrides:
apk:
depends:
Expand Down
169 changes: 169 additions & 0 deletions src/bin/monit-tedge-reconnect
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/bin/sh
set -e

help() {
cat << EOT
Collect debug information and repair the given cloud connection

$0 <c8y|aws|az>

EXAMPLE

$0 c8y
# Repair the Cumulocity IoT cloud connection (and collect debug information)

EOT
}

log() { echo "$@" >&2; }

CLOUD=
LOG_DIR=/var/log/tedge-monit-setup

# Only read the file if it has the correct permissions, to prevent people from editing it
# and side-loading functions
SETTINGS_FILE=/etc/tedge-monit-setup/env
FOUND_FILE=
if [ -f "$SETTINGS_FILE" ]; then
FOUND_FILE=$(find "$SETTINGS_FILE" -perm 644 | head -n 1)
fi

if [ -n "$FOUND_FILE" ]; then
log "Loading setting file: $SETTINGS_FILE"
# shellcheck disable=SC1091,SC1090
. "$SETTINGS_FILE"
fi

#
# Parse arguments
#
while [ $# -gt 0 ]; do
case "$1" in
-h|--help)
help
exit 0
;;
--log-dir)
LOG_DIR="$2"
shift
;;
--*|-*)
log "Unknown flag. $1"
help
exit 1
;;
*)
if [ -z "$CLOUD" ]; then
CLOUD="$1"
fi
;;
esac
shift
done

if [ -z "$CLOUD" ]; then
log "Cloud setting is empty"
help
exit 1
fi

remove_old_files() {
cd "$LOG_DIR"

MAX_FILES=${MAX_FILES:-10}
if [ "$MAX_FILES" -gt 0 ]; then
# ls -t "$LOG_DIR"/*.tar.gz | tail +6 | xargs rm
find "$LOG_DIR" -name "tedge-debug-*.tar.gz" | tail +"$MAX_FILES" | xargs rm
fi

# Only keep files newer than x days (protect against an empty log dir)
MAX_DAYS="${MAX_DAYS:-30}"
if [ "$MAX_DAYS" -gt 0 ] && [ -n "$LOG_DIR" ]; then
find "$LOG_DIR" -name "tedge-debug-*.tar.gz" -mtime "$MAX_DAYS" -exec rm -f {} \; ||:
fi
}

pre_log_collection() {
#
# Collect logs before the corrective action has been executed
#
mkdir -p "$LOGD" ||:

journalctl -u "tedge-mapper-$CLOUD" -n 100 > "$LOGD/tedge-mapper-$CLOUD.log" ||:
journalctl -u tedge-agent -n 100 > "$LOGD/tedge-agent.log" ||:

tedge config list > "$LOGD/tedge-config-list.txt"

if [ -f /etc/tedge/.tedge-mapper-c8y/entity_store.jsonl ]; then
cp /etc/tedge/.tedge-mapper-c8y/entity_store.jsonl "$LOGD/" ||:
fi

if [ -f /var/log/mosquitto/mosquitto.log ]; then
tail -n 100 /var/log/mosquitto/mosquitto.log > "$LOGD/mosquitto.log" ||:
fi

monit summary > "$LOGD/monit.summary.txt" ||:
monit status > "$LOGD/monit.status.txt" ||:
}

post_log_collection() {
#
# Collect logs after the corrective action has been executed
#
mkdir -p "$LOGD" ||:

if command -V timeout >/dev/null 2>&1; then
timeout 5 tedge mqtt sub '#' > "$LOGD/mqtt.txt" ||:
fi
}

publish_event() {
attempt=0
code=1
while [ "$attempt" -lt 5 ]; do
if MONIT_EVENT="Reconnected to cloud ($CLOUD)" MONIT_DESCRIPTION="A monit rule was able to reconnect to the cloud. Check the $LOG_DIR directory for files" /usr/bin/monit-tedge-message event "${CLOUD}_reconnected"; then
code=0
break
fi
attempt=$((attempt + 1))
sleep 5
done
if [ "$code" != 0 ]; then
log "Warning: Could not publish reconnected event"
fi
return "$code"
}

LOGD="$LOG_DIR/tmp"
log "Logging files to $LOGD"

if ! remove_old_files; then
log "Unexpected error whilst cleaning up old log files"
fi

# Collect logs
if ! pre_log_collection; then
log "Warning: Unkown error whilst collecting pre debug information"
fi

# Perform corrective action to try to reestablish the cloud connection
tedge reconnect "$CLOUD"

# send event so it is easier to query the cloud to find devices where a problem occurred
publish_event ||:

# Collect logs after the connection has hopefully been repaired
if ! post_log_collection; then
log "Warning: Unknown error whilst collecting post debug information"
fi

# Compress archive
LOG_NAME="$(date +"tedge-debug-${CLOUD}_%Y%m%d_%H%M")"
cd "$LOGD" && tar czf "$LOG_DIR/${LOG_NAME}.tar.gz" .

# Also create a concatenated log file so it can be retrieved via the log_upload operation
if command -V zcat >/dev/null 2>&1; then
zcat "$LOG_DIR/${LOG_NAME}.tar.gz" > "$LOG_DIR/${LOG_NAME}.log"
fi

rm -rf "$LOGD"
6 changes: 3 additions & 3 deletions src/conf.d/tedge-monitoring.conf
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ check program c8y-connectivity with path "/usr/bin/tedge connect c8y --test"
with timeout 60 seconds
every 120 cycles
if status != 0 then alert
if status != 0 for 10 cycles then exec "/usr/bin/tedge reconnect c8y"
if status != 0 for 10 cycles then exec "/usr/bin/monit-tedge-reconnect c8y"
depends on c8y-enabled
group c8y

Expand All @@ -64,7 +64,7 @@ check program az-connectivity with path "/usr/bin/tedge connect az --test"
with timeout 60 seconds
every 120 cycles
if status != 0 then alert
if status != 0 for 10 cycles then exec "/usr/bin/tedge reconnect az"
if status != 0 for 10 cycles then exec "/usr/bin/monit-tedge-reconnect az"
depends on az-enabled
group az

Expand All @@ -81,6 +81,6 @@ check program aws-connectivity with path "/usr/bin/tedge connect aws --test"
with timeout 60 seconds
every 120 cycles
if status != 0 then alert
if status != 0 for 10 cycles then exec "/usr/bin/tedge reconnect aws"
if status != 0 for 10 cycles then exec "/usr/bin/monit-tedge-reconnect aws"
depends on aws-enabled
group aws
8 changes: 8 additions & 0 deletions src/tedge-monit-setup/env
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Where to write the log files to
#LOG_DIR=/var/log/tedge-monit-setup

# Maximum number of days to keep log (tar.gz) files
#MAX_DAYS=30

# Maximum number of log (tar.gz) files to keep
#MAX_FILES=10
8 changes: 8 additions & 0 deletions tests/monit.robot
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ Reconnect on cloud connection loss (by stopping mosquitto)
Execute Command tedge connect c8y --test timeout=120
Execute Command systemctl is-active mosquitto
Execute Command systemctl is-active tedge-mapper-c8y
Cumulocity.Device Should Exist ${DEVICE_SN}
Cumulocity.Device Should Have Event/s expected_text=.*Reconnected to cloud .*c8y.* type=c8y_reconnected
Assert File Count /var/log/tedge-monit-setup/*.tar.gz 1
Assert File Count /var/log/tedge-monit-setup/*.log 1


*** Keywords ***
Expand All @@ -35,3 +39,7 @@ Decrease monit intervals
Execute Command cmd=sed -i 's/every 120 cycles/every 5 cycles/g' /etc/monit/conf.d/tedge-monitoring.conf
Execute Command cmd=sed -i 's/if status != 0 for 10 cycles/if status != 0 for 2 cycles/g' /etc/monit/conf.d/tedge-monitoring.conf
Execute Command systemctl restart monit

Assert File Count
[Arguments] ${path} ${count}
DeviceLibrary.Execute Command cmd=[ $(ls -l ${path} | wc -l | xargs) = ${count} ]