From 9997307aab2d583f0b6c9570e6ea44c585ed22fc Mon Sep 17 00:00:00 2001 From: Ken Bannister Date: Sun, 29 Sep 2024 19:59:55 -0400 Subject: [PATCH] Add takeover based upgrade script Change-type: patch Signed-off-by: Ken Bannister --- upgrade-takeover.sh | 494 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 494 insertions(+) create mode 100755 upgrade-takeover.sh diff --git a/upgrade-takeover.sh b/upgrade-takeover.sh new file mode 100755 index 0000000..c24a625 --- /dev/null +++ b/upgrade-takeover.sh @@ -0,0 +1,494 @@ +#!/bin/bash +# +# Upgrades balenaOS version by running takeover to flash storage, potentially +# including repartitioning. See https://github.com/balena-os/takeover +# +# Arguments: +# * --hostos-version is required to specify target OS version +# * See help() below for details +# +# Outputs: +# * Writes log file to /mnt/boot/balenahup +# * Notifies balenaAPI by PATCHing the device via /usr/bin/resin-device-progress script +# +# Results: +# * Flashes new OS and reboots into it on success +# * Returns 9 if another upgrade script may already be running +# * Returns 1 on other errors in this script +# * Reboots into current OS if fails later in the attempt +# +# Notes: +# This script was derived from upgrade-2.x.sh for traditional balenahup upgrades. + +SCRIPTNAME=upgrade-takeover.sh + +# Set up cautious error handling, run ERR trap on failure +set -o errexit +set -E +set -o pipefail + +# Define variables about running balenaOS, including VERSION and SLUG (device type) +# shellcheck disable=SC1091 +source /etc/os-release + +# Set PATH for binary lookups +# shellcheck disable=SC1091 +source /etc/profile + +# Prevent running multiple instances of upgrade +LOCKFILE="/var/lock/resinhup.lock" +LOCKFD=99 +# Private functions +_lock() { flock "-$1" $LOCKFD; } +_exit_handler() { + _exit_status=$? + if [ "${_exit_status}" -ne 0 ]; then + log "Exit on error ${_exit_status}" + if ! report_upgrade_failed > /dev/null 2>&1; then + log "Failed to report progress on exit with status $?" + fi + if [ "${_exit_status}" -eq 9 ]; then + log "No concurrent updates allowed - lock file in place." + fi + fi + _no_more_locking + log "Lock removed - end." +} +_no_more_locking() { _lock u; _lock xn && rm -f $LOCKFILE;rm -f "${outfifo}";rm -f "${errfifo}"; } +_prepare_locking() { eval "exec $LOCKFD>\"$LOCKFILE\""; trap _exit_handler EXIT; } +# Public functions +exlock_now() { _lock xn; } # obtain an exclusive lock immediately or fail + +# Helper function to report progress to device API +# $1 -- integer: completion percentage +# $2 -- string: detail message +function progress { + pct=$1 + message=$2 + resin-device-progress --percentage "${pct}" --state "${message}" > /dev/null || true +} + +function help { + cat << EOF +Upgrade balenaOS via takeover tool + +Options: + -h, --help + Display this help and exit. + + --force-slug + Override slug detection and force this slug to be used for the script. + + --hostos-version + Run the updater for this specific HostOS version as semver or ESR, where + semver is in the format major.minor.patch, like 2.5.1; and ESR is in the + format year.month.patch, like 2024.4.0. The version must begin with a + digit, not a 'v'. This is a mandatory argument. + + --balenaos-registry + Unused but accepted for compatibility with upgrade-2.x.sh when called + by balena-proxy. +EOF +} + +# Notify backend device API that upgrade has failed. This report is essential +# to allow a user to retry the upgrade. Sends the report, and then independently +# verifies success. Tries once per minute for an hour. +function report_upgrade_failed() { + pct=100 + state="OS update failed" + while ! compare_device_state "${pct}" "${state}"; do + ((c++)) && ((c==60)) && break + if resin-device-progress --percentage "${pct}" --state "${state}"; then + continue + fi + log WARN "Retrying failure report - try $c" + sleep 60 + done +} + +# Log operational message; writes provided text to journal and echos to stdout. +# If log at ERROR level, cleanup work directory and exit this script with code 1. +# +# $1 -- optional log level, must be ERROR or WARN; otherwise defaults to INFO +# $2 -- log message +function log { + # Process log level if provided to function + priority=6 + case $1 in + ERROR) + loglevel=ERROR + # 3 is "err" + priority=3 + shift + ;; + WARN) + loglevel=WARNING + # 4 is "warning" + priority=4 + shift + ;; + *) + loglevel=INFO + ;; + esac + echo "${1}" | systemd-cat --level-prefix=0 --identifier="${SCRIPTNAME}" --priority="${priority}" 2> /dev/null || true + endtime=$(date +%s) + printf "[%s][%09d%s%s\n" "$SCRIPTNAME" "$((endtime - starttime))" "][$loglevel]" "$1" + if [ "$loglevel" == "ERROR" ]; then + cleanup_work_dir + exit 1 + fi +} + +# Test if the first arg is greater than the second, when the args are compared +# as semvers. +# Return 0 if the first item is greater; 1 otherwise +# For example, version_gt "1.2.10" "1.2.3" returns true. +# +# $1 -- expected greater version +# $2 -- version to compare +function version_gt() { + test "$(echo "$@" | tr " " "\n" | sort -V | head -n 1)" != "$1" +} + +# Compare the provided percentage and state values with backend device API +# provisioning_progress and provisioning_state. +# +# $1 -- local percentage +# $2 -- local state +# +# Return 0 if the values are equal; otherwise return 1 +function compare_device_state() { + pct=$1 + state=$2 + resp=$(CURL_CA_BUNDLE="${TMPCRT}" ${CURL} --header "Authorization: Bearer ${APIKEY}" \ + "${API_ENDPOINT}/v6/device(uuid='${UUID}')?\$select=provisioning_state,provisioning_progress" | jq '.d[]') + remote_pct=$(echo "${resp}" | jq -r '.provisioning_progress') + remote_state=$(echo "${resp}" | jq -r '.provisioning_state') + if [ -n "${remote_pct}" ] && [ -n "${remote_state}" ]; then + test "${pct}" -eq "${remote_pct}" && test "${state}" = "${remote_state}" + else + return 1 + fi +} + +# Stop Supervisor and related systemd services +function stop_services() { + log "Stopping supervisor and related services..." + systemctl stop update-balena-supervisor.timer > /dev/null 2>&1 || systemctl stop update-resin-supervisor.timer > /dev/null 2>&1 + systemctl stop balena-supervisor > /dev/null 2>&1 || systemctl stop resin-supervisor > /dev/null 2>&1 + ${DOCKER_CMD} rm -f balena_supervisor resin_supervisor > /dev/null 2>&1 || true +} + +# Remove contents of work directory +function cleanup_work_dir() { + # Ensure takover path is not just '/' because deletion command is + if [[ -n "${work_dir}" && "${#work_dir}" -gt 3 ]]; then + if [[ -d "${work_dir}" ]]; then + rm -rf "${work_dir:?}/*" + fi + fi +} + +# Download takeover tool binary for device's architecture +# Requires $work_dir +# $1 -- takeover release version, like "v0.9.0" OR "latest" +# Exits script on download failure +function download_takeover_binary() { + release_version="$1" + + architecture=$(uname -m) + case ${architecture} in + aarch64|x86_64) + log "Using takeover arch ${architecture}" + ;; + *) + log ERROR "Takeover binary for arch: ${architecture} not found" + esac + + if [ "${release_version}" = "latest" ]; then + release_segment="latest/download" + else + release_segment="download/${release_version}" + fi + download_url="https://github.com/balena-os/takeover/releases/${release_segment}/takeover-${architecture}-unknown-linux-musl.tar.gz" + log "Downloading takeover binary ${download_url}" + + ${CURL} -o "${work_dir}/takeover.tar.gz" "$download_url" || log ERROR "Could not download takeover binary, aborting." + + # Extract and prepare for use + tar -C "${work_dir}" -zxvf "${work_dir}/takeover.tar.gz" + chown root:root "${work_dir}/takeover" + rm "${work_dir}/takeover.tar.gz" + + log "Download takeover binary success" +} + +# Download target balenaOS image; supports both public and private images when +# authorized. Also verifies sufficient disk space for download. +# Requires: +# $target_version; already verified +# $SLUG; already verified +# $CURL_NO_FAIL +# $work_dir; already verified -- must be in data partition +# Exits script on download failure +function download_target_image() { + log "Verifying target image size" + + # Download headers and verify content-length less than free space on data partition. + # Still on curl 7.82 so can't use "-w '%header{content-length}'" + status_code=$(\ + CURL_CA_BUNDLE="${TMPCRT}" ${CURL_NO_FAIL} -H "Authorization: Bearer ${APIKEY}" \ + --head -w "%{http_code}" \ + -o "${work_dir}/headers.txt" \ + "${API_ENDPOINT}/download?deviceType=${SLUG}&version=${target_version}&fileType=.gz" \ + 2>/dev/null \ + ) + if [[ "${status_code:0:1}" == "2" && -f "${work_dir}/headers.txt" ]]; then + # Use printf to avoid newline in var. + image_bytes=$( + awk 'BEGIN {FS=": "}/^content-length/{printf "%d",$2}' "${work_dir}/headers.txt" + ) + data_part_kb=$(df --output=avail /mnt/data/ | tail -n 1) + if [[ -n "${image_bytes}" && -n "${data_part_kb}" ]]; then + data_part_bytes=$(( data_part_kb * 1024 )) + log "Target image size: ${image_bytes}" + if [ "${image_bytes}" -ge "${data_part_bytes}" ]; then + log ERROR "Image too big for data partition: ${data_part_bytes}" + fi + fi + else + # Header retrieval ought to work, but not critical if it doesn't. + log WARN "Verify target image; code: ${status_code}" + fi + + log "Downloading target image" + status_code=$(\ + CURL_CA_BUNDLE="${TMPCRT}" ${CURL_NO_FAIL} -H "Authorization: Bearer ${APIKEY}" \ + -H "Content-Type: application/json" -w "%{http_code}" \ + --output "${work_dir}/balenaos.img.gz" \ + "${API_ENDPOINT}/download?deviceType=${SLUG}&version=${target_version}&fileType=.gz" \ + 2>/dev/null \ + ) + # expecting 200 response + if [ "${status_code:0:1}" == "2" ]; then + log "Download image success; code: ${status_code}" + else + log ERROR "Download image failed; code: ${status_code}" + fi + # sanity check + if [ ! -f "${work_dir}/balenaos.img.gz" ]; then + log ERROR "Target image not found" + fi +} + + +### +# Script start +### + +# If no arguments passed, just display the help +if [ $# -eq 0 ]; then + help + exit 0 +fi +# Log timer +starttime=$(date +%s) + +# For compatibility purposes +if [ -d "/mnt/data/resinhup" ] && [ ! -e "/mnt/data/balenahup" ]; then + ln -s "/mnt/data/resinhup" "/mnt/data/balenahup" +fi +# LOGFILE init and header +logtime=$(date +"%Y%m%d_%H%M%S") +LOGFILE="/mnt/data/balenahup/$SCRIPTNAME.${logtime}.log" +mkdir -p "$(dirname "$LOGFILE")" +log "================$SCRIPTNAME HEADER START====================" > "$LOGFILE" +date >> "$LOGFILE" + +log "Loading info from config.json" +if [ -f /mnt/boot/config.json ]; then + CONFIGJSON=/mnt/boot/config.json +else + log ERROR "Can't find config.json" +fi +# Use the user's api key if it exists rather than deviceApiKey; it means we haven't +# done the key exchange yet. +APIKEY=$(jq -r '.apiKey // .deviceApiKey' "${CONFIGJSON}") +UUID=$(jq -r '.uuid' "${CONFIGJSON}") +API_ENDPOINT=$(jq -r '.apiEndpoint' "${CONFIGJSON}") + +[ -z "${APIKEY}" ] && log ERROR "Error parsing config.json" +[ -z "${UUID}" ] && log ERROR "Error parsing config.json" +[ -z "${API_ENDPOINT}" ] && log ERROR "Error parsing config.json" + +# Create a certificate bundle file incorporating any CA provided in config.json. +# We create this primarily for use by curl. The OS started integrating this CA +# with v2.58, but use this variable in case we have an older version. +TMPCRT=$(mktemp) +jq -r '.balenaRootCA' < "${CONFIGJSON}" | base64 -d > "${TMPCRT}" +cat /etc/ssl/certs/ca-certificates.crt >> "${TMPCRT}" + +# Set up curl for use within a script, retry many times for reliability, follow +# redirects, and compress responses. +CURL="curl --silent --retry 10 --fail --location --compressed" +# Variant so can collect response status code +CURL_NO_FAIL="curl --silent --retry 10 --location --compressed" + +_err_handler(){ + log ERROR "Interrupted on error" +} + +_int_handler(){ + log ERROR "Interrupted" +} + +_term_handler(){ + log ERROR "Terminated" +} + +trap '_err_handler' ERR +trap '_int_handler' INT +trap '_term_handler' TERM + +if [ -x "$(command -v balena)" ]; then + DOCKER_CMD="balena" +else + DOCKER_CMD="docker" +fi + +# Redirect all logs to the logfile +outfifo=$(mktemp -u) +errfifo=$(mktemp -u) +mkfifo "${outfifo}" "${errfifo}" +# Read from the stdout FIFO and append to LOGFILE +cat "${outfifo}" >> "${LOGFILE}" & +# Read from the stderr FIFO, append to LOGFILE, and also display to terminal's stderr +cat "${errfifo}" >> "${LOGFILE}" & +# Redirect script's stdout and stderr to the respective FIFOs +exec >"${outfifo}" 2>"${errfifo}" + +# Parse arguments +while [[ $# -gt 0 ]]; do + arg="$1" + + case $arg in + -h|--help) + help + exit 0 + ;; + --force-slug) + if [ -z "$2" ]; then + log ERROR "\"$1\" argument needs a value." + fi + FORCED_SLUG=$2 + shift + ;; + --hostos-version) + if [ -z "$2" ]; then + log ERROR "\"$1\" argument needs a value." + fi + target_version=$2 + log "Raw target version: ${target_version}" + case $target_version in + *.prod) + target_version="${target_version%%.prod}" + log "Normalized target version: ${target_version}" + ;; + esac + shift + ;; + --balenaos-registry) + if [ -z "$2" ]; then + log ERROR "\"$1\" argument needs a value." + fi + shift + ;; + *) + log WARN "Unrecognized option $1." + ;; + esac + shift +done + +# Run on start +_prepare_locking +# Try to get the lock. Exit if unable; another instance is running already. +exlock_now || exit 9 + +if [ -z "$target_version" ]; then + log ERROR "--hostos-version is required." +fi + +progress 25 "Preparing OS update" + +# Retrieve slug (device type) from API, and use this value if not provided to the script. +FETCHED_SLUG=$(CURL_CA_BUNDLE="${TMPCRT}" ${CURL} -H "Authorization: Bearer ${APIKEY}" \ + "${API_ENDPOINT}/v6/device?\$select=is_of__device_type&\$expand=is_of__device_type(\$select=slug)&\$filter=uuid%20eq%20%27${UUID}%27" 2>/dev/null \ + | jq -r '.d[0].is_of__device_type[0].slug' + ) +SLUG=${FORCED_SLUG:-$FETCHED_SLUG} + +# Validate target version in semver (major > 1) or year.month.patch format +if [ -n "$target_version" ]; then + case $target_version in + [2-9].*|[1-9][0-9].*|2[0-9][0-9][0-9].*.*) + log "Target OS version \"$target_version\" OK." + ;; + *) + log ERROR "Target OS version \"$target_version\" not supported." + ;; + esac +else + log ERROR "No target OS version specified." +fi + +# Validate host OS version, similar to target_version above +case $VERSION in + [2-9].*|[1-9][0-9].*|2[0-9][0-9][0-9].*.*) + log "Host OS version \"$VERSION\" OK." + ;; + *) + log ERROR "Host OS version \"$VERSION\" not supported." + ;; +esac + +# Ensure working directory is empty; takeover binary does not cleanup on failure. +cleanup_work_dir + +# Verify data partition filesystem/storage is clean. Flashing over bad sectors +# may brick device. +data_device=$(mount |grep /mnt/data |awk '{print $1}' |head -n 1) +if [ -n "${data_device}" ]; then + e2fsck -n "${data_device}" > /dev/null + if [ $? -ge 4 ]; then + log ERROR "Filesystem check failed on data partition ${data_device}" + fi +fi + +# Ensure takeover environment is prepared. We'll add files here. MUST be in +# data partition since stores sizable downloaded OS image. Conserves memory for +# takeover binary. +work_dir="/mnt/data/takeover" +mkdir -p "${work_dir}" + +# Retrieve target hostOS image and takeover binary +download_target_image +download_takeover_binary "v0.9.0-dev.1" + +progress 50 "Running OS update" + +# Run takeover +# Must run from a writable directory; takeover creates temp files there +cd ${work_dir} + +# No need to specify config.json path; defaults to /mnt/boot. +# API check fails on BoB, so disabled +# We do *not* expect takeover to return on success; it flashes the device with +# the new balenaOS. +res=$(./takeover -i balenaos.img.gz \ + --no-ack --no-nwmgr-check --no-os-check \ + --log-level debug --fallback-log --fallback-log-file "advanced.${logtime}.log" \ + --s2-log-level debug --report-hup-progress) +log ERROR "Takeover result ${res}; OS not updated"