From ae1c53826aa615b296ec8ac1dbe480f3adcdbf7d Mon Sep 17 00:00:00 2001 From: Tim Meusel Date: Mon, 22 Apr 2024 15:18:40 +0200 Subject: [PATCH] Add plan for Puppet agent state summary --- README.md | 23 +++++++++++++ REFERENCE.md | 19 +++++++++++ plans/agent_state_summary.pp | 63 ++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 plans/agent_state_summary.pp diff --git a/README.md b/README.md index f39638e1..27e536f6 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,29 @@ environment. You can plott it in a more human-readable way with the [puppet/format](https://github.com/voxpupuli/puppet-format?tab=readme-ov-file#puppet-format) modules. + +The plan `pe_status_check::agent_state_summary` provides you a hash with all nodes, grouped by failure state: + +```json +{ + "noop" : [ ], + "failed" : [ ], + "changed" : [ "student2.local" ], + "unresponsive" : [ "student3.local", "student4.local", "student1.local", "login.local" ], + "corrective_changes" : [ ], + "used_cached_catalog" : [ ] +} +``` + +* `noop`: last catalog was applied in noop mode +* `failed`: The last catalog couldn't be compiled or catalog application raised an error +* `changed`: A node reported a change +* `unresponsive`: Last report is older than 30 minutes (can be configured via the `runinterval` parameter) +* `corrective_changes`: A node reported corrective changes +* `used_cached_catalog`: The node didn't apply a new catalog but used a cached version + +The goal of this plan is to run it before doing major upgrades, to ensure that your agents are in a healthy state. + ### Using a Puppet Query to report status. As the pe_status_check module uses Puppet's existing fact behavior to gather the status data from each of the agents, it is possible to use PQL (puppet query language) to gather this information. diff --git a/REFERENCE.md b/REFERENCE.md index 7e8067c0..fb723411 100644 --- a/REFERENCE.md +++ b/REFERENCE.md @@ -11,6 +11,7 @@ ### Plans +* [`pe_status_check::agent_state_summary`](#pe_status_check--agent_state_summary): provides an overview of all Puppet agents and their error states * [`pe_status_check::agent_summary`](#pe_status_check--agent_summary): Summary report of the state of agent_status_check on each node Uses the facts task to get the current status from each node and produces a summary report in JSON @@ -84,6 +85,24 @@ Default value: `true` ## Plans +### `pe_status_check::agent_state_summary` + +provides an overview of all Puppet agents and their error states + +#### Parameters + +The following parameters are available in the `pe_status_check::agent_state_summary` plan: + +* [`runinterval`](#-pe_status_check--agent_state_summary--runinterval) + +##### `runinterval` + +Data type: `Integer[0]` + +the runinterval for the Puppet Agent in minutes. We consider latest reports that are older than runinterval as unresponsive + +Default value: `30` + ### `pe_status_check::agent_summary` Summary report of the state of agent_status_check on each node diff --git a/plans/agent_state_summary.pp b/plans/agent_state_summary.pp new file mode 100644 index 00000000..aebebfaa --- /dev/null +++ b/plans/agent_state_summary.pp @@ -0,0 +1,63 @@ +# +# @summary provides an overview of all Puppet agents and their error states +# +# @param runinterval the runinterval for the Puppet Agent in minutes. We consider latest reports that are older than runinterval as unresponsive +# +# @author Tim Meusel +# +plan pe_status_check::agent_state_summary ( + Integer[0] $runinterval = 30, +){ + # a list of all nodes and their latest catalog state + $nodes = puppetdb_query('nodes[certname,latest_report_noop,latest_report_corrective_change,cached_catalog_status,latest_report_status,report_timestamp]{}') + + # check if the last catalog is older than X minutes + $current_timestamp = Integer(Timestamp().strftime('%s')) + $runinterval_seconds = $runinterval * 60 + $unresponsive = $nodes.map |$node| { + $old_timestamp = Integer(Timestamp($node['report_timestamp']).strftime('%s')) + if ($current_timestamp - $old_timestamp) >= $runinterval_seconds { + $node['certname'] + } + }.delete_undef_values + + # all nodes that delivered a report in time + $responsive = $nodes.map{ $node['certname'] } - $unresponsive + + # all nodes that used noop for the last catalog + $noop = $nodes.map |$node| { if ($node['latest_report_noop'] == true){ $node['certname'] } }.delete_undef_values + + # all nodes that reported corrective changes + $corrective_changes = $nodes.map |$node| { if ($node['latest_report_corrective_change'] == true){ $node['certname'] } }.delete_undef_values + + # all nodes that used a cached catalog on the last run + $used_cached_catalog = $nodes.map |$node| { if ($node['cached_catalog_status'] != 'not_used'){ $node['certname'] } }.delete_undef_values + + # all nodes with failed resources in the last report + $failed = $nodes.map |$node| { if ($node['latest_report_status'] == 'failed'){ $node['certname'] } }.delete_undef_values + + # all nodes with changes in the last report + $changed = $nodes.map |$node| { if ($node['latest_report_status'] == 'changed'){ $node['certname'] } }.delete_undef_values + + # all nodes that aren't healthy in any form + $unhealthy = [$noop, $corrective_changes, $used_cached_catalog, $failed, $changed, $unresponsive].flatten.uniq + + # all healthy nodes + $healthy = $nodes.map{ $node['certname'] } - $unhealthy + + $data = { + 'noop' => $noop, + 'corrective_changes' => $corrective_changes, + 'used_cached_catalog' => $used_cached_catalog, + 'failed' => $failed, + 'changed' => $changed, + 'unresponsive' => $unresponsive, + 'responsive' => $responsive, + 'unhealthy' => $unhealthy, + 'unhealthy_counter' => $unhealthy.count, + 'healthy' => $healthy, + 'healthy_counter' => $healthy.count, + } + + return $data +}