From c0d0207a21c8ae378e655cc174f03bc4822170aa Mon Sep 17 00:00:00 2001 From: mgunnala <113645315+mgunnala@users.noreply.github.com> Date: Fri, 31 Jan 2025 16:34:44 -0500 Subject: [PATCH] Add retry logic for ExtPolicy "ResourceNotFound" test failure (#3311) * Add retry and documentation * Fix comments * Fix comments * Address review comments 1 --- tests_e2e/tests/ext_policy/ext_policy.py | 103 ++++++++++++++++++----- 1 file changed, 83 insertions(+), 20 deletions(-) diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py index 97b78df62..c0da0c502 100644 --- a/tests_e2e/tests/ext_policy/ext_policy.py +++ b/tests_e2e/tests/ext_policy/ext_policy.py @@ -18,6 +18,7 @@ # import json import re +import time import uuid import os from typing import List, Dict, Any @@ -47,11 +48,11 @@ def _create_policy_file(self, policy): Create policy json file and copy to /etc/waagent_policy.json on test machine. """ unique_id = uuid.uuid4() - file_path = "/tmp/waagent_policy_{0}.json".format(unique_id) + file_path = f"/tmp/waagent_policy_{unique_id}.json" with open(file_path, mode='w') as policy_file: json.dump(policy, policy_file, indent=4) policy_file.flush() - log.info("Policy file contents: {0}".format(json.dumps(policy, indent=4))) + log.info(f"Policy file contents: {json.dumps(policy, indent=4)}") remote_path = "/tmp/waagent_policy.json" local_path = policy_file.name @@ -61,30 +62,96 @@ def _create_policy_file(self, policy): self._ssh_client.run_command(f"mv {remote_path} {policy_file_final_dest}", use_sudo=True) os.remove(file_path) + @staticmethod + def __enable_extension(extension_case, timeout=None): + """Helper to call 'enable' with appropriate parameters.""" + args = {"settings": extension_case.settings} + + # VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter. + # For all other extensions, always set force_update to true. + if not isinstance(extension_case.extension, VirtualMachineRunCommandClient): + args["force_update"] = True + + # Add timeout only if specified, else use default + if timeout is not None: + args["timeout"] = timeout + + extension_case.extension.enable(**args) + + @staticmethod + def _enable_should_succeed_with_retry(extension_case, retry_on_error, retries=2): + """ + This method was created to work around an intermittent failure for test case 4. Attempts the 'enable' operation, + retrying after a short delay if the error message contains the specified string 'retry_on_error'. + + On test case 4, the Azure SDK/ARM occasionally returns a 'ResourceNotFound' error for the 'enable' operation (#3), + even though all 3 operations in the test case succeed at the agent and CRP level. + 1. Block CSE with policy -> delete CSE (ARM/CRP continues polling until timeout, test moves to next operation) + 2. Allow CSE with policy -> delete CSE (succeeds) + 3. Enable CSE -> SDK returns ResourceNotFound error + + Error details: + (ResourceNotFound) The Resource 'Microsoft.Compute/virtualMachines/lisa-WALinuxAgent-20250124-090654-780-e56-n0/extensions/CustomScript' under resource group 'lisa-WALinuxAgent-20250124-090654-780-e56' was not found. For more details please go to https://aka.ms/ARMResourceNotFoundFix + Code: ResourceNotFound + Message: The Resource 'Microsoft.Compute/virtualMachines/lisa-WALinuxAgent-20250124-090654-780-e56-n0/extensions/CustomScript' under resource group 'lisa-WALinuxAgent-20250124-090654-780-e56' was not found. For more details please go to https://aka.ms/ARMResourceNotFoundFix + + The suspected cause is that ARM receives the enable request (#3) before the second delete operation (#2) has + completed at the ARM level, leading to a conflict. When the first delete operation (#1) fails, the agent reports + a failure status for the extension, but CRP continues to wait for the agent to stop reporting status for that + extension. Once the second delete operation (#2) succeeds, the agent stops reporting status for that extension, + so ARM reports success for the *first* delete operation and reports that the second delete is still in progress. + Consequently, the enable request (#3) is accepted by ARM but conflicts with the ongoing delete operation (#2), + causing the SDK to report a ResourceNotFound error. + + To work around this issue, we retry 'enable' a few times if the string 'ResourceNotFound' is found in the error message. + If the issue continues after retrying, another possible workaround is to wait for the full CRP timeout for delete #1. + """ + log.info("") + log.info(f"Attempting to enable {extension_case.extension}, expected to succeed") + error = None + for attempt in range(retries): + try: + ExtPolicy.__enable_extension(extension_case) + extension_case.extension.assert_instance_view() + log.info(f"Operation 'enable' for {extension_case.extension} succeeded.") + return + + except Exception as e: + error = e + # Only retry if the specified string is found in the error message. + if retry_on_error in str(e): + log.warning(f"Operation 'enable' failed with a {retry_on_error} error on attempt {attempt + 1}, retrying in 30 secs. Error: {e}") + time.sleep(30) + else: + fail( + f"Unexpected error while trying to enable {extension_case.extension}. " + f"Extension is allowed by policy so this operation should have completed successfully.\n" + f"Error: {e}") + + fail(f"Enable {extension_case.extension} failed after {retries} retries. Last error: {error}") + def _operation_should_succeed(self, operation, extension_case): log.info("") - log.info(f"Attempting to {operation} {extension_case.extension.__str__()}, expected to succeed") + log.info(f"Attempting to {operation} {extension_case.extension}, expected to succeed") # Attempt operation. If enabling, assert that the extension is present in instance view. # If deleting, assert that the extension is not present in instance view. try: if operation == "enable": - # VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter. - if isinstance(extension_case.extension, VirtualMachineRunCommandClient): - extension_case.extension.enable(settings=extension_case.settings) - else: - extension_case.extension.enable(settings=extension_case.settings, force_update=True) + ExtPolicy.__enable_extension(extension_case) extension_case.extension.assert_instance_view() + elif operation == "delete": extension_case.extension.delete() instance_view_extensions = self._context.vm.get_instance_view().extensions if instance_view_extensions is not None and any( e.name == extension_case.extension._resource_name for e in instance_view_extensions): - raise Exception( - "extension {0} still in instance view after attempting to delete".format(extension_case.extension)) - log.info(f"Operation '{operation}' for {extension_case.extension.__str__()} succeeded as expected.") + raise Exception(f"extension {extension_case.extension} still in instance view after attempting to delete") + + log.info(f"Operation '{operation}' for {extension_case.extension} succeeded as expected.") + except Exception as error: fail( - f"Unexpected error while trying to {operation} {extension_case.extension.__str__()}. " + f"Unexpected error while trying to {operation} {extension_case.extension}. " f"Extension is allowed by policy so this operation should have completed successfully.\n" f"Error: {error}") @@ -94,15 +161,11 @@ def _operation_should_fail(self, operation, extension_case): try: log.info(f"Attempting to enable {extension_case.extension}, should fail fast due to policy.") timeout = (6 * 60) # Fail fast. - # VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter. - if isinstance(extension_case.extension, VirtualMachineRunCommandClient): - extension_case.extension.enable(settings=extension_case.settings, timeout=timeout) - else: - extension_case.extension.enable(settings=extension_case.settings, force_update=True, - timeout=timeout) + ExtPolicy.__enable_extension(extension_case, timeout) fail( f"The agent should have reported an error trying to {operation} {extension_case.extension} " f"because the extension is disallowed by policy.") + except Exception as error: # We exclude the extension name from regex because CRP sometimes installs test extensions with different # names (ex: Microsoft.Azure.Extensions.Edp.RunCommandHandlerLinuxTest instead of Microsoft.CPlat.Core.RunCommandHandlerLinux) @@ -128,6 +191,7 @@ def _operation_should_fail(self, operation, extension_case): extension_case.extension.delete(timeout=timeout) fail(f"CRP should not have successfully completed the delete operation for {extension_case.extension} " f"because the extension is disallowed by policy and agent should have reported a policy failure.") + except TimeoutError: log.info("Delete operation did not complete, as expected. Checking instance view " "and agent log to confirm that delete operation failed due to policy.") @@ -305,7 +369,7 @@ def run(self): # Since CustomScript is marked for deletion by previous test case, we can only retry the delete operation (enable # is not allowed by CRP). So we first delete successfully, and then re-install/enable CustomScript. self._operation_should_succeed("delete", custom_script) - self._operation_should_succeed("enable", custom_script) + self._enable_should_succeed_with_retry(extension_case=custom_script, retry_on_error="ResourceNotFound") # Cleanup after test: disable policy enforcement in conf file. log.info("") @@ -314,7 +378,6 @@ def run(self): self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) log.info("*** Test cleanup complete.") - def get_ignore_error_rules(self) -> List[Dict[str, Any]]: ignore_rules = [ # 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified as an allowed extension. To enable, add the extension to the list of allowed extensions in the policy file ('/etc/waagent_policy.json')., duration=0