Skip to content

Commit

Permalink
Add retry logic for ExtPolicy "ResourceNotFound" test failure (#3311)
Browse files Browse the repository at this point in the history
* Add retry and documentation

* Fix comments

* Fix comments

* Address review comments 1
  • Loading branch information
mgunnala authored Jan 31, 2025
1 parent 0c8db73 commit c0d0207
Showing 1 changed file with 83 additions and 20 deletions.
103 changes: 83 additions & 20 deletions tests_e2e/tests/ext_policy/ext_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#
import json
import re
import time
import uuid
import os
from typing import List, Dict, Any
Expand Down Expand Up @@ -47,11 +48,11 @@ def _create_policy_file(self, policy):
Create policy json file and copy to /etc/waagent_policy.json on test machine.
"""
unique_id = uuid.uuid4()
file_path = "/tmp/waagent_policy_{0}.json".format(unique_id)
file_path = f"/tmp/waagent_policy_{unique_id}.json"
with open(file_path, mode='w') as policy_file:
json.dump(policy, policy_file, indent=4)
policy_file.flush()
log.info("Policy file contents: {0}".format(json.dumps(policy, indent=4)))
log.info(f"Policy file contents: {json.dumps(policy, indent=4)}")

remote_path = "/tmp/waagent_policy.json"
local_path = policy_file.name
Expand All @@ -61,30 +62,96 @@ def _create_policy_file(self, policy):
self._ssh_client.run_command(f"mv {remote_path} {policy_file_final_dest}", use_sudo=True)
os.remove(file_path)

@staticmethod
def __enable_extension(extension_case, timeout=None):
"""Helper to call 'enable' with appropriate parameters."""
args = {"settings": extension_case.settings}

# VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter.
# For all other extensions, always set force_update to true.
if not isinstance(extension_case.extension, VirtualMachineRunCommandClient):
args["force_update"] = True

# Add timeout only if specified, else use default
if timeout is not None:
args["timeout"] = timeout

extension_case.extension.enable(**args)

@staticmethod
def _enable_should_succeed_with_retry(extension_case, retry_on_error, retries=2):
"""
This method was created to work around an intermittent failure for test case 4. Attempts the 'enable' operation,
retrying after a short delay if the error message contains the specified string 'retry_on_error'.
On test case 4, the Azure SDK/ARM occasionally returns a 'ResourceNotFound' error for the 'enable' operation (#3),
even though all 3 operations in the test case succeed at the agent and CRP level.
1. Block CSE with policy -> delete CSE (ARM/CRP continues polling until timeout, test moves to next operation)
2. Allow CSE with policy -> delete CSE (succeeds)
3. Enable CSE -> SDK returns ResourceNotFound error
Error details:
(ResourceNotFound) The Resource 'Microsoft.Compute/virtualMachines/lisa-WALinuxAgent-20250124-090654-780-e56-n0/extensions/CustomScript' under resource group 'lisa-WALinuxAgent-20250124-090654-780-e56' was not found. For more details please go to https://aka.ms/ARMResourceNotFoundFix
Code: ResourceNotFound
Message: The Resource 'Microsoft.Compute/virtualMachines/lisa-WALinuxAgent-20250124-090654-780-e56-n0/extensions/CustomScript' under resource group 'lisa-WALinuxAgent-20250124-090654-780-e56' was not found. For more details please go to https://aka.ms/ARMResourceNotFoundFix
The suspected cause is that ARM receives the enable request (#3) before the second delete operation (#2) has
completed at the ARM level, leading to a conflict. When the first delete operation (#1) fails, the agent reports
a failure status for the extension, but CRP continues to wait for the agent to stop reporting status for that
extension. Once the second delete operation (#2) succeeds, the agent stops reporting status for that extension,
so ARM reports success for the *first* delete operation and reports that the second delete is still in progress.
Consequently, the enable request (#3) is accepted by ARM but conflicts with the ongoing delete operation (#2),
causing the SDK to report a ResourceNotFound error.
To work around this issue, we retry 'enable' a few times if the string 'ResourceNotFound' is found in the error message.
If the issue continues after retrying, another possible workaround is to wait for the full CRP timeout for delete #1.
"""
log.info("")
log.info(f"Attempting to enable {extension_case.extension}, expected to succeed")
error = None
for attempt in range(retries):
try:
ExtPolicy.__enable_extension(extension_case)
extension_case.extension.assert_instance_view()
log.info(f"Operation 'enable' for {extension_case.extension} succeeded.")
return

except Exception as e:
error = e
# Only retry if the specified string is found in the error message.
if retry_on_error in str(e):
log.warning(f"Operation 'enable' failed with a {retry_on_error} error on attempt {attempt + 1}, retrying in 30 secs. Error: {e}")
time.sleep(30)
else:
fail(
f"Unexpected error while trying to enable {extension_case.extension}. "
f"Extension is allowed by policy so this operation should have completed successfully.\n"
f"Error: {e}")

fail(f"Enable {extension_case.extension} failed after {retries} retries. Last error: {error}")

def _operation_should_succeed(self, operation, extension_case):
log.info("")
log.info(f"Attempting to {operation} {extension_case.extension.__str__()}, expected to succeed")
log.info(f"Attempting to {operation} {extension_case.extension}, expected to succeed")
# Attempt operation. If enabling, assert that the extension is present in instance view.
# If deleting, assert that the extension is not present in instance view.
try:
if operation == "enable":
# VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter.
if isinstance(extension_case.extension, VirtualMachineRunCommandClient):
extension_case.extension.enable(settings=extension_case.settings)
else:
extension_case.extension.enable(settings=extension_case.settings, force_update=True)
ExtPolicy.__enable_extension(extension_case)
extension_case.extension.assert_instance_view()

elif operation == "delete":
extension_case.extension.delete()
instance_view_extensions = self._context.vm.get_instance_view().extensions
if instance_view_extensions is not None and any(
e.name == extension_case.extension._resource_name for e in instance_view_extensions):
raise Exception(
"extension {0} still in instance view after attempting to delete".format(extension_case.extension))
log.info(f"Operation '{operation}' for {extension_case.extension.__str__()} succeeded as expected.")
raise Exception(f"extension {extension_case.extension} still in instance view after attempting to delete")

log.info(f"Operation '{operation}' for {extension_case.extension} succeeded as expected.")

except Exception as error:
fail(
f"Unexpected error while trying to {operation} {extension_case.extension.__str__()}. "
f"Unexpected error while trying to {operation} {extension_case.extension}. "
f"Extension is allowed by policy so this operation should have completed successfully.\n"
f"Error: {error}")

Expand All @@ -94,15 +161,11 @@ def _operation_should_fail(self, operation, extension_case):
try:
log.info(f"Attempting to enable {extension_case.extension}, should fail fast due to policy.")
timeout = (6 * 60) # Fail fast.
# VirtualMachineRunCommandClient (and VirtualMachineRunCommand) does not take force_update_tag as a parameter.
if isinstance(extension_case.extension, VirtualMachineRunCommandClient):
extension_case.extension.enable(settings=extension_case.settings, timeout=timeout)
else:
extension_case.extension.enable(settings=extension_case.settings, force_update=True,
timeout=timeout)
ExtPolicy.__enable_extension(extension_case, timeout)
fail(
f"The agent should have reported an error trying to {operation} {extension_case.extension} "
f"because the extension is disallowed by policy.")

except Exception as error:
# We exclude the extension name from regex because CRP sometimes installs test extensions with different
# names (ex: Microsoft.Azure.Extensions.Edp.RunCommandHandlerLinuxTest instead of Microsoft.CPlat.Core.RunCommandHandlerLinux)
Expand All @@ -128,6 +191,7 @@ def _operation_should_fail(self, operation, extension_case):
extension_case.extension.delete(timeout=timeout)
fail(f"CRP should not have successfully completed the delete operation for {extension_case.extension} "
f"because the extension is disallowed by policy and agent should have reported a policy failure.")

except TimeoutError:
log.info("Delete operation did not complete, as expected. Checking instance view "
"and agent log to confirm that delete operation failed due to policy.")
Expand Down Expand Up @@ -305,7 +369,7 @@ def run(self):
# Since CustomScript is marked for deletion by previous test case, we can only retry the delete operation (enable
# is not allowed by CRP). So we first delete successfully, and then re-install/enable CustomScript.
self._operation_should_succeed("delete", custom_script)
self._operation_should_succeed("enable", custom_script)
self._enable_should_succeed_with_retry(extension_case=custom_script, retry_on_error="ResourceNotFound")

# Cleanup after test: disable policy enforcement in conf file.
log.info("")
Expand All @@ -314,7 +378,6 @@ def run(self):
self._ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True)
log.info("*** Test cleanup complete.")


def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
ignore_rules = [
# 2024-10-24T17:34:20.808235Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Monitor.AzureMonitorLinuxAgent, op=None, message=[ExtensionPolicyError] Extension will not be processed: failed to run extension 'Microsoft.Azure.Monitor.AzureMonitorLinuxAgent' because it is not specified as an allowed extension. To enable, add the extension to the list of allowed extensions in the policy file ('/etc/waagent_policy.json')., duration=0
Expand Down

0 comments on commit c0d0207

Please sign in to comment.