From 753d7970f552e62d09670b1fee6b68f47f908173 Mon Sep 17 00:00:00 2001
From: maddieford <93676569+maddieford@users.noreply.github.com>
Date: Tue, 4 Feb 2025 11:39:41 -0500
Subject: [PATCH] Improve ext dependencies scenarios to share scaleset (#3312)

* Improve ext dependencies scenarios to share scaleset

* Add logging for policy deletion
---
 .../ext_policy_with_dependencies.py           | 340 +++++++++---------
 .../ext_sequencing/ext_seq_test_cases.py      |   1 +
 2 files changed, 174 insertions(+), 167 deletions(-)

diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py
index c4f798848..5830d63e5 100644
--- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py
+++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py
@@ -85,184 +85,190 @@ def _create_policy_file(ssh_client, policy):
 
     def run(self):
 
-        # Set up the test run
         instances_ip_address: List[VmssInstanceIpAddress] = self._context.vmss.get_instances_ip_address()
         ssh_clients: Dict[str, SshClient] = {}
         for instance in instances_ip_address:
-            ssh_clients[instance.instance_name] = SshClient(ip_address=instance.ip_address, username=self._context.username, identity_file=self._context.identity_file)
-
-        # Cleanup any extensions left behind by other tests, as they may be blocked by policy and erroneously cause failures.
-        instance_view_ext = self._context.vmss.get_instance_view().extensions
-        if instance_view_ext is not None and len(instance_view_ext) > 0:
-            for ex in instance_view_ext:
-                self._context.vmss.delete_extension(ex.name)
-
-        # Enable policy via conf file.
-        for ssh_client in ssh_clients.values():
-            ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True)
-
-        if not VmExtensionIds.AzureMonitorLinuxAgent.supports_distro(next(iter(ssh_clients.values())).run_command("get_distro.py").rstrip()):
-            raise TestSkipped("Currently AzureMonitorLinuxAgent is not supported on this distro")
-
-        # This is the base ARM template that's used for deploying extensions for this scenario.
-        base_extension_template = {
-            "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json",
-            "contentVersion": "1.0.0.0",
-            "resources": [
-                {
-                    "type": "Microsoft.Compute/virtualMachineScaleSets",
-                    "name": f"{self._context.vmss.name}",
-                    "location": "[resourceGroup().location]",
-                    "apiVersion": "2018-06-01",
-                    "properties": {
-                        "virtualMachineProfile": {
-                            "extensionProfile": {
-                                "extensions": []
+            ssh_clients[instance.instance_name] = SshClient(ip_address=instance.ip_address,
+                                                            username=self._context.username,
+                                                            identity_file=self._context.identity_file)
+
+        try:
+            # Cleanup any extensions left behind by other tests, as they may be blocked by policy and erroneously cause failures.
+            instance_view_ext = self._context.vmss.get_instance_view().extensions
+            if instance_view_ext is not None and len(instance_view_ext) > 0:
+                for ex in instance_view_ext:
+                    self._context.vmss.delete_extension(ex.name)
+
+            # Enable policy via conf file.
+            for ssh_client in ssh_clients.values():
+                ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True)
+
+            if not VmExtensionIds.AzureMonitorLinuxAgent.supports_distro(next(iter(ssh_clients.values())).run_command("get_distro.py").rstrip()):
+                raise TestSkipped("Currently AzureMonitorLinuxAgent is not supported on this distro")
+
+            # This is the base ARM template that's used for deploying extensions for this scenario.
+            base_extension_template = {
+                "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json",
+                "contentVersion": "1.0.0.0",
+                "resources": [
+                    {
+                        "type": "Microsoft.Compute/virtualMachineScaleSets",
+                        "name": f"{self._context.vmss.name}",
+                        "location": "[resourceGroup().location]",
+                        "apiVersion": "2018-06-01",
+                        "properties": {
+                            "virtualMachineProfile": {
+                                "extensionProfile": {
+                                    "extensions": []
+                                }
                             }
                         }
                     }
-                }
-            ]
-        }
-
-        for case in self._test_cases:
-            log.info("")
-            log.info("*** Test case: {0}".format(case.__name__.replace('_', ' ')))
-            test_case_start = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip()
-            if self._scenario_start == datetime.min:
-                self._scenario_start = test_case_start
-            log.info("Test case start time: {0}".format(test_case_start))
-
-            # Assign unique guid to forceUpdateTag for each extension to make sure they're always unique to force CRP
-            # to generate a new sequence number each time
-            test_guid = str(uuid.uuid4())
-            policy, extensions, expected_errors, deletion_order = case()
-            for ext in extensions:
-                ext["properties"].update({
-                    "forceUpdateTag": test_guid
-                })
-
-            # We update the extension template here with extensions that are specific to the scenario that we want to
-            # test out
-            ext_template = copy.deepcopy(base_extension_template)
-            ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][
-                'extensions'] = extensions
-
-            # Log the dependencies for the extensions in this test case
-            for ext in extensions:
-                provisioned_after = ext['properties'].get('provisionAfterExtensions')
-                depends_on = provisioned_after if provisioned_after else []
-                if depends_on:
-                    dependency_list = ' and '.join(depends_on)
-                    log.info("{0} depends on {1}".format(ext['name'], dependency_list))
-                else:
-                    log.info("{0} does not depend on any extension".format(ext['name']))
-
-            # Copy policy file to each VM instance
-            log.info("Updating policy file with new policy: {0}".format(policy))
-            for ssh_client in ssh_clients.values():
-                self._create_policy_file(ssh_client, policy)
-
-            log.info("Deploying extensions to the scale set...")
-            rg_client = ResourceGroupClient(self._context.vmss.cloud, self._context.vmss.subscription,
-                                            self._context.vmss.resource_group, self._context.vmss.location)
-
-            # Deploy updated extension template to the scale set.
-            # If test case is supposed to fail, assert that the operation fails with the expected error messages.
-            try:
-                rg_client.deploy_template(template=ext_template)
-                if expected_errors is not None and len(expected_errors) != 0:
-                    fail("Extension deployment was expected to fail with the following errors: {0}".format(expected_errors))
-                log.info("Extension deployment succeeded as expected")
-                log.info("")
-            except Exception as e:
-                if expected_errors is None or len(expected_errors) == 0:
-                    fail("Extension template deployment unexpectedly failed: {0}".format(e))
-                else:
-                    deployment_failure_pattern = r"[\s\S]*\"code\":\s*\"ResourceDeploymentFailure\"[\s\S]*\"details\":\s*\[\s*(?P<error>[\s\S]*)\]"
-                    deployment_failure_match = re.match(deployment_failure_pattern, str(e))
-                    try:
-                        if deployment_failure_match is None:
-                            raise Exception("Unable to match a ResourceDeploymentFailure")
-                        error_json = json.loads(deployment_failure_match.group("error"))
-                        error_message = error_json['message']
-                    except Exception as parse_exc:
-                        fail("Extension template deployment failed as expected, but there was an error in parsing the failure. Parsing failure: {0}\nDeployment Failure: {1}".format(parse_exc, e))
-
-                    for phrase in expected_errors:
-                        if phrase not in error_message:
-                            fail("Extension template deployment failed as expected, but with an unexpected error. Error expected to contain message '{0}'. Actual error: {1}".format(phrase, e))
-
-                log.info("Extensions failed as expected.")
-                log.info("")
-                log.info("Expected errors:")
-                for expected_error in expected_errors:
-                    log.info(" - {0}".format(expected_error))
-                log.info("")
+                ]
+            }
+
+            for case in self._test_cases:
                 log.info("")
-                log.info("Actual errors:")
-                log.info(str(e))
-
-            # Clean up failed extensions to leave VMSS in a good state for the next test. CRP will attempt to uninstall
-            # leftover extensions in the next test, but uninstall will be disallowed and reach timeout unexpectedly.
-            # CRP also won't allow deletion of an extension that is dependent on another failed extension, so we first
-            # update policy to allow all, re-enable all extensions, and then delete them in dependency order.
-            log.info("Starting cleanup for test case...")
-            allow_all_policy = \
-                {
-                    "policyVersion": "0.1.0",
-                    "extensionPolicies": {
-                        "allowListedExtensionsOnly": False
+                log.info("*** Test case: {0}".format(case.__name__.replace('_', ' ')))
+                test_case_start = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip()
+                if self._scenario_start == datetime.min:
+                    self._scenario_start = test_case_start
+                log.info("Test case start time: {0}".format(test_case_start))
+
+                # Assign unique guid to forceUpdateTag for each extension to make sure they're always unique to force CRP
+                # to generate a new sequence number each time
+                test_guid = str(uuid.uuid4())
+                policy, extensions, expected_errors, deletion_order = case()
+                for ext in extensions:
+                    ext["properties"].update({
+                        "forceUpdateTag": test_guid
+                    })
+
+                # We update the extension template here with extensions that are specific to the scenario that we want to
+                # test out
+                ext_template = copy.deepcopy(base_extension_template)
+                ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][
+                    'extensions'] = extensions
+
+                # Log the dependencies for the extensions in this test case
+                for ext in extensions:
+                    provisioned_after = ext['properties'].get('provisionAfterExtensions')
+                    depends_on = provisioned_after if provisioned_after else []
+                    if depends_on:
+                        dependency_list = ' and '.join(depends_on)
+                        log.info("{0} depends on {1}".format(ext['name'], dependency_list))
+                    else:
+                        log.info("{0} does not depend on any extension".format(ext['name']))
+
+                # Copy policy file to each VM instance
+                log.info("Updating policy file with new policy: {0}".format(policy))
+                for ssh_client in ssh_clients.values():
+                    self._create_policy_file(ssh_client, policy)
+
+                log.info("Deploying extensions to the scale set...")
+                rg_client = ResourceGroupClient(self._context.vmss.cloud, self._context.vmss.subscription,
+                                                self._context.vmss.resource_group, self._context.vmss.location)
+
+                # Deploy updated extension template to the scale set.
+                # If test case is supposed to fail, assert that the operation fails with the expected error messages.
+                try:
+                    rg_client.deploy_template(template=ext_template)
+                    if expected_errors is not None and len(expected_errors) != 0:
+                        fail("Extension deployment was expected to fail with the following errors: {0}".format(expected_errors))
+                    log.info("Extension deployment succeeded as expected")
+                    log.info("")
+                except Exception as e:
+                    if expected_errors is None or len(expected_errors) == 0:
+                        fail("Extension template deployment unexpectedly failed: {0}".format(e))
+                    else:
+                        deployment_failure_pattern = r"[\s\S]*\"code\":\s*\"ResourceDeploymentFailure\"[\s\S]*\"details\":\s*\[\s*(?P<error>[\s\S]*)\]"
+                        deployment_failure_match = re.match(deployment_failure_pattern, str(e))
+                        try:
+                            if deployment_failure_match is None:
+                                raise Exception("Unable to match a ResourceDeploymentFailure")
+                            error_json = json.loads(deployment_failure_match.group("error"))
+                            error_message = error_json['message']
+                        except Exception as parse_exc:
+                            fail("Extension template deployment failed as expected, but there was an error in parsing the failure. Parsing failure: {0}\nDeployment Failure: {1}".format(parse_exc, e))
+
+                        for phrase in expected_errors:
+                            if phrase not in error_message:
+                                fail("Extension template deployment failed as expected, but with an unexpected error. Error expected to contain message '{0}'. Actual error: {1}".format(phrase, e))
+
+                    log.info("Extensions failed as expected.")
+                    log.info("")
+                    log.info("Expected errors:")
+                    for expected_error in expected_errors:
+                        log.info(" - {0}".format(expected_error))
+                    log.info("")
+                    log.info("")
+                    log.info("Actual errors:")
+                    log.info(str(e))
+
+                # Clean up failed extensions to leave VMSS in a good state for the next test. CRP will attempt to uninstall
+                # leftover extensions in the next test, but uninstall will be disallowed and reach timeout unexpectedly.
+                # CRP also won't allow deletion of an extension that is dependent on another failed extension, so we first
+                # update policy to allow all, re-enable all extensions, and then delete them in dependency order.
+                log.info("Starting cleanup for test case...")
+                allow_all_policy = \
+                    {
+                        "policyVersion": "0.1.0",
+                        "extensionPolicies": {
+                            "allowListedExtensionsOnly": False
+                        }
                     }
-                }
-            for ssh_client in ssh_clients.values():
-                self._create_policy_file(ssh_client, allow_all_policy)
-
-            log.info("Trying to re-enable before deleting extensions...")
-            for ext in extensions:
-                ext["properties"].update({
-                    "forceUpdateTag": str(uuid.uuid4())
-                })
-            ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][
-                'extensions'] = extensions
-            enable_start_time = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip()
-            try:
-                rg_client.deploy_template(template=ext_template)
-            except Exception as err:
-                # Known issue - CRP returns a stale status for no-config extensions, because it does not wait for a new
-                # sequence number. Only for cases testing no-config extension dependencies, swallow the CRP error and
-                # check agent log instead to confirm that extensions were enabled successfully.
-                test_cases_to_work_around = [
-                    _should_fail_single_config_depends_on_disallowed_no_config
-                ]
-                if case in test_cases_to_work_around:
-                    log.info("CRP returned error when re-enabling extensions after allowing. Checking agent log to see if enable succeeded. "
-                             "Error: {0}".format(err))
-                    time.sleep(2 * 60)  # Give extensions some time to finish processing.
-                    extension_list = ' '.join([str(e) for e in deletion_order])
-                    command = (f"agent_ext_policy-verify_operation_success.py --after-timestamp '{enable_start_time}' "
-                               f"--operation 'enable' --extension-list {extension_list}")
-                    for ssh_client in ssh_clients.values():
-                        ssh_client.run_command(command, use_sudo=True)
-                    log.info("Agent reported successful status for all extensions, enable succeeded.")
-                else:
-                    fail("Failed to re-enable extensions after allowing with policy.")
-
-            # Delete all extensions in dependency order.
-            for ext_to_delete in deletion_order:
-                ext_name_to_delete = ext_to_delete.type
+                for ssh_client in ssh_clients.values():
+                    self._create_policy_file(ssh_client, allow_all_policy)
+
+                log.info("Trying to re-enable before deleting extensions...")
+                for ext in extensions:
+                    ext["properties"].update({
+                        "forceUpdateTag": str(uuid.uuid4())
+                    })
+                ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][
+                    'extensions'] = extensions
+                enable_start_time = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip()
                 try:
-                    self._context.vmss.delete_extension(ext_name_to_delete)
-                except Exception as crp_err:
-                    fail("Failed to uninstall extension {0}. Exception: {1}".format(ext_name_to_delete, crp_err))
-                log.info("Successfully uninstalled extension {0}".format(ext_name_to_delete))
+                    rg_client.deploy_template(template=ext_template)
+                except Exception as err:
+                    # Known issue - CRP returns a stale status for no-config extensions, because it does not wait for a new
+                    # sequence number. Only for cases testing no-config extension dependencies, swallow the CRP error and
+                    # check agent log instead to confirm that extensions were enabled successfully.
+                    test_cases_to_work_around = [
+                        _should_fail_single_config_depends_on_disallowed_no_config
+                    ]
+                    if case in test_cases_to_work_around:
+                        log.info("CRP returned error when re-enabling extensions after allowing. Checking agent log to see if enable succeeded. "
+                                 "Error: {0}".format(err))
+                        time.sleep(2 * 60)  # Give extensions some time to finish processing.
+                        extension_list = ' '.join([str(e) for e in deletion_order])
+                        command = (f"agent_ext_policy-verify_operation_success.py --after-timestamp '{enable_start_time}' "
+                                   f"--operation 'enable' --extension-list {extension_list}")
+                        for ssh_client in ssh_clients.values():
+                            ssh_client.run_command(command, use_sudo=True)
+                        log.info("Agent reported successful status for all extensions, enable succeeded.")
+                    else:
+                        fail("Failed to re-enable extensions after allowing with policy.")
+
+                # Delete all extensions in dependency order.
+                for ext_to_delete in deletion_order:
+                    ext_name_to_delete = ext_to_delete.type
+                    try:
+                        self._context.vmss.delete_extension(ext_name_to_delete)
+                    except Exception as crp_err:
+                        fail("Failed to uninstall extension {0}. Exception: {1}".format(ext_name_to_delete, crp_err))
+                    log.info("Successfully uninstalled extension {0}".format(ext_name_to_delete))
 
-            log.info("Successfully removed all extensions from VMSS")
-            log.info("---------------------------------------------")
+                log.info("Successfully removed all extensions from VMSS")
+                log.info("---------------------------------------------")
 
-        # Disable policy via conf file.
-        for ssh_client in ssh_clients.values():
-            ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True)
+        finally:
+            # Disable policy via conf file and delete policy file.
+            for ssh_client in ssh_clients.values():
+                ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True)
+                ssh_client.run_command("rm -f /etc/waagent_policy.json", use_sudo=True)
+                log.info("")
+                log.info("Successfully disabled policy via config (Debug.EnableExtensionPolicy=n) and removed policy file at /etc/waagent_policy.json")
 
     def get_ignore_errors_before_timestamp(self) -> datetime:
         # Ignore errors in the agent log before the first test case starts
diff --git a/tests_e2e/tests/ext_sequencing/ext_seq_test_cases.py b/tests_e2e/tests/ext_sequencing/ext_seq_test_cases.py
index d1c942d0a..0f1e926e3 100644
--- a/tests_e2e/tests/ext_sequencing/ext_seq_test_cases.py
+++ b/tests_e2e/tests/ext_sequencing/ext_seq_test_cases.py
@@ -14,6 +14,7 @@ def add_one_dependent_ext_without_settings():
         {
             "name": "CustomScript",
             "properties": {
+                "provisionAfterExtensions": [],
                 "publisher": "Microsoft.Azure.Extensions",
                 "type": "CustomScript",
                 "typeHandlerVersion": "2.1",