From 753d7970f552e62d09670b1fee6b68f47f908173 Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Tue, 4 Feb 2025 11:39:41 -0500 Subject: [PATCH] Improve ext dependencies scenarios to share scaleset (#3312) * Improve ext dependencies scenarios to share scaleset * Add logging for policy deletion --- .../ext_policy_with_dependencies.py | 340 +++++++++--------- .../ext_sequencing/ext_seq_test_cases.py | 1 + 2 files changed, 174 insertions(+), 167 deletions(-) diff --git a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py index c4f798848..5830d63e5 100644 --- a/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py +++ b/tests_e2e/tests/ext_policy/ext_policy_with_dependencies.py @@ -85,184 +85,190 @@ def _create_policy_file(ssh_client, policy): def run(self): - # Set up the test run instances_ip_address: List[VmssInstanceIpAddress] = self._context.vmss.get_instances_ip_address() ssh_clients: Dict[str, SshClient] = {} for instance in instances_ip_address: - ssh_clients[instance.instance_name] = SshClient(ip_address=instance.ip_address, username=self._context.username, identity_file=self._context.identity_file) - - # Cleanup any extensions left behind by other tests, as they may be blocked by policy and erroneously cause failures. - instance_view_ext = self._context.vmss.get_instance_view().extensions - if instance_view_ext is not None and len(instance_view_ext) > 0: - for ex in instance_view_ext: - self._context.vmss.delete_extension(ex.name) - - # Enable policy via conf file. - for ssh_client in ssh_clients.values(): - ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) - - if not VmExtensionIds.AzureMonitorLinuxAgent.supports_distro(next(iter(ssh_clients.values())).run_command("get_distro.py").rstrip()): - raise TestSkipped("Currently AzureMonitorLinuxAgent is not supported on this distro") - - # This is the base ARM template that's used for deploying extensions for this scenario. - base_extension_template = { - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json", - "contentVersion": "1.0.0.0", - "resources": [ - { - "type": "Microsoft.Compute/virtualMachineScaleSets", - "name": f"{self._context.vmss.name}", - "location": "[resourceGroup().location]", - "apiVersion": "2018-06-01", - "properties": { - "virtualMachineProfile": { - "extensionProfile": { - "extensions": [] + ssh_clients[instance.instance_name] = SshClient(ip_address=instance.ip_address, + username=self._context.username, + identity_file=self._context.identity_file) + + try: + # Cleanup any extensions left behind by other tests, as they may be blocked by policy and erroneously cause failures. + instance_view_ext = self._context.vmss.get_instance_view().extensions + if instance_view_ext is not None and len(instance_view_ext) > 0: + for ex in instance_view_ext: + self._context.vmss.delete_extension(ex.name) + + # Enable policy via conf file. + for ssh_client in ssh_clients.values(): + ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=y", use_sudo=True) + + if not VmExtensionIds.AzureMonitorLinuxAgent.supports_distro(next(iter(ssh_clients.values())).run_command("get_distro.py").rstrip()): + raise TestSkipped("Currently AzureMonitorLinuxAgent is not supported on this distro") + + # This is the base ARM template that's used for deploying extensions for this scenario. + base_extension_template = { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json", + "contentVersion": "1.0.0.0", + "resources": [ + { + "type": "Microsoft.Compute/virtualMachineScaleSets", + "name": f"{self._context.vmss.name}", + "location": "[resourceGroup().location]", + "apiVersion": "2018-06-01", + "properties": { + "virtualMachineProfile": { + "extensionProfile": { + "extensions": [] + } } } } - } - ] - } - - for case in self._test_cases: - log.info("") - log.info("*** Test case: {0}".format(case.__name__.replace('_', ' '))) - test_case_start = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip() - if self._scenario_start == datetime.min: - self._scenario_start = test_case_start - log.info("Test case start time: {0}".format(test_case_start)) - - # Assign unique guid to forceUpdateTag for each extension to make sure they're always unique to force CRP - # to generate a new sequence number each time - test_guid = str(uuid.uuid4()) - policy, extensions, expected_errors, deletion_order = case() - for ext in extensions: - ext["properties"].update({ - "forceUpdateTag": test_guid - }) - - # We update the extension template here with extensions that are specific to the scenario that we want to - # test out - ext_template = copy.deepcopy(base_extension_template) - ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][ - 'extensions'] = extensions - - # Log the dependencies for the extensions in this test case - for ext in extensions: - provisioned_after = ext['properties'].get('provisionAfterExtensions') - depends_on = provisioned_after if provisioned_after else [] - if depends_on: - dependency_list = ' and '.join(depends_on) - log.info("{0} depends on {1}".format(ext['name'], dependency_list)) - else: - log.info("{0} does not depend on any extension".format(ext['name'])) - - # Copy policy file to each VM instance - log.info("Updating policy file with new policy: {0}".format(policy)) - for ssh_client in ssh_clients.values(): - self._create_policy_file(ssh_client, policy) - - log.info("Deploying extensions to the scale set...") - rg_client = ResourceGroupClient(self._context.vmss.cloud, self._context.vmss.subscription, - self._context.vmss.resource_group, self._context.vmss.location) - - # Deploy updated extension template to the scale set. - # If test case is supposed to fail, assert that the operation fails with the expected error messages. - try: - rg_client.deploy_template(template=ext_template) - if expected_errors is not None and len(expected_errors) != 0: - fail("Extension deployment was expected to fail with the following errors: {0}".format(expected_errors)) - log.info("Extension deployment succeeded as expected") - log.info("") - except Exception as e: - if expected_errors is None or len(expected_errors) == 0: - fail("Extension template deployment unexpectedly failed: {0}".format(e)) - else: - deployment_failure_pattern = r"[\s\S]*\"code\":\s*\"ResourceDeploymentFailure\"[\s\S]*\"details\":\s*\[\s*(?P[\s\S]*)\]" - deployment_failure_match = re.match(deployment_failure_pattern, str(e)) - try: - if deployment_failure_match is None: - raise Exception("Unable to match a ResourceDeploymentFailure") - error_json = json.loads(deployment_failure_match.group("error")) - error_message = error_json['message'] - except Exception as parse_exc: - fail("Extension template deployment failed as expected, but there was an error in parsing the failure. Parsing failure: {0}\nDeployment Failure: {1}".format(parse_exc, e)) - - for phrase in expected_errors: - if phrase not in error_message: - fail("Extension template deployment failed as expected, but with an unexpected error. Error expected to contain message '{0}'. Actual error: {1}".format(phrase, e)) - - log.info("Extensions failed as expected.") - log.info("") - log.info("Expected errors:") - for expected_error in expected_errors: - log.info(" - {0}".format(expected_error)) - log.info("") + ] + } + + for case in self._test_cases: log.info("") - log.info("Actual errors:") - log.info(str(e)) - - # Clean up failed extensions to leave VMSS in a good state for the next test. CRP will attempt to uninstall - # leftover extensions in the next test, but uninstall will be disallowed and reach timeout unexpectedly. - # CRP also won't allow deletion of an extension that is dependent on another failed extension, so we first - # update policy to allow all, re-enable all extensions, and then delete them in dependency order. - log.info("Starting cleanup for test case...") - allow_all_policy = \ - { - "policyVersion": "0.1.0", - "extensionPolicies": { - "allowListedExtensionsOnly": False + log.info("*** Test case: {0}".format(case.__name__.replace('_', ' '))) + test_case_start = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip() + if self._scenario_start == datetime.min: + self._scenario_start = test_case_start + log.info("Test case start time: {0}".format(test_case_start)) + + # Assign unique guid to forceUpdateTag for each extension to make sure they're always unique to force CRP + # to generate a new sequence number each time + test_guid = str(uuid.uuid4()) + policy, extensions, expected_errors, deletion_order = case() + for ext in extensions: + ext["properties"].update({ + "forceUpdateTag": test_guid + }) + + # We update the extension template here with extensions that are specific to the scenario that we want to + # test out + ext_template = copy.deepcopy(base_extension_template) + ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][ + 'extensions'] = extensions + + # Log the dependencies for the extensions in this test case + for ext in extensions: + provisioned_after = ext['properties'].get('provisionAfterExtensions') + depends_on = provisioned_after if provisioned_after else [] + if depends_on: + dependency_list = ' and '.join(depends_on) + log.info("{0} depends on {1}".format(ext['name'], dependency_list)) + else: + log.info("{0} does not depend on any extension".format(ext['name'])) + + # Copy policy file to each VM instance + log.info("Updating policy file with new policy: {0}".format(policy)) + for ssh_client in ssh_clients.values(): + self._create_policy_file(ssh_client, policy) + + log.info("Deploying extensions to the scale set...") + rg_client = ResourceGroupClient(self._context.vmss.cloud, self._context.vmss.subscription, + self._context.vmss.resource_group, self._context.vmss.location) + + # Deploy updated extension template to the scale set. + # If test case is supposed to fail, assert that the operation fails with the expected error messages. + try: + rg_client.deploy_template(template=ext_template) + if expected_errors is not None and len(expected_errors) != 0: + fail("Extension deployment was expected to fail with the following errors: {0}".format(expected_errors)) + log.info("Extension deployment succeeded as expected") + log.info("") + except Exception as e: + if expected_errors is None or len(expected_errors) == 0: + fail("Extension template deployment unexpectedly failed: {0}".format(e)) + else: + deployment_failure_pattern = r"[\s\S]*\"code\":\s*\"ResourceDeploymentFailure\"[\s\S]*\"details\":\s*\[\s*(?P[\s\S]*)\]" + deployment_failure_match = re.match(deployment_failure_pattern, str(e)) + try: + if deployment_failure_match is None: + raise Exception("Unable to match a ResourceDeploymentFailure") + error_json = json.loads(deployment_failure_match.group("error")) + error_message = error_json['message'] + except Exception as parse_exc: + fail("Extension template deployment failed as expected, but there was an error in parsing the failure. Parsing failure: {0}\nDeployment Failure: {1}".format(parse_exc, e)) + + for phrase in expected_errors: + if phrase not in error_message: + fail("Extension template deployment failed as expected, but with an unexpected error. Error expected to contain message '{0}'. Actual error: {1}".format(phrase, e)) + + log.info("Extensions failed as expected.") + log.info("") + log.info("Expected errors:") + for expected_error in expected_errors: + log.info(" - {0}".format(expected_error)) + log.info("") + log.info("") + log.info("Actual errors:") + log.info(str(e)) + + # Clean up failed extensions to leave VMSS in a good state for the next test. CRP will attempt to uninstall + # leftover extensions in the next test, but uninstall will be disallowed and reach timeout unexpectedly. + # CRP also won't allow deletion of an extension that is dependent on another failed extension, so we first + # update policy to allow all, re-enable all extensions, and then delete them in dependency order. + log.info("Starting cleanup for test case...") + allow_all_policy = \ + { + "policyVersion": "0.1.0", + "extensionPolicies": { + "allowListedExtensionsOnly": False + } } - } - for ssh_client in ssh_clients.values(): - self._create_policy_file(ssh_client, allow_all_policy) - - log.info("Trying to re-enable before deleting extensions...") - for ext in extensions: - ext["properties"].update({ - "forceUpdateTag": str(uuid.uuid4()) - }) - ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][ - 'extensions'] = extensions - enable_start_time = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip() - try: - rg_client.deploy_template(template=ext_template) - except Exception as err: - # Known issue - CRP returns a stale status for no-config extensions, because it does not wait for a new - # sequence number. Only for cases testing no-config extension dependencies, swallow the CRP error and - # check agent log instead to confirm that extensions were enabled successfully. - test_cases_to_work_around = [ - _should_fail_single_config_depends_on_disallowed_no_config - ] - if case in test_cases_to_work_around: - log.info("CRP returned error when re-enabling extensions after allowing. Checking agent log to see if enable succeeded. " - "Error: {0}".format(err)) - time.sleep(2 * 60) # Give extensions some time to finish processing. - extension_list = ' '.join([str(e) for e in deletion_order]) - command = (f"agent_ext_policy-verify_operation_success.py --after-timestamp '{enable_start_time}' " - f"--operation 'enable' --extension-list {extension_list}") - for ssh_client in ssh_clients.values(): - ssh_client.run_command(command, use_sudo=True) - log.info("Agent reported successful status for all extensions, enable succeeded.") - else: - fail("Failed to re-enable extensions after allowing with policy.") - - # Delete all extensions in dependency order. - for ext_to_delete in deletion_order: - ext_name_to_delete = ext_to_delete.type + for ssh_client in ssh_clients.values(): + self._create_policy_file(ssh_client, allow_all_policy) + + log.info("Trying to re-enable before deleting extensions...") + for ext in extensions: + ext["properties"].update({ + "forceUpdateTag": str(uuid.uuid4()) + }) + ext_template['resources'][0]['properties']['virtualMachineProfile']['extensionProfile'][ + 'extensions'] = extensions + enable_start_time = random.choice(list(ssh_clients.values())).run_command("date '+%Y-%m-%d %T'").rstrip() try: - self._context.vmss.delete_extension(ext_name_to_delete) - except Exception as crp_err: - fail("Failed to uninstall extension {0}. Exception: {1}".format(ext_name_to_delete, crp_err)) - log.info("Successfully uninstalled extension {0}".format(ext_name_to_delete)) + rg_client.deploy_template(template=ext_template) + except Exception as err: + # Known issue - CRP returns a stale status for no-config extensions, because it does not wait for a new + # sequence number. Only for cases testing no-config extension dependencies, swallow the CRP error and + # check agent log instead to confirm that extensions were enabled successfully. + test_cases_to_work_around = [ + _should_fail_single_config_depends_on_disallowed_no_config + ] + if case in test_cases_to_work_around: + log.info("CRP returned error when re-enabling extensions after allowing. Checking agent log to see if enable succeeded. " + "Error: {0}".format(err)) + time.sleep(2 * 60) # Give extensions some time to finish processing. + extension_list = ' '.join([str(e) for e in deletion_order]) + command = (f"agent_ext_policy-verify_operation_success.py --after-timestamp '{enable_start_time}' " + f"--operation 'enable' --extension-list {extension_list}") + for ssh_client in ssh_clients.values(): + ssh_client.run_command(command, use_sudo=True) + log.info("Agent reported successful status for all extensions, enable succeeded.") + else: + fail("Failed to re-enable extensions after allowing with policy.") + + # Delete all extensions in dependency order. + for ext_to_delete in deletion_order: + ext_name_to_delete = ext_to_delete.type + try: + self._context.vmss.delete_extension(ext_name_to_delete) + except Exception as crp_err: + fail("Failed to uninstall extension {0}. Exception: {1}".format(ext_name_to_delete, crp_err)) + log.info("Successfully uninstalled extension {0}".format(ext_name_to_delete)) - log.info("Successfully removed all extensions from VMSS") - log.info("---------------------------------------------") + log.info("Successfully removed all extensions from VMSS") + log.info("---------------------------------------------") - # Disable policy via conf file. - for ssh_client in ssh_clients.values(): - ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) + finally: + # Disable policy via conf file and delete policy file. + for ssh_client in ssh_clients.values(): + ssh_client.run_command("update-waagent-conf Debug.EnableExtensionPolicy=n", use_sudo=True) + ssh_client.run_command("rm -f /etc/waagent_policy.json", use_sudo=True) + log.info("") + log.info("Successfully disabled policy via config (Debug.EnableExtensionPolicy=n) and removed policy file at /etc/waagent_policy.json") def get_ignore_errors_before_timestamp(self) -> datetime: # Ignore errors in the agent log before the first test case starts diff --git a/tests_e2e/tests/ext_sequencing/ext_seq_test_cases.py b/tests_e2e/tests/ext_sequencing/ext_seq_test_cases.py index d1c942d0a..0f1e926e3 100644 --- a/tests_e2e/tests/ext_sequencing/ext_seq_test_cases.py +++ b/tests_e2e/tests/ext_sequencing/ext_seq_test_cases.py @@ -14,6 +14,7 @@ def add_one_dependent_ext_without_settings(): { "name": "CustomScript", "properties": { + "provisionAfterExtensions": [], "publisher": "Microsoft.Azure.Extensions", "type": "CustomScript", "typeHandlerVersion": "2.1",