diff --git a/global/concourse-main/Chart.yaml b/global/concourse-main/Chart.yaml index aa5a1ebdfc6..acc51b97b5b 100644 --- a/global/concourse-main/Chart.yaml +++ b/global/concourse-main/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: concourse-main description: An umbrella chart for concourse ci type: application -version: 7.11.2-2 +version: 7.11.2-3 appVersion: 7.11.2 dependencies: - name: concourse diff --git a/global/concourse-main/values.yaml b/global/concourse-main/values.yaml index bbf314fbc73..086e4756d31 100644 --- a/global/concourse-main/values.yaml +++ b/global/concourse-main/values.yaml @@ -135,7 +135,7 @@ gitResourceProxy: timeout: 60s debug: true image: keppel.global.cloud.sap/ccloud/concourse-git-resource-proxy - imageTag: 0.6.0 + imageTag: 0.7.0 imagePullPolicy: IfNotPresent kubernetes-ingress: diff --git a/openstack/castellum/alerts/openstack/errors.alerts b/openstack/castellum/alerts/openstack/errors.alerts index 1d7e8bf01c4..a0149a0185e 100644 --- a/openstack/castellum/alerts/openstack/errors.alerts +++ b/openstack/castellum/alerts/openstack/errors.alerts @@ -268,11 +268,11 @@ groups: summary: Castellum encountered backend errors while resizing some assets - alert: OpenstackCastellumAuditEventPublishFailing - # Usually, you would check increase() here, but audit events *can* be quite - # rare, so we alert if there are any failed audit events at all. To clear this alert, - # delete the respective - expr: max by (pod) (castellum_failed_auditevent_publish > 0) - for: 1h + # The underlying metric counts failed submission attempts, e.g. because the hermes-rabbitmq server is restarting. + # These are not necessarily fatal because the process will hold them in memory to retry the submission later. + # The alert will clear up on its own once submissions start working again. + expr: sum by (pod) (changes(audittools_failed_submissions{namespace="castellum"}[1h]) > 0) + for: 5m labels: context: auditeventpublish service: castellum @@ -283,4 +283,4 @@ groups: meta: '{{ $labels.pod }}' annotations: summary: "{{ $labels.pod }} cannot publish audit events" - description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for details. Once the underlying issue was addressed, delete the offending pod to clear this alert." + description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for detailed error messages. Affected audit events are held in memory until publishing succeeds." diff --git a/openstack/elektra/values.yaml b/openstack/elektra/values.yaml index 0c8c5ba78bd..f06a4cc0ae4 100644 --- a/openstack/elektra/values.yaml +++ b/openstack/elektra/values.yaml @@ -19,7 +19,7 @@ monsoon_dashboard_mail_user: defined_in_secrets monsoon_dashboard_mail_password: defined_in_secrets monsoon_dashboard_avatar_url: https://avatars.wdf.sap.corp/avatar/#{current_user.name}?size=24x24 -monsoon_dashboard_cam_url: https://spc.ondemand.com/sap/bc/webdynpro/a1sspc/cam_wd_central +monsoon_dashboard_cam_url: https://cam.int.sap/cam/ui/admin #monsoon_openstack_auth_api_endpoint: monsoon_openstack_auth_api_userid: dashboard monsoon_openstack_auth_api_domain: Default diff --git a/openstack/keppel/alerts/openstack/api.alerts b/openstack/keppel/alerts/openstack/api.alerts index 4e80723a91c..9973e77095d 100644 --- a/openstack/keppel/alerts/openstack/api.alerts +++ b/openstack/keppel/alerts/openstack/api.alerts @@ -126,11 +126,11 @@ groups: starved for CPU time, so try checking the CPU throttling metrics. - alert: OpenstackKeppelAuditEventPublishFailing - # Usually, you would check increase() here, but audit events *can* be quite - # rare, so we alert if there are any failed audit events at all. To clear this alert, - # delete the respective - expr: max by (pod) (keppel_failed_auditevent_publish > 0) - for: 1h + # The underlying metric counts failed submission attempts, e.g. because the hermes-rabbitmq server is restarting. + # These are not necessarily fatal because the process will hold them in memory to retry the submission later. + # The alert will clear up on its own once submissions start working again. + expr: sum by (pod) (changes(audittools_failed_submissions{namespace="keppel"}[1h]) > 0) + for: 5m labels: context: auditeventpublish dashboard: keppel-overview @@ -141,4 +141,4 @@ groups: meta: '{{ $labels.pod }}' annotations: summary: "{{ $labels.pod }} cannot publish audit events" - description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for details. Once the underlying issue was addressed, delete the offending pod to clear this alert." + description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for detailed error messages. Affected audit events are held in memory until publishing succeeds." diff --git a/openstack/limes/alerts/openstack/api.alerts b/openstack/limes/alerts/openstack/api.alerts index 730cbe2a2fc..6275a89ff54 100644 --- a/openstack/limes/alerts/openstack/api.alerts +++ b/openstack/limes/alerts/openstack/api.alerts @@ -209,11 +209,11 @@ groups: summary: Limes cannot sync quota overrides. - alert: OpenstackLimesAuditEventPublishFailing - # Usually, you would check increase() here, but audit events *can* be quite - # rare, so we alert if there are any failed audit events at all. To clear this alert, - # delete the respective - expr: max by (pod) (limes_failed_auditevent_publish > 0) - for: 1h + # The underlying metric counts failed submission attempts, e.g. because the hermes-rabbitmq server is restarting. + # These are not necessarily fatal because the process will hold them in memory to retry the submission later. + # The alert will clear up on its own once submissions start working again. + expr: sum by (pod) (changes(audittools_failed_submissions{namespace="limes"}[1h]) > 0) + for: 5m labels: context: auditeventpublish dashboard: limes-overview @@ -224,7 +224,7 @@ groups: meta: '{{ $labels.pod }}' annotations: summary: "{{ $labels.pod }} cannot publish audit events" - description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for details. Once the underlying issue was addressed, delete the offending pod to clear this alert." + description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for detailed error messages. Affected audit events are held in memory until publishing succeeds." - alert: OpenstackLimesIncompleteProjectResourceData expr: max by (service, resource) (limes_project_resources_by_type_count) != on () group_left max(limes_project_count) diff --git a/openstack/limes/templates/_utils.tpl b/openstack/limes/templates/_utils.tpl index 237e441eed6..fda63327410 100644 --- a/openstack/limes/templates/_utils.tpl +++ b/openstack/limes/templates/_utils.tpl @@ -8,9 +8,7 @@ {{- define "limes_common_envvars" }} {{- if $.Values.limes.has_audit_trail }} -- name: LIMES_AUDIT_ENABLE - value: "true" -- name: LIMES_AUDIT_QUEUE_NAME +- name: LIMES_AUDIT_RABBITMQ_QUEUE_NAME value: "notifications.info" - name: LIMES_AUDIT_RABBITMQ_HOSTNAME value: "hermes-rabbitmq-notifications.hermes.svc" diff --git a/openstack/utils/Chart.yaml b/openstack/utils/Chart.yaml index ab476a7ab05..784773166b1 100644 --- a/openstack/utils/Chart.yaml +++ b/openstack/utils/Chart.yaml @@ -1,4 +1,4 @@ apiVersion: v1 description: A Helm chart for Kubernetes name: utils -version: 0.19.7 +version: 0.20.0 diff --git a/openstack/utils/templates/snippets/_proxysql.cfg.tpl b/openstack/utils/templates/snippets/_proxysql.cfg.tpl index 410602c4c70..71c239cabcd 100644 --- a/openstack/utils/templates/snippets/_proxysql.cfg.tpl +++ b/openstack/utils/templates/snippets/_proxysql.cfg.tpl @@ -42,6 +42,8 @@ mysql_variables = connect_retries_on_failure = {{ default 1000 .global.Values.proxysql.connect_retries_on_failure }} connect_retries_delay = {{ default 100 .global.Values.proxysql.connect_retries_delay }} {{- /* The default is 1ms, and that means we will run through the retries on failure in no time */}} connect_timeout_server_max = {{ default 100000 .global.Values.proxysql.connect_timeout_server_max }} + max_transaction_time = {{ default 60000 .global.Values.proxysql.max_transaction_time }} + default_query_timeout = {{ default 90000 .global.Values.proxysql.default_query_timeout }} } mysql_servers = diff --git a/prometheus-rules/prometheus-vmware-rules/alerts/virtualmachine.alerts b/prometheus-rules/prometheus-vmware-rules/alerts/virtualmachine.alerts index 764ded8f2fe..550a4ddc58b 100644 --- a/prometheus-rules/prometheus-vmware-rules/alerts/virtualmachine.alerts +++ b/prometheus-rules/prometheus-vmware-rules/alerts/virtualmachine.alerts @@ -162,7 +162,7 @@ groups: context: "NSXTMgmtVMDistribution" dashboard: management-cluster-resources/management-cluster-resources?orgId=1&var-pod=All&var-clusters={{ $labels.vccluster }}&var-hosts={{ $labels.hostsystem }} meta: "Too many NSX-T VMs on {{ $labels.hostsystem }}. ({{ $labels.vcenter }})" - playbook: docs/devops/alert/nsxt/#NSXTMgmtVMsOddDistribution + playbook: docs/devops/alert/vcenter/#nsxtmgmtvmsodddistribution no_alert_on_absence: "true" annotations: description: "Too many NSX-T VMs for the same cluster on {{ $labels.hostsystem }}. Please distribute the VMs across different nodes. ({{ $labels.vcenter }})" diff --git a/system/kube-system-admin-k3s/Chart.lock b/system/kube-system-admin-k3s/Chart.lock index d50993e5819..8376ed2ba39 100644 --- a/system/kube-system-admin-k3s/Chart.lock +++ b/system/kube-system-admin-k3s/Chart.lock @@ -10,7 +10,7 @@ dependencies: version: 6.5.0 - name: traefik repository: https://helm.traefik.io/traefik - version: 10.22.0 + version: 33.1.0 - name: cc-rbac repository: oci://keppel.eu-de-1.cloud.sap/ccloud-helm version: 0.2.7 @@ -53,5 +53,5 @@ dependencies: - name: reloader repository: oci://ghcr.io/stakater/charts version: 1.0.121 -digest: sha256:f4c3563a0619870ec7ff794f2171aac15b2edd0f7542dde378aedb6c908ab24a -generated: "2024-11-12T09:22:01.009140541Z" +digest: sha256:b5afbb819c7a233adca48feba80e5c8fb4dfd1b2cc51a6a2749cccef6661720a +generated: "2024-12-06T14:53:23.875335+02:00" diff --git a/system/kube-system-admin-k3s/Chart.yaml b/system/kube-system-admin-k3s/Chart.yaml index e7dcae88e2d..12940b0c435 100644 --- a/system/kube-system-admin-k3s/Chart.yaml +++ b/system/kube-system-admin-k3s/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 appVersion: "1.0" description: Kube-System relevant Service collection for the new admin clusters. name: kube-system-admin-k3s -version: 3.3.24 +version: 3.3.27 home: https://github.com/sapcc/helm-charts/tree/master/system/kube-system-admin-k3s dependencies: - name: disco @@ -16,7 +16,7 @@ dependencies: version: 6.5.0 - name: traefik repository: https://helm.traefik.io/traefik - version: 10.22.0 + version: 33.1.0 - name: cc-rbac repository: oci://keppel.eu-de-1.cloud.sap/ccloud-helm version: 0.2.7 diff --git a/system/kube-system-admin-k3s/ci/test-values.yaml b/system/kube-system-admin-k3s/ci/test-values.yaml index 71812f71cce..729d873fd1d 100644 --- a/system/kube-system-admin-k3s/ci/test-values.yaml +++ b/system/kube-system-admin-k3s/ci/test-values.yaml @@ -26,9 +26,6 @@ traefik: enabled: true additionalArguments: - "--test=test" - serviceAccount: - enabled: true - name: test k3s-backup: aws: diff --git a/system/kube-system-admin-k3s/values.yaml b/system/kube-system-admin-k3s/values.yaml index 606bc0879c2..4c06ec6ae0d 100644 --- a/system/kube-system-admin-k3s/values.yaml +++ b/system/kube-system-admin-k3s/values.yaml @@ -12,6 +12,9 @@ ingress: ca_cert: traefik: + instanceLabelOverride: kube-system + updateStrategy: + type: Recreate ingressRoute: dashboard: enabled: false @@ -22,7 +25,8 @@ traefik: kubernetesIngress: {} ports: web: - expose: false + expose: + default: false websecure: tls: enabled: true diff --git a/system/vmware-monitoring/Chart.yaml b/system/vmware-monitoring/Chart.yaml index 79572b7f16d..0e07bab094c 100644 --- a/system/vmware-monitoring/Chart.yaml +++ b/system/vmware-monitoring/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: vmware-monitoring -version: 2.0.4 +version: 2.0.5 description: VMware Monitoring and Metrics Collection dependencies: - name: prometheus-server diff --git a/system/vmware-monitoring/templates/collector-configmap.yaml b/system/vmware-monitoring/templates/collector-configmap.yaml index 82b23667d7b..340c7af05b0 100644 --- a/system/vmware-monitoring/templates/collector-configmap.yaml +++ b/system/vmware-monitoring/templates/collector-configmap.yaml @@ -293,8 +293,8 @@ data: key: "config|storageDevice|plugStoreTopology|numberofPath" - metric_suffix: "custom_attributes_change_request_info" key: "summary|customTag:Change_Request|customTagValue" - - metric_suffix: "summary_custom_tag_no_swap_ds" - key: "summary|customTag:No_Swap_DS|customTagValue" + - metric_suffix: "summary_custom_tag_nvme" + key: "summary|customTag:nvme|customTagValue" VCenterStatsCollector: # INFO - Prefix: vrops_vcenter_