Merge branch 'sapcc:master' into master

himanip94 · Dec 6, 2024 · 3d3d7e3 · 3d3d7e3
2 parents 92fd3ef + d088a16
commit 3d3d7e3
Show file tree

Hide file tree

Showing 16 changed files with 39 additions and 38 deletions.
diff --git a/global/concourse-main/Chart.yaml b/global/concourse-main/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: concourse-main
 description: An umbrella chart for concourse ci
 type: application
-version: 7.11.2-2
+version: 7.11.2-3
 appVersion: 7.11.2
 dependencies:
 - name: concourse

diff --git a/global/concourse-main/values.yaml b/global/concourse-main/values.yaml
@@ -135,7 +135,7 @@ gitResourceProxy:
   timeout: 60s
   debug: true
   image: keppel.global.cloud.sap/ccloud/concourse-git-resource-proxy
-  imageTag: 0.6.0
+  imageTag: 0.7.0
   imagePullPolicy: IfNotPresent
 
 kubernetes-ingress:

diff --git a/openstack/castellum/alerts/openstack/errors.alerts b/openstack/castellum/alerts/openstack/errors.alerts
@@ -268,11 +268,11 @@ groups:
         summary: Castellum encountered backend errors while resizing some assets
 
     - alert: OpenstackCastellumAuditEventPublishFailing
-      # Usually, you would check increase() here, but audit events *can* be quite
-      # rare, so we alert if there are any failed audit events at all. To clear this alert,
-      # delete the respective
-      expr: max by (pod) (castellum_failed_auditevent_publish > 0)
-      for: 1h
+      # The underlying metric counts failed submission attempts, e.g. because the hermes-rabbitmq server is restarting.
+      # These are not necessarily fatal because the process will hold them in memory to retry the submission later.
+      # The alert will clear up on its own once submissions start working again.
+      expr: sum by (pod) (changes(audittools_failed_submissions{namespace="castellum"}[1h]) > 0)
+      for: 5m
       labels:
         context: auditeventpublish
         service: castellum
@@ -283,4 +283,4 @@ groups:
         meta: '{{ $labels.pod }}'
       annotations:
         summary: "{{ $labels.pod }} cannot publish audit events"
-        description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for details. Once the underlying issue was addressed, delete the offending pod to clear this alert."
+        description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for detailed error messages. Affected audit events are held in memory until publishing succeeds."
diff --git a/openstack/elektra/values.yaml b/openstack/elektra/values.yaml
@@ -19,7 +19,7 @@ monsoon_dashboard_mail_user: defined_in_secrets
 monsoon_dashboard_mail_password: defined_in_secrets
 
 monsoon_dashboard_avatar_url: https://avatars.wdf.sap.corp/avatar/#{current_user.name}?size=24x24
-monsoon_dashboard_cam_url: https://spc.ondemand.com/sap/bc/webdynpro/a1sspc/cam_wd_central
+monsoon_dashboard_cam_url: https://cam.int.sap/cam/ui/admin
 #monsoon_openstack_auth_api_endpoint:
 monsoon_openstack_auth_api_userid: dashboard
 monsoon_openstack_auth_api_domain: Default

diff --git a/openstack/keppel/alerts/openstack/api.alerts b/openstack/keppel/alerts/openstack/api.alerts
@@ -126,11 +126,11 @@ groups:
         starved for CPU time, so try checking the CPU throttling metrics.
 
   - alert: OpenstackKeppelAuditEventPublishFailing
-    # Usually, you would check increase() here, but audit events *can* be quite
-    # rare, so we alert if there are any failed audit events at all. To clear this alert,
-    # delete the respective
-    expr: max by (pod) (keppel_failed_auditevent_publish > 0)
-    for: 1h
+    # The underlying metric counts failed submission attempts, e.g. because the hermes-rabbitmq server is restarting.
+    # These are not necessarily fatal because the process will hold them in memory to retry the submission later.
+    # The alert will clear up on its own once submissions start working again.
+    expr: sum by (pod) (changes(audittools_failed_submissions{namespace="keppel"}[1h]) > 0)
+    for: 5m
     labels:
       context: auditeventpublish
       dashboard: keppel-overview
@@ -141,4 +141,4 @@ groups:
       meta: '{{ $labels.pod }}'
     annotations:
       summary: "{{ $labels.pod }} cannot publish audit events"
-      description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for details. Once the underlying issue was addressed, delete the offending pod to clear this alert."
+      description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for detailed error messages. Affected audit events are held in memory until publishing succeeds."
diff --git a/openstack/limes/alerts/openstack/api.alerts b/openstack/limes/alerts/openstack/api.alerts
@@ -209,11 +209,11 @@ groups:
       summary: Limes cannot sync quota overrides.
 
   - alert: OpenstackLimesAuditEventPublishFailing
-    # Usually, you would check increase() here, but audit events *can* be quite
-    # rare, so we alert if there are any failed audit events at all. To clear this alert,
-    # delete the respective
-    expr: max by (pod) (limes_failed_auditevent_publish > 0)
-    for: 1h
+    # The underlying metric counts failed submission attempts, e.g. because the hermes-rabbitmq server is restarting.
+    # These are not necessarily fatal because the process will hold them in memory to retry the submission later.
+    # The alert will clear up on its own once submissions start working again.
+    expr: sum by (pod) (changes(audittools_failed_submissions{namespace="limes"}[1h]) > 0)
+    for: 5m
     labels:
       context: auditeventpublish
       dashboard: limes-overview
@@ -224,7 +224,7 @@ groups:
       meta: '{{ $labels.pod }}'
     annotations:
       summary: "{{ $labels.pod }} cannot publish audit events"
-      description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for details. Once the underlying issue was addressed, delete the offending pod to clear this alert."
+      description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for detailed error messages. Affected audit events are held in memory until publishing succeeds."
 
   - alert: OpenstackLimesIncompleteProjectResourceData
     expr: max by (service, resource) (limes_project_resources_by_type_count) != on () group_left max(limes_project_count)

diff --git a/openstack/limes/templates/_utils.tpl b/openstack/limes/templates/_utils.tpl
@@ -8,9 +8,7 @@
 
 {{- define "limes_common_envvars" }}
 {{- if $.Values.limes.has_audit_trail }}
-- name: LIMES_AUDIT_ENABLE
-  value: "true"
-- name: LIMES_AUDIT_QUEUE_NAME
+- name: LIMES_AUDIT_RABBITMQ_QUEUE_NAME
   value: "notifications.info"
 - name: LIMES_AUDIT_RABBITMQ_HOSTNAME
   value: "hermes-rabbitmq-notifications.hermes.svc"

diff --git a/openstack/utils/Chart.yaml b/openstack/utils/Chart.yaml
@@ -1,4 +1,4 @@
 apiVersion: v1
 description: A Helm chart for Kubernetes
 name: utils
-version: 0.19.7
+version: 0.20.0
diff --git a/openstack/utils/templates/snippets/_proxysql.cfg.tpl b/openstack/utils/templates/snippets/_proxysql.cfg.tpl
@@ -42,6 +42,8 @@ mysql_variables =
     connect_retries_on_failure = {{ default 1000 .global.Values.proxysql.connect_retries_on_failure }}
     connect_retries_delay = {{ default 100 .global.Values.proxysql.connect_retries_delay }} {{- /* The default is 1ms, and that means we will run through the retries on failure in no time */}}
     connect_timeout_server_max = {{ default 100000 .global.Values.proxysql.connect_timeout_server_max }}
+    max_transaction_time = {{ default 60000 .global.Values.proxysql.max_transaction_time }}
+    default_query_timeout = {{ default 90000 .global.Values.proxysql.default_query_timeout }}
 }
 
 mysql_servers =

diff --git a/prometheus-rules/prometheus-vmware-rules/alerts/virtualmachine.alerts b/prometheus-rules/prometheus-vmware-rules/alerts/virtualmachine.alerts
@@ -162,7 +162,7 @@ groups:
       context: "NSXTMgmtVMDistribution"
       dashboard: management-cluster-resources/management-cluster-resources?orgId=1&var-pod=All&var-clusters={{ $labels.vccluster }}&var-hosts={{ $labels.hostsystem }}
       meta: "Too many NSX-T VMs on {{ $labels.hostsystem }}. ({{ $labels.vcenter }})"
-      playbook: docs/devops/alert/nsxt/#NSXTMgmtVMsOddDistribution
+      playbook: docs/devops/alert/vcenter/#nsxtmgmtvmsodddistribution
       no_alert_on_absence: "true"
     annotations:
       description: "Too many NSX-T VMs for the same cluster on {{ $labels.hostsystem }}. Please distribute the VMs across different nodes. ({{ $labels.vcenter }})"

diff --git a/system/kube-system-admin-k3s/Chart.lock b/system/kube-system-admin-k3s/Chart.lock
@@ -10,7 +10,7 @@ dependencies:
   version: 6.5.0
 - name: traefik
   repository: https://helm.traefik.io/traefik
-  version: 10.22.0
+  version: 33.1.0
 - name: cc-rbac
   repository: oci://keppel.eu-de-1.cloud.sap/ccloud-helm
   version: 0.2.7
@@ -53,5 +53,5 @@ dependencies:
 - name: reloader
   repository: oci://ghcr.io/stakater/charts
   version: 1.0.121
-digest: sha256:f4c3563a0619870ec7ff794f2171aac15b2edd0f7542dde378aedb6c908ab24a
-generated: "2024-11-12T09:22:01.009140541Z"
+digest: sha256:b5afbb819c7a233adca48feba80e5c8fb4dfd1b2cc51a6a2749cccef6661720a
+generated: "2024-12-06T14:53:23.875335+02:00"
diff --git a/system/kube-system-admin-k3s/Chart.yaml b/system/kube-system-admin-k3s/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 appVersion: "1.0"
 description: Kube-System relevant Service collection for the new admin clusters.
 name: kube-system-admin-k3s
-version: 3.3.24
+version: 3.3.27
 home: https://github.com/sapcc/helm-charts/tree/master/system/kube-system-admin-k3s
 dependencies:
   - name: disco
@@ -16,7 +16,7 @@ dependencies:
     version: 6.5.0
   - name: traefik
     repository: https://helm.traefik.io/traefik
-    version: 10.22.0
+    version: 33.1.0
   - name: cc-rbac
     repository: oci://keppel.eu-de-1.cloud.sap/ccloud-helm
     version: 0.2.7

diff --git a/system/kube-system-admin-k3s/ci/test-values.yaml b/system/kube-system-admin-k3s/ci/test-values.yaml
@@ -26,9 +26,6 @@ traefik:
       enabled: true
   additionalArguments:
     - "--test=test"
-  serviceAccount:
-    enabled: true
-    name: test
 
 k3s-backup:
   aws:

diff --git a/system/kube-system-admin-k3s/values.yaml b/system/kube-system-admin-k3s/values.yaml
@@ -12,6 +12,9 @@ ingress:
     ca_cert:
 
 traefik:
+  instanceLabelOverride: kube-system
+  updateStrategy: 
+    type: Recreate
   ingressRoute:
     dashboard:
       enabled: false
@@ -22,7 +25,8 @@ traefik:
     kubernetesIngress: {}
   ports:
     web:
-      expose: false
+      expose:
+        default: false
     websecure:
       tls:
         enabled: true

diff --git a/system/vmware-monitoring/Chart.yaml b/system/vmware-monitoring/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
 name: vmware-monitoring
-version: 2.0.4
+version: 2.0.5
 description: VMware Monitoring and Metrics Collection
 dependencies:
   - name: prometheus-server

diff --git a/system/vmware-monitoring/templates/collector-configmap.yaml b/system/vmware-monitoring/templates/collector-configmap.yaml
@@ -293,8 +293,8 @@ data:
         key: "config|storageDevice|plugStoreTopology|numberofPath"
       - metric_suffix: "custom_attributes_change_request_info"
         key: "summary|customTag:Change_Request|customTagValue"
-      - metric_suffix: "summary_custom_tag_no_swap_ds"
-        key: "summary|customTag:No_Swap_DS|customTagValue"
+      - metric_suffix: "summary_custom_tag_nvme"
+        key: "summary|customTag:nvme|customTagValue"
 
     VCenterStatsCollector:
     # INFO - Prefix: vrops_vcenter_