Skip to content

Commit

Permalink
Merge branch 'sapcc:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
himanip94 authored Dec 6, 2024
2 parents 92fd3ef + d088a16 commit 3d3d7e3
Show file tree
Hide file tree
Showing 16 changed files with 39 additions and 38 deletions.
2 changes: 1 addition & 1 deletion global/concourse-main/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: v2
name: concourse-main
description: An umbrella chart for concourse ci
type: application
version: 7.11.2-2
version: 7.11.2-3
appVersion: 7.11.2
dependencies:
- name: concourse
Expand Down
2 changes: 1 addition & 1 deletion global/concourse-main/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ gitResourceProxy:
timeout: 60s
debug: true
image: keppel.global.cloud.sap/ccloud/concourse-git-resource-proxy
imageTag: 0.6.0
imageTag: 0.7.0
imagePullPolicy: IfNotPresent

kubernetes-ingress:
Expand Down
12 changes: 6 additions & 6 deletions openstack/castellum/alerts/openstack/errors.alerts
Original file line number Diff line number Diff line change
Expand Up @@ -268,11 +268,11 @@ groups:
summary: Castellum encountered backend errors while resizing some assets

- alert: OpenstackCastellumAuditEventPublishFailing
# Usually, you would check increase() here, but audit events *can* be quite
# rare, so we alert if there are any failed audit events at all. To clear this alert,
# delete the respective
expr: max by (pod) (castellum_failed_auditevent_publish > 0)
for: 1h
# The underlying metric counts failed submission attempts, e.g. because the hermes-rabbitmq server is restarting.
# These are not necessarily fatal because the process will hold them in memory to retry the submission later.
# The alert will clear up on its own once submissions start working again.
expr: sum by (pod) (changes(audittools_failed_submissions{namespace="castellum"}[1h]) > 0)
for: 5m
labels:
context: auditeventpublish
service: castellum
Expand All @@ -283,4 +283,4 @@ groups:
meta: '{{ $labels.pod }}'
annotations:
summary: "{{ $labels.pod }} cannot publish audit events"
description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for details. Once the underlying issue was addressed, delete the offending pod to clear this alert."
description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for detailed error messages. Affected audit events are held in memory until publishing succeeds."
2 changes: 1 addition & 1 deletion openstack/elektra/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ monsoon_dashboard_mail_user: defined_in_secrets
monsoon_dashboard_mail_password: defined_in_secrets

monsoon_dashboard_avatar_url: https://avatars.wdf.sap.corp/avatar/#{current_user.name}?size=24x24
monsoon_dashboard_cam_url: https://spc.ondemand.com/sap/bc/webdynpro/a1sspc/cam_wd_central
monsoon_dashboard_cam_url: https://cam.int.sap/cam/ui/admin
#monsoon_openstack_auth_api_endpoint:
monsoon_openstack_auth_api_userid: dashboard
monsoon_openstack_auth_api_domain: Default
Expand Down
12 changes: 6 additions & 6 deletions openstack/keppel/alerts/openstack/api.alerts
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,11 @@ groups:
starved for CPU time, so try checking the CPU throttling metrics.
- alert: OpenstackKeppelAuditEventPublishFailing
# Usually, you would check increase() here, but audit events *can* be quite
# rare, so we alert if there are any failed audit events at all. To clear this alert,
# delete the respective
expr: max by (pod) (keppel_failed_auditevent_publish > 0)
for: 1h
# The underlying metric counts failed submission attempts, e.g. because the hermes-rabbitmq server is restarting.
# These are not necessarily fatal because the process will hold them in memory to retry the submission later.
# The alert will clear up on its own once submissions start working again.
expr: sum by (pod) (changes(audittools_failed_submissions{namespace="keppel"}[1h]) > 0)
for: 5m
labels:
context: auditeventpublish
dashboard: keppel-overview
Expand All @@ -141,4 +141,4 @@ groups:
meta: '{{ $labels.pod }}'
annotations:
summary: "{{ $labels.pod }} cannot publish audit events"
description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for details. Once the underlying issue was addressed, delete the offending pod to clear this alert."
description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for detailed error messages. Affected audit events are held in memory until publishing succeeds."
12 changes: 6 additions & 6 deletions openstack/limes/alerts/openstack/api.alerts
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,11 @@ groups:
summary: Limes cannot sync quota overrides.

- alert: OpenstackLimesAuditEventPublishFailing
# Usually, you would check increase() here, but audit events *can* be quite
# rare, so we alert if there are any failed audit events at all. To clear this alert,
# delete the respective
expr: max by (pod) (limes_failed_auditevent_publish > 0)
for: 1h
# The underlying metric counts failed submission attempts, e.g. because the hermes-rabbitmq server is restarting.
# These are not necessarily fatal because the process will hold them in memory to retry the submission later.
# The alert will clear up on its own once submissions start working again.
expr: sum by (pod) (changes(audittools_failed_submissions{namespace="limes"}[1h]) > 0)
for: 5m
labels:
context: auditeventpublish
dashboard: limes-overview
Expand All @@ -224,7 +224,7 @@ groups:
meta: '{{ $labels.pod }}'
annotations:
summary: "{{ $labels.pod }} cannot publish audit events"
description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for details. Once the underlying issue was addressed, delete the offending pod to clear this alert."
description: "Audit events from {{ $labels.pod }} could not be published to the RabbitMQ server. Check the pod log for detailed error messages. Affected audit events are held in memory until publishing succeeds."

- alert: OpenstackLimesIncompleteProjectResourceData
expr: max by (service, resource) (limes_project_resources_by_type_count) != on () group_left max(limes_project_count)
Expand Down
4 changes: 1 addition & 3 deletions openstack/limes/templates/_utils.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@

{{- define "limes_common_envvars" }}
{{- if $.Values.limes.has_audit_trail }}
- name: LIMES_AUDIT_ENABLE
value: "true"
- name: LIMES_AUDIT_QUEUE_NAME
- name: LIMES_AUDIT_RABBITMQ_QUEUE_NAME
value: "notifications.info"
- name: LIMES_AUDIT_RABBITMQ_HOSTNAME
value: "hermes-rabbitmq-notifications.hermes.svc"
Expand Down
2 changes: 1 addition & 1 deletion openstack/utils/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
apiVersion: v1
description: A Helm chart for Kubernetes
name: utils
version: 0.19.7
version: 0.20.0
2 changes: 2 additions & 0 deletions openstack/utils/templates/snippets/_proxysql.cfg.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ mysql_variables =
connect_retries_on_failure = {{ default 1000 .global.Values.proxysql.connect_retries_on_failure }}
connect_retries_delay = {{ default 100 .global.Values.proxysql.connect_retries_delay }} {{- /* The default is 1ms, and that means we will run through the retries on failure in no time */}}
connect_timeout_server_max = {{ default 100000 .global.Values.proxysql.connect_timeout_server_max }}
max_transaction_time = {{ default 60000 .global.Values.proxysql.max_transaction_time }}
default_query_timeout = {{ default 90000 .global.Values.proxysql.default_query_timeout }}
}

mysql_servers =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ groups:
context: "NSXTMgmtVMDistribution"
dashboard: management-cluster-resources/management-cluster-resources?orgId=1&var-pod=All&var-clusters={{ $labels.vccluster }}&var-hosts={{ $labels.hostsystem }}
meta: "Too many NSX-T VMs on {{ $labels.hostsystem }}. ({{ $labels.vcenter }})"
playbook: docs/devops/alert/nsxt/#NSXTMgmtVMsOddDistribution
playbook: docs/devops/alert/vcenter/#nsxtmgmtvmsodddistribution
no_alert_on_absence: "true"
annotations:
description: "Too many NSX-T VMs for the same cluster on {{ $labels.hostsystem }}. Please distribute the VMs across different nodes. ({{ $labels.vcenter }})"
Expand Down
6 changes: 3 additions & 3 deletions system/kube-system-admin-k3s/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ dependencies:
version: 6.5.0
- name: traefik
repository: https://helm.traefik.io/traefik
version: 10.22.0
version: 33.1.0
- name: cc-rbac
repository: oci://keppel.eu-de-1.cloud.sap/ccloud-helm
version: 0.2.7
Expand Down Expand Up @@ -53,5 +53,5 @@ dependencies:
- name: reloader
repository: oci://ghcr.io/stakater/charts
version: 1.0.121
digest: sha256:f4c3563a0619870ec7ff794f2171aac15b2edd0f7542dde378aedb6c908ab24a
generated: "2024-11-12T09:22:01.009140541Z"
digest: sha256:b5afbb819c7a233adca48feba80e5c8fb4dfd1b2cc51a6a2749cccef6661720a
generated: "2024-12-06T14:53:23.875335+02:00"
4 changes: 2 additions & 2 deletions system/kube-system-admin-k3s/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: v2
appVersion: "1.0"
description: Kube-System relevant Service collection for the new admin clusters.
name: kube-system-admin-k3s
version: 3.3.24
version: 3.3.27
home: https://github.com/sapcc/helm-charts/tree/master/system/kube-system-admin-k3s
dependencies:
- name: disco
Expand All @@ -16,7 +16,7 @@ dependencies:
version: 6.5.0
- name: traefik
repository: https://helm.traefik.io/traefik
version: 10.22.0
version: 33.1.0
- name: cc-rbac
repository: oci://keppel.eu-de-1.cloud.sap/ccloud-helm
version: 0.2.7
Expand Down
3 changes: 0 additions & 3 deletions system/kube-system-admin-k3s/ci/test-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@ traefik:
enabled: true
additionalArguments:
- "--test=test"
serviceAccount:
enabled: true
name: test

k3s-backup:
aws:
Expand Down
6 changes: 5 additions & 1 deletion system/kube-system-admin-k3s/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ ingress:
ca_cert:

traefik:
instanceLabelOverride: kube-system
updateStrategy:
type: Recreate
ingressRoute:
dashboard:
enabled: false
Expand All @@ -22,7 +25,8 @@ traefik:
kubernetesIngress: {}
ports:
web:
expose: false
expose:
default: false
websecure:
tls:
enabled: true
Expand Down
2 changes: 1 addition & 1 deletion system/vmware-monitoring/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: vmware-monitoring
version: 2.0.4
version: 2.0.5
description: VMware Monitoring and Metrics Collection
dependencies:
- name: prometheus-server
Expand Down
4 changes: 2 additions & 2 deletions system/vmware-monitoring/templates/collector-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,8 @@ data:
key: "config|storageDevice|plugStoreTopology|numberofPath"
- metric_suffix: "custom_attributes_change_request_info"
key: "summary|customTag:Change_Request|customTagValue"
- metric_suffix: "summary_custom_tag_no_swap_ds"
key: "summary|customTag:No_Swap_DS|customTagValue"
- metric_suffix: "summary_custom_tag_nvme"
key: "summary|customTag:nvme|customTagValue"
VCenterStatsCollector:
# INFO - Prefix: vrops_vcenter_
Expand Down

0 comments on commit 3d3d7e3

Please sign in to comment.