diff --git a/addons/grafana/dashboards/dcgm_metrics.json b/addons/grafana/dashboards/dcgm_metrics.json index 4a52d111e..fd0894f09 100644 --- a/addons/grafana/dashboards/dcgm_metrics.json +++ b/addons/grafana/dashboards/dcgm_metrics.json @@ -21,6 +21,7 @@ "fiscalYearStartMonth": 0, "gnetId": 12239, "graphTooltip": 0, + "id": 4, "links": [], "panels": [ { @@ -120,12 +121,12 @@ "disableTextWrap": false, "editorMode": "builder", "exemplar": false, - "expr": "avg by(instance, gpu, porter_run_app_name) (gpu_memory_utilization_ratio{instance=~\"$instance\", gpu=~\"$gpu\", porter_run_app_name=\"$application\"}) * 100", + "expr": "avg by(Hostname, gpu, porter_run_app_name) (gpu_memory_utilization_ratio{Hostname=~\"$Hostname\", gpu=~\"$gpu\", porter_run_app_name=~\"$application\"}) * 100", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "interval": "", - "legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"", + "legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"", "range": true, "refId": "GPU VRAM utilization", "useBackend": false @@ -228,11 +229,11 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "avg by(instance, gpu, porter_run_app_name) (DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"${application}\"})", + "expr": "avg by(Hostname, gpu, porter_run_app_name) (DCGM_FI_DEV_GPU_UTIL{Hostname=~\"${Hostname}\", gpu=~\"${gpu}\", porter_run_app_name=~\"${application}\"})", "fullMetaSearch": false, "includeNullMetadata": true, "interval": "", - "legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"", + "legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"", "range": true, "refId": "A", "useBackend": false @@ -333,12 +334,12 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "avg by(instance, gpu, porter_run_app_name) (DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=\"$application\"})", + "expr": "avg by(porter_run_app_name, Hostname, gpu) (DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "interval": "", - "legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"", + "legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"", "refId": "A", "useBackend": false } @@ -411,11 +412,11 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "avg by(instance, gpu, porter_run_app_name) (DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=\"$application\"})", + "expr": "avg by(gpu, Hostname) (DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", "fullMetaSearch": false, "includeNullMetadata": true, "interval": "", - "legendFormat": "", + "legendFormat": "{{Hostname}}__{{gpu}}", "range": true, "refId": "A", "useBackend": false @@ -517,11 +518,11 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "avg by(instance, gpu, porter_run_app_name) (DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", + "expr": "avg by(porter_run_app_name, Hostname, gpu) (DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", "fullMetaSearch": false, "includeNullMetadata": true, "interval": "", - "legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"", + "legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"", "range": true, "refId": "A", "useBackend": false @@ -595,7 +596,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by() (DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", + "expr": "sum by() (DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", "fullMetaSearch": false, "includeNullMetadata": true, "interval": "", @@ -685,7 +686,9 @@ ], "displayMode": "table", "placement": "bottom", - "showLegend": true + "showLegend": true, + "sortBy": "Name", + "sortDesc": true }, "tooltip": { "maxHeight": 600, @@ -701,13 +704,13 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "avg by(pod, porter_run_app_name, instance, gpu) (DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"}) * 1000000", + "expr": "avg by(porter_run_app_name, Hostname, gpu) (DCGM_FI_DEV_SM_CLOCK{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"}) * 1000000", "format": "time_series", "fullMetaSearch": false, "includeNullMetadata": true, "interval": "", "intervalFactor": 1, - "legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"", + "legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"", "range": true, "refId": "A", "useBackend": false @@ -810,11 +813,11 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "avg by(gpu, instance, pod, porter_run_app_name) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", + "expr": "avg by(gpu, instance, porter_run_app_name) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", "fullMetaSearch": false, "includeNullMetadata": true, "interval": "", - "legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"", + "legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"", "range": true, "refId": "A", "useBackend": false @@ -914,11 +917,11 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "avg by(instance, gpu, pod, porter_run_app_name) (DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", + "expr": "avg by(gpu, porter_run_app_name, Hostname) (DCGM_FI_DEV_FB_USED{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})", "fullMetaSearch": false, "includeNullMetadata": true, "interval": "", - "legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"", + "legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"", "range": true, "refId": "A", "useBackend": false @@ -935,24 +938,29 @@ "list": [ { "current": { - "selected": false, - "text": "All", - "value": "$__all" + "selected": true, + "text": [ + "amd-full-call-cobra" + ], + "value": [ + "amd-full-call-cobra" + ] }, "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "definition": "label_values(DCGM_FI_DEV_GPU_TEMP,instance)", + "definition": "label_values(DCGM_FI_DEV_GPU_UTIL,porter_run_app_name)", + "description": "This is the name of the application that is running on the GPU at the moment the metrics were retrieved ", "hide": 0, "includeAll": true, - "label": "instance", + "label": "Application", "multi": true, - "name": "instance", + "name": "application", "options": [], "query": { "qryType": 1, - "query": "label_values(DCGM_FI_DEV_GPU_TEMP,instance)", + "query": "label_values(DCGM_FI_DEV_GPU_UTIL,porter_run_app_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -963,27 +971,31 @@ }, { "current": { - "selected": false, - "text": "All", - "value": "$__all" + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] }, "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "definition": "label_values(DCGM_FI_DEV_GPU_TEMP,gpu)", + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP,Hostname)", "hide": 0, "includeAll": true, - "label": "gpu", + "label": "Hostname", "multi": true, - "name": "gpu", + "name": "Hostname", "options": [], "query": { "qryType": 1, - "query": "label_values(DCGM_FI_DEV_GPU_TEMP,gpu)", + "query": "label_values(DCGM_FI_DEV_GPU_TEMP,Hostname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -991,28 +1003,31 @@ }, { "current": { - "selected": false, - "text": "test", - "value": "test" + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] }, "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "definition": "label_values(DCGM_FI_DEV_GPU_UTIL,porter_run_app_name)", - "description": "This is the name of the application that is running on the GPU at the moment the metrics were retrieved ", + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP,gpu)", "hide": 0, "includeAll": true, - "label": "Application", + "label": "gpu", "multi": true, - "name": "application", + "name": "gpu", "options": [], "query": { "qryType": 1, - "query": "label_values(DCGM_FI_DEV_GPU_UTIL,porter_run_app_name)", + "query": "label_values(DCGM_FI_DEV_GPU_TEMP,gpu)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -1021,7 +1036,7 @@ ] }, "time": { - "from": "now-15m", + "from": "now-5m", "to": "now" }, "timepicker": { @@ -1039,8 +1054,8 @@ ] }, "timezone": "", - "title": "NVIDIA DCGM Exporter Dashboard", - "uid": "cdveeo686dr0gc", - "version": 1, + "title": "NVIDIA DCGM Exporter Dashboard 2", + "uid": "cdveeo686dr0gd", + "version": 7, "weekStart": "" } \ No newline at end of file diff --git a/applications/job/templates/cronjob.yaml b/applications/job/templates/cronjob.yaml index 06eb4ac5f..93c945739 100644 --- a/applications/job/templates/cronjob.yaml +++ b/applications/job/templates/cronjob.yaml @@ -79,6 +79,9 @@ spec: imagePullSecrets: - name: {{ .Values.global.image.imagePullSecret }} {{- end }} + {{- if .Values.enableHostIpc }} + hostIPC: true + {{- end }} containers: - name: {{ .Chart.Name }} {{- if .Values.global }} diff --git a/applications/job/values.yaml b/applications/job/values.yaml index a2982527e..f8f9a0c8a 100644 --- a/applications/job/values.yaml +++ b/applications/job/values.yaml @@ -99,3 +99,8 @@ nodeGroups: [] fileSecretMounts: enabled: false mounts: [] + +# hostIPC is required on pods that need to access the host's IPC namespace +# this is the case for instance for pods that need to use the MPS sliced GPUs +# enable this conservatively +enableHostIpc: false \ No newline at end of file diff --git a/applications/web/templates/deployment-blue-green-legacy.yaml b/applications/web/templates/deployment-blue-green-legacy.yaml index f1aece245..e15d728c1 100644 --- a/applications/web/templates/deployment-blue-green-legacy.yaml +++ b/applications/web/templates/deployment-blue-green-legacy.yaml @@ -39,6 +39,9 @@ spec: imagePullSecrets: - name: {{ $.Values.image.imagePullSecret }} {{- end }} + {{- if $.Values.enableHostIpc }} + hostIPC: true + {{- end }} containers: - name: {{ $.Chart.Name }} securityContext: diff --git a/applications/web/templates/deployment.yaml b/applications/web/templates/deployment.yaml index 8bfe2803f..df25c1ef5 100644 --- a/applications/web/templates/deployment.yaml +++ b/applications/web/templates/deployment.yaml @@ -92,6 +92,9 @@ spec: imagePullSecrets: - name: {{ .Values.global.image.imagePullSecret }} {{- end }} + {{- if .Values.enableHostIpc }} + hostIPC: true + {{- end }} initContainers: # this is used for ensuring the kubelet is ready on new nodes, and can injected any downward API keys - name: downward-api diff --git a/applications/web/values.yaml b/applications/web/values.yaml index cb877b88c..8d786d7cc 100644 --- a/applications/web/values.yaml +++ b/applications/web/values.yaml @@ -310,3 +310,8 @@ metricsScraping: enabled: false port: 80 path: "/metrics" + +# hostIPC is required on pods that need to access the host's IPC namespace +# this is the case for instance for pods that need to use the MPS sliced GPUs +# enable this conservatively +enableHostIpc: false \ No newline at end of file diff --git a/applications/worker/templates/deployment.yaml b/applications/worker/templates/deployment.yaml index f8c3e689d..353a73ced 100644 --- a/applications/worker/templates/deployment.yaml +++ b/applications/worker/templates/deployment.yaml @@ -69,6 +69,9 @@ spec: imagePullSecrets: - name: {{ .Values.global.image.imagePullSecret }} {{- end }} + {{- if .Values.enableHostIpc }} + hostIPC: true + {{- end }} initContainers: # this is used for ensuring the kubelet is ready on new nodes, and can injected any downward API keys - name: downward-api diff --git a/applications/worker/values.yaml b/applications/worker/values.yaml index 2e377398f..5fd529d2b 100644 --- a/applications/worker/values.yaml +++ b/applications/worker/values.yaml @@ -187,3 +187,8 @@ additionalVolumes: # type: "" # mountPath: "" # volumeOptions: {} + +# hostIPC is required on pods that need to access the host's IPC namespace +# this is the case for instance for pods that need to use the MPS sliced GPUs +# enable this conservatively +enableHostIpc: false \ No newline at end of file