Skip to content

Commit

Permalink
Merge pull request #1494 from porter-dev/main
Browse files Browse the repository at this point in the history
Moving default grafana dashboard and app changes to prod
  • Loading branch information
yosefmih authored Feb 14, 2025
2 parents c8172c4 + 569eb06 commit 9672c7d
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 46 deletions.
107 changes: 61 additions & 46 deletions addons/grafana/dashboards/dcgm_metrics.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"fiscalYearStartMonth": 0,
"gnetId": 12239,
"graphTooltip": 0,
"id": 4,
"links": [],
"panels": [
{
Expand Down Expand Up @@ -120,12 +121,12 @@
"disableTextWrap": false,
"editorMode": "builder",
"exemplar": false,
"expr": "avg by(instance, gpu, porter_run_app_name) (gpu_memory_utilization_ratio{instance=~\"$instance\", gpu=~\"$gpu\", porter_run_app_name=\"$application\"}) * 100",
"expr": "avg by(Hostname, gpu, porter_run_app_name) (gpu_memory_utilization_ratio{Hostname=~\"$Hostname\", gpu=~\"$gpu\", porter_run_app_name=~\"$application\"}) * 100",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"interval": "",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"",
"range": true,
"refId": "GPU VRAM utilization",
"useBackend": false
Expand Down Expand Up @@ -228,11 +229,11 @@
},
"disableTextWrap": false,
"editorMode": "code",
"expr": "avg by(instance, gpu, porter_run_app_name) (DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"${application}\"})",
"expr": "avg by(Hostname, gpu, porter_run_app_name) (DCGM_FI_DEV_GPU_UTIL{Hostname=~\"${Hostname}\", gpu=~\"${gpu}\", porter_run_app_name=~\"${application}\"})",
"fullMetaSearch": false,
"includeNullMetadata": true,
"interval": "",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -333,12 +334,12 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "avg by(instance, gpu, porter_run_app_name) (DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=\"$application\"})",
"expr": "avg by(porter_run_app_name, Hostname, gpu) (DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"interval": "",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"",
"refId": "A",
"useBackend": false
}
Expand Down Expand Up @@ -411,11 +412,11 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "avg by(instance, gpu, porter_run_app_name) (DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=\"$application\"})",
"expr": "avg by(gpu, Hostname) (DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"fullMetaSearch": false,
"includeNullMetadata": true,
"interval": "",
"legendFormat": "",
"legendFormat": "{{Hostname}}__{{gpu}}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -517,11 +518,11 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "avg by(instance, gpu, porter_run_app_name) (DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"expr": "avg by(porter_run_app_name, Hostname, gpu) (DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"fullMetaSearch": false,
"includeNullMetadata": true,
"interval": "",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -595,7 +596,7 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "sum by() (DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"expr": "sum by() (DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"fullMetaSearch": false,
"includeNullMetadata": true,
"interval": "",
Expand Down Expand Up @@ -685,7 +686,9 @@
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
"showLegend": true,
"sortBy": "Name",
"sortDesc": true
},
"tooltip": {
"maxHeight": 600,
Expand All @@ -701,13 +704,13 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "avg by(pod, porter_run_app_name, instance, gpu) (DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"}) * 1000000",
"expr": "avg by(porter_run_app_name, Hostname, gpu) (DCGM_FI_DEV_SM_CLOCK{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"}) * 1000000",
"format": "time_series",
"fullMetaSearch": false,
"includeNullMetadata": true,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -810,11 +813,11 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "avg by(gpu, instance, pod, porter_run_app_name) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"expr": "avg by(gpu, instance, porter_run_app_name) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"fullMetaSearch": false,
"includeNullMetadata": true,
"interval": "",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -914,11 +917,11 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "avg by(instance, gpu, pod, porter_run_app_name) (DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"expr": "avg by(gpu, porter_run_app_name, Hostname) (DCGM_FI_DEV_FB_USED{Hostname=~\"$Hostname\", gpu=~\"${gpu}\", porter_run_app_name=~\"$application\"})",
"fullMetaSearch": false,
"includeNullMetadata": true,
"interval": "",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Instance: \"{{instance}}\", GPU: \"{{gpu}}\"",
"legendFormat": "Application: \"{{porter_run_app_name}}\", Host: \"{{Hostname}}\", GPU: \"{{gpu}}\"",
"range": true,
"refId": "A",
"useBackend": false
Expand All @@ -935,24 +938,29 @@
"list": [
{
"current": {
"selected": false,
"text": "All",
"value": "$__all"
"selected": true,
"text": [
"amd-full-call-cobra"
],
"value": [
"amd-full-call-cobra"
]
},
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP,instance)",
"definition": "label_values(DCGM_FI_DEV_GPU_UTIL,porter_run_app_name)",
"description": "This is the name of the application that is running on the GPU at the moment the metrics were retrieved ",
"hide": 0,
"includeAll": true,
"label": "instance",
"label": "Application",
"multi": true,
"name": "instance",
"name": "application",
"options": [],
"query": {
"qryType": 1,
"query": "label_values(DCGM_FI_DEV_GPU_TEMP,instance)",
"query": "label_values(DCGM_FI_DEV_GPU_UTIL,porter_run_app_name)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
Expand All @@ -963,56 +971,63 @@
},
{
"current": {
"selected": false,
"text": "All",
"value": "$__all"
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP,gpu)",
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP,Hostname)",
"hide": 0,
"includeAll": true,
"label": "gpu",
"label": "Hostname",
"multi": true,
"name": "gpu",
"name": "Hostname",
"options": [],
"query": {
"qryType": 1,
"query": "label_values(DCGM_FI_DEV_GPU_TEMP,gpu)",
"query": "label_values(DCGM_FI_DEV_GPU_TEMP,Hostname)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 2,
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"current": {
"selected": false,
"text": "test",
"value": "test"
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"definition": "label_values(DCGM_FI_DEV_GPU_UTIL,porter_run_app_name)",
"description": "This is the name of the application that is running on the GPU at the moment the metrics were retrieved ",
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP,gpu)",
"hide": 0,
"includeAll": true,
"label": "Application",
"label": "gpu",
"multi": true,
"name": "application",
"name": "gpu",
"options": [],
"query": {
"qryType": 1,
"query": "label_values(DCGM_FI_DEV_GPU_UTIL,porter_run_app_name)",
"query": "label_values(DCGM_FI_DEV_GPU_TEMP,gpu)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 0,
Expand All @@ -1021,7 +1036,7 @@
]
},
"time": {
"from": "now-15m",
"from": "now-5m",
"to": "now"
},
"timepicker": {
Expand All @@ -1039,8 +1054,8 @@
]
},
"timezone": "",
"title": "NVIDIA DCGM Exporter Dashboard",
"uid": "cdveeo686dr0gc",
"version": 1,
"title": "NVIDIA DCGM Exporter Dashboard 2",
"uid": "cdveeo686dr0gd",
"version": 7,
"weekStart": ""
}
3 changes: 3 additions & 0 deletions applications/job/templates/cronjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ spec:
imagePullSecrets:
- name: {{ .Values.global.image.imagePullSecret }}
{{- end }}
{{- if .Values.enableHostIpc }}
hostIPC: true
{{- end }}
containers:
- name: {{ .Chart.Name }}
{{- if .Values.global }}
Expand Down
5 changes: 5 additions & 0 deletions applications/job/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,8 @@ nodeGroups: []
fileSecretMounts:
enabled: false
mounts: []

# hostIPC is required on pods that need to access the host's IPC namespace
# this is the case for instance for pods that need to use the MPS sliced GPUs
# enable this conservatively
enableHostIpc: false
3 changes: 3 additions & 0 deletions applications/web/templates/deployment-blue-green-legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ spec:
imagePullSecrets:
- name: {{ $.Values.image.imagePullSecret }}
{{- end }}
{{- if $.Values.enableHostIpc }}
hostIPC: true
{{- end }}
containers:
- name: {{ $.Chart.Name }}
securityContext:
Expand Down
3 changes: 3 additions & 0 deletions applications/web/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ spec:
imagePullSecrets:
- name: {{ .Values.global.image.imagePullSecret }}
{{- end }}
{{- if .Values.enableHostIpc }}
hostIPC: true
{{- end }}
initContainers:
# this is used for ensuring the kubelet is ready on new nodes, and can injected any downward API keys
- name: downward-api
Expand Down
5 changes: 5 additions & 0 deletions applications/web/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,8 @@ metricsScraping:
enabled: false
port: 80
path: "/metrics"

# hostIPC is required on pods that need to access the host's IPC namespace
# this is the case for instance for pods that need to use the MPS sliced GPUs
# enable this conservatively
enableHostIpc: false
3 changes: 3 additions & 0 deletions applications/worker/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ spec:
imagePullSecrets:
- name: {{ .Values.global.image.imagePullSecret }}
{{- end }}
{{- if .Values.enableHostIpc }}
hostIPC: true
{{- end }}
initContainers:
# this is used for ensuring the kubelet is ready on new nodes, and can injected any downward API keys
- name: downward-api
Expand Down
5 changes: 5 additions & 0 deletions applications/worker/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,8 @@ additionalVolumes:
# type: ""
# mountPath: ""
# volumeOptions: {}

# hostIPC is required on pods that need to access the host's IPC namespace
# this is the case for instance for pods that need to use the MPS sliced GPUs
# enable this conservatively
enableHostIpc: false

0 comments on commit 9672c7d

Please sign in to comment.