From d8a60ef8b7c98b039beec9a1df85a7af8ebcbb7f Mon Sep 17 00:00:00 2001 From: Saravjeet 'Aman' Singh Date: Wed, 31 Jul 2024 22:32:22 +0530 Subject: [PATCH] Metrics: Producer metric overhaul + grafana dashboard (#90) * publisher: kinesis: add metric for hitting throughput limits * publisher: kinesis: refactor metrics reporting * publisher: kinesis: add metrics for stream creation limit exceeded * chore: kinesis: make internal names consistent * publisher: pubsub: add metric for hitting throughput limits * publisher: pubsub: add metric for exceeding topic creation limits * kinesis: document bug with reporting throughput errors * publisher: kafka: remove false positives from delivery metrics * publisher: kafka: fix tests * publisher: pubsub: remove false positives from delivery metrics * publisher: kinesis: remove false positives from delivery report * publisher: emit non-delivery metrics on topic/stream existence errors * chore: remove redundant imports * misc: use better var names * publishers: add topic name to *_messages_delivered_total metric * publisher: kinesis: make partition-key fixed length * chore: remove typo in comments * metrics: update metric metadata * metrics: add topic/stream label for failed deliveries * docs: update delivery metrics with topic/stream name * docs: add ${PUBLISHER}_producebulk_tt_ms metric documentation * misc: add dashboard with support for prometheus data source * docs: add info about grafana dashboard * chore: make dashboard portable * misc: rename dashboard to grafana.json for better semantics --- dashboards/grafana.json | 4067 +++++++++++++++++++++++++++++++ docs/docs/guides/monitoring.md | 8 + docs/docs/reference/metrics.md | 73 +- integration/integration_test.go | 10 +- metrics/prometheus.go | 24 +- metrics/prometheus_test.go | 2 +- publisher/kafka/kafka.go | 61 +- publisher/kafka/kafka_test.go | 20 +- publisher/kinesis/kinesis.go | 169 +- publisher/pubsub/error.go | 2 +- publisher/pubsub/pubsub.go | 152 +- worker/worker.go | 2 +- 12 files changed, 4409 insertions(+), 181 deletions(-) create mode 100644 dashboards/grafana.json diff --git a/dashboards/grafana.json b/dashboards/grafana.json new file mode 100644 index 00000000..7fc67f4e --- /dev/null +++ b/dashboards/grafana.json @@ -0,0 +1,4067 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.1.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 7, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "duration from the time request is processed to the time events are published. This metric is calculated per event by following formula(PublishedTime - ProcessedTime)/CountEvents", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(worker_processing_duration_milliseconds_bucket[1m])) by (le, worker))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Worker Processing Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Duration from the time request is received to the time events are published. This metric is calculated per event by following formula(PublishedTime - ReceivedTime)/CountEvents", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum(rate(server_processing_latency_milliseconds_bucket[1m])) by (le))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Server Processing Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Duration from the time request is sent to the time events are published. This metric is calculated per event by following formula (PublishedTime - SentTime)/CountEvents", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(event_processing_duration_milliseconds_bucket[1m])) by (le))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Event Processing Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Duration from when the request is received to when the request is processed. High value of this metric indicates the publisher is slow.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum(rate(batch_idle_in_channel_milliseconds_bucket[1m])) by (le))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Batch idle time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total byte received in requests", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(events_rx_bytes_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Data Ingested per minute", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Request count", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(batches_read_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Batches read per minute", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of events received in requests", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 52 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(events_rx_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Events Ingested per minute", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of duplicate events", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "events_duplicate_total", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Duplicate Events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Time taken from event being consumed from the queue to being acked by the ack handler.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 68 + }, + "id": 46, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(event_rtt_ms_bucket[1m])) by (le))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Event Round Trip Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Time difference between ack function called by producer and processing by the ack handler.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 76 + }, + "id": 47, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(ack_event_rtt_ms_bucket[1m])) by (le))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Ack Latency", + "type": "timeseries" + } + ], + "title": "Delivery", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 1, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Bytes of allocated heap objects", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "server_mem_heap_alloc_bytes_current", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Heap (Allocated)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "in-use spans of heap in bytes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "server_mem_heap_inuse_bytes_current", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Heap (In Use)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of allocated heap objects\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "server_mem_heap_objects_total_current", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Objects in Heap", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Bytes in stack spans\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 42 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "server_mem_stack_inuse_bytes_current", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Stack Size (In Use)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of goroutine spawn in a single flush", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 50 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "server_go_routines_count_current", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Goroutine Count", + "type": "timeseries" + } + ], + "title": "Process", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 16, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of successful connections established to the server (per minute)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(user_connection_success_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Successful Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of failed connections established to the server (per minute)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(user_connection_failure_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Failed Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of alive connections\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "connections_count_current", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Active Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Duration of alive connection per session per connection", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 48 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(user_session_duration_milliseconds_bucket[1m])) by (le))", + "instant": false, + "legendFormat": "Average Time", + "range": true, + "refId": "A" + } + ], + "title": "User Session Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of connection close errors encountered", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 55 + }, + "id": 45, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(conn_close_err_count[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Connection Close errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total ping that server fails to send (per minute)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(server_ping_failure_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Ping Failure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total pong that server fails to send (per minute)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(server_pong_failure_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pong Failure", + "type": "timeseries" + } + ], + "title": "Server", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 22, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Response time of produce batch method of the kafka producer", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 90 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "(histogram_quantile(0.50, sum(rate(kafka_producebulk_tt_ms_bucket[1m])) by (le))) ", + "instant": false, + "legendFormat": "Average Time", + "range": true, + "refId": "A" + } + ], + "title": "Average Produce Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of events delivered to Kafka.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 97 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(kafka_messages_delivered_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Events delivered", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of events not delivered to Kafka.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 104 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(kafka_messages_undelivered_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Undelivered Events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of delivery failures (per-minute) caused by non-existent topics ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 110 + }, + "id": 26, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(kafka_unknown_topic_failure_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Unknown Topic Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total number of requests sent to Kafka brokers", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 116 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "kafka_brokers_tx_total", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Requests sent to Brokers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total number of bytes transmitted to Kafka brokers", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 122 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "kafka_brokers_tx_bytes_total", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Data transfer to brokers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Broker latency / round-trip time in milliseconds", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 128 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "kafka_brokers_rtt_average_milliseconds", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Broker round trip time", + "type": "timeseries" + } + ], + "title": "Publisher: Kafka", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 30, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of events delivered to PubSub.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 42 + }, + "id": 36, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(pubsub_messages_delivered_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Delivered Events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of events that were not delivered to PubSub.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 49 + }, + "id": 35, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(pubsub_messages_undelivered_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Undelivered Events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of delivery failures caused by non-existence of topic in PubSub.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 34, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(pubsub_unknown_topic_failure_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Unknown Topic Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of delivery failures caused by exceeding throughput limits on PubSub.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 33, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(pubsub_topic_throughput_exceeded_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Topic Throughput exceeded errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of delivery failures caused by exceeding the limit on number of Topics on PubSub.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 71 + }, + "id": 32, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(pubsub_topics_limit_exceeded_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Topic Limit exceeded errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Response time of produce batch method of the pubsub producer", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 79 + }, + "id": 31, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "(histogram_quantile(0.50, sum(rate(pubsub_producebulk_tt_ms_bucket[1m])) by (le))) ", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Average Produce Time", + "type": "timeseries" + } + ], + "title": "Publisher: PubSub", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 37, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of events successfully delivered to Kinesis.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 43, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(kinesis_messages_delivered_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Events Delivered", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of events not delivered to Kinesis.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 49 + }, + "id": 42, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(kinesis_messages_undelivered_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Undelivered Events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of delivery failures caused by non-existence of stream in Kinesis.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 41, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(kinesis_unknown_stream_failure_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Unknown Stream Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of delivery failures caused by exceeding shard throughput limits. This error can also occur if the message size of an event exceeds message size limit ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(kinesis_stream_throughput_exceeded_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Stream Throughput exceeded errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of delivery failures caused due to too many streams in CREATING mode. AWS Kinesis limits how many stream creation requests can be submitted in parallel to 5.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(kinesis_streams_limit_exceeded_total[1m])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Stream Limit exceeded errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Response time of produce batch method of the kinesis producer", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 77 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "(histogram_quantile(0.50, sum(rate(kinesis_producebulk_tt_ms_bucket[1m])) by (le))) ", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Average Produce Time", + "type": "timeseries" + } + ], + "title": "Publisher: Kinesis", + "type": "row" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "2024-07-27T17:19:04.007Z", + "to": "2024-07-27T19:52:31.823Z" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Raccoon", + "uid": "ddt4tpfg6k5q8f", + "version": 35, + "weekStart": "" +} diff --git a/docs/docs/guides/monitoring.md b/docs/docs/guides/monitoring.md index 899d0d5d..974b87de 100644 --- a/docs/docs/guides/monitoring.md +++ b/docs/docs/guides/monitoring.md @@ -153,3 +153,11 @@ Raccoon provides fine-grained metrics that denote latency. That gives clues as t - [`event_processing_duration_milliseconds`](reference/metrics.md#event_processing_duration_milliseconds) This metrics denotes overall latency. You need to look at other latency metrics to find the root cause when this metric is high. - [`server_processing_latency_milliseconds`](reference/metrics.md#server_processing_latency_milliseconds) Correlate this metric with `event_processing_duration_milliseconds` to infer whether the issue is with Raccoon itself, or something wrong with the network, or the way [sent_time](https://github.com/raystack/proton/blob/main/raystack/raccoon/v1beta1/raccoon.proto#L47) is generated.- - [`worker_processing_duration_milliseconds`](reference/metrics.md#worker_processing_duration_milliseconds) High value of this metric indicates that the publisher is slow or can't keep up. + + +### Dashboard + +There is a pre-built [grafana dashboard](https://github.com/raystack/raccoon/tree/dashboards/raccoon.json) available with support for Prometheus data source. + +If you're running the statsd + telegraf setup, you can configure telegraf to push metrics to Prometheus. + diff --git a/docs/docs/reference/metrics.md b/docs/docs/reference/metrics.md index 7d099725..f09ef189 100644 --- a/docs/docs/reference/metrics.md +++ b/docs/docs/reference/metrics.md @@ -14,9 +14,6 @@ This page contains the reference for all the metrics exposed by Raccoon. - [Kafka](#kafka) - [PubSub](#pubsub) - [Kinesis](#kinesis) -- [PubSub Publisher](metrics.md#pubsub-publisher) -- [Kinesis Publisher](metrics.md#kinesis-publisher) -- [Resource Usage](metrics.md#resource-usage) - [Event Delivery](metrics.md#event-delivery) ## Server Connection @@ -74,17 +71,17 @@ Number of connection close errors encountered ### Kafka #### `kafka_messages_delivered_total` -Number of delivered events to Kafka. The metric also contains false increments. To find the true value, one should use the difference between this and `kafka_messages_undelivered_total` metric for the same tag/labels. +Number of events delivered to Kafka. - Type: `Count` -- Tags: `success=false` `success=true` `conn_group=*` `event_type=*` +- Tags: `topic=topicname` `conn_group=*` `event_type=*` #### `kafka_messages_undelivered_total` -The count of false increments done by `kafka_messages_delivered_total`. To be used in conjunction with the former for accurate metrics. +Number of events not delivered to Kafka. - Type: `Count` -- Tags: `success=false` `success=true` `conn_group=*` `event_type=*` +- Tags: `topic=topicname` `conn_group=*` `event_type=*` #### `kafka_unknown_topic_failure_total` @@ -153,50 +150,92 @@ Response time of produce batch method of the kafka producer #### `pubsub_messages_delivered_total` -Number of delivered events to PubSub. The metric also contains false increments. To find the true value, one should use the difference between this and `pubsub_messages_undelivered_total` metric for the same tag/labels. +Number of events delivered to PubSub. - Type: `Count` -- Tags: `success=false` `success=true` `conn_group=*` `event_type=*` +- Tags: `topic=topicname` `conn_group=*` `event_type=*` #### `pubsub_messages_undelivered_total` -The count of false increments done by `pubsub_messages_delivered_total`. To be used in conjunction with the former for accurate metrics. +Number of events that were not delivered to PubSub. - Type: `Count` -- Tags: `success=false` `success=true` `conn_group=*` `event_type=*` +- Tags: `topic=topicname` `conn_group=*` `event_type=*` #### `pubsub_unknown_topic_failure_total` -Number of delivery failure caused by topic does not exist in PubSub. +Number of delivery failures caused by non-existence of topic in PubSub. - Type: `Count` - Tags: `topic=topicname` `event_type=*` `conn_group=*` +#### `pubsub_topic_throughput_exceeded_total` + +Number of delivery failures caused by exceeding throughput limits on PubSub. + +- Type: `Count` +- Tags: `topic=topicname` `event_type=*` `conn_group=*` + +#### `pubsub_topics_limit_exceeded_total` + +Number of delivery failures caused by exceeding the limit on number of Topics on PubSub. + +- Type: `Count` +- Tags: `topic=topicname` `event_type=*` `conn_group=*` + +#### `pubsub_producebulk_tt_ms` + +Response time of produce batch method of the pubsub producer + +- Type `Timing` +- Tags: NA + ### Kinesis #### `kinesis_messages_delivered_total` -Number of delivered events to Kinesis. The metric also contains false increments. To find the true value, one should use the difference between this and `kinesis_messages_undelivered_total` metric for the same tag/labels. +Number of events successfully delivered to Kinesis. - Type: `Count` -- Tags: `success=false` `success=true` `conn_group=*` `event_type=*` +- Tags: `stream=streamname` `conn_group=*` `event_type=*` #### `kinesis_messages_undelivered_total` -The count of false increments done by `kinesis_messages_delivered_total`. To be used in conjunction with the former for accurate metrics. +Number of events not delivered to Kinesis. - Type: `Count` -- Tags: `success=false` `success=true` `conn_group=*` `event_type=*` +- Tags: `stream=streamname` `conn_group=*` `event_type=*` #### `kinesis_unknown_stream_failure_total` -Number of delivery failure caused by stream does not exist in Kinesis. +Number of delivery failures caused by non-existence of stream in Kinesis. - Type: `Count` - Tags: `stream=streamname` `event_type=*` `conn_group=*` +#### `kinesis_stream_throughput_exceeded_total` + +Number of delivery failures caused by exceeding shard throughput limits. This error can also occur if the message size of an event exceeds message size limit (1MiB as of the day of this writing). See [Limits and Quotas on Kinesis](https://docs.aws.amazon.com/streams/latest/dev/service-sizes-and-limits.html) + +- Type: `Count` +- Tags: `stream=streamname` `event_type=*` `conn_group=*` + +#### `kinesis_streams_limit_exceeded_total` + +Number of delivery failures caused due to too many streams in `CREATING` mode. AWS Kinesis limits how many stream creation requests can be submitted in parallel to 5. + +- Type: `Count` +- Tags: `stream=streamname` `event_type=*` `conn_group=*` + +#### `kinesis_producebulk_tt_ms` + +Response time of produce batch method of the kinesis producer + +- Type `Timing` +- Tags: NA + ## Resource Usage ### `server_mem_gc_triggered_current` diff --git a/integration/integration_test.go b/integration/integration_test.go index be7b4979..c73ff08a 100644 --- a/integration/integration_test.go +++ b/integration/integration_test.go @@ -30,18 +30,24 @@ var url, wsurl string var bootstrapServers string var grpcServerAddr string +const envTestHost = "INTEGTEST_HOST" + func TestMain(m *testing.M) { uuid = fmt.Sprintf("%d-test", rand.Int()) timeout = 20 * time.Second topicFormat = os.Getenv("INTEGTEST_TOPIC_FORMAT") - wsurl = fmt.Sprintf("ws://%v/api/v1/events", os.Getenv("INTEGTEST_HOST")) - url = fmt.Sprintf("http://%v/api/v1/events", os.Getenv("INTEGTEST_HOST")) + wsurl = fmt.Sprintf("ws://%v/api/v1/events", os.Getenv(envTestHost)) + url = fmt.Sprintf("http://%v/api/v1/events", os.Getenv(envTestHost)) grpcServerAddr = os.Getenv("GRPC_SERVER_ADDR") bootstrapServers = os.Getenv("INTEGTEST_BOOTSTRAP_SERVER") os.Exit(m.Run()) } func TestIntegration(t *testing.T) { + if os.Getenv(envTestHost) == "" { + t.Errorf("cannot run tests because %s env variable is not set", envTestHost) + return + } var err error assert.NoError(t, err) header := http.Header{ diff --git a/metrics/prometheus.go b/metrics/prometheus.go index c126ac6a..ba608c5a 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -140,22 +140,34 @@ func getCounterMap() map[string]CounterVec { counters["kafka_messages_delivered_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "kafka_messages_delivered_total", - Help: "Number of delivered events to Kafka"}, []string{"success", "conn_group", "event_type"}) + Help: "Number of delivered events to Kafka"}, []string{"topic", "conn_group", "event_type"}) counters["kafka_messages_undelivered_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "kafka_messages_undelivered_total", - Help: "Number of delivered events to Kafka which failed while reading delivery report"}, []string{"success", "conn_group", "event_type"}) + Help: "Number of events that failed delivery"}, []string{"topic", "conn_group", "event_type"}) counters["pubsub_messages_delivered_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "pubsub_messages_delivered_total", - Help: "Number of delivered events to Kafka"}, []string{"success", "conn_group", "event_type"}) + Help: "Number of delivered events to PubSub"}, []string{"topic", "conn_group", "event_type"}) counters["pubsub_messages_undelivered_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "pubsub_messages_undelivered_total", - Help: "Number of delivered events to PubSub which failed while reading delivery report"}, []string{"success", "conn_group", "event_type"}) + Help: "Number of events that failed delivery"}, []string{"topic", "conn_group", "event_type"}) counters["kinesis_messages_delivered_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "kinesis_messages_delivered_total", - Help: "Number of delivered events to Kafka"}, []string{"success", "conn_group", "event_type"}) + Help: "Number of delivered events to Kinesis"}, []string{"stream", "conn_group", "event_type"}) counters["kinesis_messages_undelivered_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "kinesis_messages_undelivered_total", - Help: "Number of delivered events to kinesis which failed while reading delivery report"}, []string{"success", "conn_group", "event_type"}) + Help: "Number of events that failed delivery"}, []string{"stream", "conn_group", "event_type"}) + counters["kinesis_stream_throughput_exceeded_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "kinesis_stream_throughput_exceeded_total", + Help: "Number of messages that failed to deliver because the operation exceeded shard limit or if the message size was too big"}, []string{"stream", "conn_group", "event_type"}) + counters["kinesis_streams_limit_exceeded_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "kinesis_streams_limit_exceeded_total", + Help: "Number of messages that failed to deliver because target stream creation failed due to too many stream creation requests"}, []string{"stream", "conn_group", "event_type"}) + counters["pubsub_topic_throughput_exceeded_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "pubsub_topic_throughput_exceeded_total", + Help: "Number of messages that failed to deliver because pub/sub throughput limits were exceeded"}, []string{"topic", "conn_group", "event_type"}) + counters["pubsub_topics_limit_exceeded_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "pubsub_topics_limit_exceeded_total", + Help: "Number of messages that failed to deliver because pub/sub topic limits were exceeded"}, []string{"topic", "conn_group", "event_type"}) counters["events_rx_bytes_total"] = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "events_rx_bytes_total", Help: "Total byte receieved in requests"}, []string{"conn_group", "event_type"}) diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index 42c1d948..cad1d8d7 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -117,7 +117,7 @@ func (m *mockObserver) Observe(f float64) { func (promSuite *PrometheusTestSuite) Test_Prometheus_Collector_Metrics_Initialised() { // NOTE(turtledev): what are we even testing here? - numCounters := 18 + numCounters := 22 numGauge := 15 numHistogram := 10 var err error diff --git a/publisher/kafka/kafka.go b/publisher/kafka/kafka.go index d93c14e5..c35ad454 100644 --- a/publisher/kafka/kafka.go +++ b/publisher/kafka/kafka.go @@ -49,7 +49,6 @@ type Kafka struct { } // ProduceBulk messages to kafka. Block until all messages are sent. Return array of error. Order of Errors is guaranteed. -// DeliveryChannel needs to be exclusive. DeliveryChannel is exposed for recyclability purpose. func (pr *Kafka) ProduceBulk(events []*pb.Event, connGroup string) error { errors := make([]error, len(events)) totalProcessed := 0 @@ -64,38 +63,60 @@ func (pr *Kafka) ProduceBulk(events []*pb.Event, connGroup string) error { err := pr.kp.Produce(message, deliveryChannel) if err != nil { - metrics.Increment("kafka_messages_delivered_total", map[string]string{"success": "false", "conn_group": connGroup, "event_type": event.Type}) + metrics.Increment( + "kafka_messages_undelivered_total", + map[string]string{ + "topic": topic, + "conn_group": connGroup, + "event_type": event.Type, + }, + ) if err.Error() == "Local: Unknown topic" { errors[order] = fmt.Errorf("%v %s", err, topic) - metrics.Increment("kafka_unknown_topic_failure_total", map[string]string{"topic": topic, "event_type": event.Type, "conn_group": connGroup}) + metrics.Increment( + "kafka_unknown_topic_failure_total", + map[string]string{ + "topic": topic, + "event_type": event.Type, + "conn_group": connGroup, + }, + ) } else { errors[order] = err } continue } - metrics.Increment( - "kafka_messages_delivered_total", - map[string]string{ - "success": "true", - "conn_group": connGroup, - "event_type": event.Type, - }, - ) - totalProcessed++ } // Wait for deliveryChannel as many as processed for range totalProcessed { - d := <-deliveryChannel - m := d.(*kafka.Message) - if m.TopicPartition.Error != nil { - order := m.Opaque.(int) - eventType := events[order].Type - metrics.Increment("kafka_messages_undelivered_total", map[string]string{"success": "true", "conn_group": connGroup, "event_type": eventType}) - metrics.Increment("kafka_messages_delivered_total", map[string]string{"success": "false", "conn_group": connGroup, "event_type": eventType}) - errors[order] = m.TopicPartition.Error + var ( + deliveryReport = <-deliveryChannel + msg = deliveryReport.(*kafka.Message) + order = msg.Opaque.(int) + eventType = events[order].Type + ) + if msg.TopicPartition.Error != nil { + metrics.Increment( + "kafka_messages_undelivered_total", + map[string]string{ + "topic": *msg.TopicPartition.Topic, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + errors[order] = msg.TopicPartition.Error + continue } + metrics.Increment( + "kafka_messages_delivered_total", + map[string]string{ + "topic": *msg.TopicPartition.Topic, + "conn_group": connGroup, + "event_type": eventType, + }, + ) } if cmp.Or(errors...) != nil { diff --git a/publisher/kafka/kafka_test.go b/publisher/kafka/kafka_test.go index 6826a3e8..ad5425db 100644 --- a/publisher/kafka/kafka_test.go +++ b/publisher/kafka/kafka_test.go @@ -50,6 +50,7 @@ func TestKafka_ProduceBulk(suite *testing.T) { Offset: 0, Error: nil, }, + Opaque: 0, } }() }) @@ -65,16 +66,15 @@ func TestKafka_ProduceBulk(suite *testing.T) { client := &mockClient{} client.On("Produce", mock.Anything, mock.Anything).Return(fmt.Errorf("buffer full")).Once() client.On("Produce", mock.Anything, mock.Anything).Return(nil).Run(func(args mock.Arguments) { - go func() { - args.Get(1).(chan kafka.Event) <- &kafka.Message{ - TopicPartition: kafka.TopicPartition{ - Topic: args.Get(0).(*kafka.Message).TopicPartition.Topic, - Partition: 0, - Offset: 0, - Error: nil, - }, - } - }() + args.Get(1).(chan kafka.Event) <- &kafka.Message{ + TopicPartition: kafka.TopicPartition{ + Topic: args.Get(0).(*kafka.Message).TopicPartition.Topic, + Partition: 0, + Offset: 0, + Error: nil, + }, + Opaque: 1, + } }).Once() client.On("Produce", mock.Anything, mock.Anything).Return(fmt.Errorf("buffer full")).Once() kp := NewFromClient(client, 10, "%s", 1) diff --git a/publisher/kinesis/kinesis.go b/publisher/kinesis/kinesis.go index 24af3697..7ff1cbfc 100644 --- a/publisher/kinesis/kinesis.go +++ b/publisher/kinesis/kinesis.go @@ -29,7 +29,7 @@ type Publisher struct { streamAutocreate bool streamProbeInterval time.Duration streamMode types.StreamMode - defaultShardCount int32 + streamShards int32 publishTimeout time.Duration } @@ -47,39 +47,13 @@ func (p *Publisher) ProduceBulk(events []*pb.Event, connGroup string) error { if p.streamAutocreate { err := p.ensureStream(ctx, streamName) if err != nil { - metrics.Increment( - "kinesis_messages_delivered_total", - map[string]string{ - "success": "false", - "conn_group": connGroup, - "event_type": event.Type, - }, - ) - if p.isErrNotFound(err) { - metrics.Increment( - "kinesis_unknown_stream_failure_total", - map[string]string{ - "stream": streamName, - "conn_group": connGroup, - "event_type": event.Type, - }, - ) - } + reportStreamError(err, streamName, connGroup, event.Type) errors[order] = err continue } } - metrics.Increment( - "kinesis_messages_delivered_total", - map[string]string{ - "success": "true", - "conn_group": connGroup, - "event_type": event.Type, - }, - ) - - partitionKey := fmt.Sprintf("%d", rand.Int31()) + partitionKey := fmt.Sprintf("%010d", rand.Int31()) _, err := p.client.PutRecord( ctx, &kinesis.PutRecordInput{ @@ -90,35 +64,19 @@ func (p *Publisher) ProduceBulk(events []*pb.Event, connGroup string) error { ) if err != nil { - metrics.Increment( - "kinesis_messages_delivered_total", - map[string]string{ - "success": "false", - "conn_group": connGroup, - "event_type": event.Type, - }, - ) - metrics.Increment( - "kinesis_messages_undelivered_total", - map[string]string{ - "success": "true", - "conn_group": connGroup, - "event_type": event.Type, - }, - ) - if p.isErrNotFound(err) { - metrics.Increment( - "kinesis_unknown_stream_failure_total", - map[string]string{ - "stream": streamName, - "conn_group": connGroup, - "event_type": event.Type, - }, - ) - } + reportPutError(err, streamName, connGroup, event.Type) errors[order] = err continue } + + metrics.Increment( + "kinesis_messages_delivered_total", + map[string]string{ + "stream": streamName, + "conn_group": connGroup, + "event_type": event.Type, + }, + ) } if cmp.Or(errors...) != nil { return &publisher.BulkError{Errors: errors} @@ -145,7 +103,7 @@ func (p *Publisher) ensureStream(ctx context.Context, name string) error { ) if err != nil { - if !p.isErrNotFound(err) { + if !isErrNotFound(err) { return err } @@ -153,7 +111,7 @@ func (p *Publisher) ensureStream(ctx context.Context, name string) error { ctx, &kinesis.CreateStreamInput{ StreamName: aws.String(name), - ShardCount: aws.Int32(p.defaultShardCount), + ShardCount: aws.Int32(p.streamShards), StreamModeDetails: &types.StreamModeDetails{ StreamMode: p.streamMode, }, @@ -190,14 +148,6 @@ func (p *Publisher) ensureStream(ctx context.Context, name string) error { return nil } -func (*Publisher) isErrNotFound(e error) bool { - var ( - errNotFound *types.ResourceNotFoundException - isErrNotFound = errors.As(e, &errNotFound) - ) - return isErrNotFound -} - func (*Publisher) Name() string { return "kinesis" } func (*Publisher) Close() error { return nil } @@ -233,7 +183,7 @@ func WithStreamMode(mode types.StreamMode) Opt { func WithShards(n uint32) Opt { return func(p *Publisher) error { - p.defaultShardCount = int32(n) + p.streamShards = int32(n) return nil } } @@ -263,7 +213,7 @@ func New(client *kinesis.Client, opts ...Opt) (*Publisher, error) { p := &Publisher{ client: client, streamPattern: "%s", - defaultShardCount: 1, + streamShards: 1, streamProbeInterval: time.Second, streamMode: types.StreamModeOnDemand, streams: make(map[string]bool), @@ -277,3 +227,88 @@ func New(client *kinesis.Client, opts ...Opt) (*Publisher, error) { } return p, nil } + +func isErrNotFound(e error) bool { + var t *types.ResourceNotFoundException + return errors.As(e, &t) +} + +func isErrThroughputExceeded(e error) bool { + var t *types.ProvisionedThroughputExceededException + return errors.As(e, &t) +} + +func isErrLimitExceeded(e error) bool { + var t *types.LimitExceededException + return errors.As(e, &t) +} + +func reportPutError(err error, streamName, connGroup, eventType string) { + metrics.Increment( + "kinesis_messages_undelivered_total", + map[string]string{ + "stream": streamName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + if isErrNotFound(err) { + metrics.Increment( + "kinesis_unknown_stream_failure_total", + map[string]string{ + "stream": streamName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + } + + // BUG: AWS Kinesis API returns types.ProvisionedThroughputExceededException + // (which is checked by isErrThroughputExceeded) when there are too many + // `put` requests in flight. However, the same error is also returned + // when the size of an individual message exceeds the message size threshold. + // That means means we need a more fine-grained method of detecting which + // of the two cases we're getting this error for. + if isErrThroughputExceeded(err) { + metrics.Increment( + "kinesis_stream_throughput_exceeded_total", + map[string]string{ + "stream": streamName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + } +} + +func reportStreamError(err error, streamName, connGroup, eventType string) { + metrics.Increment( + "kinesis_messages_undelivered_total", + map[string]string{ + "stream": streamName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + if isErrNotFound(err) { + metrics.Increment( + "kinesis_unknown_stream_failure_total", + map[string]string{ + "stream": streamName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + } + + if isErrLimitExceeded(err) { + metrics.Increment( + "kinesis_streams_limit_exceeded_total", + map[string]string{ + "stream": streamName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + } +} diff --git a/publisher/pubsub/error.go b/publisher/pubsub/error.go index 01dc33e3..afdd6ff9 100644 --- a/publisher/pubsub/error.go +++ b/publisher/pubsub/error.go @@ -6,7 +6,7 @@ type unknownTopicError struct { Topic, Project string } -func (e *unknownTopicError) Error() string { +func (e unknownTopicError) Error() string { return fmt.Sprintf( `topic %q doesn't exist in %q project`, e.Topic, e.Project, ) diff --git a/publisher/pubsub/pubsub.go b/publisher/pubsub/pubsub.go index 988a7ddf..1ef0a543 100644 --- a/publisher/pubsub/pubsub.go +++ b/publisher/pubsub/pubsub.go @@ -3,6 +3,7 @@ package pubsub import ( "cmp" "context" + "errors" "fmt" "strings" "sync" @@ -44,29 +45,11 @@ func (p *Publisher) ProduceBulk(events []*pb.Event, connGroup string) error { results := make([]*pubsub.PublishResult, len(events)) for order, event := range events { - topicId := strings.Replace(p.topicFormat, "%s", event.Type, 1) + topicName := p.topicNameFromEvent(event) - topic, err := p.topic(ctx, topicId) + topic, err := p.topic(ctx, topicName) if err != nil { - metrics.Increment( - "pubsub_messages_delivered_total", - map[string]string{ - "success": "false", - "conn_group": connGroup, - "event_type": event.Type, - }, - ) - _, isUnknownTopic := err.(*unknownTopicError) - if isUnknownTopic { - metrics.Increment( - "pubsub_unknown_topic_failure_total", - map[string]string{ - "topic": topicId, - "conn_group": connGroup, - "event_type": event.Type, - }, - ) - } + reportTopicError(err, topicName, connGroup, event.Type) errors[order] = err continue } @@ -74,42 +57,31 @@ func (p *Publisher) ProduceBulk(events []*pb.Event, connGroup string) error { results[order] = topic.Publish(ctx, &pubsub.Message{ Data: event.EventBytes, }) - - metrics.Increment( - "pubsub_messages_delivered_total", - map[string]string{ - "success": "true", - "conn_group": connGroup, - "event_type": event.Type, - }, - ) } for order, result := range results { if result == nil { continue } + var ( + event = events[order] + topicName = p.topicNameFromEvent(event) + ) _, err := result.Get(ctx) if err != nil { - metrics.Increment( - "pubsub_messages_delivered_total", - map[string]string{ - "success": "false", - "conn_group": connGroup, - "event_type": events[order].Type, - }, - ) - metrics.Increment( - "pubsub_messages_undelivered_total", - map[string]string{ - "success": "true", - "conn_group": connGroup, - "event_type": events[order].Type, - }, - ) + reportPublishError(err, topicName, connGroup, event.Type) errors[order] = err continue } + + metrics.Increment( + "pubsub_messages_delivered_total", + map[string]string{ + "topic": topicName, + "conn_group": connGroup, + "event_type": events[order].Type, + }, + ) } if cmp.Or(errors...) != nil { @@ -118,6 +90,10 @@ func (p *Publisher) ProduceBulk(events []*pb.Event, connGroup string) error { return nil } +func (p *Publisher) topicNameFromEvent(event *pb.Event) string { + return strings.Replace(p.topicFormat, "%s", event.Type, 1) +} + func (p *Publisher) topic(ctx context.Context, id string) (*pubsub.Topic, error) { p.topicLock.RLock() topic, exists := p.topics[id] @@ -145,7 +121,7 @@ func (p *Publisher) topic(ctx context.Context, id string) (*pubsub.Topic, error) if !exists { if !p.autoCreateTopic { - return nil, &unknownTopicError{Topic: id, Project: p.client.Project()} + return nil, unknownTopicError{Topic: id, Project: p.client.Project()} } cfg := &pubsub.TopicConfig{} @@ -156,7 +132,7 @@ func (p *Publisher) topic(ctx context.Context, id string) (*pubsub.Topic, error) topic, err = p.client.CreateTopicWithConfig(ctx, id, cfg) if err != nil { // in case a service replica created this topic before we could - if p.isAlreadyExistsError(err) { + if isErrAlreadyExists(err) { topic = p.client.Topic(id) } else { return nil, fmt.Errorf("error creating topic %q: %w", id, err) @@ -182,14 +158,6 @@ func (p *Publisher) Name() string { return "pubsub" } -func (p *Publisher) isAlreadyExistsError(e error) bool { - apiError, ok := e.(*apierror.APIError) - if !ok { - return false - } - return apiError.GRPCStatus().Code() == codes.AlreadyExists -} - type Opt func(*Publisher) func WithTopicAutocreate(autocreate bool) Opt { @@ -250,3 +218,75 @@ func New(client *pubsub.Client, opts ...Opt) (*Publisher, error) { return p, nil } + +func isErrUnknownTopic(e error) bool { + return errors.As(e, &unknownTopicError{}) +} + +func isErrAlreadyExists(e error) bool { + return hasAPIErrorCode(e, codes.AlreadyExists) +} + +func isErrResourceExhausted(e error) bool { + return hasAPIErrorCode(e, codes.ResourceExhausted) +} + +func hasAPIErrorCode(e error, code codes.Code) bool { + apiError, ok := e.(*apierror.APIError) + if !ok { + return false + } + return apiError.GRPCStatus().Code() == code +} + +func reportTopicError(err error, topicName, connGroup, eventType string) { + metrics.Increment( + "pubsub_messages_undelivered_total", + map[string]string{ + "topic": topicName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + switch { + case isErrUnknownTopic(err): + metrics.Increment( + "pubsub_unknown_topic_failure_total", + map[string]string{ + "topic": topicName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + case isErrResourceExhausted(err): + metrics.Increment( + "pubsub_topics_limit_exceeded_total", + map[string]string{ + "topic": topicName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + } +} + +func reportPublishError(err error, topicName, connGroup, eventType string) { + metrics.Increment( + "pubsub_messages_undelivered_total", + map[string]string{ + "topic": topicName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + if isErrResourceExhausted(err) { + metrics.Increment( + "pubsub_topic_throughput_exceeded_total", + map[string]string{ + "topic": topicName, + "conn_group": connGroup, + "event_type": eventType, + }, + ) + } +} diff --git a/worker/worker.go b/worker/worker.go index 8bf48372..44af302d 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -71,7 +71,7 @@ func (w *Pool) worker(name string) { case *publisher.BulkError: for _, e := range et.Errors { if e != nil { - logger.Errorf("[worker] Fail to publish message to kafka %v", e) + logger.Errorf("[worker] Fail to publish message: %v", e) totalErr++ } }