From 0f9306112b6135fcf2ccfe5001e895fea48c0ee7 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 29 Aug 2024 11:50:02 -0700 Subject: [PATCH 01/85] Deploy signoz --- .../stacks/dpe-k8s-deployments/main.tf | 25 +- modules/signoz/main.tf | 39 + modules/signoz/templates/values.yaml | 2388 +++++++++++++++++ modules/signoz/variables.tf | 27 + modules/signoz/versions.tf | 24 + 5 files changed, 2501 insertions(+), 2 deletions(-) create mode 100644 modules/signoz/main.tf create mode 100644 modules/signoz/templates/values.yaml create mode 100644 modules/signoz/variables.tf create mode 100644 modules/signoz/versions.tf diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 7c227ded..c46b44f3 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -44,6 +44,8 @@ module "trivy-operator" { } module "airflow" { + # TODO: This is temporary + count = 0 depends_on = [module.victoria-metrics, module.argo-cd] source = "spacelift.io/sagebionetworks/airflow/aws" version = "0.4.0" @@ -54,6 +56,8 @@ module "airflow" { } module "postgres-cloud-native-operator" { + # TODO: This is temporary + count = 0 depends_on = [module.argo-cd] source = "spacelift.io/sagebionetworks/postgres-cloud-native-operator/aws" version = "0.4.0" @@ -63,13 +67,30 @@ module "postgres-cloud-native-operator" { } module "postgres-cloud-native-database" { + # TODO: This is temporary + count = 0 depends_on = [module.postgres-cloud-native-operator, module.airflow, module.argo-cd] source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" version = "0.5.0" - auto_deploy = true - auto_prune = true + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune git_revision = var.git_revision namespace = "airflow" argo_deployment_name = "airflow-postgres-cloud-native" } + +module "signoz" { + # TODO: This is temporary + count = 0 + depends_on = [module.argo-cd] + # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" + # version = "0.5.0" + source = "../../../modules/signoz" + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = var.git_revision + namespace = "signoz" + argo_deployment_name = "signoz" +} + diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf new file mode 100644 index 00000000..7e187c8f --- /dev/null +++ b/modules/signoz/main.tf @@ -0,0 +1,39 @@ + +resource "kubernetes_namespace" "signoz" { + metadata { + name = var.namespace + } +} + +resource "kubectl_manifest" "signoz-deployment" { + depends_on = [kubernetes_namespace.signoz] + + yaml_body = <`. + # If set to "-", `storageClassName: ""`, which disables dynamic provisioning + # If undefined (the default) or set to `null`, no storageClassName spec is + # set, choosing the default provisioner. + # + storageClass: null + + # -- Access Modes for persistent volume + accessModes: + - ReadWriteOnce + + # -- Persistent Volume size + size: 20Gi + + # -- Clickhouse user profile configuration. + # You can use this to override profile settings, for example + # `default/max_memory_usage: 40000000000` or `default/max_concurrent_queries: 200` + # + # For the full list of settings, see: + # - https://clickhouse.com/docs/en/operations/settings/settings-profiles/ + # - https://clickhouse.com/docs/en/operations/settings/settings/ + # + profiles: {} + + # -- Default user profile configuration for Clickhouse. !!! Please DO NOT override this !!! + defaultProfiles: + default/allow_experimental_window_functions: "1" + default/allow_nondeterministic_mutations: "1" + + # -- Clickhouse init container to copy histogramQuantile UDF + # @default -- See `values.yaml` for defaults + initContainers: + enabled: true + udf: + enabled: true + image: + registry: docker.io + repository: alpine + tag: 3.18.2 + pullPolicy: IfNotPresent + command: + - sh + - -c + - | + set -x + wget -O /tmp/histogramQuantile https://github.com/SigNoz/signoz/raw/develop/deploy/docker/clickhouse-setup/user_scripts/histogramQuantile + mv /tmp/histogramQuantile /var/lib/clickhouse/user_scripts/histogramQuantile + chmod +x /var/lib/clickhouse/user_scripts/histogramQuantile + init: + enabled: false + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + - /bin/sh + - -c + - | + set -e + until curl -s -o /dev/null http://signoz-clickhouse:8123/ + do sleep 1 + done + + # -- Clickhouse cluster layout. (Experimental, use at own risk) + # For a full list of options, see https://github.com/Altinity/clickhouse-operator/blob/master/docs/custom_resource_explained.md + # section on clusters and layouts. + # + layout: + shardsCount: 1 + replicasCount: 1 + + # -- ClickHouse settings configuration. + # You can use this to override settings, for example `prometheus/port: 9363` + # For the full list of settings, see: + # - https://clickhouse.com/docs/en/operations/settings/settings/ + # + settings: + # Uncomment those lines if you want to enable the built-in Prometheus HTTP endpoint in ClickHouse. + prometheus/endpoint: /metrics + prometheus/port: 9363 + # prometheus/metrics: true + # prometheus/events: true + # prometheus/asynchronous_metrics: true + + # -- Default settings configuration for ClickHouse. !!! Please DO NOT override this !!! + defaultSettings: + format_schema_path: /etc/clickhouse-server/config.d/ + user_scripts_path: /var/lib/clickhouse/user_scripts/ + user_defined_executable_functions_config: '/etc/clickhouse-server/functions/custom-functions.xml' + + # -- ClickHouse pod(s) annotation. + podAnnotations: + signoz.io/scrape: 'true' + signoz.io/port: '9363' + signoz.io/path: /metrics + + # -- Topologies on how to distribute the ClickHouse pod. + # Possible values can be found here: + # - https://github.com/Altinity/clickhouse-operator/blob/1414503921da3ae475eb6f9a296d3475a6993768/docs/chi-examples/99-clickhouseinstallation-max.yaml#L428-L481 + podDistribution: [] + # - type: ShardAntiAffinity + # topologyKey: kubernetes.io/hostname + # - type: ReplicaAntiAffinity + # topologyKey: kubernetes.io/hostname + # - type: MaxNumberPerNode + # number: 2 + # topologyKey: kubernetes.io/hostname + + # Cold storage configuration + coldStorage: + # -- Whether to enable S3 cold storage + enabled: false + # -- Reserve free space on default disk (in bytes) + # Default value is 10MiB + defaultKeepFreeSpaceBytes: "10485760" + # -- Type of cold storage: s3 or gcs + type: s3 + # -- Endpoint for S3 or GCS + # For S3, if region is us-east-1, endpoint can be https://s3.amazonaws.com + # if region is not us-east-1, endpoint should be https://s3-.amazonaws.com + # For GCS, endpoint should be https://storage.googleapis.com//data/ + endpoint: https://.s3-.amazonaws.com/data/ + # -- Access Key for S3 or GCS + accessKey: + # -- Secret Access Key for S3 or GCS + secretAccess: + # AWS role configuration - to use environment variables instead of passing access and secret keys + role: + # -- Whether to enable AWS IAM ARN role. + enabled: false + # -- Annotations to use by service account associated to Clickhouse instance + annotations: + # aws role arn + eks.amazonaws.com/role-arn: arn:aws:iam::******:role/***** + + # -- Clickhouse configuration files. + # + # Refs: + # - https://clickhouse.com/docs/en/operations/configuration-files/ + # - https://github.com/Altinity/clickhouse-operator/blob/master/docs/chi-examples/05-settings-05-files-nested.yaml + files: {} + # config.d/log_rotation.xml: | + # + # + # trace + # true + # /var/log/clickhouse-server/clickhouse-server.err.log + # /var/log/clickhouse-server/clickhouse-server.log + # 100M + # 10 + # + # + # test.xml: | + # + # some-value + # + + ### + ### + ### ---- MISC ---- + ### + ### + + # -- When the `installCustomStorageClass` is enabled with `cloud` set as `gcp` or `aws`, + # it creates custom storage class with volume expansion permission. + installCustomStorageClass: false + + ### + ### + ### ---- CLICKHOUSE OPERATOR ---- + ### + ### + clickhouseOperator: + # -- name of the component + name: operator + + # -- Version of the operator + version: 0.21.2 + + # -- Clickhouse Operator image + image: + # -- Clickhouse Operator image registry to use. + registry: docker.io + # -- Clickhouse Operator image repository to use. + repository: altinity/clickhouse-operator + # -- Clickhouse Operator image tag. + tag: 0.21.2 + # -- Clickhouse Operator image pull policy. + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for Clickhouse Operator. + # If global.imagePullSecrets is set as well, it will merged. + imagePullSecrets: [] + # - "clickhouseOperator-pull-secret" + + # ClickHouse Operator Service Account + serviceAccount: + # -- Specifies whether a service account should be created + create: true + # -- Annotations to add to the service account + annotations: {} + # -- The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # Clickhouse logging config + logger: + # -- Logging level. Acceptable values: trace, debug, information, warning, error. + level: information + # -- Size of the file. Applies to log and errorlog. Once the file reaches size, + # ClickHouse archives and renames it, and creates a new log file in its place. + size: 1000M + # -- The number of archived log files that ClickHouse stores. + count: 10 + # -- Whether to send log and errorlog to the console instead of file. To enable, set to 1 or true. + console: 1 + + # Query Log table configuration + queryLog: + # -- The number of days to keep the data in the query_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the query_log table. + flushInterval: 7500 + # Part Log table configuration + partLog: + # -- The number of days to keep the data in the part_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the part_log table. + flushInterval: 7500 + # Trace Log table configuration + traceLog: + # -- The number of days to keep the data in the trace_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the trace_log table. + flushInterval: 7500 + + asynchronousInsertLog: + # -- The number of days to keep the data in the asynchronous_insert_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the asynchronous_insert_log table. + flushInterval: 7500 + asynchronousMetricLog: + # -- The number of days to keep the data in the asynchronous_metric_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the asynchronous_metric_log table. + flushInterval: 7500 + backupLog: + # -- The number of days to keep the data in the backup_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the backup_log table. + flushInterval: 7500 + blobStorageLog: + # -- The number of days to keep the data in the blob_storage_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the blob_storage_log table. + flushInterval: 7500 + crashLog: + # -- The number of days to keep the data in the crash_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the crash_log table. + flushInterval: 7500 + metricLog: + # -- The number of days to keep the data in the metric_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the metric_log table. + flushInterval: 7500 + queryThreadLog: + # -- The number of days to keep the data in the query_thread_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the query_thread_log table. + flushInterval: 7500 + queryViewsLog: + # -- The number of days to keep the data in the query_views_log table. + ttl: 15 + # -- Time interval in milliseconds between flushes of the query_views_log table. + flushInterval: 7500 + sessionLog: + # -- The number of days to keep the data in the session_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the session_log table. + flushInterval: 7500 + zookeeperLog: + # -- The number of days to keep the data in the zookeeper_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the zookeeper_log table. + flushInterval: 7500 + processorsProfileLog: + # -- The number of days to keep the data in the processors_profile_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the processors_profile_log table. + flushInterval: 7500 + + # -- Clickhouse Operator pod(s) annotation. + podAnnotations: + signoz.io/port: '8888' + signoz.io/scrape: 'true' + + # -- Clickhouse Operator node selector + nodeSelector: {} + + # -- Metrics Exporter config. + metricsExporter: + # -- name of the component + name: metrics-exporter + + # -- Metrics Exporter service + service: + # -- Annotations to use by service associated to Metrics Exporter + annotations: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + # -- Metrics Exporter port + port: 8888 + + # -- Metrics Exporter image + image: + # -- Metrics Exporter image registry to use. + registry: docker.io + # -- Metrics Exporter image repository to use. + repository: altinity/metrics-exporter + # -- Metrics Exporter image tag. + tag: 0.21.2 + # -- Metrics Exporter image pull policy. + pullPolicy: IfNotPresent + + +## External clickhouse configuration +## This is required when clickhouse.enabled is false +## +# TODO: Implement external clickhouse configuration +externalClickhouse: + # -- Host of the external cluster. + host: + # -- Name of the external cluster to run DDL queries on. + cluster: cluster + # -- Database name for the external cluster + database: signoz_metrics + # -- Clickhouse trace database (SigNoz Traces) + traceDatabase: signoz_traces + # -- Clickhouse log database (SigNoz Logs) + logDatabase: signoz_logs + # -- User name for the external cluster to connect to the external cluster as + user: "" + # -- Password for the cluster. Ignored if externalClickhouse.existingSecret is set + password: "" + # -- Name of an existing Kubernetes secret object containing the password + existingSecret: + # -- Name of the key pointing to the password in your Kubernetes secret + existingSecretPasswordKey: + # -- Whether to use TLS connection connecting to ClickHouse + secure: false + # -- Whether to verify TLS connection connecting to ClickHouse + verify: false + # -- HTTP port of Clickhouse + httpPort: 8123 + # -- TCP port of Clickhouse + tcpPort: 9000 + +# Default values for query-service +queryService: + name: "query-service" + replicaCount: 1 + image: + registry: docker.io + repository: signoz/query-service + tag: 0.52.0 + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for Query-Service + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + # Query-Service Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # Query-Service service + service: + # -- Annotations to use by service associated to Query-Service + annotations: {} + # -- Labels to use by service associated to Query-Service + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + # -- Query-Service HTTP port + port: 8080 + # -- Query-Service Internal port + internalPort: 8085 + # -- Query-Service OpAMP Internal port + opampPort: 4320 + # -- Set this to you want to force a specific nodePort for http. + # Must be use with service.type=NodePort + nodePort: null + # -- Set this to you want to force a specific nodePort for internal. + # Must be use with service.type=NodePort + internalNodePort: null + + # -- Query-Service annotations + annotations: + "helm.sh/hook-weight": "2" + + # -- Query-Service additional arguments for command line + additionalArgs: [] + # - --prefer-delta=true + + # -- Additional environments to set for queryService + additionalEnvs: {} + # env_key: env_value + + initContainers: + init: + enabled: true + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /ping + waitMessage: "waiting for clickhouseDB" + doneMessage: "clickhouse ready, starting query service now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + migration: + enabled: false + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + args: [] + command: [] + # - sh + # - -c + # - | + # echo "Running migration" + # sleep 10 # Replace with actual migration command + # echo "Migration completed" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + configVars: + storage: clickhouse + # ClickHouse URL is set and applied internally. + # Don't override unless you know what you are doing. + # clickHouseUrl: tcp://clickhouse_operator:clickhouse_operator_password@my-release-clickhouse:9000/signoz_traces + goDebug: netdns=go + telemetryEnabled: true + deploymentType: kubernetes-helm + + # Query-Service cache options + cache: + # -- Whether to enable cache for Query-Service + enabled: true + # -- Cache flux interval for Query-Service + fluxInterval: 30m + # -- Cache configurations for Query-Service + config: + name: cache + provider: inmemory + inmemory: + ttl: 168h + + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + + # -- Configure liveness and readiness probes. + # ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes + livenessProbe: + enabled: true + port: http + path: /api/v1/health + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + readinessProbe: + enabled: true + port: http + path: /api/v1/health?live=1 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + + # -- Custom liveness probe + customLivenessProbe: {} + # -- Custom readiness probe + customReadinessProbe: {} + + ingress: + # -- Enable ingress for Query-Service + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to Query-Service Ingress + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # cert-manager.io/cluster-issuer: letsencrypt-prod + # -- Query-Service Ingress Host names with their path details + hosts: + - host: query-service.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 8080 + # -- Query-Service Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - query-service.domain.com + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: 750m + # memory: 1000Mi + + # -- QueryService priority class name + priorityClassName: "" + # -- Node selector for settings for QueryService pod + nodeSelector: {} + # -- Toleration labels for QueryService pod assignment + tolerations: [] + # -- Affinity settings for QueryService pod + affinity: {} + # -- TopologySpreadConstraints describes how QueryService pods ought to spread + topologySpreadConstraints: [] + + persistence: + # -- Enable data persistence using PVC for SQLiteDB data. + enabled: true + + # -- Name of an existing PVC to use (only when deploying a single replica) + existingClaim: "" + + # -- Persistent Volume Storage Class to use. + # If defined, `storageClassName: `. + # If set to "-", `storageClassName: ""`, which disables dynamic provisioning + # If undefined (the default) or set to `null`, no storageClassName spec is + # set, choosing the default provisioner. + # + storageClass: null + + # -- Access Modes for persistent volume + accessModes: + - ReadWriteOnce + + # -- Persistent Volume size + size: 1Gi + + +# Default values for frontend +frontend: + name: "frontend" + replicaCount: 1 + + image: + registry: docker.io + repository: signoz/frontend + tag: 0.52.0 + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for Frontend + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + # Frontend Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # Frontend service + service: + # -- Annotations to use by service associated to Frontend + annotations: {} + # -- Labels to use by service associated to Frontend + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + # -- Frontend HTTP port + port: 3301 + + initContainers: + init: + enabled: true + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /api/v1/health?live=1 + waitMessage: "waiting for query-service" + doneMessage: "query-service ready, starting frontend now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 11 + targetCPUUtilizationPercentage: 50 + targetMemoryUtilizationPercentage: 50 + behavior: {} + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 2 + # periodSeconds: 60 + + autoscalingTemplate: [] + keda: + enabled: false + pollingInterval: "30" # check 30sec periodically for metrics data + cooldownPeriod: "300" # once the load decreased, it will wait for 5 min and downscale + minReplicaCount: "1" # should be >= replicaCount specified in values.yaml + maxReplicaCount: "5" + triggers: + - type: memory + metadata: + type: Utilization + value: "80" # hpa make sure average Utilization <=80 by adding new pods + - type: cpu + metadata: + type: Utilization + value: "80" # hpa make sure average Utlization <=80 by adding new pods + + configVars: {} + + # -- Frontend deployment annotations + annotations: + "helm.sh/hook-weight": "5" + # -- Frontend pod security context + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + + ingress: + # -- Enable ingress for Frontend + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to Frontend Ingress + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # cert-manager.io/cluster-issuer: letsencrypt-prod + # -- Frontend Ingress Host names with their path details + hosts: + - host: frontend.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 3301 + # -- Frontend Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - frontend.domain.com + + # -- Frontend Nginx extra configurations + nginxExtraConfig: | + client_max_body_size 24M; + large_client_header_buffers 8 16k; + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: 200m + # memory: 200Mi + + # -- Frontend priority class name + priorityClassName: "" + # -- Node selector for settings for Frontend pod + nodeSelector: {} + # -- Toleration labels for Frontend pod assignment + tolerations: [] + # -- Affinity settings for Frontend pod + affinity: {} + # -- TopologySpreadConstraints describes how Frontend pods ought to spread + topologySpreadConstraints: [] + +# Default values for Alertmanager +alertmanager: + enabled: false + name: "alertmanager" + replicaCount: 1 + + image: + registry: docker.io + repository: signoz/alertmanager + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: 0.23.5 + + # -- Image Registry Secret Names for Alertmanager + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + # -- Alertmanager custom command override + command: [] + # -- Alertmanager extra Arguments + extraArgs: {} + + # Alertmanager Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # Alertmanager service + service: + # -- Annotations to use by service associated to Alertmanager + annotations: {} + # -- Labels to use by service associated to Alertmanager + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + # -- Alertmanager HTTP port + port: 9093 + # -- Alertmanager cluster port + clusterPort: 9094 + # -- Set this to you want to force a specific nodePort. Must be use with service.type=NodePort + nodePort: null + + # -- Additional environments to set for Alertmanager + additionalEnvs: {} + # env_key: env_value + + initContainers: + init: + enabled: true + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /api/v1/health?live=1 + waitMessage: "waiting for query-service" + doneMessage: "query-service ready, starting alertmanager now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + podSecurityContext: + fsGroup: 65534 + dnsConfig: {} + # nameservers: + # - 1.2.3.4 + # searches: + # - ns1.svc.cluster-domain.example + # - my.dns.search.suffix + # options: + # - name: ndots + # value: "2" + # - name: edns0 + securityContext: + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + runAsUser: 65534 + runAsNonRoot: true + runAsGroup: 65534 + + additionalPeers: [] + + livenessProbe: + httpGet: + path: / + port: http + + readinessProbe: + httpGet: + path: / + port: http + + ingress: + # -- Enable ingress for Alertmanager + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to Alertmanager Ingress + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # cert-manager.io/cluster-issuer: letsencrypt-prod + # -- Alertmanager Ingress Host names with their path details + hosts: + - host: alertmanager.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 9093 + # -- Alertmanager Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - alertmanager.domain.com + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: 200m + # memory: 200Mi + + # -- Alertmanager priority class name + priorityClassName: "" + # -- Node selector for settings for Alertmanager pod + nodeSelector: {} + # -- Toleration labels for Alertmanager pod assignment + tolerations: [] + # -- Affinity settings for Alertmanager pod + affinity: {} + # -- TopologySpreadConstraints describes how Alertmanager pods ought to spread + topologySpreadConstraints: [] + + statefulSet: + annotations: + "helm.sh/hook-weight": "4" + + podAnnotations: {} + podLabels: {} + + # Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/ + podDisruptionBudget: {} + # maxUnavailable: 1 + # minAvailable: 1 + + persistence: + # -- Enable data persistence using PVC for Alertmanager data. + enabled: true + + # -- Name of an existing PVC to use (only when deploying a single replica) + existingClaim: "" + + # -- Persistent Volume Storage Class to use. + # If defined, `storageClassName: `. + # If set to "-", `storageClassName: ""`, which disables dynamic provisioning + # If undefined (the default) or set to `null`, no storageClassName spec is + # set, choosing the default provisioner. + # + storageClass: null + + # -- Access Modes for persistent volume + accessModes: + - ReadWriteOnce + + # -- Persistent Volume size + size: 100Mi + + ## Using the config, alertmanager.yml file is created. + ## We no longer need the config file as query services + ## delivers the required config. + # config: + # global: + # resolve_timeout: 1m + # slack_api_url: 'https://hooks.slack.com/services/xxx' + + # templates: + # - '/etc/alertmanager/*.tmpl' + + # receivers: + # - name: 'slack-notifications' + # slack_configs: + # - channel: '#alerts' + # send_resolved: true + # icon_url: https://avatars3.githubusercontent.com/u/3380462 + # title: '{{ template "slack.title" . }}' + # text: '{{ template "slack.text" . }}' + + # route: + # receiver: 'slack-notifications' + + ## Templates are no longer needed as they are included + ## from frontend placeholder while creating alert channels. + # templates: + # title.tmpl: |- + # {{ define "slack.title" }} + # [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + # {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + # {{" "}}( + # {{- with .CommonLabels.Remove .GroupLabels.Names }} + # {{- range $index, $label := .SortedPairs -}} + # {{ if $index }}, {{ end }} + # {{- $label.Name }}="{{ $label.Value -}}" + # {{- end }} + # {{- end -}} + # ) + # {{- end }} + # {{ end }} + # text.tmpl: |- + # {{ define "slack.text" }} + # {{ range .Alerts -}} + # *Alert:* {{ .Labels.alertname }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + # *Summary:* {{ .Annotations.summary }} + # *Description:* {{ .Annotations.description }} + + # *Details:* + # {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + # {{ end }} + # {{ end }} + # {{ end }} + + ## Monitors ConfigMap changes and POSTs to a URL + ## Ref: https://github.com/jimmidyson/configmap-reload + ## + configmapReload: + ## If false, the configmap-reload container will not be deployed + ## + enabled: false + + ## configmap-reload container name + ## + name: configmap-reload + + ## configmap-reload container image + ## + image: + repository: jimmidyson/configmap-reload + tag: v0.5.0 + pullPolicy: IfNotPresent + + # containerPort: 9533 + + # -- Configure resource requests and limits. Update as per your need. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: 200m + # memory: 200Mi + +# Default values for schemaMigrator +schemaMigrator: + enabled: true + name: "schema-migrator" + + image: + registry: docker.io + repository: signoz/signoz-schema-migrator + tag: 0.102.4 + pullPolicy: IfNotPresent + + args: {} + annotations: + "helm.sh/hook-weight": "1" + "helm.sh/hook-delete-policy": "before-hook-creation" + + # -- Whether to enable replication for schemaMigrator + enableReplication: false + + # -- Node selector for settings for schemaMigrator + nodeSelector: {} + # -- Toleration labels for schemaMigrator assignment + tolerations: [] + # -- Affinity settings for schemaMigrator + affinity: {} + # -- TopologySpreadConstraints describes how schemaMigrator pods ought to spread + topologySpreadConstraints: [] + + initContainers: + wait: + image: + registry: docker.io + repository: groundnuty/k8s-wait-for + tag: v2.0 + pullPolicy: IfNotPresent + init: + enabled: true + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /ping + waitMessage: "waiting for clickhouseDB" + doneMessage: "clickhouse ready, starting schema migrator now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + chReady: + enabled: true + image: + registry: docker.io + repository: clickhouse/clickhouse-server + tag: 24.1.2-alpine + pullPolicy: IfNotPresent + command: + - "sh" + - "-c" + - | + echo "Running clickhouse ready check" + while true + do + version="$(CLICKHOUSE_VERSION)" + shards="$(CLICKHOUSE_SHARDS)" + replicas="$(CLICKHOUSE_REPLICAS)" + current_version="$(clickhouse client --host ${CLICKHOUSE_HOST} --port ${CLICKHOUSE_PORT} --user "${CLICKHOUSE_USER}" --password "${CLICKHOUSE_PASSWORD}" -q "SELECT version()")" + if [ -z "$current_version" ]; then + echo "waiting for clickhouse to be ready" + sleep 5 + continue + fi + if [ -z "$(echo "$current_version" | grep "$version")" ]; then + echo "expected version: $version, current version: $current_version" + echo "waiting for clickhouse with correct version" + sleep 5 + continue + fi + current_shards="$(clickhouse client --host ${CLICKHOUSE_HOST} --port ${CLICKHOUSE_PORT} --user "${CLICKHOUSE_USER}" --password "${CLICKHOUSE_PASSWORD}" -q "SELECT count(DISTINCT(shard_num)) FROM system.clusters WHERE cluster = '${CLICKHOUSE_CLUSTER}'")" + if [ -z "$current_shards" ]; then + echo "waiting for clickhouse to be ready" + sleep 5 + continue + fi + if [ "$current_shards" -ne "$shards" ]; then + echo "expected shard count: $shards, current shard count: $current_shards" + echo "waiting for clickhouse with correct shard count" + sleep 5 + continue + fi + current_replicas="$(clickhouse client --host ${CLICKHOUSE_HOST} --port ${CLICKHOUSE_PORT} --user "${CLICKHOUSE_USER}" --password "${CLICKHOUSE_PASSWORD}" -q "SELECT count(DISTINCT(replica_num)) FROM system.clusters WHERE cluster = '${CLICKHOUSE_CLUSTER}'")" + if [ -z "$current_replicas" ]; then + echo "waiting for clickhouse to be ready" + sleep 5 + continue + fi + if [ "$current_replicas" -ne "$replicas" ]; then + echo "expected replica count: $replicas, current replica count: $current_replicas" + echo "waiting for clickhouse with correct replica count" + sleep 5 + continue + fi + break + done + echo "clickhouse ready, starting schema migrator now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + +# Default values for OtelCollector +otelCollector: + name: "otel-collector" + image: + registry: docker.io + repository: signoz/signoz-otel-collector + tag: 0.102.4 + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for OtelCollector + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + initContainers: + init: + enabled: false + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /ping + waitMessage: "waiting for clickhouseDB" + doneMessage: "clickhouse ready, starting otel collector now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + # OpenTelemetry Collector executable + command: + # -- OtelCollector command name + name: /signoz-collector + # -- OtelCollector command extra arguments + extraArgs: + - --feature-gates=-pkg.translator.prometheus.NormalizeName + + configMap: + # -- Specifies whether a configMap should be created (true by default) + create: true + + # OtelCollector Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # OtelCollector service + service: + # -- Annotations to use by service associated to OtelCollector + annotations: {} + # -- Labels to use by service associated to OtelCollector + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + + # -- OtelCollector Deployment annotation. + annotations: + "helm.sh/hook-weight": "3" + # -- OtelCollector pod(s) annotation. + podAnnotations: + signoz.io/scrape: 'true' + signoz.io/port: '8888' + + # -- OtelCollector pod(s) labels. + podLabels: {} + + # -- Additional environments to set for OtelCollector + additionalEnvs: {} + # env_key: env_value + + # -- Whether to enable grouping of exceptions with same name and different stack trace. + # This is useful when you have a lot of exceptions with same name but different stack trace. + # This is a tradeoff between cardinality and accuracy of exception grouping. + lowCardinalityExceptionGrouping: false + + minReadySeconds: 5 + progressDeadlineSeconds: 120 + replicaCount: 1 + + # OtelCollector RBAC config + clusterRole: + # -- Specifies whether a clusterRole should be created + create: true + # -- Annotations to add to the clusterRole + annotations: {} + # -- The name of the clusterRole to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + # -- A set of rules as documented here. + # ref: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ + # @default -- See `values.yaml` for defaults + rules: + # k8sattributes processor requires these permissions + - apiGroups: [""] + resources: ["pods", "namespaces", "nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch"] + + # OtelCollector clusterRoleBinding + clusterRoleBinding: + # Annotations to add to the clusterRoleBinding + annotations: {} + # The name of the clusterRoleBinding to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + + # Configuration for ports + ports: + otlp: + # -- Whether to enable service port for OTLP gRPC + enabled: true + # -- Container port for OTLP gRPC + containerPort: 4317 + # -- Service port for OTLP gRPC + servicePort: 4317 + # -- Node port for OTLP gRPC + nodePort: "" + # -- Protocol to use for OTLP gRPC + protocol: TCP + otlp-http: + # -- Whether to enable service port for OTLP HTTP + enabled: true + # -- Container port for OTLP HTTP + containerPort: 4318 + # -- Service port for OTLP HTTP + servicePort: 4318 + # -- Node port for OTLP HTTP + nodePort: "" + # -- Protocol to use for OTLP HTTP + protocol: TCP + jaeger-compact: + # -- Whether to enable service port for Jaeger Compact + enabled: false + # -- Container port for Jaeger Compact + containerPort: 6831 + # -- Service port for Jaeger Compact + servicePort: 6831 + # -- Node port for Jaeger Compact + nodePort: "" + # -- Protocol to use for Jaeger Compact + protocol: UDP + jaeger-thrift: + # -- Whether to enable service port for Jaeger Thrift HTTP + enabled: true + # -- Container port for Jaeger Thrift + containerPort: 14268 + # -- Service port for Jaeger Thrift + servicePort: 14268 + # -- Node port for Jaeger Thrift + nodePort: "" + # -- Protocol to use for Jaeger Thrift + protocol: TCP + jaeger-grpc: + # -- Whether to enable service port for Jaeger gRPC + enabled: true + # -- Container port for Jaeger gRPC + containerPort: 14250 + # -- Service port for Jaeger gRPC + servicePort: 14250 + # -- Node port for Jaeger gRPC + nodePort: "" + # -- Protocol to use for Jaeger gRPC + protocol: TCP + zipkin: + # -- Whether to enable service port for Zipkin + enabled: false + # -- Container port for Zipkin + containerPort: 9411 + # -- Service port for Zipkin + servicePort: 9411 + # -- Node port for Zipkin + nodePort: "" + # -- Protocol to use for Zipkin + protocol: TCP + prometheus: + # -- Whether to enable service port for SigNoz exported prometheus metrics + enabled: false + # -- Container port for SigNoz exported prometheus metrics + containerPort: 8889 + # -- Service port for SigNoz exported prometheus metrics + servicePort: 8889 + # -- Node port for SigNoz exported prometheus metrics + nodePort: "" + # -- Protocol to use for SigNoz exported prometheus metrics + protocol: TCP + metrics: + # -- Whether to enable service port for internal metrics + enabled: true + # -- Container port for internal metrics + containerPort: 8888 + # -- Service port for internal metrics + servicePort: 8888 + # -- Node port for internal metrics + nodePort: "" + # -- Protocol to use for internal metrics + protocol: TCP + zpages: + # -- Whether to enable service port for ZPages + enabled: false + # -- Container port for Zpages + containerPort: 55679 + # -- Service port for Zpages + servicePort: 55679 + # -- Node port for Zpages + nodePort: "" + # -- Protocol to use for Zpages + protocol: TCP + pprof: + # -- Whether to enable service port for pprof + enabled: false + # -- Container port for pprof + containerPort: 1777 + # -- Service port for pprof + servicePort: 1777 + # -- Node port for pprof + nodePort: "" + # -- Protocol to use for pprof + protocol: TCP + logsheroku: + # -- Whether to enable service port for logsheroku + enabled: true + # -- Container port for logsheroku + containerPort: 8081 + # -- Service port for logsheroku + servicePort: 8081 + # -- Node port for logsheroku + nodePort: "" + # -- Protocol to use for logsheroku + protocol: TCP + logsjson: + # -- Whether to enable service port for logsjson + enabled: true + # -- Container port for logsjson + containerPort: 8082 + # -- Service port for logsjson + servicePort: 8082 + # -- Node port for logsjson + nodePort: "" + # -- Protocol to use for logsjson + protocol: TCP + + # -- Configure liveness and readiness probes. + # ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes + livenessProbe: + enabled: true + port: 13133 + path: / + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + readinessProbe: + enabled: true + port: 13133 + path: / + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + + # -- Custom liveness probe + customLivenessProbe: {} + # -- Custom readiness probe + customReadinessProbe: {} + + # -- Extra volumes mount for OtelCollector pod + extraVolumeMounts: [] + # -- Extra volumes for OtelCollector pod + extraVolumes: [] + + ingress: + # -- Enable ingress for OtelCollector + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to OtelCollector Ingress + annotations: {} + # cert-manager.io/cluster-issuer: letsencrypt-prod + # nginx.ingress.kubernetes.io/ssl-redirect: "true" + # nginx.ingress.kubernetes.io/backend-protocol: "GRPC" + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # -- OtelCollector Ingress Host names with their path details + hosts: + - host: otelcollector.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 4318 + # -- OtelCollector Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - otelcollector.domain.com + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 200Mi + # limits: + # cpu: "1" + # memory: 2Gi + + # -- OtelCollector priority class name + priorityClassName: "" + # -- Node selector for settings for OtelCollector pod + nodeSelector: {} + # -- Toleration labels for OtelCollector pod assignment + tolerations: [] + # -- Affinity settings for OtelCollector pod + affinity: {} + # -- TopologySpreadConstraints describes how OtelCollector pods ought to spread + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/component: otel-collector + + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + + autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 11 + targetCPUUtilizationPercentage: 50 + targetMemoryUtilizationPercentage: 50 + behavior: {} + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 2 + # periodSeconds: 60 + + autoscalingTemplate: [] + keda: + enabled: false + pollingInterval: "30" # check 30sec periodically for metrics data + cooldownPeriod: "300" # once the load decreased, it will wait for 5 min and downscale + minReplicaCount: "1" # should be >= replicaCount specified in values.yaml + maxReplicaCount: "5" + triggers: [] + # - type: memory + # metadata: + # type: Utilization + # value: "80" # hpa make sure average Utilization <=80 by adding new pods + # - type: cpu + # metadata: + # type: Utilization + # value: "80" # hpa make sure average Utlization <=80 by adding new pods + + # -- Configurations for OtelCollector + # @default -- See `values.yaml` for defaults + config: + receivers: + otlp/spanmetrics: + protocols: + grpc: + endpoint: localhost:12345 + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + max_recv_msg_size_mib: 16 + http: + endpoint: 0.0.0.0:4318 + jaeger: + protocols: + grpc: + endpoint: 0.0.0.0:14250 + thrift_http: + endpoint: 0.0.0.0:14268 + # Uncomment to enable thift_company receiver. + # You will also have set set enable it in `otelCollector.ports + # thrift_compact: + # endpoint: 0.0.0.0:6831 + hostmetrics: + collection_interval: 30s + scrapers: + cpu: {} + load: {} + memory: {} + disk: {} + filesystem: {} + network: {} + httplogreceiver/heroku: + # endpoint specifies the network interface and port which will receive data + endpoint: 0.0.0.0:8081 + source: heroku + httplogreceiver/json: + # endpoint specifies the network interface and port which will receive data + endpoint: 0.0.0.0:8082 + source: json + processors: + # default parsing of logs + # logstransform/internal: + # operators: + # - type: regex_parser + # id: traceid + # # https://regex101.com/r/yFW5UC/1 + # regex: '(?i)(^trace|(("| )+trace))((-|_||)id("|=| |-|:)*)(?P[A-Fa-f0-9]+)' + # parse_from: body + # parse_to: attributes.temp_trace + # if: 'body matches "(?i)(^trace|((\"| )+trace))((-|_||)id(\"|=| |-|:)*)(?P[A-Fa-f0-9]+)"' + # output: spanid + # - type: regex_parser + # id: spanid + # # https://regex101.com/r/DZ2gng/1 + # regex: '(?i)(^span|(("| )+span))((-|_||)id("|=| |-|:)*)(?P[A-Fa-f0-9]+)' + # parse_from: body + # parse_to: attributes.temp_trace + # if: 'body matches "(?i)(^span|((\"| )+span))((-|_||)id(\"|=| |-|:)*)(?P[A-Fa-f0-9]+)"' + # output: trace_parser + # - type: trace_parser + # id: trace_parser + # trace_id: + # parse_from: attributes.temp_trace.trace_id + # span_id: + # parse_from: attributes.temp_trace.span_id + # output: remove_temp + # - type: remove + # id: remove_temp + # field: attributes.temp_trace + # if: '"temp_trace" in attributes' + # Batch processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md + batch: + send_batch_size: 50000 + timeout: 1s + # Resource detection processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/resourcedetectionprocessor/README.md + resourcedetection: + # detectors: include ec2/eks for AWS, gcp for GCP and azure/aks for Azure + # env detector included below adds custom labels using OTEL_RESOURCE_ATTRIBUTES envvar + detectors: + - env + # - elastic_beanstalk + # - eks + # - ecs + # - ec2 + # - gcp + # - azure + # - heroku + - system + timeout: 2s + system: + hostname_sources: [dns, os] + # Memory Limiter processor. + # If not set, will be overridden with values based on k8s resource limits. + # ref: https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/memorylimiterprocessor/README.md + # memory_limiter: null + signozspanmetrics/cumulative: + metrics_exporter: clickhousemetricswrite + latency_histogram_buckets: + [ + 100us, + 1ms, + 2ms, + 6ms, + 10ms, + 50ms, + 100ms, + 250ms, + 500ms, + 1000ms, + 1400ms, + 2000ms, + 5s, + 10s, + 20s, + 40s, + 60s, + ] + dimensions_cache_size: 100000 + dimensions: + - name: service.namespace + default: default + - name: deployment.environment + default: default + - name: signoz.collector.id + signozspanmetrics/delta: + metrics_exporter: clickhousemetricswrite + latency_histogram_buckets: + [ + 100us, + 1ms, + 2ms, + 6ms, + 10ms, + 50ms, + 100ms, + 250ms, + 500ms, + 1000ms, + 1400ms, + 2000ms, + 5s, + 10s, + 20s, + 40s, + 60s, + ] + dimensions_cache_size: 100000 + dimensions: + - name: service.namespace + default: default + - name: deployment.environment + default: default + - name: signoz.collector.id + aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA + # K8s Attribute processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/k8sattributesprocessor/README.md + k8sattributes: + # -- Whether to detect the IP address of agents and add it as an attribute to all telemetry resources. + # If set to true, Agents will not make any k8s API calls, do any discovery of pods or extract any metadata. + passthrough: false + # -- Filters can be used to limit each OpenTelemetry agent to query pods based on specific + # selector to only dramatically reducing resource requirements for very large clusters. + filter: + # -- Restrict each OpenTelemetry agent to query pods running on the same node + node_from_env_var: K8S_NODE_NAME + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + extract: + metadata: + - k8s.namespace.name + - k8s.pod.name + - k8s.pod.uid + - k8s.pod.start_time + - k8s.deployment.name + - k8s.node.name + extensions: + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: localhost:55679 + pprof: + endpoint: localhost:1777 + exporters: + clickhousetraces: + datasource: tcp://${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}@${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT}/${CLICKHOUSE_TRACE_DATABASE} + low_cardinal_exception_grouping: ${LOW_CARDINAL_EXCEPTION_GROUPING} + clickhousemetricswrite: + endpoint: tcp://${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}@${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT}/${CLICKHOUSE_DATABASE} + timeout: 15s + resource_to_telemetry_conversion: + enabled: true + clickhouselogsexporter: + dsn: tcp://${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}@${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT}/${CLICKHOUSE_LOG_DATABASE} + timeout: 10s + prometheus: + endpoint: 0.0.0.0:8889 + service: + telemetry: + logs: + encoding: json + metrics: + address: 0.0.0.0:8888 + extensions: [health_check, zpages, pprof] + pipelines: + traces: + receivers: [otlp, jaeger] + processors: [signozspanmetrics/cumulative, signozspanmetrics/delta, batch] + exporters: [clickhousetraces] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [clickhousemetricswrite] + metrics/internal: + receivers: [hostmetrics] + processors: [resourcedetection, k8sattributes, batch] + exporters: [clickhousemetricswrite] + logs: + receivers: [otlp, httplogreceiver/heroku, httplogreceiver/json] + processors: [batch] + exporters: [clickhouselogsexporter] + +# Default values for OtelCollectorMetrics +otelCollectorMetrics: + enabled: true + name: "otel-collector-metrics" + image: + registry: docker.io + repository: signoz/signoz-otel-collector + tag: 0.102.4 + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for OtelCollector + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + # OpenTelemetry Collector executable + command: + # -- OtelCollectorMetrics command name + name: /signoz-collector + # -- OtelCollectorMetrics command extra arguments + extraArgs: + - --feature-gates=-pkg.translator.prometheus.NormalizeName + + configMap: + # -- Specifies whether a configMap should be created (true by default) + create: true + + # OtelCollectorMetrics Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # OtelCollectorMetrics service + service: + # -- Annotations to use by service associated to OtelCollectorMetrics + annotations: {} + # -- Labels to use by service associated to OtelCollectorMetrics + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + + # -- OtelCollectorMetrics Deployment annotation. + annotations: + "helm.sh/hook-weight": "3" + # -- OtelCollectorMetrics pod(s) annotation. + podAnnotations: + signoz.io/scrape: 'true' + signoz.io/port: '8888' + + # -- Additional environments to set for OtelCollectorMetrics + additionalEnvs: {} + # env_key: env_value + + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + + minReadySeconds: 5 + progressDeadlineSeconds: 120 + replicaCount: 1 + + initContainers: + init: + enabled: false + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /ping + waitMessage: "waiting for clickhouseDB" + doneMessage: "clickhouse ready, starting otel collector metrics now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + # Configuration for ports + ports: + metrics: + # -- Whether to enable service port for internal metrics + enabled: false + # -- Container port for internal metrics + containerPort: 8888 + # -- Service port for internal metrics + servicePort: 8888 + # -- Protocol to use for internal metrics + protocol: TCP + zpages: + # -- Whether to enable service port for ZPages + enabled: false + # -- Container port for Zpages + containerPort: 55679 + # -- Service port for Zpages + servicePort: 55679 + # -- Protocol to use for Zpages + protocol: TCP + health-check: + # -- Whether to enable service port for health check + enabled: true + # -- Container port for health check + containerPort: 13133 + # -- Service port for health check + servicePort: 13133 + # -- Protocol to use for health check + protocol: TCP + pprof: + # -- Whether to enable service port for pprof + enabled: false + # -- Container port for pprof + containerPort: 1777 + # -- Service port for pprof + servicePort: 1777 + # -- Protocol to use for pprof + protocol: TCP + + + ## Configure liveness and readiness probes. + ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes + ## + livenessProbe: + enabled: true + port: 13133 + path: / + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + readinessProbe: + enabled: true + port: 13133 + path: / + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + + ## Custom liveness and readiness probes + customLivenessProbe: {} + customReadinessProbe: {} + + # -- Extra volumes mount for OtelCollectorMetrics pod + extraVolumeMounts: [] + # -- Extra volumes for OtelCollectorMetrics pod + extraVolumes: [] + + ingress: + # -- Enable ingress for OtelCollectorMetrics + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to OtelCollectorMetrics Ingress + annotations: {} + # cert-manager.io/cluster-issuer: letsencrypt-prod + # nginx.ingress.kubernetes.io/ssl-redirect: "true" + # nginx.ingress.kubernetes.io/backend-protocol: "GRPC" + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # -- OtelCollectorMetrics Ingress Host names with their path details + hosts: + - host: otelcollector-metrics.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 13133 + # -- OtelCollectorMetrics Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - otelcollector-metrics.domain.com + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: "1" + # memory: 2Gi + + # -- OtelCollectorMetrics priority class name + priorityClassName: "" + # -- Node selector for settings for OtelCollectorMetrics pod + nodeSelector: {} + # -- Toleration labels for OtelCollectorMetrics pod assignment + tolerations: [] + # -- Affinity settings for OtelCollectorMetrics pod + affinity: {} + # -- TopologySpreadConstraints describes how OtelCollectorMetrics pods ought to spread + topologySpreadConstraints: [] + + # OtelCollectorMetrics RBAC config + clusterRole: + # -- Specifies whether a clusterRole should be created + create: true + # -- Annotations to add to the clusterRole + annotations: {} + # -- The name of the clusterRole to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + # -- A set of rules as documented here. + # ref: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ + # @default -- See `values.yaml` for defaults + rules: + # k8sattributes processor requires these permissions + - apiGroups: [""] + resources: ["pods", "namespaces", "nodes"] + verbs: ["get", "watch", "list"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] + # other processors and receivers require these permissions + - apiGroups: [""] + resources: ["nodes", "nodes/proxy", "services", "endpoints"] + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: ["ingresses"] + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] + + # OtelCollectorMetrics clusterRoleBinding + clusterRoleBinding: + # -- Annotations to add to the clusterRoleBinding + annotations: {} + # -- The name of the clusterRoleBinding to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + + # -- Configurations for OtelCollectorMetrics + # @default -- See `values.yaml` for defaults + config: + receivers: + # prometheus scrape config + prometheus: + config: + scrape_configs: + # generic prometheus metrics scraper (scrapped when signoz.io pod annotations are set) + - job_name: "generic-collector" + scrape_interval: 60s + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: + [__meta_kubernetes_pod_annotation_signoz_io_scrape] + action: keep + regex: true + - source_labels: + [__meta_kubernetes_pod_annotation_signoz_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: + [ + __meta_kubernetes_pod_ip, + __meta_kubernetes_pod_annotation_signoz_io_port, + ] + action: replace + separator: ":" + target_label: __address__ + - target_label: job_name + replacement: generic-collector + # Uncomment line below to include all labels of the pod + # - action: labelmap + # regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + action: replace + target_label: signoz_k8s_name + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] + action: replace + target_label: signoz_k8s_instance + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] + action: replace + target_label: signoz_k8s_component + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: k8s_namespace_name + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: k8s_pod_name + - source_labels: [__meta_kubernetes_pod_uid] + action: replace + target_label: k8s_pod_uid + - source_labels: [__meta_kubernetes_pod_container_name] + action: replace + target_label: k8s_container_name + - source_labels: [__meta_kubernetes_pod_container_name] + regex: (.+)-init + action: drop + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: k8s_node_name + - source_labels: [__meta_kubernetes_pod_ready] + action: replace + target_label: k8s_pod_ready + - source_labels: [__meta_kubernetes_pod_phase] + action: replace + target_label: k8s_pod_phase + processors: + # Batch processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md + batch: + send_batch_size: 10000 + timeout: 1s + # Resource detection processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/resourcedetectionprocessor/README.md + resourcedetection: + # detectors: include ec2/eks for AWS, gcp for GCP and azure/aks for Azure + # env detector included below adds custom labels using OTEL_RESOURCE_ATTRIBUTES envvar + detectors: + - env + # - elastic_beanstalk + # - eks + # - ecs + # - ec2 + # - gcp + # - azure + # - heroku + - system + timeout: 2s + system: + hostname_sources: [dns, os] + extensions: + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: localhost:55679 + pprof: + endpoint: localhost:1777 + exporters: + clickhousemetricswrite: + timeout: 15s + endpoint: tcp://${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}@${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT}/${CLICKHOUSE_DATABASE} + clickhousemetricswrite/hostmetrics: + endpoint: tcp://${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}@${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT}/${CLICKHOUSE_DATABASE} + resource_to_telemetry_conversion: + enabled: true + service: + telemetry: + logs: + encoding: json + metrics: + address: 0.0.0.0:8888 + extensions: [health_check, zpages, pprof] + pipelines: + metrics: + receivers: [prometheus] + processors: [batch] + exporters: [clickhousemetricswrite] + +signoz-otel-gateway: + enabled: false + diff --git a/modules/signoz/variables.tf b/modules/signoz/variables.tf new file mode 100644 index 00000000..770d80bd --- /dev/null +++ b/modules/signoz/variables.tf @@ -0,0 +1,27 @@ +variable "auto_deploy" { + description = "Auto deploy through ArgoCD" + type = bool + default = false +} + +variable "auto_prune" { + description = "Auto prune through ArgoCD" + type = bool + default = false +} + +variable "git_revision" { + description = "The git revision to deploy" + type = string + default = "main" +} + +variable "argo_deployment_name" { + description = "The name of the ArgoCD deployment, must be globally unique" + type = string +} + +variable "namespace" { + description = "The namespace to deploy into" + type = string +} diff --git a/modules/signoz/versions.tf b/modules/signoz/versions.tf new file mode 100644 index 00000000..28b5ab89 --- /dev/null +++ b/modules/signoz/versions.tf @@ -0,0 +1,24 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "1.14.0" + } + } + # TODO: Move to this provider + # required_providers { + # argocd = { + # source = "oboukili/argocd" + # version = "6.1.1" + # } + # } +} + From 73bb182e223cfaf6375459b90becd96945d46604 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 29 Aug 2024 12:00:48 -0700 Subject: [PATCH 02/85] Deploy signoz --- deployments/stacks/dpe-k8s-deployments/main.tf | 2 -- 1 file changed, 2 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index c46b44f3..736d4e27 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -81,8 +81,6 @@ module "postgres-cloud-native-database" { module "signoz" { - # TODO: This is temporary - count = 0 depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" From 4e3c472ff4737c006add632a6a59405ce3a55263 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 11 Sep 2024 15:11:19 -0700 Subject: [PATCH 03/85] Update signoz readme --- modules/signoz/README.md | 65 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 modules/signoz/README.md diff --git a/modules/signoz/README.md b/modules/signoz/README.md new file mode 100644 index 00000000..1cac8fc7 --- /dev/null +++ b/modules/signoz/README.md @@ -0,0 +1,65 @@ +# Purpose +The purpose of this module is to deploy the `Signoz` helm chart . + +SigNoz is an open-source APM. It helps developers monitor their applications +& troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open +source Application Performance Monitoring (APM) & Observability tool. + + +## This module is a work in progress +This was hastly thrown together to get a tool available to ingest telemetry data in. +A number of items are needed: + +- Updating the clickhouse install to cluster mode, and potentially this operator: https://github.com/Altinity/clickhouse-operator +- Setting up backups and data retention +- Trim down the number of ports available in the service +- Double check the entire `values.yaml` file +- Set up accounts and access to the service decleratively + +## Accessing signoz + +### Pre-req +This assumes that you have accessed the k8s cluster before using `k9s` or another tool. +If you have not, read over this documentation: + +- +- Description of port-forwarding via `k9s`: + +### Connecting to signoz +After signoz has been deployed to the k8s cluster you will need to port-forward to 2 +pods/services: + +- `signoz-frontend` +- `signoz-otel-collector` + +The frontend is how you'll access all of the data contained within signoz. Once you +port forward and access it via your web-browser you'll need to signup and login. +TODO: The steps on this are not fleshed out, this is going to be a manual step that the +admin of the server will need to help you with. + + +#### Sending data into signoz +Once you find the `signoz-otel-collector` you'll need to start a port-forward session in +order to pass data along to it from your local machine. Here are the settings you'll use +for the port-forward: + +Windows/Linux: +``` +Container Port: collector/otlp:4317,collector/otlp-http:4318 +Local Port: 4317,4318 +``` + +Mac: +``` +Container Port: collector::4317,collector::4318 +Local Port: 4317,4318 +``` + +Some data will be present in those fields by default, delete was is there and copy the +above data into it. + +### Application side +Once you're connected via a port-forward session the next item is to make sure that the +application you're sending data from is instrumented with open-telemetry. This is going +to be application specific so instructions will need to live within the application +you are using. From 6b457dd2857199040a6cedbd4a5b0d79fcc732e2 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:38:10 -0700 Subject: [PATCH 04/85] Disable a handful of items not needed --- modules/signoz/templates/values.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/signoz/templates/values.yaml b/modules/signoz/templates/values.yaml index f9831027..5dc9b966 100644 --- a/modules/signoz/templates/values.yaml +++ b/modules/signoz/templates/values.yaml @@ -1540,7 +1540,7 @@ otelCollector: protocol: UDP jaeger-thrift: # -- Whether to enable service port for Jaeger Thrift HTTP - enabled: true + enabled: false # -- Container port for Jaeger Thrift containerPort: 14268 # -- Service port for Jaeger Thrift @@ -1551,7 +1551,7 @@ otelCollector: protocol: TCP jaeger-grpc: # -- Whether to enable service port for Jaeger gRPC - enabled: true + enabled: false # -- Container port for Jaeger gRPC containerPort: 14250 # -- Service port for Jaeger gRPC @@ -1617,7 +1617,7 @@ otelCollector: protocol: TCP logsheroku: # -- Whether to enable service port for logsheroku - enabled: true + enabled: false # -- Container port for logsheroku containerPort: 8081 # -- Service port for logsheroku @@ -1628,7 +1628,7 @@ otelCollector: protocol: TCP logsjson: # -- Whether to enable service port for logsjson - enabled: true + enabled: false # -- Container port for logsjson containerPort: 8082 # -- Service port for logsjson @@ -2007,7 +2007,7 @@ otelCollector: # Default values for OtelCollectorMetrics otelCollectorMetrics: - enabled: true + enabled: false name: "otel-collector-metrics" image: registry: docker.io From 5b694bb58acf1dd44afcc9834a0f97f4276696dd Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 09:44:33 -0700 Subject: [PATCH 05/85] Try out kong-ingress --- .../stacks/dpe-k8s-deployments/main.tf | 11 ++++ modules/kong-ingress/README.md | 65 +++++++++++++++++++ modules/kong-ingress/main.tf | 39 +++++++++++ modules/kong-ingress/templates/values.yaml | 47 ++++++++++++++ modules/kong-ingress/variables.tf | 27 ++++++++ modules/kong-ingress/versions.tf | 24 +++++++ 6 files changed, 213 insertions(+) create mode 100644 modules/kong-ingress/README.md create mode 100644 modules/kong-ingress/main.tf create mode 100644 modules/kong-ingress/templates/values.yaml create mode 100644 modules/kong-ingress/variables.tf create mode 100644 modules/kong-ingress/versions.tf diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 736d4e27..ff71e24d 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -92,3 +92,14 @@ module "signoz" { argo_deployment_name = "signoz" } +module "kong-ingress" { + depends_on = [module.argo-cd] + # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" + # version = "0.5.0" + source = "../../../modules/kong-ingress" + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = var.git_revision + namespace = "kong-ingress" + argo_deployment_name = "kong-ingress" +} \ No newline at end of file diff --git a/modules/kong-ingress/README.md b/modules/kong-ingress/README.md new file mode 100644 index 00000000..1cac8fc7 --- /dev/null +++ b/modules/kong-ingress/README.md @@ -0,0 +1,65 @@ +# Purpose +The purpose of this module is to deploy the `Signoz` helm chart . + +SigNoz is an open-source APM. It helps developers monitor their applications +& troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open +source Application Performance Monitoring (APM) & Observability tool. + + +## This module is a work in progress +This was hastly thrown together to get a tool available to ingest telemetry data in. +A number of items are needed: + +- Updating the clickhouse install to cluster mode, and potentially this operator: https://github.com/Altinity/clickhouse-operator +- Setting up backups and data retention +- Trim down the number of ports available in the service +- Double check the entire `values.yaml` file +- Set up accounts and access to the service decleratively + +## Accessing signoz + +### Pre-req +This assumes that you have accessed the k8s cluster before using `k9s` or another tool. +If you have not, read over this documentation: + +- +- Description of port-forwarding via `k9s`: + +### Connecting to signoz +After signoz has been deployed to the k8s cluster you will need to port-forward to 2 +pods/services: + +- `signoz-frontend` +- `signoz-otel-collector` + +The frontend is how you'll access all of the data contained within signoz. Once you +port forward and access it via your web-browser you'll need to signup and login. +TODO: The steps on this are not fleshed out, this is going to be a manual step that the +admin of the server will need to help you with. + + +#### Sending data into signoz +Once you find the `signoz-otel-collector` you'll need to start a port-forward session in +order to pass data along to it from your local machine. Here are the settings you'll use +for the port-forward: + +Windows/Linux: +``` +Container Port: collector/otlp:4317,collector/otlp-http:4318 +Local Port: 4317,4318 +``` + +Mac: +``` +Container Port: collector::4317,collector::4318 +Local Port: 4317,4318 +``` + +Some data will be present in those fields by default, delete was is there and copy the +above data into it. + +### Application side +Once you're connected via a port-forward session the next item is to make sure that the +application you're sending data from is instrumented with open-telemetry. This is going +to be application specific so instructions will need to live within the application +you are using. diff --git a/modules/kong-ingress/main.tf b/modules/kong-ingress/main.tf new file mode 100644 index 00000000..bc152932 --- /dev/null +++ b/modules/kong-ingress/main.tf @@ -0,0 +1,39 @@ + +resource "kubernetes_namespace" "kong-ingress" { + metadata { + name = var.namespace + } +} + +resource "kubectl_manifest" "kong-ingress" { + depends_on = [kubernetes_namespace.kong-ingress] + + yaml_body = < Date: Mon, 30 Sep 2024 09:56:25 -0700 Subject: [PATCH 06/85] Correct chart name --- modules/kong-ingress/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/kong-ingress/main.tf b/modules/kong-ingress/main.tf index bc152932..f0e09175 100644 --- a/modules/kong-ingress/main.tf +++ b/modules/kong-ingress/main.tf @@ -23,7 +23,7 @@ spec: %{endif} sources: - repoURL: 'https://charts.konghq.com' - chart: kong/ingress + chart: ingress targetRevision: 0.14.1 helm: releaseName: kong-ingress From a002873d67bf84628aa48821f6877341ab3ee781 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 09:59:12 -0700 Subject: [PATCH 07/85] Update values --- modules/kong-ingress/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/kong-ingress/main.tf b/modules/kong-ingress/main.tf index f0e09175..1079e089 100644 --- a/modules/kong-ingress/main.tf +++ b/modules/kong-ingress/main.tf @@ -12,7 +12,7 @@ resource "kubectl_manifest" "kong-ingress" { apiVersion: argoproj.io/v1alpha1 kind: Application metadata: - name: kong-ingress + name: kong namespace: argocd spec: project: default @@ -26,7 +26,7 @@ spec: chart: ingress targetRevision: 0.14.1 helm: - releaseName: kong-ingress + releaseName: kong valueFiles: - $values/modules/kong-ingress/templates/values.yaml - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' From b3768c5af5217d04dd5827ab3943ea9bca72e2dc Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:26:03 -0700 Subject: [PATCH 08/85] Deploy oauth2 plugin --- .../create-synapse-oauth-client.py | 26 +++++++++++++++++++ modules/kong-ingress/data.tf | 15 +++++++++++ modules/kong-ingress/main.tf | 14 ++++++++++ .../kong-ingress/resources/kustomization.yaml | 4 +++ .../resources/openid-connect-plugin.yaml | 20 ++++++++++++++ 5 files changed, 79 insertions(+) create mode 100644 modules/kong-ingress/create-synapse-oauth-client.py create mode 100644 modules/kong-ingress/data.tf create mode 100644 modules/kong-ingress/resources/kustomization.yaml create mode 100644 modules/kong-ingress/resources/openid-connect-plugin.yaml diff --git a/modules/kong-ingress/create-synapse-oauth-client.py b/modules/kong-ingress/create-synapse-oauth-client.py new file mode 100644 index 00000000..002e3964 --- /dev/null +++ b/modules/kong-ingress/create-synapse-oauth-client.py @@ -0,0 +1,26 @@ +import synapseclient +import json +syn = synapseclient.login() + +client_meta_data = { + 'client_name': 'dpe-dev-k8s-cluster', + 'redirect_uris': [ + 'https://a9a60607095304dec9cd248ef7bd64ea-1681374179.us-east-1.elb.amazonaws.com/testing' + ], + # 'client_uri': 'https://yourhost.com/index.html', + # 'policy_uri': 'https://yourhost.com/policy', + # 'tos_uri': 'https://yourhost.com/terms_of_service', + 'userinfo_signed_response_alg': 'RS256' +} + +# Create the client: +client_meta_data = syn.restPOST(uri='/oauth2/client', + endpoint=syn.authEndpoint, body=json.dumps(client_meta_data)) + +client_id = client_meta_data['client_id'] + +# Generate and retrieve the client secret: +client_id_and_secret = syn.restPOST(uri='/oauth2/client/secret/'+client_id, + endpoint=syn.authEndpoint, body='') + +print(client_id_and_secret) diff --git a/modules/kong-ingress/data.tf b/modules/kong-ingress/data.tf new file mode 100644 index 00000000..c3260947 --- /dev/null +++ b/modules/kong-ingress/data.tf @@ -0,0 +1,15 @@ +data "aws_secretsmanager_secret" "oauth-client-id" { + name = "dev/dpe-sandbox/client-id" +} + +data "aws_secretsmanager_secret_version" "client-id" { + secret_id = data.aws_secretsmanager_secret.oauth-client-id.id +} + +data "aws_secretsmanager_secret" "oauth-client-secret" { + name = "dev/dpe-sandbox/client-secret" +} + +data "aws_secretsmanager_secret_version" "client-secret" { + secret_id = data.aws_secretsmanager_secret.oauth-client-secret.id +} \ No newline at end of file diff --git a/modules/kong-ingress/main.tf b/modules/kong-ingress/main.tf index 1079e089..dd90f901 100644 --- a/modules/kong-ingress/main.tf +++ b/modules/kong-ingress/main.tf @@ -32,6 +32,20 @@ spec: - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' targetRevision: signoz-testing ref: values + - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' + targetRevision: ${var.git_revision} + path: modules/kong-ingress/resources + kustomize: + patches: + - target: + kind: KongClusterPlugin + patch: |- + - op: replace + path: /config/client_id/0 + value: ${data.aws_secretsmanager_secret_version.client-id.secret_string} + - op: replace + path: /config/client_secret/0 + value: ${data.aws_secretsmanager_secret_version.client-secret.secret_string} destination: server: 'https://kubernetes.default.svc' namespace: ${var.namespace} diff --git a/modules/kong-ingress/resources/kustomization.yaml b/modules/kong-ingress/resources/kustomization.yaml new file mode 100644 index 00000000..119ff612 --- /dev/null +++ b/modules/kong-ingress/resources/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- openid-connect-plugin.yaml diff --git a/modules/kong-ingress/resources/openid-connect-plugin.yaml b/modules/kong-ingress/resources/openid-connect-plugin.yaml new file mode 100644 index 00000000..8013b6cc --- /dev/null +++ b/modules/kong-ingress/resources/openid-connect-plugin.yaml @@ -0,0 +1,20 @@ +apiVersion: configuration.konghq.com/v1 +kind: KongClusterPlugin +metadata: + name: cluster-openid-connect + annotations: + kubernetes.io/ingress.class: kong + labels: + global: "true" +config: + auth_methods: + - authorization_code + - session + issuer: http://example.org + client_id: + - "" + client_secret: + - "" +# session_secret: "" + response_mode: form_post +plugin: openid-connect \ No newline at end of file From d525af38ab7441d42095a9f2164cb515328b9446 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:29:34 -0700 Subject: [PATCH 09/85] Correct issuer --- modules/kong-ingress/resources/openid-connect-plugin.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/kong-ingress/resources/openid-connect-plugin.yaml b/modules/kong-ingress/resources/openid-connect-plugin.yaml index 8013b6cc..f504fa4c 100644 --- a/modules/kong-ingress/resources/openid-connect-plugin.yaml +++ b/modules/kong-ingress/resources/openid-connect-plugin.yaml @@ -10,7 +10,7 @@ config: auth_methods: - authorization_code - session - issuer: http://example.org + issuer: https://repo-prod.prod.sagebase.org/auth/v1 client_id: - "" client_secret: From 81a3adcfcbd1b161ab221bfdf853f80e56531eff Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:31:02 -0700 Subject: [PATCH 10/85] Correct indent --- .../resources/openid-connect-plugin.yaml | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/kong-ingress/resources/openid-connect-plugin.yaml b/modules/kong-ingress/resources/openid-connect-plugin.yaml index f504fa4c..bfb20f35 100644 --- a/modules/kong-ingress/resources/openid-connect-plugin.yaml +++ b/modules/kong-ingress/resources/openid-connect-plugin.yaml @@ -1,20 +1,20 @@ apiVersion: configuration.konghq.com/v1 kind: KongClusterPlugin metadata: - name: cluster-openid-connect - annotations: - kubernetes.io/ingress.class: kong - labels: - global: "true" + name: cluster-openid-connect + annotations: + kubernetes.io/ingress.class: kong + labels: + global: "true" config: - auth_methods: - - authorization_code - - session - issuer: https://repo-prod.prod.sagebase.org/auth/v1 - client_id: - - "" - client_secret: - - "" + auth_methods: + - authorization_code + - session + issuer: https://repo-prod.prod.sagebase.org/auth/v1 + client_id: + - "" + client_secret: + - "" # session_secret: "" - response_mode: form_post + response_mode: form_post plugin: openid-connect \ No newline at end of file From 8c89bc6e0c15df9de1aeeb0a489e5738bf1fba2d Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:32:18 -0700 Subject: [PATCH 11/85] Correct revision target --- modules/kong-ingress/main.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/kong-ingress/main.tf b/modules/kong-ingress/main.tf index dd90f901..3db3437c 100644 --- a/modules/kong-ingress/main.tf +++ b/modules/kong-ingress/main.tf @@ -5,6 +5,7 @@ resource "kubernetes_namespace" "kong-ingress" { } } +# TODO: Using kustomize in this fashion prints out the secret in the spacelift UI when terraform is running resource "kubectl_manifest" "kong-ingress" { depends_on = [kubernetes_namespace.kong-ingress] @@ -33,7 +34,7 @@ spec: targetRevision: signoz-testing ref: values - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' - targetRevision: ${var.git_revision} + targetRevision: signoz-testing path: modules/kong-ingress/resources kustomize: patches: From 23ef64e698686d0604be45ea42f6605a8b5d4017 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:37:23 -0700 Subject: [PATCH 12/85] Deploy out cert-manager --- .../stacks/dpe-k8s-deployments/main.tf | 12 + modules/cert-manager/README.md | 6 + modules/cert-manager/main.tf | 38 + modules/cert-manager/templates/values.yaml | 1350 +++++++++++++++++ modules/cert-manager/variables.tf | 27 + modules/cert-manager/versions.tf | 24 + 6 files changed, 1457 insertions(+) create mode 100644 modules/cert-manager/README.md create mode 100644 modules/cert-manager/main.tf create mode 100644 modules/cert-manager/templates/values.yaml create mode 100644 modules/cert-manager/variables.tf create mode 100644 modules/cert-manager/versions.tf diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index ff71e24d..907ff4e7 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -102,4 +102,16 @@ module "kong-ingress" { git_revision = var.git_revision namespace = "kong-ingress" argo_deployment_name = "kong-ingress" +} + +module "cert-manager" { + depends_on = [module.argo-cd] + # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" + # version = "0.5.0" + source = "../../../modules/cert-manager" + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = var.git_revision + namespace = "cert-manager" + argo_deployment_name = "cert-manager" } \ No newline at end of file diff --git a/modules/cert-manager/README.md b/modules/cert-manager/README.md new file mode 100644 index 00000000..b47a4bd9 --- /dev/null +++ b/modules/cert-manager/README.md @@ -0,0 +1,6 @@ +# Purpose +This module is used to deploy the cert-manager helm chart + +Resources: + +- diff --git a/modules/cert-manager/main.tf b/modules/cert-manager/main.tf new file mode 100644 index 00000000..5d657d52 --- /dev/null +++ b/modules/cert-manager/main.tf @@ -0,0 +1,38 @@ +resource "kubernetes_namespace" "cert-manager" { + metadata { + name = var.namespace + } +} + +resource "kubectl_manifest" "cert-manager" { + depends_on = [kubernetes_namespace.cert-manager] + + yaml_body = < 1`, consider setting `podDisruptionBudget.enabled=true`. +# +# Note that cert-manager uses leader election to ensure that there can +# only be a single instance active at a time. +replicaCount: 1 + +# Deployment update strategy for the cert-manager controller deployment. +# For more information, see the [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy). +# +# For example: +# strategy: +# type: RollingUpdate +# rollingUpdate: +# maxSurge: 0 +# maxUnavailable: 1 +strategy: {} + +podDisruptionBudget: + # Enable or disable the PodDisruptionBudget resource. + # + # This prevents downtime during voluntary disruptions such as during a Node upgrade. + # For example, the PodDisruptionBudget will block `kubectl drain` + # if it is used on the Node where the only remaining cert-manager + # Pod is currently running. + enabled: false + + # This configures the minimum available pods for disruptions. It can either be set to + # an integer (e.g. 1) or a percentage value (e.g. 25%). + # It cannot be used if `maxUnavailable` is set. + # +docs:property + # minAvailable: 1 + + # This configures the maximum unavailable pods for disruptions. It can either be set to + # an integer (e.g. 1) or a percentage value (e.g. 25%). + # it cannot be used if `minAvailable` is set. + # +docs:property + # maxUnavailable: 1 + +# A comma-separated list of feature gates that should be enabled on the +# controller pod. +featureGates: "" + +# The maximum number of challenges that can be scheduled as 'processing' at once. +maxConcurrentChallenges: 60 + +image: + # The container registry to pull the manager image from. + # +docs:property + # registry: quay.io + + # The container image for the cert-manager controller. + # +docs:property + repository: quay.io/jetstack/cert-manager-controller + + # Override the image tag to deploy by setting this variable. + # If no value is set, the chart's appVersion is used. + # +docs:property + # tag: vX.Y.Z + + # Setting a digest will override any tag. + # +docs:property + # digest: sha256:0e072dddd1f7f8fc8909a2ca6f65e76c5f0d2fcfb8be47935ae3457e8bbceb20 + + # Kubernetes imagePullPolicy on Deployment. + pullPolicy: IfNotPresent + +# Override the namespace used to store DNS provider credentials etc. for ClusterIssuer +# resources. By default, the same namespace as cert-manager is deployed within is +# used. This namespace will not be automatically created by the Helm chart. +clusterResourceNamespace: "" + +# This namespace allows you to define where the services are installed into. +# If not set then they use the namespace of the release. +# This is helpful when installing cert manager as a chart dependency (sub chart). +namespace: "" + +serviceAccount: + # Specifies whether a service account should be created. + create: true + + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template. + # +docs:property + # name: "" + + # Optional additional annotations to add to the controller's Service Account. + # +docs:property + # annotations: {} + + # Optional additional labels to add to the controller's Service Account. + # +docs:property + # labels: {} + + # Automount API credentials for a Service Account. + automountServiceAccountToken: true + +# Automounting API credentials for a particular pod. +# +docs:property +# automountServiceAccountToken: true + +# When this flag is enabled, secrets will be automatically removed when the certificate resource is deleted. +enableCertificateOwnerRef: false + +# This property is used to configure options for the controller pod. +# This allows setting options that would usually be provided using flags. +# An APIVersion and Kind must be specified in your values.yaml file. +# Flags will override options that are set here. +# +# For example: +# config: +# apiVersion: controller.config.cert-manager.io/v1alpha1 +# kind: ControllerConfiguration +# logging: +# verbosity: 2 +# format: text +# leaderElectionConfig: +# namespace: kube-system +# kubernetesAPIQPS: 9000 +# kubernetesAPIBurst: 9000 +# numberOfConcurrentWorkers: 200 +# featureGates: +# AdditionalCertificateOutputFormats: true +# DisallowInsecureCSRUsageDefinition: true +# ExperimentalCertificateSigningRequestControllers: true +# ExperimentalGatewayAPISupport: true +# LiteralCertificateSubject: true +# SecretsFilteredCaching: true +# ServerSideApply: true +# StableCertificateRequestName: true +# UseCertificateRequestBasicConstraints: true +# ValidateCAA: true +# metricsTLSConfig: +# dynamic: +# secretNamespace: "cert-manager" +# secretName: "cert-manager-metrics-ca" +# dnsNames: +# - cert-manager-metrics +# - cert-manager-metrics.cert-manager +# - cert-manager-metrics.cert-manager.svc +config: {} + +# Setting Nameservers for DNS01 Self Check. +# For more information, see the [cert-manager documentation](https://cert-manager.io/docs/configuration/acme/dns01/#setting-nameservers-for-dns01-self-check). + +# A comma-separated string with the host and port of the recursive nameservers cert-manager should query. +dns01RecursiveNameservers: "" + +# Forces cert-manager to use only the recursive nameservers for verification. +# Enabling this option could cause the DNS01 self check to take longer owing to caching performed by the recursive nameservers. +dns01RecursiveNameserversOnly: false + +# Option to disable cert-manager's build-in auto-approver. The auto-approver +# approves all CertificateRequests that reference issuers matching the 'approveSignerNames' +# option. This 'disableAutoApproval' option is useful when you want to make all approval decisions +# using a different approver (like approver-policy - https://github.com/cert-manager/approver-policy). +disableAutoApproval: false + +# List of signer names that cert-manager will approve by default. CertificateRequests +# referencing these signer names will be auto-approved by cert-manager. Defaults to just +# approving the cert-manager.io Issuer and ClusterIssuer issuers. When set to an empty +# array, ALL issuers will be auto-approved by cert-manager. To disable the auto-approval, +# because eg. you are using approver-policy, you can enable 'disableAutoApproval'. +# ref: https://cert-manager.io/docs/concepts/certificaterequest/#approval +# +docs:property +approveSignerNames: +- issuers.cert-manager.io/* +- clusterissuers.cert-manager.io/* + +# Additional command line flags to pass to cert-manager controller binary. +# To see all available flags run `docker run quay.io/jetstack/cert-manager-controller: --help`. +# +# Use this flag to enable or disable arbitrary controllers. For example, to disable the CertificiateRequests approver. +# +# For example: +# extraArgs: +# - --controllers=*,-certificaterequests-approver +extraArgs: [] + +# Additional environment variables to pass to cert-manager controller binary. +extraEnv: [] +# - name: SOME_VAR +# value: 'some value' + +# Resources to provide to the cert-manager controller pod. +# +# For example: +# requests: +# cpu: 10m +# memory: 32Mi +# +# For more information, see [Resource Management for Pods and Containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/). +resources: {} + +# Pod Security Context. +# For more information, see [Configure a Security Context for a Pod or Container](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/). +# +docs:property +securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + +# Container Security Context to be set on the controller component container. +# For more information, see [Configure a Security Context for a Pod or Container](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/). +# +docs:property +containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + +# Additional volumes to add to the cert-manager controller pod. +volumes: [] + +# Additional volume mounts to add to the cert-manager controller container. +volumeMounts: [] + +# Optional additional annotations to add to the controller Deployment. +# +docs:property +# deploymentAnnotations: {} + +# Optional additional annotations to add to the controller Pods. +# +docs:property +# podAnnotations: {} + +# Optional additional labels to add to the controller Pods. +podLabels: {} + +# Optional annotations to add to the controller Service. +# +docs:property +# serviceAnnotations: {} + +# Optional additional labels to add to the controller Service. +# +docs:property +# serviceLabels: {} + +# Optionally set the IP family policy for the controller Service to configure dual-stack; see [Configure dual-stack](https://kubernetes.io/docs/concepts/services-networking/dual-stack/#services). +# +docs:property +# serviceIPFamilyPolicy: "" + +# Optionally set the IP families for the controller Service that should be supported, in the order in which they should be applied to ClusterIP. Can be IPv4 and/or IPv6. +# +docs:property +# serviceIPFamilies: [] + +# Optional DNS settings. These are useful if you have a public and private DNS zone for +# the same domain on Route 53. The following is an example of ensuring +# cert-manager can access an ingress or DNS TXT records at all times. +# Note that this requires Kubernetes 1.10 or `CustomPodDNS` feature gate enabled for +# the cluster to work. + +# Pod DNS policy. +# For more information, see [Pod's DNS Policy](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-s-dns-policy). +# +docs:property +# podDnsPolicy: "None" + +# Pod DNS configuration. The podDnsConfig field is optional and can work with any podDnsPolicy +# settings. However, when a Pod's dnsPolicy is set to "None", the dnsConfig field has to be specified. +# For more information, see [Pod's DNS Config](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-dns-config). +# +docs:property +# podDnsConfig: +# nameservers: +# - "1.1.1.1" +# - "8.8.8.8" + +# Optional hostAliases for cert-manager-controller pods. May be useful when performing ACME DNS-01 self checks. +hostAliases: [] +# - ip: 127.0.0.1 +# hostnames: +# - foo.local +# - bar.local +# - ip: 10.1.2.3 +# hostnames: +# - foo.remote +# - bar.remote + +# The nodeSelector on Pods tells Kubernetes to schedule Pods on the nodes with +# matching labels. +# For more information, see [Assigning Pods to Nodes](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/). +# +# This default ensures that Pods are only scheduled to Linux nodes. +# It prevents Pods being scheduled to Windows nodes in a mixed OS cluster. +# +docs:property +nodeSelector: + kubernetes.io/os: linux + +# +docs:ignore +ingressShim: {} + + # Optional default issuer to use for ingress resources. + # +docs:property=ingressShim.defaultIssuerName + # defaultIssuerName: "" + + # Optional default issuer kind to use for ingress resources. + # +docs:property=ingressShim.defaultIssuerKind + # defaultIssuerKind: "" + + # Optional default issuer group to use for ingress resources. + # +docs:property=ingressShim.defaultIssuerGroup + # defaultIssuerGroup: "" + +# Use these variables to configure the HTTP_PROXY environment variables. + +# Configures the HTTP_PROXY environment variable where a HTTP proxy is required. +# +docs:property +# http_proxy: "http://proxy:8080" + +# Configures the HTTPS_PROXY environment variable where a HTTP proxy is required. +# +docs:property +# https_proxy: "https://proxy:8080" + +# Configures the NO_PROXY environment variable where a HTTP proxy is required, +# but certain domains should be excluded. +# +docs:property +# no_proxy: 127.0.0.1,localhost + + +# A Kubernetes Affinity, if required. For more information, see [Affinity v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#affinity-v1-core). +# +# For example: +# affinity: +# nodeAffinity: +# requiredDuringSchedulingIgnoredDuringExecution: +# nodeSelectorTerms: +# - matchExpressions: +# - key: foo.bar.com/role +# operator: In +# values: +# - master +affinity: {} + +# A list of Kubernetes Tolerations, if required. For more information, see [Toleration v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#toleration-v1-core). +# +# For example: +# tolerations: +# - key: foo.bar.com/role +# operator: Equal +# value: master +# effect: NoSchedule +tolerations: [] + +# A list of Kubernetes TopologySpreadConstraints, if required. For more information, see [Topology spread constraint v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#topologyspreadconstraint-v1-core +# +# For example: +# topologySpreadConstraints: +# - maxSkew: 2 +# topologyKey: topology.kubernetes.io/zone +# whenUnsatisfiable: ScheduleAnyway +# labelSelector: +# matchLabels: +# app.kubernetes.io/instance: cert-manager +# app.kubernetes.io/component: controller +topologySpreadConstraints: [] + +# LivenessProbe settings for the controller container of the controller Pod. +# +# This is enabled by default, in order to enable the clock-skew liveness probe that +# restarts the controller in case of a skew between the system clock and the monotonic clock. +# LivenessProbe durations and thresholds are based on those used for the Kubernetes +# controller-manager. For more information see the following on the +# [Kubernetes GitHub repository](https://github.com/kubernetes/kubernetes/blob/806b30170c61a38fedd54cc9ede4cd6275a1ad3b/cmd/kubeadm/app/util/staticpod/utils.go#L241-L245) +# +docs:property +livenessProbe: + enabled: true + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 15 + successThreshold: 1 + failureThreshold: 8 + +# enableServiceLinks indicates whether information about services should be +# injected into the pod's environment variables, matching the syntax of Docker +# links. +enableServiceLinks: false + +# +docs:section=Prometheus + +prometheus: + # Enable Prometheus monitoring for the cert-manager controller to use with the + # Prometheus Operator. If this option is enabled without enabling `prometheus.servicemonitor.enabled` or + # `prometheus.podmonitor.enabled`, 'prometheus.io' annotations are added to the cert-manager Deployment + # resources. Additionally, a service is created which can be used together + # with your own ServiceMonitor (managed outside of this Helm chart). + # Otherwise, a ServiceMonitor/ PodMonitor is created. + enabled: true + + servicemonitor: + # Create a ServiceMonitor to add cert-manager to Prometheus. + enabled: true + + # Specifies the `prometheus` label on the created ServiceMonitor. This is + # used when different Prometheus instances have label selectors matching + # different ServiceMonitors. + prometheusInstance: default + + # The target port to set on the ServiceMonitor. This must match the port that the + # cert-manager controller is listening on for metrics. + targetPort: 9402 + + # The path to scrape for metrics. + path: /metrics + + # The interval to scrape metrics. + interval: 60s + + # The timeout before a metrics scrape fails. + scrapeTimeout: 30s + + # Additional labels to add to the ServiceMonitor. + labels: {} + + # Additional annotations to add to the ServiceMonitor. + annotations: {} + + # Keep labels from scraped data, overriding server-side labels. + honorLabels: false + + # EndpointAdditionalProperties allows setting additional properties on the + # endpoint such as relabelings, metricRelabelings etc. + # + # For example: + # endpointAdditionalProperties: + # relabelings: + # - action: replace + # sourceLabels: + # - __meta_kubernetes_pod_node_name + # targetLabel: instance + # + # +docs:property + endpointAdditionalProperties: {} + + # Note that you can not enable both PodMonitor and ServiceMonitor as they are mutually exclusive. Enabling both will result in a error. + podmonitor: + # Create a PodMonitor to add cert-manager to Prometheus. + enabled: false + + # Specifies the `prometheus` label on the created PodMonitor. This is + # used when different Prometheus instances have label selectors matching + # different PodMonitors. + prometheusInstance: default + + # The path to scrape for metrics. + path: /metrics + + # The interval to scrape metrics. + interval: 60s + + # The timeout before a metrics scrape fails. + scrapeTimeout: 30s + + # Additional labels to add to the PodMonitor. + labels: {} + + # Additional annotations to add to the PodMonitor. + annotations: {} + + # Keep labels from scraped data, overriding server-side labels. + honorLabels: false + + # EndpointAdditionalProperties allows setting additional properties on the + # endpoint such as relabelings, metricRelabelings etc. + # + # For example: + # endpointAdditionalProperties: + # relabelings: + # - action: replace + # sourceLabels: + # - __meta_kubernetes_pod_node_name + # targetLabel: instance + # + # +docs:property + endpointAdditionalProperties: {} + +# +docs:section=Webhook + +webhook: + # Number of replicas of the cert-manager webhook to run. + # + # The default is 1, but in production set this to 2 or 3 to provide high + # availability. + # + # If `replicas > 1`, consider setting `webhook.podDisruptionBudget.enabled=true`. + replicaCount: 1 + + # The number of seconds the API server should wait for the webhook to respond before treating the call as a failure. + # The value must be between 1 and 30 seconds. For more information, see + # [Validating webhook configuration v1](https://kubernetes.io/docs/reference/kubernetes-api/extend-resources/validating-webhook-configuration-v1/). + # + # The default is set to the maximum value of 30 seconds as + # users sometimes report that the connection between the K8S API server and + # the cert-manager webhook server times out. + # If *this* timeout is reached, the error message will be "context deadline exceeded", + # which doesn't help the user diagnose what phase of the HTTPS connection timed out. + # For example, it could be during DNS resolution, TCP connection, TLS + # negotiation, HTTP negotiation, or slow HTTP response from the webhook + # server. + # By setting this timeout to its maximum value the underlying timeout error + # message has more chance of being returned to the end user. + timeoutSeconds: 30 + + # This is used to configure options for the webhook pod. + # This allows setting options that would usually be provided using flags. + # An APIVersion and Kind must be specified in your values.yaml file. + # Flags override options that are set here. + # + # For example: + # apiVersion: webhook.config.cert-manager.io/v1alpha1 + # kind: WebhookConfiguration + # # The port that the webhook listens on for requests. + # # In GKE private clusters, by default Kubernetes apiservers are allowed to + # # talk to the cluster nodes only on 443 and 10250. Configuring + # # securePort: 10250 therefore will work out-of-the-box without needing to add firewall + # # rules or requiring NET_BIND_SERVICE capabilities to bind port numbers < 1000. + # # This should be uncommented and set as a default by the chart once + # # the apiVersion of WebhookConfiguration graduates beyond v1alpha1. + # securePort: 10250 + config: {} + + # The update strategy for the cert-manager webhook deployment. + # For more information, see the [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy) + # + # For example: + # strategy: + # type: RollingUpdate + # rollingUpdate: + # maxSurge: 0 + # maxUnavailable: 1 + strategy: {} + + # Pod Security Context to be set on the webhook component Pod. + # For more information, see [Configure a Security Context for a Pod or Container](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/). + # +docs:property + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + + # Container Security Context to be set on the webhook component container. + # For more information, see [Configure a Security Context for a Pod or Container](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/). + # +docs:property + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + + podDisruptionBudget: + # Enable or disable the PodDisruptionBudget resource. + # + # This prevents downtime during voluntary disruptions such as during a Node upgrade. + # For example, the PodDisruptionBudget will block `kubectl drain` + # if it is used on the Node where the only remaining cert-manager + # Pod is currently running. + enabled: false + + # This property configures the minimum available pods for disruptions. Can either be set to + # an integer (e.g. 1) or a percentage value (e.g. 25%). + # It cannot be used if `maxUnavailable` is set. + # +docs:property + # minAvailable: 1 + + # This property configures the maximum unavailable pods for disruptions. Can either be set to + # an integer (e.g. 1) or a percentage value (e.g. 25%). + # It cannot be used if `minAvailable` is set. + # +docs:property + # maxUnavailable: 1 + + # Optional additional annotations to add to the webhook Deployment. + # +docs:property + # deploymentAnnotations: {} + + # Optional additional annotations to add to the webhook Pods. + # +docs:property + # podAnnotations: {} + + # Optional additional annotations to add to the webhook Service. + # +docs:property + # serviceAnnotations: {} + + # Optional additional annotations to add to the webhook MutatingWebhookConfiguration. + # +docs:property + # mutatingWebhookConfigurationAnnotations: {} + + # Optional additional annotations to add to the webhook ValidatingWebhookConfiguration. + # +docs:property + # validatingWebhookConfigurationAnnotations: {} + + validatingWebhookConfiguration: + # Configure spec.namespaceSelector for validating webhooks. + # +docs:property + namespaceSelector: + matchExpressions: + - key: "cert-manager.io/disable-validation" + operator: "NotIn" + values: + - "true" + + mutatingWebhookConfiguration: + # Configure spec.namespaceSelector for mutating webhooks. + # +docs:property + namespaceSelector: {} + # matchLabels: + # key: value + # matchExpressions: + # - key: kubernetes.io/metadata.name + # operator: NotIn + # values: + # - kube-system + + + # Additional command line flags to pass to cert-manager webhook binary. + # To see all available flags run `docker run quay.io/jetstack/cert-manager-webhook: --help`. + extraArgs: [] + # Path to a file containing a WebhookConfiguration object used to configure the webhook. + # - --config= + + # Comma separated list of feature gates that should be enabled on the + # webhook pod. + featureGates: "" + + # Resources to provide to the cert-manager webhook pod. + # + # For example: + # requests: + # cpu: 10m + # memory: 32Mi + # + # For more information, see [Resource Management for Pods and Containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/). + resources: {} + + # Liveness probe values. + # For more information, see [Container probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes). + # + # +docs:property + livenessProbe: + failureThreshold: 3 + initialDelaySeconds: 60 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + + # Readiness probe values. + # For more information, see [Container probes](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes). + # + # +docs:property + readinessProbe: + failureThreshold: 3 + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + + # The nodeSelector on Pods tells Kubernetes to schedule Pods on the nodes with + # matching labels. + # For more information, see [Assigning Pods to Nodes](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/). + # + # This default ensures that Pods are only scheduled to Linux nodes. + # It prevents Pods being scheduled to Windows nodes in a mixed OS cluster. + # +docs:property + nodeSelector: + kubernetes.io/os: linux + + # A Kubernetes Affinity, if required. For more information, see [Affinity v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#affinity-v1-core). + # + # For example: + # affinity: + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: foo.bar.com/role + # operator: In + # values: + # - master + affinity: {} + + # A list of Kubernetes Tolerations, if required. For more information, see [Toleration v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#toleration-v1-core). + # + # For example: + # tolerations: + # - key: foo.bar.com/role + # operator: Equal + # value: master + # effect: NoSchedule + tolerations: [] + + # A list of Kubernetes TopologySpreadConstraints, if required. For more information, see [Topology spread constraint v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#topologyspreadconstraint-v1-core). + # + # For example: + # topologySpreadConstraints: + # - maxSkew: 2 + # topologyKey: topology.kubernetes.io/zone + # whenUnsatisfiable: ScheduleAnyway + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: cert-manager + # app.kubernetes.io/component: controller + topologySpreadConstraints: [] + + # Optional additional labels to add to the Webhook Pods. + podLabels: {} + + # Optional additional labels to add to the Webhook Service. + serviceLabels: {} + + # Optionally set the IP family policy for the controller Service to configure dual-stack; see [Configure dual-stack](https://kubernetes.io/docs/concepts/services-networking/dual-stack/#services). + serviceIPFamilyPolicy: "" + + # Optionally set the IP families for the controller Service that should be supported, in the order in which they should be applied to ClusterIP. Can be IPv4 and/or IPv6. + serviceIPFamilies: [] + + image: + # The container registry to pull the webhook image from. + # +docs:property + # registry: quay.io + + # The container image for the cert-manager webhook + # +docs:property + repository: quay.io/jetstack/cert-manager-webhook + + # Override the image tag to deploy by setting this variable. + # If no value is set, the chart's appVersion will be used. + # +docs:property + # tag: vX.Y.Z + + # Setting a digest will override any tag + # +docs:property + # digest: sha256:0e072dddd1f7f8fc8909a2ca6f65e76c5f0d2fcfb8be47935ae3457e8bbceb20 + + # Kubernetes imagePullPolicy on Deployment. + pullPolicy: IfNotPresent + + serviceAccount: + # Specifies whether a service account should be created. + create: true + + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template. + # +docs:property + # name: "" + + # Optional additional annotations to add to the controller's Service Account. + # +docs:property + # annotations: {} + + # Optional additional labels to add to the webhook's Service Account. + # +docs:property + # labels: {} + + # Automount API credentials for a Service Account. + automountServiceAccountToken: true + + # Automounting API credentials for a particular pod. + # +docs:property + # automountServiceAccountToken: true + + # The port that the webhook listens on for requests. + # In GKE private clusters, by default Kubernetes apiservers are allowed to + # talk to the cluster nodes only on 443 and 10250. Configuring + # securePort: 10250, therefore will work out-of-the-box without needing to add firewall + # rules or requiring NET_BIND_SERVICE capabilities to bind port numbers <1000. + securePort: 10250 + + # Specifies if the webhook should be started in hostNetwork mode. + # + # Required for use in some managed kubernetes clusters (such as AWS EKS) with custom + # CNI (such as calico), because control-plane managed by AWS cannot communicate + # with pods' IP CIDR and admission webhooks are not working + # + # Since the default port for the webhook conflicts with kubelet on the host + # network, `webhook.securePort` should be changed to an available port if + # running in hostNetwork mode. + hostNetwork: false + + # Specifies how the service should be handled. Useful if you want to expose the + # webhook outside of the cluster. In some cases, the control plane cannot + # reach internal services. + serviceType: ClusterIP + + # Specify the load balancer IP for the created service. + # +docs:property + # loadBalancerIP: "10.10.10.10" + + # Overrides the mutating webhook and validating webhook so they reach the webhook + # service using the `url` field instead of a service. + url: {} + # host: + + # Enables default network policies for webhooks. + networkPolicy: + # Create network policies for the webhooks. + enabled: false + + # Ingress rule for the webhook network policy. By default, it allows all + # inbound traffic. + # +docs:property + ingress: + - from: + - ipBlock: + cidr: 0.0.0.0/0 + + # Egress rule for the webhook network policy. By default, it allows all + # outbound traffic to ports 80 and 443, as well as DNS ports. + # +docs:property + egress: + - ports: + - port: 80 + protocol: TCP + - port: 443 + protocol: TCP + - port: 53 + protocol: TCP + - port: 53 + protocol: UDP + # On OpenShift and OKD, the Kubernetes API server listens on. + # port 6443. + - port: 6443 + protocol: TCP + to: + - ipBlock: + cidr: 0.0.0.0/0 + + # Additional volumes to add to the cert-manager controller pod. + volumes: [] + + # Additional volume mounts to add to the cert-manager controller container. + volumeMounts: [] + + # enableServiceLinks indicates whether information about services should be + # injected into the pod's environment variables, matching the syntax of Docker + # links. + enableServiceLinks: false + +# +docs:section=CA Injector + +cainjector: + # Create the CA Injector deployment + enabled: true + + # The number of replicas of the cert-manager cainjector to run. + # + # The default is 1, but in production set this to 2 or 3 to provide high + # availability. + # + # If `replicas > 1`, consider setting `cainjector.podDisruptionBudget.enabled=true`. + # + # Note that cert-manager uses leader election to ensure that there can + # only be a single instance active at a time. + replicaCount: 1 + + # This is used to configure options for the cainjector pod. + # It allows setting options that are usually provided via flags. + # An APIVersion and Kind must be specified in your values.yaml file. + # Flags override options that are set here. + # + # For example: + # apiVersion: cainjector.config.cert-manager.io/v1alpha1 + # kind: CAInjectorConfiguration + # logging: + # verbosity: 2 + # format: text + # leaderElectionConfig: + # namespace: kube-system + config: {} + + # Deployment update strategy for the cert-manager cainjector deployment. + # For more information, see the [Kubernetes documentation](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy). + # + # For example: + # strategy: + # type: RollingUpdate + # rollingUpdate: + # maxSurge: 0 + # maxUnavailable: 1 + strategy: {} + + # Pod Security Context to be set on the cainjector component Pod + # For more information, see [Configure a Security Context for a Pod or Container](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/). + # +docs:property + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + + # Container Security Context to be set on the cainjector component container + # For more information, see [Configure a Security Context for a Pod or Container](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/). + # +docs:property + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + + podDisruptionBudget: + # Enable or disable the PodDisruptionBudget resource. + # + # This prevents downtime during voluntary disruptions such as during a Node upgrade. + # For example, the PodDisruptionBudget will block `kubectl drain` + # if it is used on the Node where the only remaining cert-manager + # Pod is currently running. + enabled: false + + # `minAvailable` configures the minimum available pods for disruptions. It can either be set to + # an integer (e.g. 1) or a percentage value (e.g. 25%). + # Cannot be used if `maxUnavailable` is set. + # +docs:property + # minAvailable: 1 + + # `maxUnavailable` configures the maximum unavailable pods for disruptions. It can either be set to + # an integer (e.g. 1) or a percentage value (e.g. 25%). + # Cannot be used if `minAvailable` is set. + # +docs:property + # maxUnavailable: 1 + + # Optional additional annotations to add to the cainjector Deployment. + # +docs:property + # deploymentAnnotations: {} + + # Optional additional annotations to add to the cainjector Pods. + # +docs:property + # podAnnotations: {} + + # Additional command line flags to pass to cert-manager cainjector binary. + # To see all available flags run `docker run quay.io/jetstack/cert-manager-cainjector: --help`. + extraArgs: [] + # Enable profiling for cainjector. + # - --enable-profiling=true + + # Comma separated list of feature gates that should be enabled on the + # cainjector pod. + featureGates: "" + + # Resources to provide to the cert-manager cainjector pod. + # + # For example: + # requests: + # cpu: 10m + # memory: 32Mi + # + # For more information, see [Resource Management for Pods and Containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/). + resources: {} + + + # The nodeSelector on Pods tells Kubernetes to schedule Pods on the nodes with + # matching labels. + # For more information, see [Assigning Pods to Nodes](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/). + # + # This default ensures that Pods are only scheduled to Linux nodes. + # It prevents Pods being scheduled to Windows nodes in a mixed OS cluster. + # +docs:property + nodeSelector: + kubernetes.io/os: linux + + # A Kubernetes Affinity, if required. For more information, see [Affinity v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#affinity-v1-core). + # + # For example: + # affinity: + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: foo.bar.com/role + # operator: In + # values: + # - master + affinity: {} + + # A list of Kubernetes Tolerations, if required. For more information, see [Toleration v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#toleration-v1-core). + # + # For example: + # tolerations: + # - key: foo.bar.com/role + # operator: Equal + # value: master + # effect: NoSchedule + tolerations: [] + + # A list of Kubernetes TopologySpreadConstraints, if required. For more information, see [Topology spread constraint v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#topologyspreadconstraint-v1-core). + # + # For example: + # topologySpreadConstraints: + # - maxSkew: 2 + # topologyKey: topology.kubernetes.io/zone + # whenUnsatisfiable: ScheduleAnyway + # labelSelector: + # matchLabels: + # app.kubernetes.io/instance: cert-manager + # app.kubernetes.io/component: controller + topologySpreadConstraints: [] + + # Optional additional labels to add to the CA Injector Pods. + podLabels: {} + + image: + # The container registry to pull the cainjector image from. + # +docs:property + # registry: quay.io + + # The container image for the cert-manager cainjector + # +docs:property + repository: quay.io/jetstack/cert-manager-cainjector + + # Override the image tag to deploy by setting this variable. + # If no value is set, the chart's appVersion will be used. + # +docs:property + # tag: vX.Y.Z + + # Setting a digest will override any tag. + # +docs:property + # digest: sha256:0e072dddd1f7f8fc8909a2ca6f65e76c5f0d2fcfb8be47935ae3457e8bbceb20 + + # Kubernetes imagePullPolicy on Deployment. + pullPolicy: IfNotPresent + + serviceAccount: + # Specifies whether a service account should be created. + create: true + + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + # +docs:property + # name: "" + + # Optional additional annotations to add to the controller's Service Account. + # +docs:property + # annotations: {} + + # Optional additional labels to add to the cainjector's Service Account. + # +docs:property + # labels: {} + + # Automount API credentials for a Service Account. + automountServiceAccountToken: true + + # Automounting API credentials for a particular pod. + # +docs:property + # automountServiceAccountToken: true + + # Additional volumes to add to the cert-manager controller pod. + volumes: [] + + # Additional volume mounts to add to the cert-manager controller container. + volumeMounts: [] + + # enableServiceLinks indicates whether information about services should be + # injected into the pod's environment variables, matching the syntax of Docker + # links. + enableServiceLinks: false + +# +docs:section=ACME Solver + +acmesolver: + image: + # The container registry to pull the acmesolver image from. + # +docs:property + # registry: quay.io + + # The container image for the cert-manager acmesolver. + # +docs:property + repository: quay.io/jetstack/cert-manager-acmesolver + + # Override the image tag to deploy by setting this variable. + # If no value is set, the chart's appVersion is used. + # +docs:property + # tag: vX.Y.Z + + # Setting a digest will override any tag. + # +docs:property + # digest: sha256:0e072dddd1f7f8fc8909a2ca6f65e76c5f0d2fcfb8be47935ae3457e8bbceb20 + + # Kubernetes imagePullPolicy on Deployment. + pullPolicy: IfNotPresent + +# +docs:section=Startup API Check +# This startupapicheck is a Helm post-install hook that waits for the webhook +# endpoints to become available. +# The check is implemented using a Kubernetes Job - if you are injecting mesh +# sidecar proxies into cert-manager pods, ensure that they +# are not injected into this Job's pod. Otherwise, the installation may time out +# owing to the Job never being completed because the sidecar proxy does not exit. +# For more information, see [this note](https://github.com/cert-manager/cert-manager/pull/4414). + +startupapicheck: + # Enables the startup api check. + enabled: true + + # Pod Security Context to be set on the startupapicheck component Pod. + # For more information, see [Configure a Security Context for a Pod or Container](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/). + # +docs:property + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + + # Container Security Context to be set on the controller component container. + # For more information, see [Configure a Security Context for a Pod or Container](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/). + # +docs:property + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + + # Timeout for 'kubectl check api' command. + timeout: 1m + + # Job backoffLimit + backoffLimit: 4 + + # Optional additional annotations to add to the startupapicheck Job. + # +docs:property + jobAnnotations: + helm.sh/hook: post-install + helm.sh/hook-weight: "1" + helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded + + # Optional additional annotations to add to the startupapicheck Pods. + # +docs:property + # podAnnotations: {} + + # Additional command line flags to pass to startupapicheck binary. + # To see all available flags run `docker run quay.io/jetstack/cert-manager-startupapicheck: --help`. + # + # Verbose logging is enabled by default so that if startupapicheck fails, you + # can know what exactly caused the failure. Verbose logs include details of + # the webhook URL, IP address and TCP connect errors for example. + # +docs:property + extraArgs: + - -v + + # Resources to provide to the cert-manager controller pod. + # + # For example: + # requests: + # cpu: 10m + # memory: 32Mi + # + # For more information, see [Resource Management for Pods and Containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/). + resources: {} + + + # The nodeSelector on Pods tells Kubernetes to schedule Pods on the nodes with + # matching labels. + # For more information, see [Assigning Pods to Nodes](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/). + # + # This default ensures that Pods are only scheduled to Linux nodes. + # It prevents Pods being scheduled to Windows nodes in a mixed OS cluster. + # +docs:property + nodeSelector: + kubernetes.io/os: linux + + # A Kubernetes Affinity, if required. For more information, see [Affinity v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#affinity-v1-core). + # For example: + # affinity: + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: foo.bar.com/role + # operator: In + # values: + # - master + affinity: {} + + # A list of Kubernetes Tolerations, if required. For more information, see [Toleration v1 core](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#toleration-v1-core). + # + # For example: + # tolerations: + # - key: foo.bar.com/role + # operator: Equal + # value: master + # effect: NoSchedule + tolerations: [] + + # Optional additional labels to add to the startupapicheck Pods. + podLabels: {} + + image: + # The container registry to pull the startupapicheck image from. + # +docs:property + # registry: quay.io + + # The container image for the cert-manager startupapicheck. + # +docs:property + repository: quay.io/jetstack/cert-manager-startupapicheck + + # Override the image tag to deploy by setting this variable. + # If no value is set, the chart's appVersion is used. + # +docs:property + # tag: vX.Y.Z + + # Setting a digest will override any tag. + # +docs:property + # digest: sha256:0e072dddd1f7f8fc8909a2ca6f65e76c5f0d2fcfb8be47935ae3457e8bbceb20 + + # Kubernetes imagePullPolicy on Deployment. + pullPolicy: IfNotPresent + + rbac: + # annotations for the startup API Check job RBAC and PSP resources. + # +docs:property + annotations: + helm.sh/hook: post-install + helm.sh/hook-weight: "-5" + helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded + + # Automounting API credentials for a particular pod. + # +docs:property + # automountServiceAccountToken: true + + serviceAccount: + # Specifies whether a service account should be created. + create: true + + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template. + # +docs:property + # name: "" + + # Optional additional annotations to add to the Job's Service Account. + # +docs:property + annotations: + helm.sh/hook: post-install + helm.sh/hook-weight: "-5" + helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded + + # Automount API credentials for a Service Account. + # +docs:property + automountServiceAccountToken: true + + # Optional additional labels to add to the startupapicheck's Service Account. + # +docs:property + # labels: {} + + # Additional volumes to add to the cert-manager controller pod. + volumes: [] + + # Additional volume mounts to add to the cert-manager controller container. + volumeMounts: [] + + # enableServiceLinks indicates whether information about services should be + # injected into pod's environment variables, matching the syntax of Docker + # links. + enableServiceLinks: false + +# Create dynamic manifests via values. +# +# For example: +# extraObjects: +# - | +# apiVersion: v1 +# kind: ConfigMap +# metadata: +# name: '{{ template "cert-manager.name" . }}-extra-configmap' +extraObjects: [] + diff --git a/modules/cert-manager/variables.tf b/modules/cert-manager/variables.tf new file mode 100644 index 00000000..e1e6f934 --- /dev/null +++ b/modules/cert-manager/variables.tf @@ -0,0 +1,27 @@ +variable "auto_deploy" { + description = "Auto deploy through ArgoCD" + type = bool + default = false +} + +variable "auto_prune" { + description = "Auto prune through ArgoCD" + type = bool + default = false +} + +variable "git_revision" { + description = "The git revision to deploy" + type = string + default = "main" +} + +variable "argo_deployment_name" { + description = "The name of the ArgoCD deployment, must be globally unique" + type = string +} + +variable "namespace" { + description = "The namespace to deploy into" + type = string +} diff --git a/modules/cert-manager/versions.tf b/modules/cert-manager/versions.tf new file mode 100644 index 00000000..28b5ab89 --- /dev/null +++ b/modules/cert-manager/versions.tf @@ -0,0 +1,24 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "1.14.0" + } + } + # TODO: Move to this provider + # required_providers { + # argocd = { + # source = "oboukili/argocd" + # version = "6.1.1" + # } + # } +} + From ef8bd3f6e514ac6c4dc58a19327f69fc5cd0ada8 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:55:05 -0700 Subject: [PATCH 13/85] Add oauth2 plugin --- modules/kong-ingress/templates/values.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/kong-ingress/templates/values.yaml b/modules/kong-ingress/templates/values.yaml index 2eb60b29..f403ced2 100644 --- a/modules/kong-ingress/templates/values.yaml +++ b/modules/kong-ingress/templates/values.yaml @@ -45,3 +45,8 @@ gateway: role: traditional database: "off" + plugins: + - pluginName: oauth2 + name: oauth2 + + From 502276bee7c53a8461badf3f09b4cad405fb2b1a Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 10:57:47 -0700 Subject: [PATCH 14/85] update plugin --- modules/kong-ingress/templates/values.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/kong-ingress/templates/values.yaml b/modules/kong-ingress/templates/values.yaml index f403ced2..46a5ba9c 100644 --- a/modules/kong-ingress/templates/values.yaml +++ b/modules/kong-ingress/templates/values.yaml @@ -46,7 +46,8 @@ gateway: database: "off" plugins: - - pluginName: oauth2 - name: oauth2 + configMaps: + - pluginName: oauth2 + name: oauth2 From 7a864e7f769bd6a09b8a61365a4e18eda80e5430 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:02:04 -0700 Subject: [PATCH 15/85] set openid-connect --- modules/kong-ingress/templates/values.yaml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/modules/kong-ingress/templates/values.yaml b/modules/kong-ingress/templates/values.yaml index 46a5ba9c..4a63ab29 100644 --- a/modules/kong-ingress/templates/values.yaml +++ b/modules/kong-ingress/templates/values.yaml @@ -44,10 +44,6 @@ gateway: env: role: traditional database: "off" - - plugins: - configMaps: - - pluginName: oauth2 - name: oauth2 + plugins: bundled,openid-connect From c0e2a4cbba5f2a9d4bbf63de3368cd5aba129f2b Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 11:24:55 -0700 Subject: [PATCH 16/85] Try out envoy gateway --- .../stacks/dpe-k8s-deployments/main.tf | 8 +- modules/envoy-gateway/README.md | 65 +++++++++++++++ .../create-synapse-oauth-client.py | 26 ++++++ modules/envoy-gateway/data.tf | 15 ++++ modules/envoy-gateway/main.tf | 40 +++++++++ modules/envoy-gateway/templates/values.yaml | 82 +++++++++++++++++++ modules/envoy-gateway/variables.tf | 27 ++++++ modules/envoy-gateway/versions.tf | 24 ++++++ 8 files changed, 283 insertions(+), 4 deletions(-) create mode 100644 modules/envoy-gateway/README.md create mode 100644 modules/envoy-gateway/create-synapse-oauth-client.py create mode 100644 modules/envoy-gateway/data.tf create mode 100644 modules/envoy-gateway/main.tf create mode 100644 modules/envoy-gateway/templates/values.yaml create mode 100644 modules/envoy-gateway/variables.tf create mode 100644 modules/envoy-gateway/versions.tf diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 907ff4e7..5eab6804 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -92,16 +92,16 @@ module "signoz" { argo_deployment_name = "signoz" } -module "kong-ingress" { +module "envoy-gateway" { depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" - source = "../../../modules/kong-ingress" + source = "../../../modules/envoy-gateway" auto_deploy = var.auto_deploy auto_prune = var.auto_prune git_revision = var.git_revision - namespace = "kong-ingress" - argo_deployment_name = "kong-ingress" + namespace = "envoy-gateway" + argo_deployment_name = "envoy-gateway" } module "cert-manager" { diff --git a/modules/envoy-gateway/README.md b/modules/envoy-gateway/README.md new file mode 100644 index 00000000..1cac8fc7 --- /dev/null +++ b/modules/envoy-gateway/README.md @@ -0,0 +1,65 @@ +# Purpose +The purpose of this module is to deploy the `Signoz` helm chart . + +SigNoz is an open-source APM. It helps developers monitor their applications +& troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open +source Application Performance Monitoring (APM) & Observability tool. + + +## This module is a work in progress +This was hastly thrown together to get a tool available to ingest telemetry data in. +A number of items are needed: + +- Updating the clickhouse install to cluster mode, and potentially this operator: https://github.com/Altinity/clickhouse-operator +- Setting up backups and data retention +- Trim down the number of ports available in the service +- Double check the entire `values.yaml` file +- Set up accounts and access to the service decleratively + +## Accessing signoz + +### Pre-req +This assumes that you have accessed the k8s cluster before using `k9s` or another tool. +If you have not, read over this documentation: + +- +- Description of port-forwarding via `k9s`: + +### Connecting to signoz +After signoz has been deployed to the k8s cluster you will need to port-forward to 2 +pods/services: + +- `signoz-frontend` +- `signoz-otel-collector` + +The frontend is how you'll access all of the data contained within signoz. Once you +port forward and access it via your web-browser you'll need to signup and login. +TODO: The steps on this are not fleshed out, this is going to be a manual step that the +admin of the server will need to help you with. + + +#### Sending data into signoz +Once you find the `signoz-otel-collector` you'll need to start a port-forward session in +order to pass data along to it from your local machine. Here are the settings you'll use +for the port-forward: + +Windows/Linux: +``` +Container Port: collector/otlp:4317,collector/otlp-http:4318 +Local Port: 4317,4318 +``` + +Mac: +``` +Container Port: collector::4317,collector::4318 +Local Port: 4317,4318 +``` + +Some data will be present in those fields by default, delete was is there and copy the +above data into it. + +### Application side +Once you're connected via a port-forward session the next item is to make sure that the +application you're sending data from is instrumented with open-telemetry. This is going +to be application specific so instructions will need to live within the application +you are using. diff --git a/modules/envoy-gateway/create-synapse-oauth-client.py b/modules/envoy-gateway/create-synapse-oauth-client.py new file mode 100644 index 00000000..002e3964 --- /dev/null +++ b/modules/envoy-gateway/create-synapse-oauth-client.py @@ -0,0 +1,26 @@ +import synapseclient +import json +syn = synapseclient.login() + +client_meta_data = { + 'client_name': 'dpe-dev-k8s-cluster', + 'redirect_uris': [ + 'https://a9a60607095304dec9cd248ef7bd64ea-1681374179.us-east-1.elb.amazonaws.com/testing' + ], + # 'client_uri': 'https://yourhost.com/index.html', + # 'policy_uri': 'https://yourhost.com/policy', + # 'tos_uri': 'https://yourhost.com/terms_of_service', + 'userinfo_signed_response_alg': 'RS256' +} + +# Create the client: +client_meta_data = syn.restPOST(uri='/oauth2/client', + endpoint=syn.authEndpoint, body=json.dumps(client_meta_data)) + +client_id = client_meta_data['client_id'] + +# Generate and retrieve the client secret: +client_id_and_secret = syn.restPOST(uri='/oauth2/client/secret/'+client_id, + endpoint=syn.authEndpoint, body='') + +print(client_id_and_secret) diff --git a/modules/envoy-gateway/data.tf b/modules/envoy-gateway/data.tf new file mode 100644 index 00000000..c3260947 --- /dev/null +++ b/modules/envoy-gateway/data.tf @@ -0,0 +1,15 @@ +data "aws_secretsmanager_secret" "oauth-client-id" { + name = "dev/dpe-sandbox/client-id" +} + +data "aws_secretsmanager_secret_version" "client-id" { + secret_id = data.aws_secretsmanager_secret.oauth-client-id.id +} + +data "aws_secretsmanager_secret" "oauth-client-secret" { + name = "dev/dpe-sandbox/client-secret" +} + +data "aws_secretsmanager_secret_version" "client-secret" { + secret_id = data.aws_secretsmanager_secret.oauth-client-secret.id +} \ No newline at end of file diff --git a/modules/envoy-gateway/main.tf b/modules/envoy-gateway/main.tf new file mode 100644 index 00000000..d48447f0 --- /dev/null +++ b/modules/envoy-gateway/main.tf @@ -0,0 +1,40 @@ + +resource "kubernetes_namespace" "envoy-gateway" { + metadata { + name = var.namespace + } +} + +# TODO: Using kustomize in this fashion prints out the secret in the spacelift UI when terraform is running +resource "kubectl_manifest" "envoy-gateway" { + depends_on = [kubernetes_namespace.envoy-gateway] + + yaml_body = < Date: Mon, 30 Sep 2024 11:31:42 -0700 Subject: [PATCH 17/85] Disable service monitor --- modules/cert-manager/templates/values.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/cert-manager/templates/values.yaml b/modules/cert-manager/templates/values.yaml index c4310fb8..e3a07932 100644 --- a/modules/cert-manager/templates/values.yaml +++ b/modules/cert-manager/templates/values.yaml @@ -466,6 +466,7 @@ enableServiceLinks: false # +docs:section=Prometheus +# TODO: Convert these over to trivy style monitors prometheus: # Enable Prometheus monitoring for the cert-manager controller to use with the # Prometheus Operator. If this option is enabled without enabling `prometheus.servicemonitor.enabled` or @@ -473,7 +474,7 @@ prometheus: # resources. Additionally, a service is created which can be used together # with your own ServiceMonitor (managed outside of this Helm chart). # Otherwise, a ServiceMonitor/ PodMonitor is created. - enabled: true + enabled: false servicemonitor: # Create a ServiceMonitor to add cert-manager to Prometheus. From 8b8be2e7d32fefee8263e010316d5749a6f85e76 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 12:12:34 -0700 Subject: [PATCH 18/85] Set argocd docker registry oci --- modules/argo-cd/templates/values.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/argo-cd/templates/values.yaml b/modules/argo-cd/templates/values.yaml index 6325f022..cc4daf35 100644 --- a/modules/argo-cd/templates/values.yaml +++ b/modules/argo-cd/templates/values.yaml @@ -501,7 +501,14 @@ configs: # -- Repositories list to be used by applications ## Creates a secret for each key/value specified below to create repositories ## Note: the last example in the list would use a repository credential template, configured under "configs.credentialTemplates". - repositories: {} + repositories: + docker-registry: + url: registry-1.docker.io + # username: "docker" + # password: "" + name: docker-registry + enableOCI: "true" + type: "helm" # istio-helm-repo: # url: https://storage.googleapis.com/istio-prerelease/daily-build/master-latest-daily/charts # name: istio.io From 4bf72d94955ad4658c24383c8b7a753b5391b685 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 12:15:36 -0700 Subject: [PATCH 19/85] Point to local argo-cd --- deployments/stacks/dpe-k8s-deployments/main.tf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 5eab6804..f1068174 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -21,8 +21,9 @@ module "sage-aws-eks-addons" { module "argo-cd" { depends_on = [module.sage-aws-eks-autoscaler] - source = "spacelift.io/sagebionetworks/argo-cd/aws" - version = "0.3.1" + # source = "spacelift.io/sagebionetworks/argo-cd/aws" + # version = "0.3.1" + source = "../../../modules/argo-cd" } module "victoria-metrics" { From 4983fb2227cdce8cb7488cdff98052d3e244a586 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:08:04 -0700 Subject: [PATCH 20/85] Deploy dex idp --- .../stacks/dpe-k8s-deployments/main.tf | 12 + modules/dex-idp/README.md | 65 ++++ modules/dex-idp/data.tf | 15 + modules/dex-idp/main.tf | 39 ++ modules/dex-idp/templates/values.yaml | 337 ++++++++++++++++++ modules/dex-idp/variables.tf | 27 ++ modules/dex-idp/versions.tf | 24 ++ 7 files changed, 519 insertions(+) create mode 100644 modules/dex-idp/README.md create mode 100644 modules/dex-idp/data.tf create mode 100644 modules/dex-idp/main.tf create mode 100644 modules/dex-idp/templates/values.yaml create mode 100644 modules/dex-idp/variables.tf create mode 100644 modules/dex-idp/versions.tf diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index f1068174..39b266ae 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -115,4 +115,16 @@ module "cert-manager" { git_revision = var.git_revision namespace = "cert-manager" argo_deployment_name = "cert-manager" +} + +module "dex-idp" { + depends_on = [module.argo-cd] + # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" + # version = "0.5.0" + source = "../../../modules/dex-idp" + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = var.git_revision + namespace = "dex-idp" + argo_deployment_name = "dex-idp" } \ No newline at end of file diff --git a/modules/dex-idp/README.md b/modules/dex-idp/README.md new file mode 100644 index 00000000..1cac8fc7 --- /dev/null +++ b/modules/dex-idp/README.md @@ -0,0 +1,65 @@ +# Purpose +The purpose of this module is to deploy the `Signoz` helm chart . + +SigNoz is an open-source APM. It helps developers monitor their applications +& troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open +source Application Performance Monitoring (APM) & Observability tool. + + +## This module is a work in progress +This was hastly thrown together to get a tool available to ingest telemetry data in. +A number of items are needed: + +- Updating the clickhouse install to cluster mode, and potentially this operator: https://github.com/Altinity/clickhouse-operator +- Setting up backups and data retention +- Trim down the number of ports available in the service +- Double check the entire `values.yaml` file +- Set up accounts and access to the service decleratively + +## Accessing signoz + +### Pre-req +This assumes that you have accessed the k8s cluster before using `k9s` or another tool. +If you have not, read over this documentation: + +- +- Description of port-forwarding via `k9s`: + +### Connecting to signoz +After signoz has been deployed to the k8s cluster you will need to port-forward to 2 +pods/services: + +- `signoz-frontend` +- `signoz-otel-collector` + +The frontend is how you'll access all of the data contained within signoz. Once you +port forward and access it via your web-browser you'll need to signup and login. +TODO: The steps on this are not fleshed out, this is going to be a manual step that the +admin of the server will need to help you with. + + +#### Sending data into signoz +Once you find the `signoz-otel-collector` you'll need to start a port-forward session in +order to pass data along to it from your local machine. Here are the settings you'll use +for the port-forward: + +Windows/Linux: +``` +Container Port: collector/otlp:4317,collector/otlp-http:4318 +Local Port: 4317,4318 +``` + +Mac: +``` +Container Port: collector::4317,collector::4318 +Local Port: 4317,4318 +``` + +Some data will be present in those fields by default, delete was is there and copy the +above data into it. + +### Application side +Once you're connected via a port-forward session the next item is to make sure that the +application you're sending data from is instrumented with open-telemetry. This is going +to be application specific so instructions will need to live within the application +you are using. diff --git a/modules/dex-idp/data.tf b/modules/dex-idp/data.tf new file mode 100644 index 00000000..c3260947 --- /dev/null +++ b/modules/dex-idp/data.tf @@ -0,0 +1,15 @@ +data "aws_secretsmanager_secret" "oauth-client-id" { + name = "dev/dpe-sandbox/client-id" +} + +data "aws_secretsmanager_secret_version" "client-id" { + secret_id = data.aws_secretsmanager_secret.oauth-client-id.id +} + +data "aws_secretsmanager_secret" "oauth-client-secret" { + name = "dev/dpe-sandbox/client-secret" +} + +data "aws_secretsmanager_secret_version" "client-secret" { + secret_id = data.aws_secretsmanager_secret.oauth-client-secret.id +} \ No newline at end of file diff --git a/modules/dex-idp/main.tf b/modules/dex-idp/main.tf new file mode 100644 index 00000000..90ce3e3e --- /dev/null +++ b/modules/dex-idp/main.tf @@ -0,0 +1,39 @@ + +resource "kubernetes_namespace" "dex" { + metadata { + name = var.namespace + } +} + +resource "kubectl_manifest" "dex" { + depends_on = [kubernetes_namespace.dex] + + yaml_body = < Date: Mon, 30 Sep 2024 14:11:44 -0700 Subject: [PATCH 21/85] Correct envoy-gateway chart repo --- modules/envoy-gateway/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/envoy-gateway/main.tf b/modules/envoy-gateway/main.tf index d48447f0..56f7e716 100644 --- a/modules/envoy-gateway/main.tf +++ b/modules/envoy-gateway/main.tf @@ -23,8 +23,8 @@ spec: prune: ${var.auto_prune} %{endif} sources: - - repoURL: 'oci://docker.io/envoyproxy/gateway-helm' - chart: envoyproxy + - repoURL: registry-1.docker.io + chart: envoyproxy/gateway-helm targetRevision: v1.1.2 helm: releaseName: gateway-helm From e4f3fbb4235a738d00c4bceb65b975ccbbf08b30 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:23:38 -0700 Subject: [PATCH 22/85] Set google conncetor --- modules/dex-idp/templates/values.yaml | 31 +++++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/modules/dex-idp/templates/values.yaml b/modules/dex-idp/templates/values.yaml index 441a3b6b..30fd2429 100644 --- a/modules/dex-idp/templates/values.yaml +++ b/modules/dex-idp/templates/values.yaml @@ -53,7 +53,19 @@ configSecret: # -- Application configuration. # See the [official documentation](https://dexidp.io/docs/). -config: {} +config: + connectors: + - type: google + id: google + name: Google + config: + + # Connector config values starting with a "$" will read from the environment. + clientID: $GOOGLE_CLIENT_ID + clientSecret: $GOOGLE_CLIENT_SECRET + + # Dex's issuer URL + "/callback" + redirectURI: http://127.0.0.1:5556/callback # -- Additional storage [volumes](https://kubernetes.io/docs/concepts/storage/volumes/). # See the [API reference](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#volumes-1) for details. @@ -73,14 +85,19 @@ env: {} # -- Similar to env but with support for all possible configurations. # See the [API reference](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#environment-variables) for details. -envVars: [] +envVars: # - name: SOME_ENV_VAR # value: value -# - name: SOME_ENV_VAR2 -# valueFrom: -# secretKeyRef: -# name: secret-name -# key: secret-key +- name: GOOGLE_CLIENT_ID + valueFrom: + secretKeyRef: + name: google-oauth + key: client-id +- name: GOOGLE_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: google-oauth + key: client-secret # - name: SOME_ENV_VAR3 # valueFrom: # configMapKeyRef: From 75a30639947929fc46a234aada2a4818f6826687 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:24:07 -0700 Subject: [PATCH 23/85] Set storage --- modules/dex-idp/templates/values.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/dex-idp/templates/values.yaml b/modules/dex-idp/templates/values.yaml index 30fd2429..2f93fa94 100644 --- a/modules/dex-idp/templates/values.yaml +++ b/modules/dex-idp/templates/values.yaml @@ -54,6 +54,10 @@ configSecret: # -- Application configuration. # See the [official documentation](https://dexidp.io/docs/). config: + storage: + type: kubernetes + config: + inCluster: true connectors: - type: google id: google From 875010b361b55b3d55d752ea6fb9cd622803a8ff Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:30:42 -0700 Subject: [PATCH 24/85] Set issuer --- modules/dex-idp/templates/values.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/dex-idp/templates/values.yaml b/modules/dex-idp/templates/values.yaml index 2f93fa94..41287528 100644 --- a/modules/dex-idp/templates/values.yaml +++ b/modules/dex-idp/templates/values.yaml @@ -54,6 +54,7 @@ configSecret: # -- Application configuration. # See the [official documentation](https://dexidp.io/docs/). config: + issuer: http://a471b28f92d654dba8455f1712544444-1736471731.us-east-1.elb.amazonaws.com storage: type: kubernetes config: From 41e914721f540ce6cde254ef78f0a1c70ba1abf8 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:47:16 -0700 Subject: [PATCH 25/85] Deploy DB for dex --- deployments/stacks/dpe-k8s-deployments/main.tf | 15 ++++++++++++++- modules/postgres-cloud-native/main.tf | 2 ++ modules/postgres-cloud-native/variables.tf | 5 +++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 39b266ae..bfa835bc 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -127,4 +127,17 @@ module "dex-idp" { git_revision = var.git_revision namespace = "dex-idp" argo_deployment_name = "dex-idp" -} \ No newline at end of file +} + +module "dex-idp-postgres-db" { + depends_on = [module.argo-cd] + # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" + # version = "0.5.0" + source = "../../../modules/postgres-cloud-native" + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = var.git_revision + deploy_pooler = false + namespace = "dex-idp" + argo_deployment_name = "dex-idp-database" +} diff --git a/modules/postgres-cloud-native/main.tf b/modules/postgres-cloud-native/main.tf index 25a654a3..6be5d1df 100644 --- a/modules/postgres-cloud-native/main.tf +++ b/modules/postgres-cloud-native/main.tf @@ -28,6 +28,7 @@ spec: - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' targetRevision: ${var.git_revision} ref: values + %{if var.deploy_pooler} - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' targetRevision: ${var.git_revision} path: modules/postgres-cloud-native/resources @@ -42,6 +43,7 @@ spec: - op: replace path: /metadata/name value: ${var.argo_deployment_name}-pooler-rw + %{endif} destination: server: 'https://kubernetes.default.svc' namespace: ${var.namespace} diff --git a/modules/postgres-cloud-native/variables.tf b/modules/postgres-cloud-native/variables.tf index 770d80bd..1fe16d96 100644 --- a/modules/postgres-cloud-native/variables.tf +++ b/modules/postgres-cloud-native/variables.tf @@ -25,3 +25,8 @@ variable "namespace" { description = "The namespace to deploy into" type = string } + +variable "deploy_pooler" { + description = "Deploy the connection pooler" + type = bool +} From 88144118575cd380fec80fefef2e9a7f7aac526f Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 14:52:58 -0700 Subject: [PATCH 26/85] Deploy DB operator --- deployments/stacks/dpe-k8s-deployments/main.tf | 2 -- 1 file changed, 2 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index bfa835bc..4237c4d7 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -57,8 +57,6 @@ module "airflow" { } module "postgres-cloud-native-operator" { - # TODO: This is temporary - count = 0 depends_on = [module.argo-cd] source = "spacelift.io/sagebionetworks/postgres-cloud-native-operator/aws" version = "0.4.0" From 0eaafed3b77bec8f02fc69beb1c17c7a90994662 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:01:09 -0700 Subject: [PATCH 27/85] Set dex to use postgres --- modules/dex-idp/templates/values.yaml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/modules/dex-idp/templates/values.yaml b/modules/dex-idp/templates/values.yaml index 41287528..c7fbdb26 100644 --- a/modules/dex-idp/templates/values.yaml +++ b/modules/dex-idp/templates/values.yaml @@ -56,9 +56,13 @@ configSecret: config: issuer: http://a471b28f92d654dba8455f1712544444-1736471731.us-east-1.elb.amazonaws.com storage: - type: kubernetes + type: postgres config: - inCluster: true + host: dex-idp-database-cluster-rw.dex-idp + port: 5432 + database: application-database + user: $DB_USER + password: $DB_PASSWORD connectors: - type: google id: google @@ -103,6 +107,16 @@ envVars: secretKeyRef: name: google-oauth key: client-secret +- name: DB_USER + valueFrom: + secretKeyRef: + name: pg-user-secret + key: username +- name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: pg-user-secret + key: password # - name: SOME_ENV_VAR3 # valueFrom: # configMapKeyRef: From 6c828063c1a3735c4b25a8e03f4dcf927ca935be Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:02:40 -0700 Subject: [PATCH 28/85] Disable ssl --- modules/dex-idp/templates/values.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/dex-idp/templates/values.yaml b/modules/dex-idp/templates/values.yaml index c7fbdb26..78c731ec 100644 --- a/modules/dex-idp/templates/values.yaml +++ b/modules/dex-idp/templates/values.yaml @@ -63,6 +63,8 @@ config: database: application-database user: $DB_USER password: $DB_PASSWORD + ssl: + mode: disable connectors: - type: google id: google From 74c83f39eeee8053abfbdd5cdc55459321990854 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:03:44 -0700 Subject: [PATCH 29/85] ssl --- modules/dex-idp/templates/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/dex-idp/templates/values.yaml b/modules/dex-idp/templates/values.yaml index 78c731ec..1a7c9255 100644 --- a/modules/dex-idp/templates/values.yaml +++ b/modules/dex-idp/templates/values.yaml @@ -63,8 +63,8 @@ config: database: application-database user: $DB_USER password: $DB_PASSWORD - ssl: - mode: disable + ssl: + mode: disable connectors: - type: google id: google From d4fae1fb3aaf1f98cfd17fa2243a26e4dc833d6d Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 10:32:27 -0700 Subject: [PATCH 30/85] Enable cert-manager gateway api support --- modules/cert-manager/templates/values.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/cert-manager/templates/values.yaml b/modules/cert-manager/templates/values.yaml index e3a07932..633029c0 100644 --- a/modules/cert-manager/templates/values.yaml +++ b/modules/cert-manager/templates/values.yaml @@ -229,7 +229,10 @@ enableCertificateOwnerRef: false # - cert-manager-metrics # - cert-manager-metrics.cert-manager # - cert-manager-metrics.cert-manager.svc -config: {} +config: + apiVersion: controller.config.cert-manager.io/v1alpha1 + kind: ControllerConfiguration + enableGatewayAPI: true # Setting Nameservers for DNS01 Self Check. # For more information, see the [cert-manager documentation](https://cert-manager.io/docs/configuration/acme/dns01/#setting-nameservers-for-dns01-self-check). From 1dfa08a4d7099e78824c8d4527147b10f89d8b76 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:08:24 -0700 Subject: [PATCH 31/85] Deploy out ingress --- .gitignore | 3 +- deployments/main.tf | 32 +- .../stacks/dpe-k8s-deployments/main.tf | 43 +- modules/apache-airflow/templates/values.yaml | 4 +- modules/cert-manager/README.md | 9 +- modules/cluster-ingress/README.md | 49 +++ modules/cluster-ingress/main.tf | 40 ++ .../resources/cert-issuer.yaml | 25 ++ .../resources/gateway-class.yaml | 6 + .../cluster-ingress/resources/gateway.yaml | 21 + .../resources/kustomization.yaml | 3 +- .../{dex-idp => cluster-ingress}/variables.tf | 10 + .../versions.tf | 8 - modules/dex-idp/README.md | 65 --- modules/dex-idp/data.tf | 15 - modules/dex-idp/main.tf | 39 -- modules/dex-idp/templates/values.yaml | 375 ------------------ modules/dex-idp/versions.tf | 24 -- modules/envoy-gateway/README.md | 112 +++--- .../create-synapse-oauth-client.py | 4 +- modules/envoy-gateway/data.tf | 15 - modules/kong-ingress/README.md | 65 --- .../create-synapse-oauth-client.py | 26 -- modules/kong-ingress/data.tf | 15 - modules/kong-ingress/main.tf | 54 --- .../resources/openid-connect-plugin.yaml | 20 - modules/kong-ingress/templates/values.yaml | 49 --- modules/kong-ingress/variables.tf | 27 -- modules/postgres-cloud-native/main.tf | 2 +- modules/signoz/templates/values.yaml | 5 +- 30 files changed, 251 insertions(+), 914 deletions(-) create mode 100644 modules/cluster-ingress/README.md create mode 100644 modules/cluster-ingress/main.tf create mode 100644 modules/cluster-ingress/resources/cert-issuer.yaml create mode 100644 modules/cluster-ingress/resources/gateway-class.yaml create mode 100644 modules/cluster-ingress/resources/gateway.yaml rename modules/{kong-ingress => cluster-ingress}/resources/kustomization.yaml (68%) rename modules/{dex-idp => cluster-ingress}/variables.tf (72%) rename modules/{kong-ingress => cluster-ingress}/versions.tf (62%) delete mode 100644 modules/dex-idp/README.md delete mode 100644 modules/dex-idp/data.tf delete mode 100644 modules/dex-idp/main.tf delete mode 100644 modules/dex-idp/templates/values.yaml delete mode 100644 modules/dex-idp/versions.tf delete mode 100644 modules/envoy-gateway/data.tf delete mode 100644 modules/kong-ingress/README.md delete mode 100644 modules/kong-ingress/create-synapse-oauth-client.py delete mode 100644 modules/kong-ingress/data.tf delete mode 100644 modules/kong-ingress/main.tf delete mode 100644 modules/kong-ingress/resources/openid-connect-plugin.yaml delete mode 100644 modules/kong-ingress/templates/values.yaml delete mode 100644 modules/kong-ingress/variables.tf diff --git a/.gitignore b/.gitignore index 71bcd867..0c5afbfa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.tfstate* .terraform terraform.tfvars -settings.json \ No newline at end of file +settings.json +temp* \ No newline at end of file diff --git a/deployments/main.tf b/deployments/main.tf index 3717ec59..bea46a57 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -37,18 +37,10 @@ module "dpe-sandbox-spacelift-development" { cluster_name = "dpe-k8-sandbox" vpc_name = "dpe-sandbox" - vpc_cidr_block = "10.51.0.0/16" - # public_subnet_cidrs = ["10.51.1.0/24", "10.51.2.0/24", "10.51.3.0/24"] - # private_subnet_cidrs = ["10.51.4.0/24", "10.51.5.0/24", "10.51.6.0/24"] - # azs = ["us-east-1a", "us-east-1b", "us-east-1c"] - # For now, we are only using one public and one private subnet. This is due to how - # EBS can only be mounted to a single AZ. We will need to revisit this if we want to - # allow usage of EFS ($$$$), or add some kind of EBS volume replication. - # Note: EKS requires at least two subnets in different AZs. However, we are only using - # a single subnet for node deployment. - public_subnet_cidrs = ["10.51.1.0/24", "10.51.2.0/24"] - private_subnet_cidrs = ["10.51.4.0/24", "10.51.5.0/24"] - azs = ["us-east-1a", "us-east-1b"] + vpc_cidr_block = "10.51.0.0/16" + public_subnet_cidrs = ["10.51.1.0/24", "10.51.2.0/24", "10.51.3.0/24"] + private_subnet_cidrs = ["10.51.4.0/24", "10.51.5.0/24", "10.51.6.0/24"] + azs = ["us-east-1a", "us-east-1b", "us-east-1c"] } module "dpe-sandbox-spacelift-production" { @@ -75,16 +67,8 @@ module "dpe-sandbox-spacelift-production" { cluster_name = "dpe-k8" vpc_name = "dpe-k8" - vpc_cidr_block = "10.52.0.0/16" - # public_subnet_cidrs = ["10.52.1.0/24", "10.52.2.0/24", "10.52.3.0/24"] - # private_subnet_cidrs = ["10.52.4.0/24", "10.52.5.0/24", "10.52.6.0/24"] - # azs = ["us-east-1a", "us-east-1b", "us-east-1c"] - # For now, we are only using one public and one private subnet. This is due to how - # EBS can only be mounted to a single AZ. We will need to revisit this if we want to - # allow usage of EFS ($$$$), or add some kind of EBS volume replication. - # Note: EKS requires at least two subnets in different AZs. However, we are only using - # a single subnet for node deployment. - public_subnet_cidrs = ["10.52.1.0/24", "10.52.2.0/24"] - private_subnet_cidrs = ["10.52.4.0/24", "10.52.5.0/24"] - azs = ["us-east-1a", "us-east-1b"] + vpc_cidr_block = "10.52.0.0/16" + public_subnet_cidrs = ["10.52.1.0/24", "10.52.2.0/24", "10.52.3.0/24"] + private_subnet_cidrs = ["10.52.4.0/24", "10.52.5.0/24", "10.52.6.0/24"] + azs = ["us-east-1a", "us-east-1b", "us-east-1c"] } diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 4237c4d7..8aa80435 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -6,7 +6,7 @@ module "sage-aws-eks-autoscaler" { vpc_id = var.vpc_id node_security_group_id = var.node_security_group_id spotinst_account = var.spotinst_account - single_az = true + single_az = false desired_capacity = 3 } @@ -46,7 +46,7 @@ module "trivy-operator" { module "airflow" { # TODO: This is temporary - count = 0 + count = 0 depends_on = [module.victoria-metrics, module.argo-cd] source = "spacelift.io/sagebionetworks/airflow/aws" version = "0.4.0" @@ -67,7 +67,7 @@ module "postgres-cloud-native-operator" { module "postgres-cloud-native-database" { # TODO: This is temporary - count = 0 + count = 0 depends_on = [module.postgres-cloud-native-operator, module.airflow, module.argo-cd] source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" version = "0.5.0" @@ -80,10 +80,10 @@ module "postgres-cloud-native-database" { module "signoz" { - depends_on = [module.argo-cd] + depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" - source = "../../../modules/signoz" + source = "../../../modules/signoz" auto_deploy = var.auto_deploy auto_prune = var.auto_prune git_revision = var.git_revision @@ -92,10 +92,10 @@ module "signoz" { } module "envoy-gateway" { - depends_on = [module.argo-cd] + depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" - source = "../../../modules/envoy-gateway" + source = "../../../modules/envoy-gateway" auto_deploy = var.auto_deploy auto_prune = var.auto_prune git_revision = var.git_revision @@ -104,10 +104,10 @@ module "envoy-gateway" { } module "cert-manager" { - depends_on = [module.argo-cd] + depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" - source = "../../../modules/cert-manager" + source = "../../../modules/cert-manager" auto_deploy = var.auto_deploy auto_prune = var.auto_prune git_revision = var.git_revision @@ -115,27 +115,18 @@ module "cert-manager" { argo_deployment_name = "cert-manager" } -module "dex-idp" { - depends_on = [module.argo-cd] +module "cluster-ingress" { + depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" - source = "../../../modules/dex-idp" + source = "../../../modules/cluster-ingress" auto_deploy = var.auto_deploy auto_prune = var.auto_prune git_revision = var.git_revision - namespace = "dex-idp" - argo_deployment_name = "dex-idp" -} + namespace = "envoy-gateway" + argo_deployment_name = "cluster-ingress" -module "dex-idp-postgres-db" { - depends_on = [module.argo-cd] - # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" - # version = "0.5.0" - source = "../../../modules/postgres-cloud-native" - auto_deploy = var.auto_deploy - auto_prune = var.auto_prune - git_revision = var.git_revision - deploy_pooler = false - namespace = "dex-idp" - argo_deployment_name = "dex-idp-database" + # To determine more elegant ways to fill in these values + ssl_hostname = "unknown-to-fill-in" + cluster_issuer_name = "selfsigned" } diff --git a/modules/apache-airflow/templates/values.yaml b/modules/apache-airflow/templates/values.yaml index 04832b57..640a4d92 100644 --- a/modules/apache-airflow/templates/values.yaml +++ b/modules/apache-airflow/templates/values.yaml @@ -108,7 +108,9 @@ images: pullPolicy: IfNotPresent # Select certain nodes for airflow pods. -nodeSelector: {} +nodeSelector: { + failure-domain.beta.kubernetes.io/zone: us-east-1a +} affinity: {} tolerations: [] topologySpreadConstraints: [] diff --git a/modules/cert-manager/README.md b/modules/cert-manager/README.md index b47a4bd9..30b7eb65 100644 --- a/modules/cert-manager/README.md +++ b/modules/cert-manager/README.md @@ -1,6 +1,13 @@ # Purpose -This module is used to deploy the cert-manager helm chart +This module is used to deploy the cert-manager helm chart. cert-manager is responsible +for creating SSL certs to use within the cluster. Resources: - + +## Relation to envoy-gateway +The envoy-gateway is responsible for handling ingress for the kubernetes cluster. +cert-manager has an a integration to watch for changes to `kind: Gateway` resources to +determine when to provision SSL certs. This integration is in the `values.yaml` file +of this directory under `kind: ControllerConfiguration`. diff --git a/modules/cluster-ingress/README.md b/modules/cluster-ingress/README.md new file mode 100644 index 00000000..8df21df2 --- /dev/null +++ b/modules/cluster-ingress/README.md @@ -0,0 +1,49 @@ +# Purpose +The purpose of this module is to deploy kubernetes resources related to ingress for +the cluster. Along with the ingress we will also deploy out the related SSL cert issuer. + +## To implemement +The Envoy Gateway can secure ingress by verifying JWT. It can be applied to a specific +target, for example this applies it to an HTTPRoute called `backend`: + +``` +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: SecurityPolicy +metadata: + name: jwt-example +spec: + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: backend + jwt: + providers: + - name: auth0 + remoteJWKS: + uri: https://dev-57n3awu5je6q653y.us.auth0.com/.well-known/jwks.json +``` + + +The HTTPRoute is used to connect the envoy gateway ingress to a service in the cluster. +In this example the path `/get` routes the request to a service called `backend` on +port 3000. +``` +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: backend +spec: + parentRefs: + - name: eg + rules: + - backendRefs: + - group: "" + kind: Service + name: backend + port: 3000 + weight: 1 + matches: + - path: + type: PathPrefix + value: /get +``` \ No newline at end of file diff --git a/modules/cluster-ingress/main.tf b/modules/cluster-ingress/main.tf new file mode 100644 index 00000000..36046e25 --- /dev/null +++ b/modules/cluster-ingress/main.tf @@ -0,0 +1,40 @@ +resource "kubectl_manifest" "cluster-ingress" { + yaml_body = < +spec: + gatewayClassName: eg + listeners: + - name: https + protocol: HTTPS + hostname: + port: 443 + tls: + mode: Terminate + certificateRefs: + - kind: Secret + name: eg-https + - name: http + protocol: HTTP + port: 80 \ No newline at end of file diff --git a/modules/kong-ingress/resources/kustomization.yaml b/modules/cluster-ingress/resources/kustomization.yaml similarity index 68% rename from modules/kong-ingress/resources/kustomization.yaml rename to modules/cluster-ingress/resources/kustomization.yaml index 119ff612..c66218bf 100644 --- a/modules/kong-ingress/resources/kustomization.yaml +++ b/modules/cluster-ingress/resources/kustomization.yaml @@ -1,4 +1,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- openid-connect-plugin.yaml +- gateway.yaml +- cert-issuer.yaml diff --git a/modules/dex-idp/variables.tf b/modules/cluster-ingress/variables.tf similarity index 72% rename from modules/dex-idp/variables.tf rename to modules/cluster-ingress/variables.tf index 770d80bd..03f078cd 100644 --- a/modules/dex-idp/variables.tf +++ b/modules/cluster-ingress/variables.tf @@ -25,3 +25,13 @@ variable "namespace" { description = "The namespace to deploy into" type = string } + +variable "cluster_issuer_name" { + description = "The name of the cluster issuer" + type = string +} + +variable "ssl_hostname" { + description = "The hostname to use for the SSL certificate" + type = string +} diff --git a/modules/kong-ingress/versions.tf b/modules/cluster-ingress/versions.tf similarity index 62% rename from modules/kong-ingress/versions.tf rename to modules/cluster-ingress/versions.tf index 28b5ab89..c35c044f 100644 --- a/modules/kong-ingress/versions.tf +++ b/modules/cluster-ingress/versions.tf @@ -13,12 +13,4 @@ terraform { version = "1.14.0" } } - # TODO: Move to this provider - # required_providers { - # argocd = { - # source = "oboukili/argocd" - # version = "6.1.1" - # } - # } } - diff --git a/modules/dex-idp/README.md b/modules/dex-idp/README.md deleted file mode 100644 index 1cac8fc7..00000000 --- a/modules/dex-idp/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Purpose -The purpose of this module is to deploy the `Signoz` helm chart . - -SigNoz is an open-source APM. It helps developers monitor their applications -& troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open -source Application Performance Monitoring (APM) & Observability tool. - - -## This module is a work in progress -This was hastly thrown together to get a tool available to ingest telemetry data in. -A number of items are needed: - -- Updating the clickhouse install to cluster mode, and potentially this operator: https://github.com/Altinity/clickhouse-operator -- Setting up backups and data retention -- Trim down the number of ports available in the service -- Double check the entire `values.yaml` file -- Set up accounts and access to the service decleratively - -## Accessing signoz - -### Pre-req -This assumes that you have accessed the k8s cluster before using `k9s` or another tool. -If you have not, read over this documentation: - -- -- Description of port-forwarding via `k9s`: - -### Connecting to signoz -After signoz has been deployed to the k8s cluster you will need to port-forward to 2 -pods/services: - -- `signoz-frontend` -- `signoz-otel-collector` - -The frontend is how you'll access all of the data contained within signoz. Once you -port forward and access it via your web-browser you'll need to signup and login. -TODO: The steps on this are not fleshed out, this is going to be a manual step that the -admin of the server will need to help you with. - - -#### Sending data into signoz -Once you find the `signoz-otel-collector` you'll need to start a port-forward session in -order to pass data along to it from your local machine. Here are the settings you'll use -for the port-forward: - -Windows/Linux: -``` -Container Port: collector/otlp:4317,collector/otlp-http:4318 -Local Port: 4317,4318 -``` - -Mac: -``` -Container Port: collector::4317,collector::4318 -Local Port: 4317,4318 -``` - -Some data will be present in those fields by default, delete was is there and copy the -above data into it. - -### Application side -Once you're connected via a port-forward session the next item is to make sure that the -application you're sending data from is instrumented with open-telemetry. This is going -to be application specific so instructions will need to live within the application -you are using. diff --git a/modules/dex-idp/data.tf b/modules/dex-idp/data.tf deleted file mode 100644 index c3260947..00000000 --- a/modules/dex-idp/data.tf +++ /dev/null @@ -1,15 +0,0 @@ -data "aws_secretsmanager_secret" "oauth-client-id" { - name = "dev/dpe-sandbox/client-id" -} - -data "aws_secretsmanager_secret_version" "client-id" { - secret_id = data.aws_secretsmanager_secret.oauth-client-id.id -} - -data "aws_secretsmanager_secret" "oauth-client-secret" { - name = "dev/dpe-sandbox/client-secret" -} - -data "aws_secretsmanager_secret_version" "client-secret" { - secret_id = data.aws_secretsmanager_secret.oauth-client-secret.id -} \ No newline at end of file diff --git a/modules/dex-idp/main.tf b/modules/dex-idp/main.tf deleted file mode 100644 index 90ce3e3e..00000000 --- a/modules/dex-idp/main.tf +++ /dev/null @@ -1,39 +0,0 @@ - -resource "kubernetes_namespace" "dex" { - metadata { - name = var.namespace - } -} - -resource "kubectl_manifest" "dex" { - depends_on = [kubernetes_namespace.dex] - - yaml_body = <. +Create/handle ingress for the kubernetes cluster -SigNoz is an open-source APM. It helps developers monitor their applications -& troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open -source Application Performance Monitoring (APM) & Observability tool. +# Integration with Auth0 +Auth0 handles provisioning JWT to authenticate with the envoy gateway. -## This module is a work in progress -This was hastly thrown together to get a tool available to ingest telemetry data in. -A number of items are needed: +## Creating credential: +`openssl genrsa -out test_key.pem 2048` +`openssl rsa -in test_key.pem -outform PEM -pubout -out test_key.pem.pub` -- Updating the clickhouse install to cluster mode, and potentially this operator: https://github.com/Altinity/clickhouse-operator -- Setting up backups and data retention -- Trim down the number of ports available in the service -- Double check the entire `values.yaml` file -- Set up accounts and access to the service decleratively -## Accessing signoz -### Pre-req -This assumes that you have accessed the k8s cluster before using `k9s` or another tool. -If you have not, read over this documentation: - -- -- Description of port-forwarding via `k9s`: - -### Connecting to signoz -After signoz has been deployed to the k8s cluster you will need to port-forward to 2 -pods/services: - -- `signoz-frontend` -- `signoz-otel-collector` - -The frontend is how you'll access all of the data contained within signoz. Once you -port forward and access it via your web-browser you'll need to signup and login. -TODO: The steps on this are not fleshed out, this is going to be a manual step that the -admin of the server will need to help you with. - - -#### Sending data into signoz -Once you find the `signoz-otel-collector` you'll need to start a port-forward session in -order to pass data along to it from your local machine. Here are the settings you'll use -for the port-forward: - -Windows/Linux: +Creating gateway resources: ``` -Container Port: collector/otlp:4317,collector/otlp-http:4318 -Local Port: 4317,4318 -``` - -Mac: -``` -Container Port: collector::4317,collector::4318 -Local Port: 4317,4318 -``` - -Some data will be present in those fields by default, delete was is there and copy the -above data into it. - -### Application side -Once you're connected via a port-forward session the next item is to make sure that the -application you're sending data from is instrumented with open-telemetry. This is going -to be application specific so instructions will need to live within the application -you are using. +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: eg +spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + server: https://acme-staging-v02.api.letsencrypt.org/directory + email: "bryan.fauble@sagebase.org" + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + - http01: + gatewayHTTPRoute: + parentRefs: + - kind: Gateway + name: eg + namespace: envoy-gateway +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: eg + annotations: + cert-manager.io/cluster-issuer: letsencrypt-staging +spec: + gatewayClassName: eg + listeners: + - name: https + protocol: HTTPS + hostname: aff3f8141f88b4f958400fc7bab55329-385678462.us-east-1.elb.amazonaws.com + port: 443 + tls: + mode: Terminate + certificateRefs: + - kind: Secret + name: eg-https + - name: http + protocol: HTTP + port: 80 + +``` \ No newline at end of file diff --git a/modules/envoy-gateway/create-synapse-oauth-client.py b/modules/envoy-gateway/create-synapse-oauth-client.py index 002e3964..2f0b23c5 100644 --- a/modules/envoy-gateway/create-synapse-oauth-client.py +++ b/modules/envoy-gateway/create-synapse-oauth-client.py @@ -3,9 +3,9 @@ syn = synapseclient.login() client_meta_data = { - 'client_name': 'dpe-dev-k8s-cluster', + 'client_name': '', 'redirect_uris': [ - 'https://a9a60607095304dec9cd248ef7bd64ea-1681374179.us-east-1.elb.amazonaws.com/testing' + '' ], # 'client_uri': 'https://yourhost.com/index.html', # 'policy_uri': 'https://yourhost.com/policy', diff --git a/modules/envoy-gateway/data.tf b/modules/envoy-gateway/data.tf deleted file mode 100644 index c3260947..00000000 --- a/modules/envoy-gateway/data.tf +++ /dev/null @@ -1,15 +0,0 @@ -data "aws_secretsmanager_secret" "oauth-client-id" { - name = "dev/dpe-sandbox/client-id" -} - -data "aws_secretsmanager_secret_version" "client-id" { - secret_id = data.aws_secretsmanager_secret.oauth-client-id.id -} - -data "aws_secretsmanager_secret" "oauth-client-secret" { - name = "dev/dpe-sandbox/client-secret" -} - -data "aws_secretsmanager_secret_version" "client-secret" { - secret_id = data.aws_secretsmanager_secret.oauth-client-secret.id -} \ No newline at end of file diff --git a/modules/kong-ingress/README.md b/modules/kong-ingress/README.md deleted file mode 100644 index 1cac8fc7..00000000 --- a/modules/kong-ingress/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# Purpose -The purpose of this module is to deploy the `Signoz` helm chart . - -SigNoz is an open-source APM. It helps developers monitor their applications -& troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open -source Application Performance Monitoring (APM) & Observability tool. - - -## This module is a work in progress -This was hastly thrown together to get a tool available to ingest telemetry data in. -A number of items are needed: - -- Updating the clickhouse install to cluster mode, and potentially this operator: https://github.com/Altinity/clickhouse-operator -- Setting up backups and data retention -- Trim down the number of ports available in the service -- Double check the entire `values.yaml` file -- Set up accounts and access to the service decleratively - -## Accessing signoz - -### Pre-req -This assumes that you have accessed the k8s cluster before using `k9s` or another tool. -If you have not, read over this documentation: - -- -- Description of port-forwarding via `k9s`: - -### Connecting to signoz -After signoz has been deployed to the k8s cluster you will need to port-forward to 2 -pods/services: - -- `signoz-frontend` -- `signoz-otel-collector` - -The frontend is how you'll access all of the data contained within signoz. Once you -port forward and access it via your web-browser you'll need to signup and login. -TODO: The steps on this are not fleshed out, this is going to be a manual step that the -admin of the server will need to help you with. - - -#### Sending data into signoz -Once you find the `signoz-otel-collector` you'll need to start a port-forward session in -order to pass data along to it from your local machine. Here are the settings you'll use -for the port-forward: - -Windows/Linux: -``` -Container Port: collector/otlp:4317,collector/otlp-http:4318 -Local Port: 4317,4318 -``` - -Mac: -``` -Container Port: collector::4317,collector::4318 -Local Port: 4317,4318 -``` - -Some data will be present in those fields by default, delete was is there and copy the -above data into it. - -### Application side -Once you're connected via a port-forward session the next item is to make sure that the -application you're sending data from is instrumented with open-telemetry. This is going -to be application specific so instructions will need to live within the application -you are using. diff --git a/modules/kong-ingress/create-synapse-oauth-client.py b/modules/kong-ingress/create-synapse-oauth-client.py deleted file mode 100644 index 002e3964..00000000 --- a/modules/kong-ingress/create-synapse-oauth-client.py +++ /dev/null @@ -1,26 +0,0 @@ -import synapseclient -import json -syn = synapseclient.login() - -client_meta_data = { - 'client_name': 'dpe-dev-k8s-cluster', - 'redirect_uris': [ - 'https://a9a60607095304dec9cd248ef7bd64ea-1681374179.us-east-1.elb.amazonaws.com/testing' - ], - # 'client_uri': 'https://yourhost.com/index.html', - # 'policy_uri': 'https://yourhost.com/policy', - # 'tos_uri': 'https://yourhost.com/terms_of_service', - 'userinfo_signed_response_alg': 'RS256' -} - -# Create the client: -client_meta_data = syn.restPOST(uri='/oauth2/client', - endpoint=syn.authEndpoint, body=json.dumps(client_meta_data)) - -client_id = client_meta_data['client_id'] - -# Generate and retrieve the client secret: -client_id_and_secret = syn.restPOST(uri='/oauth2/client/secret/'+client_id, - endpoint=syn.authEndpoint, body='') - -print(client_id_and_secret) diff --git a/modules/kong-ingress/data.tf b/modules/kong-ingress/data.tf deleted file mode 100644 index c3260947..00000000 --- a/modules/kong-ingress/data.tf +++ /dev/null @@ -1,15 +0,0 @@ -data "aws_secretsmanager_secret" "oauth-client-id" { - name = "dev/dpe-sandbox/client-id" -} - -data "aws_secretsmanager_secret_version" "client-id" { - secret_id = data.aws_secretsmanager_secret.oauth-client-id.id -} - -data "aws_secretsmanager_secret" "oauth-client-secret" { - name = "dev/dpe-sandbox/client-secret" -} - -data "aws_secretsmanager_secret_version" "client-secret" { - secret_id = data.aws_secretsmanager_secret.oauth-client-secret.id -} \ No newline at end of file diff --git a/modules/kong-ingress/main.tf b/modules/kong-ingress/main.tf deleted file mode 100644 index 3db3437c..00000000 --- a/modules/kong-ingress/main.tf +++ /dev/null @@ -1,54 +0,0 @@ - -resource "kubernetes_namespace" "kong-ingress" { - metadata { - name = var.namespace - } -} - -# TODO: Using kustomize in this fashion prints out the secret in the spacelift UI when terraform is running -resource "kubectl_manifest" "kong-ingress" { - depends_on = [kubernetes_namespace.kong-ingress] - - yaml_body = <" - client_secret: - - "" -# session_secret: "" - response_mode: form_post -plugin: openid-connect \ No newline at end of file diff --git a/modules/kong-ingress/templates/values.yaml b/modules/kong-ingress/templates/values.yaml deleted file mode 100644 index 4a63ab29..00000000 --- a/modules/kong-ingress/templates/values.yaml +++ /dev/null @@ -1,49 +0,0 @@ -deployment: - test: - enabled: false - -controller: - proxy: - nameOverride: "{{ .Release.Name }}-gateway-proxy" - - enabled: true - - deployment: - kong: - enabled: false - - ingressController: - enabled: true - - gatewayDiscovery: - enabled: true - generateAdminApiService: true - - podAnnotations: - kuma.io/gateway: enabled - # This port must match your Kong admin API port. 8444 is the default. - # If you set gateway.admin.tls.containerPort, change these annotations - # to use that value. - traffic.kuma.io/exclude-outbound-ports: "8444" - traffic.sidecar.istio.io/excludeOutboundPorts: "8444" - -gateway: - enabled: true - deployment: - kong: - enabled: true - - admin: - enabled: true - type: ClusterIP - clusterIP: None - - ingressController: - enabled: false - - env: - role: traditional - database: "off" - plugins: bundled,openid-connect - - diff --git a/modules/kong-ingress/variables.tf b/modules/kong-ingress/variables.tf deleted file mode 100644 index 770d80bd..00000000 --- a/modules/kong-ingress/variables.tf +++ /dev/null @@ -1,27 +0,0 @@ -variable "auto_deploy" { - description = "Auto deploy through ArgoCD" - type = bool - default = false -} - -variable "auto_prune" { - description = "Auto prune through ArgoCD" - type = bool - default = false -} - -variable "git_revision" { - description = "The git revision to deploy" - type = string - default = "main" -} - -variable "argo_deployment_name" { - description = "The name of the ArgoCD deployment, must be globally unique" - type = string -} - -variable "namespace" { - description = "The namespace to deploy into" - type = string -} diff --git a/modules/postgres-cloud-native/main.tf b/modules/postgres-cloud-native/main.tf index 6be5d1df..8f3c909d 100644 --- a/modules/postgres-cloud-native/main.tf +++ b/modules/postgres-cloud-native/main.tf @@ -67,7 +67,7 @@ resource "kubernetes_secret" "connection-secret" { type = "kubernetes.io/basic-auth" - + # TODO: Need to provide an updated connection is not using the pooler data = { "dbname" = "application-database" "host" = "${var.argo_deployment_name}-pooler-rw.${var.namespace}" diff --git a/modules/signoz/templates/values.yaml b/modules/signoz/templates/values.yaml index 5dc9b966..ca3a46e1 100644 --- a/modules/signoz/templates/values.yaml +++ b/modules/signoz/templates/values.yaml @@ -307,7 +307,7 @@ clickhouse: # layout: shardsCount: 1 - replicasCount: 1 + replicasCount: 2 # -- ClickHouse settings configuration. # You can use this to override settings, for example `prometheus/port: 9363` @@ -346,6 +346,7 @@ clickhouse: # number: 2 # topologyKey: kubernetes.io/hostname + # TODO: Enable cold storage: https://sagebionetworks.jira.com/browse/IBCDPE-1094 # Cold storage configuration coldStorage: # -- Whether to enable S3 cold storage @@ -566,8 +567,6 @@ clickhouse: ## External clickhouse configuration ## This is required when clickhouse.enabled is false -## -# TODO: Implement external clickhouse configuration externalClickhouse: # -- Host of the external cluster. host: From 8dfedba1b6f4bd63067c120124a31a89eac8e66a Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:19:29 -0700 Subject: [PATCH 32/85] Try out on-demand node lifecycle --- .gitignore | 2 +- modules/signoz/templates/values.yaml | 16 +++++++++++----- modules/victoria-metrics/templates/values.yaml | 6 ++++++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 0c5afbfa..8336c48f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ .terraform terraform.tfvars settings.json -temp* \ No newline at end of file +temporary_files* \ No newline at end of file diff --git a/modules/signoz/templates/values.yaml b/modules/signoz/templates/values.yaml index ca3a46e1..5eb748e1 100644 --- a/modules/signoz/templates/values.yaml +++ b/modules/signoz/templates/values.yaml @@ -183,7 +183,9 @@ clickhouse: # port: 2181 # -- Node selector for settings for clickhouse pod - nodeSelector: {} + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } # -- Toleration labels for clickhouse pod assignment tolerations: [] # -- Affinity settings for clickhouse pod @@ -1131,7 +1133,9 @@ alertmanager: # -- Alertmanager priority class name priorityClassName: "" # -- Node selector for settings for Alertmanager pod - nodeSelector: {} + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } # -- Toleration labels for Alertmanager pod assignment tolerations: [] # -- Affinity settings for Alertmanager pod @@ -1709,7 +1713,9 @@ otelCollector: # -- OtelCollector priority class name priorityClassName: "" # -- Node selector for settings for OtelCollector pod - nodeSelector: {} + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } # -- Toleration labels for OtelCollector pod assignment tolerations: [] # -- Affinity settings for OtelCollector pod @@ -1735,8 +1741,8 @@ otelCollector: # runAsUser: 1000 autoscaling: - enabled: false - minReplicas: 1 + enabled: true + minReplicas: 2 maxReplicas: 11 targetCPUUtilizationPercentage: 50 targetMemoryUtilizationPercentage: 50 diff --git a/modules/victoria-metrics/templates/values.yaml b/modules/victoria-metrics/templates/values.yaml index b50a54b9..c4e84892 100644 --- a/modules/victoria-metrics/templates/values.yaml +++ b/modules/victoria-metrics/templates/values.yaml @@ -233,6 +233,9 @@ vmsingle: resources: requests: storage: 20Gi + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } ingress: enabled: false # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName @@ -698,6 +701,9 @@ vmagent: # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug promscrape.dropOriginalLabels: "true" + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } ingress: enabled: false # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName From be902b3f818852ef60561011aeb060ea5133b844 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:21:02 -0700 Subject: [PATCH 33/85] Correct path --- modules/cluster-ingress/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/cluster-ingress/main.tf b/modules/cluster-ingress/main.tf index 36046e25..64d3c65e 100644 --- a/modules/cluster-ingress/main.tf +++ b/modules/cluster-ingress/main.tf @@ -15,7 +15,7 @@ spec: sources: - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' targetRevision: signoz-testing - path: modules/postgres-cloud-native/resources + path: modules/cluster-ingress/resources kustomize: patches: - target: From ac6a1e3062798d5e9a8335fab7ac9c4a2a929f47 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:33:03 -0700 Subject: [PATCH 34/85] Include gatway class --- modules/cluster-ingress/resources/kustomization.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/cluster-ingress/resources/kustomization.yaml b/modules/cluster-ingress/resources/kustomization.yaml index c66218bf..99613ce8 100644 --- a/modules/cluster-ingress/resources/kustomization.yaml +++ b/modules/cluster-ingress/resources/kustomization.yaml @@ -3,3 +3,4 @@ kind: Kustomization resources: - gateway.yaml - cert-issuer.yaml +- gateway-class.yaml From bf536964d0d72aa86e3037eee49ecdc3aa25b87a Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:50:29 -0700 Subject: [PATCH 35/85] Add some notes --- deployments/stacks/dpe-k8s-deployments/main.tf | 13 ++++++++----- modules/cluster-ingress/README.md | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 8aa80435..0ae98c25 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -45,8 +45,6 @@ module "trivy-operator" { } module "airflow" { - # TODO: This is temporary - count = 0 depends_on = [module.victoria-metrics, module.argo-cd] source = "spacelift.io/sagebionetworks/airflow/aws" version = "0.4.0" @@ -66,8 +64,6 @@ module "postgres-cloud-native-operator" { } module "postgres-cloud-native-database" { - # TODO: This is temporary - count = 0 depends_on = [module.postgres-cloud-native-operator, module.airflow, module.argo-cd] source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" version = "0.5.0" @@ -92,6 +88,8 @@ module "signoz" { } module "envoy-gateway" { + # TODO: This is temporary until we are ready to deploy the ingress controller: https://sagebionetworks.jira.com/browse/IBCDPE-1095 + count = 0 depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" @@ -104,6 +102,8 @@ module "envoy-gateway" { } module "cert-manager" { + # TODO: This is temporary until we are ready to deploy the ingress controller: https://sagebionetworks.jira.com/browse/IBCDPE-1095 + count = 0 depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" @@ -116,6 +116,8 @@ module "cert-manager" { } module "cluster-ingress" { + # TODO: This is temporary until we are ready to deploy the ingress controller: https://sagebionetworks.jira.com/browse/IBCDPE-1095 + count = 0 depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" @@ -126,7 +128,8 @@ module "cluster-ingress" { namespace = "envoy-gateway" argo_deployment_name = "cluster-ingress" - # To determine more elegant ways to fill in these values + # To determine more elegant ways to fill in these values, for example, if we have + # a pre-defined DNS name for the cluster (https://sagebionetworks.jira.com/browse/IT-3931) ssl_hostname = "unknown-to-fill-in" cluster_issuer_name = "selfsigned" } diff --git a/modules/cluster-ingress/README.md b/modules/cluster-ingress/README.md index 8df21df2..db50c39d 100644 --- a/modules/cluster-ingress/README.md +++ b/modules/cluster-ingress/README.md @@ -14,8 +14,8 @@ metadata: spec: targetRef: group: gateway.networking.k8s.io - kind: HTTPRoute - name: backend + kind: Gateway + name: eg jwt: providers: - name: auth0 From 19190c1f61e6957bad70946d31be0e4eface1610 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:51:55 -0700 Subject: [PATCH 36/85] Set scaling back --- modules/signoz/templates/values.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/signoz/templates/values.yaml b/modules/signoz/templates/values.yaml index 5eb748e1..454c9ebf 100644 --- a/modules/signoz/templates/values.yaml +++ b/modules/signoz/templates/values.yaml @@ -1469,7 +1469,7 @@ otelCollector: minReadySeconds: 5 progressDeadlineSeconds: 120 - replicaCount: 1 + replicaCount: 2 # OtelCollector RBAC config clusterRole: @@ -1741,8 +1741,8 @@ otelCollector: # runAsUser: 1000 autoscaling: - enabled: true - minReplicas: 2 + enabled: false + minReplicas: 1 maxReplicas: 11 targetCPUUtilizationPercentage: 50 targetMemoryUtilizationPercentage: 50 From fd8fe4f6eea7d9b79a383c388a5b0be6fa920e72 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:59:39 -0700 Subject: [PATCH 37/85] Run 1 replica but on-demand --- modules/apache-airflow/templates/values.yaml | 22 +++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/modules/apache-airflow/templates/values.yaml b/modules/apache-airflow/templates/values.yaml index 640a4d92..93a0f503 100644 --- a/modules/apache-airflow/templates/values.yaml +++ b/modules/apache-airflow/templates/values.yaml @@ -469,7 +469,7 @@ kerberos: # Airflow Worker Config workers: # Number of airflow celery workers in StatefulSet - replicas: 2 + replicas: 1 # Max number of old replicasets to retain revisionHistoryLimit: ~ @@ -638,7 +638,9 @@ workers: extraVolumeMounts: [] # Select certain nodes for airflow worker pods. - nodeSelector: {} + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } runtimeClassName: ~ priorityClassName: ~ affinity: @@ -723,7 +725,7 @@ scheduler: command: ~ # Airflow 2.0 allows users to run multiple schedulers, # However this feature is only recommended for MySQL 8+ and Postgres - replicas: 2 + replicas: 1 # Max number of old replicasets to retain revisionHistoryLimit: ~ @@ -808,7 +810,9 @@ scheduler: extraVolumeMounts: [] # Select certain nodes for airflow scheduler pods. - nodeSelector: {} + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } affinity: # default scheduler affinity is: podAntiAffinity: @@ -1250,7 +1254,7 @@ webserver: triggerer: enabled: true # Number of airflow triggerers in the deployment - replicas: 2 + replicas: 1 # Max number of old replicasets to retain revisionHistoryLimit: ~ @@ -1350,7 +1354,9 @@ triggerer: extraVolumeMounts: [] # Select certain nodes for airflow triggerer pods. - nodeSelector: {} + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } affinity: # default triggerer affinity is: podAntiAffinity: @@ -1944,7 +1950,9 @@ redis: safeToEvict: true # Select certain nodes for redis pods. - nodeSelector: {} + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } affinity: {} tolerations: [] topologySpreadConstraints: [] From 2f6bae7571e7833a65952ff2785ab4f8631e9d8b Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:01:58 -0700 Subject: [PATCH 38/85] Remove todo comment --- modules/envoy-gateway/main.tf | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/envoy-gateway/main.tf b/modules/envoy-gateway/main.tf index 56f7e716..7546d3d2 100644 --- a/modules/envoy-gateway/main.tf +++ b/modules/envoy-gateway/main.tf @@ -4,8 +4,6 @@ resource "kubernetes_namespace" "envoy-gateway" { name = var.namespace } } - -# TODO: Using kustomize in this fashion prints out the secret in the spacelift UI when terraform is running resource "kubectl_manifest" "envoy-gateway" { depends_on = [kubernetes_namespace.envoy-gateway] From 5ed270d96a6efe9a9854001fd6bd0c559717a01a Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:02:57 -0700 Subject: [PATCH 39/85] Point to correct revision --- modules/cert-manager/main.tf | 2 +- modules/cluster-ingress/main.tf | 2 +- modules/envoy-gateway/main.tf | 2 +- modules/signoz/main.tf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/cert-manager/main.tf b/modules/cert-manager/main.tf index 5d657d52..d6aa61e4 100644 --- a/modules/cert-manager/main.tf +++ b/modules/cert-manager/main.tf @@ -29,7 +29,7 @@ spec: valueFiles: - $values/modules/cert-manager/templates/values.yaml - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' - targetRevision: signoz-testing + targetRevision: ${var.git_revision} ref: values destination: server: 'https://kubernetes.default.svc' diff --git a/modules/cluster-ingress/main.tf b/modules/cluster-ingress/main.tf index 64d3c65e..ffd78f7c 100644 --- a/modules/cluster-ingress/main.tf +++ b/modules/cluster-ingress/main.tf @@ -14,7 +14,7 @@ spec: %{endif} sources: - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' - targetRevision: signoz-testing + targetRevision: ${var.git_revision} path: modules/cluster-ingress/resources kustomize: patches: diff --git a/modules/envoy-gateway/main.tf b/modules/envoy-gateway/main.tf index 7546d3d2..147c94db 100644 --- a/modules/envoy-gateway/main.tf +++ b/modules/envoy-gateway/main.tf @@ -29,7 +29,7 @@ spec: valueFiles: - $values/modules/envoy-gateway/templates/values.yaml - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' - targetRevision: signoz-testing + targetRevision: ${var.git_revision} ref: values destination: server: 'https://kubernetes.default.svc' diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 7e187c8f..a7c29126 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -30,7 +30,7 @@ spec: valueFiles: - $values/modules/signoz/templates/values.yaml - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' - targetRevision: signoz-testing + targetRevision: ${var.git_revision} ref: values destination: server: 'https://kubernetes.default.svc' From a61bdd0b2e080444080f309aebe13434066b7207 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:03:53 -0700 Subject: [PATCH 40/85] Correct comment --- modules/postgres-cloud-native/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/postgres-cloud-native/main.tf b/modules/postgres-cloud-native/main.tf index 8f3c909d..31284660 100644 --- a/modules/postgres-cloud-native/main.tf +++ b/modules/postgres-cloud-native/main.tf @@ -67,7 +67,7 @@ resource "kubernetes_secret" "connection-secret" { type = "kubernetes.io/basic-auth" - # TODO: Need to provide an updated connection is not using the pooler + # TODO: Need to provide an updated connection if not using the pooler data = { "dbname" = "application-database" "host" = "${var.argo_deployment_name}-pooler-rw.${var.namespace}" From bec8d9dc819b5bb1de53adc755a393979c4e50d4 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:05:03 -0700 Subject: [PATCH 41/85] Add to readme --- modules/signoz/README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/modules/signoz/README.md b/modules/signoz/README.md index 1cac8fc7..85f64cbe 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -7,13 +7,10 @@ source Application Performance Monitoring (APM) & Observability tool. ## This module is a work in progress -This was hastly thrown together to get a tool available to ingest telemetry data in. A number of items are needed: -- Updating the clickhouse install to cluster mode, and potentially this operator: https://github.com/Altinity/clickhouse-operator -- Setting up backups and data retention -- Trim down the number of ports available in the service -- Double check the entire `values.yaml` file +- Setting up backups and data retention: https://sagebionetworks.jira.com/browse/IBCDPE-1094 +- Set up ingress to the cluster/collector to send data to: https://sagebionetworks.jira.com/browse/IBCDPE-1095 - Set up accounts and access to the service decleratively ## Accessing signoz From 5b0aa6480a370a1529567aeb0cc04db41518100c Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:31:56 -0700 Subject: [PATCH 42/85] Leave at 2 az deployment --- deployments/main.tf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deployments/main.tf b/deployments/main.tf index bea46a57..db22bb64 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -38,9 +38,9 @@ module "dpe-sandbox-spacelift-development" { vpc_name = "dpe-sandbox" vpc_cidr_block = "10.51.0.0/16" - public_subnet_cidrs = ["10.51.1.0/24", "10.51.2.0/24", "10.51.3.0/24"] - private_subnet_cidrs = ["10.51.4.0/24", "10.51.5.0/24", "10.51.6.0/24"] - azs = ["us-east-1a", "us-east-1b", "us-east-1c"] + public_subnet_cidrs = ["10.51.1.0/24", "10.51.2.0/24"] + private_subnet_cidrs = ["10.51.4.0/24", "10.51.5.0/24"] + azs = ["us-east-1a", "us-east-1b"] } module "dpe-sandbox-spacelift-production" { @@ -68,7 +68,7 @@ module "dpe-sandbox-spacelift-production" { vpc_name = "dpe-k8" vpc_cidr_block = "10.52.0.0/16" - public_subnet_cidrs = ["10.52.1.0/24", "10.52.2.0/24", "10.52.3.0/24"] - private_subnet_cidrs = ["10.52.4.0/24", "10.52.5.0/24", "10.52.6.0/24"] - azs = ["us-east-1a", "us-east-1b", "us-east-1c"] + public_subnet_cidrs = ["10.52.1.0/24", "10.52.2.0/24"] + private_subnet_cidrs = ["10.52.4.0/24", "10.52.5.0/24"] + azs = ["us-east-1a", "us-east-1b"] } From aefa2e1b7cffcdb2890affe928cd23b64464b0b2 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:51:19 -0700 Subject: [PATCH 43/85] Update modules/cluster-ingress/README.md --- modules/cluster-ingress/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/cluster-ingress/README.md b/modules/cluster-ingress/README.md index db50c39d..6890c94c 100644 --- a/modules/cluster-ingress/README.md +++ b/modules/cluster-ingress/README.md @@ -4,7 +4,7 @@ the cluster. Along with the ingress we will also deploy out the related SSL cert ## To implemement The Envoy Gateway can secure ingress by verifying JWT. It can be applied to a specific -target, for example this applies it to an HTTPRoute called `backend`: +target, for example this applies it to all requests going through a `Gateway` called `eg` ``` apiVersion: gateway.envoyproxy.io/v1alpha1 From 58b2de4906d9338bff99a16d5a38966328bc8d68 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:00:14 -0700 Subject: [PATCH 44/85] Update readme --- modules/envoy-gateway/README.md | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/modules/envoy-gateway/README.md b/modules/envoy-gateway/README.md index 43d387e8..1753a2ec 100644 --- a/modules/envoy-gateway/README.md +++ b/modules/envoy-gateway/README.md @@ -1,17 +1,12 @@ # Purpose -Create/handle ingress for the kubernetes cluster +Create/handle ingress (aka north/south traffic) for the kubernetes cluster. Using +kubernetes resources we can define how to handle traffic. +## Components +The following show some example components for creating a GatewayClass/Gateway to +handle ingress for the kubernetes cluster. This is set to use letsencrypt staging, but +it is subject to change: -# Integration with Auth0 -Auth0 handles provisioning JWT to authenticate with the envoy gateway. - -## Creating credential: -`openssl genrsa -out test_key.pem 2048` -`openssl rsa -in test_key.pem -outform PEM -pubout -out test_key.pem.pub` - - - -Creating gateway resources: ``` apiVersion: gateway.networking.k8s.io/v1 kind: GatewayClass From e26e7e44c1c73d4ed027130ee5ca05ca0cb4d6df Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 4 Oct 2024 10:22:14 -0700 Subject: [PATCH 45/85] Set param --- modules/signoz/main.tf | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index a7c29126..40b456f6 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -27,6 +27,10 @@ spec: targetRevision: 0.50.0 helm: releaseName: signoz + # Extra parameters to set (same as setting through values.yaml, but these take precedence) + parameters: + - name: "clickhouse.password" + value: ${random_password.clickhouse-admin-password.result} valueFiles: - $values/modules/signoz/templates/values.yaml - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' @@ -37,3 +41,22 @@ spec: namespace: ${var.namespace} YAML } + + +resource "random_password" "clickhouse-admin-password" { + length = 32 + special = false +} + +resource "kubernetes_secret" "clickhouse-admin-password" { + metadata { + name = "clickhouse-admin-password" + namespace = var.namespace + } + + data = { + "password" = random_password.clickhouse-admin-password.result + } + + depends_on = [kubernetes_namespace.signoz] +} From a501f3277ca5d9d8f78cf76f5ddd61139116f1c2 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 4 Oct 2024 10:29:21 -0700 Subject: [PATCH 46/85] no multiple sources --- modules/signoz/main.tf | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 40b456f6..473b2992 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -32,10 +32,7 @@ spec: - name: "clickhouse.password" value: ${random_password.clickhouse-admin-password.result} valueFiles: - - $values/modules/signoz/templates/values.yaml - - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' - targetRevision: ${var.git_revision} - ref: values + - ./templates/values.yaml destination: server: 'https://kubernetes.default.svc' namespace: ${var.namespace} From f284709b11c1004aba52bd13ef1ba07af6f0cb56 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 4 Oct 2024 10:35:56 -0700 Subject: [PATCH 47/85] Set --- modules/signoz/main.tf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 473b2992..40b456f6 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -32,7 +32,10 @@ spec: - name: "clickhouse.password" value: ${random_password.clickhouse-admin-password.result} valueFiles: - - ./templates/values.yaml + - $values/modules/signoz/templates/values.yaml + - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' + targetRevision: ${var.git_revision} + ref: values destination: server: 'https://kubernetes.default.svc' namespace: ${var.namespace} From 5aa954be634832f132b59e3bb8affa98a2212d58 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 4 Oct 2024 10:55:00 -0700 Subject: [PATCH 48/85] Note that the admin password is randomized --- modules/signoz/templates/values.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/signoz/templates/values.yaml b/modules/signoz/templates/values.yaml index 454c9ebf..d6d25d66 100644 --- a/modules/signoz/templates/values.yaml +++ b/modules/signoz/templates/values.yaml @@ -127,8 +127,7 @@ clickhouse: # -- Clickhouse user user: admin # -- Clickhouse password - # TODO: Replace with random password - password: 27ff0399-0d3a-4bd8-919d-17c2181e6fb9 + password: # -- Clickhouse image image: From eebfca17538aa9eb534a2f82a0bd299491248907 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 4 Oct 2024 11:21:54 -0700 Subject: [PATCH 49/85] Update modules/signoz/README.md Co-authored-by: Thomas Yu --- modules/signoz/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz/README.md b/modules/signoz/README.md index 85f64cbe..a84055a2 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -52,7 +52,7 @@ Container Port: collector::4317,collector::4318 Local Port: 4317,4318 ``` -Some data will be present in those fields by default, delete was is there and copy the +Some data will be present in those fields by default, delete what is there and copy the above data into it. ### Application side From 835de37aa85cca89907dfb714812c1cc64f5f7f7 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 8 Oct 2024 10:16:15 -0700 Subject: [PATCH 50/85] Enable replication for schema migrator --- modules/signoz/templates/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz/templates/values.yaml b/modules/signoz/templates/values.yaml index d6d25d66..c160e354 100644 --- a/modules/signoz/templates/values.yaml +++ b/modules/signoz/templates/values.yaml @@ -1280,7 +1280,7 @@ schemaMigrator: "helm.sh/hook-delete-policy": "before-hook-creation" # -- Whether to enable replication for schemaMigrator - enableReplication: false + enableReplication: true # -- Node selector for settings for schemaMigrator nodeSelector: {} From e8f989f635909b6b0aa5058836eca0aac5d35990 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 8 Oct 2024 12:35:50 -0700 Subject: [PATCH 51/85] Set back to single replica for DB init --- modules/signoz/templates/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz/templates/values.yaml b/modules/signoz/templates/values.yaml index c160e354..1bf4090d 100644 --- a/modules/signoz/templates/values.yaml +++ b/modules/signoz/templates/values.yaml @@ -308,7 +308,7 @@ clickhouse: # layout: shardsCount: 1 - replicasCount: 2 + replicasCount: 1 # -- ClickHouse settings configuration. # You can use this to override settings, for example `prometheus/port: 9363` From 5ac8424e01e0fb46f373bffec0424df0a8842afe Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 8 Oct 2024 14:07:13 -0700 Subject: [PATCH 52/85] Bump replica back to 2 --- modules/signoz/templates/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz/templates/values.yaml b/modules/signoz/templates/values.yaml index 1bf4090d..c160e354 100644 --- a/modules/signoz/templates/values.yaml +++ b/modules/signoz/templates/values.yaml @@ -308,7 +308,7 @@ clickhouse: # layout: shardsCount: 1 - replicasCount: 1 + replicasCount: 2 # -- ClickHouse settings configuration. # You can use this to override settings, for example `prometheus/port: 9363` From 5947065a1f7e015561f2b8cad2c2356f2e4f6d6d Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:54:35 -0700 Subject: [PATCH 53/85] Envoy Gateway Minimum TLS (#36) * Apply minimum TLS --- modules/cluster-ingress/resources/gateway-class.yaml | 2 +- modules/cluster-ingress/resources/kustomization.yaml | 1 + modules/cluster-ingress/resources/traffic-policy.yaml | 11 +++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 modules/cluster-ingress/resources/traffic-policy.yaml diff --git a/modules/cluster-ingress/resources/gateway-class.yaml b/modules/cluster-ingress/resources/gateway-class.yaml index a750b9fd..a619d17a 100644 --- a/modules/cluster-ingress/resources/gateway-class.yaml +++ b/modules/cluster-ingress/resources/gateway-class.yaml @@ -3,4 +3,4 @@ kind: GatewayClass metadata: name: eg spec: - controllerName: gateway.envoyproxy.io/gatewayclass-controller \ No newline at end of file + controllerName: gateway.envoyproxy.io/gatewayclass-controller diff --git a/modules/cluster-ingress/resources/kustomization.yaml b/modules/cluster-ingress/resources/kustomization.yaml index 99613ce8..be0a9119 100644 --- a/modules/cluster-ingress/resources/kustomization.yaml +++ b/modules/cluster-ingress/resources/kustomization.yaml @@ -4,3 +4,4 @@ resources: - gateway.yaml - cert-issuer.yaml - gateway-class.yaml +- traffic-policy.yaml diff --git a/modules/cluster-ingress/resources/traffic-policy.yaml b/modules/cluster-ingress/resources/traffic-policy.yaml new file mode 100644 index 00000000..f95cc046 --- /dev/null +++ b/modules/cluster-ingress/resources/traffic-policy.yaml @@ -0,0 +1,11 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: ClientTrafficPolicy +metadata: + name: traffic-policy +spec: + targetRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: eg + tls: + minVersion: "1.3" \ No newline at end of file From 139dd6aba77e8dbd0f8438219cb779b599d5d299 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 17 Oct 2024 15:14:49 -0700 Subject: [PATCH 54/85] Shrink VPC size and create subnets specifically for worker nodes that are not shared with control plane --- deployments/main.tf | 22 +++++++----- deployments/spacelift/dpe-k8s/main.tf | 28 ++++++++------- deployments/spacelift/dpe-k8s/variables.tf | 18 +++++++--- .../stacks/dpe-k8s-deployments/main.tf | 4 +-- .../stacks/dpe-k8s-deployments/variables.tf | 4 +-- deployments/stacks/dpe-k8s/main.tf | 34 ++++++++++++------- deployments/stacks/dpe-k8s/outputs.tf | 4 +-- deployments/stacks/dpe-k8s/variables.tf | 18 +++++++--- modules/sage-aws-eks/main.tf | 7 ++-- modules/sage-aws-eks/variables.tf | 9 +++-- modules/sage-aws-vpc/main.tf | 4 +-- modules/sage-aws-vpc/ouputs.tf | 8 +++-- modules/sage-aws-vpc/variables.tf | 18 +++++++--- 13 files changed, 117 insertions(+), 61 deletions(-) diff --git a/deployments/main.tf b/deployments/main.tf index db22bb64..2920e481 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -37,10 +37,13 @@ module "dpe-sandbox-spacelift-development" { cluster_name = "dpe-k8-sandbox" vpc_name = "dpe-sandbox" - vpc_cidr_block = "10.51.0.0/16" - public_subnet_cidrs = ["10.51.1.0/24", "10.51.2.0/24"] - private_subnet_cidrs = ["10.51.4.0/24", "10.51.5.0/24"] - azs = ["us-east-1a", "us-east-1b"] + vpc_cidr_block = "10.52.16.0/20" + public_subnet_cidrs = ["10.52.16.0/24", "10.52.17.0/24"] + private_subnet_cidrs_eks_control_plane = ["10.52.18.0/24", "10.52.19.0/24"] + azs_eks_control_plane = ["us-east-1a", "us-east-1b"] + + private_subnet_cidrs_eks_worker_nodes = ["10.52.20.0/22", "10.52.24.0/22", "10.52.26.0/22"] + azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] } module "dpe-sandbox-spacelift-production" { @@ -67,8 +70,11 @@ module "dpe-sandbox-spacelift-production" { cluster_name = "dpe-k8" vpc_name = "dpe-k8" - vpc_cidr_block = "10.52.0.0/16" - public_subnet_cidrs = ["10.52.1.0/24", "10.52.2.0/24"] - private_subnet_cidrs = ["10.52.4.0/24", "10.52.5.0/24"] - azs = ["us-east-1a", "us-east-1b"] + vpc_cidr_block = "10.52.0.0/20" + public_subnet_cidrs = ["10.52.1.0/24", "10.52.2.0/24"] + private_subnet_cidrs_eks_control_plane = ["10.52.4.0/24", "10.52.5.0/24"] + azs_eks_control_plane = ["us-east-1a", "us-east-1b"] + + private_subnet_cidrs_eks_worker_nodes = ["10.52.6.0/22", "10.52.10.0/22", "10.52.14.0/22"] + azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] } diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index f63aaafb..c0035bac 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -1,14 +1,16 @@ locals { k8s_stack_environment_variables = { - aws_account_id = var.aws_account_id - region = var.region - pod_security_group_enforcing_mode = var.pod_security_group_enforcing_mode - cluster_name = var.cluster_name - vpc_name = var.vpc_name - vpc_cidr_block = var.vpc_cidr_block - public_subnet_cidrs = var.public_subnet_cidrs - private_subnet_cidrs = var.private_subnet_cidrs - azs = var.azs + aws_account_id = var.aws_account_id + region = var.region + pod_security_group_enforcing_mode = var.pod_security_group_enforcing_mode + cluster_name = var.cluster_name + vpc_name = var.vpc_name + vpc_cidr_block = var.vpc_cidr_block + public_subnet_cidrs = var.public_subnet_cidrs + private_subnet_cidrs_eks_control_plane = var.private_subnet_cidrs_eks_control_plane + private_subnet_cidrs_eks_worker_nodes = var.private_subnet_cidrs_eks_worker_nodes + azs_eks_control_plane = var.azs_eks_control_plane + azs_eks_worker_nodes = var.azs_eks_worker_nodes } k8s_stack_deployments_variables = { @@ -23,10 +25,10 @@ locals { # Variables to be passed from the k8s stack to the deployments stack k8s_stack_to_deployment_variables = { - vpc_id = "TF_VAR_vpc_id" - private_subnet_ids = "TF_VAR_private_subnet_ids" - node_security_group_id = "TF_VAR_node_security_group_id" - pod_to_node_dns_sg_id = "TF_VAR_pod_to_node_dns_sg_id" + vpc_id = "TF_VAR_vpc_id" + private_subnet_ids_eks_worker_nodes = "TF_VAR_private_subnet_ids_eks_worker_nodes" + node_security_group_id = "TF_VAR_node_security_group_id" + pod_to_node_dns_sg_id = "TF_VAR_pod_to_node_dns_sg_id" } } diff --git a/deployments/spacelift/dpe-k8s/variables.tf b/deployments/spacelift/dpe-k8s/variables.tf index 3bbdc25e..3e43e436 100644 --- a/deployments/spacelift/dpe-k8s/variables.tf +++ b/deployments/spacelift/dpe-k8s/variables.tf @@ -118,12 +118,22 @@ variable "public_subnet_cidrs" { description = "Public Subnet CIDR values" } -variable "private_subnet_cidrs" { +variable "private_subnet_cidrs_eks_control_plane" { type = list(string) - description = "Private Subnet CIDR values" + description = "Private Subnet CIDR values for the EKS control plane" } -variable "azs" { +variable "private_subnet_cidrs_eks_worker_nodes" { type = list(string) - description = "Availability Zones" + description = "Private Subnet CIDR values for the EKS worker nodes" +} + +variable "azs_eks_control_plane" { + type = list(string) + description = "Availability Zones for the EKS control plane" +} + +variable "azs_eks_worker_nodes" { + type = list(string) + description = "Availability Zones for the EKS worker nodes" } diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 0ae98c25..ebe7b1f7 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -2,7 +2,7 @@ module "sage-aws-eks-autoscaler" { source = "spacelift.io/sagebionetworks/sage-aws-eks-autoscaler/aws" version = "0.9.0" cluster_name = var.cluster_name - private_vpc_subnet_ids = var.private_subnet_ids + private_vpc_subnet_ids = var.private_subnet_ids_eks_worker_nodes vpc_id = var.vpc_id node_security_group_id = var.node_security_group_id spotinst_account = var.spotinst_account @@ -16,7 +16,7 @@ module "sage-aws-eks-addons" { cluster_name = var.cluster_name aws_account_id = var.aws_account_id vpc_id = var.vpc_id - private_subnet_ids = var.private_subnet_ids + private_subnet_ids = var.private_subnet_ids_eks_worker_nodes } module "argo-cd" { diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index c918f5e2..5797a052 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -3,8 +3,8 @@ variable "vpc_id" { type = string } -variable "private_subnet_ids" { - description = "Private subnet IDs" +variable "private_subnet_ids_eks_worker_nodes" { + description = "Private subnet IDs for the EKS worker nodes" type = list(string) } diff --git a/deployments/stacks/dpe-k8s/main.tf b/deployments/stacks/dpe-k8s/main.tf index 9677d01f..d9f9cf69 100644 --- a/deployments/stacks/dpe-k8s/main.tf +++ b/deployments/stacks/dpe-k8s/main.tf @@ -1,24 +1,27 @@ module "sage-aws-vpc" { - source = "spacelift.io/sagebionetworks/sage-aws-vpc/aws" - version = "0.4.2" + # source = "spacelift.io/sagebionetworks/sage-aws-vpc/aws" + # version = "0.4.2" + source = "../../../modules/sage-aws-vpc" vpc_name = var.vpc_name # TODO: Per https://sagebionetworks.jira.com/browse/IT-3824 # We will soon not have to capture the VPC flow logs outself as every account with a VPC will have them enabled by default - capture_flow_logs = true - flow_log_retention = 90 - vpc_cidr_block = var.vpc_cidr_block - public_subnet_cidrs = var.public_subnet_cidrs - private_subnet_cidrs = var.private_subnet_cidrs - azs = var.azs - region = var.region + capture_flow_logs = true + flow_log_retention = 90 + vpc_cidr_block = var.vpc_cidr_block + public_subnet_cidrs = var.public_subnet_cidrs + private_subnet_cidrs_eks_control_plane = var.private_subnet_cidrs_eks_control_plane + private_subnet_cidrs_eks_worker_nodes = var.private_subnet_cidrs_eks_worker_nodes + azs_eks_control_plane = var.azs_eks_control_plane + azs_eks_worker_nodes = var.azs_eks_worker_nodes + region = var.region } module "sage-aws-eks" { - source = "spacelift.io/sagebionetworks/sage-aws-eks/aws" - version = "0.6.0" + # source = "spacelift.io/sagebionetworks/sage-aws-eks/aws" + # version = "0.6.0" + source = "../../../modules/sage-aws-eks" cluster_name = var.cluster_name - private_vpc_subnet_ids = module.sage-aws-vpc.private_subnet_ids vpc_id = module.sage-aws-vpc.vpc_id vpc_security_group_id = module.sage-aws-vpc.vpc_security_group_id enable_policy_event_logs = true @@ -26,5 +29,10 @@ module "sage-aws-eks" { cloudwatch_retention = 90 pod_security_group_enforcing_mode = var.pod_security_group_enforcing_mode aws_account_id = var.aws_account_id - private_subnet_cidrs = module.sage-aws-vpc.vpc_private_subnet_cidrs + private_subnet_cidrs = concat( + var.private_subnet_cidrs_eks_control_plane, + var.private_subnet_cidrs_eks_worker_nodes + ) + private_subnet_ids_eks_control_plane = module.sage-aws-vpc.private_subnet_ids_eks_control_plane + private_subnet_ids_eks_worker_nodes = module.sage-aws-vpc.private_subnet_ids_eks_worker_nodes } diff --git a/deployments/stacks/dpe-k8s/outputs.tf b/deployments/stacks/dpe-k8s/outputs.tf index 1e5ab5ba..19418d1c 100644 --- a/deployments/stacks/dpe-k8s/outputs.tf +++ b/deployments/stacks/dpe-k8s/outputs.tf @@ -14,8 +14,8 @@ output "vpc_private_subnet_cidrs" { value = module.sage-aws-vpc.vpc_private_subnet_cidrs } -output "private_subnet_ids" { - value = module.sage-aws-vpc.private_subnet_ids +output "private_subnet_ids_eks_worker_nodes" { + value = module.sage-aws-vpc.private_subnet_ids_eks_worker_nodes } output "vpc_security_group_id" { diff --git a/deployments/stacks/dpe-k8s/variables.tf b/deployments/stacks/dpe-k8s/variables.tf index b19ef969..6edc1cc1 100644 --- a/deployments/stacks/dpe-k8s/variables.tf +++ b/deployments/stacks/dpe-k8s/variables.tf @@ -35,12 +35,22 @@ variable "public_subnet_cidrs" { description = "Public Subnet CIDR values" } -variable "private_subnet_cidrs" { +variable "private_subnet_cidrs_eks_control_plane" { type = list(string) - description = "Private Subnet CIDR values" + description = "Private Subnet CIDR values for the EKS control plane" } -variable "azs" { +variable "private_subnet_cidrs_eks_worker_nodes" { type = list(string) - description = "Availability Zones" + description = "Private Subnet CIDR values for the EKS worker nodes" +} + +variable "azs_eks_control_plane" { + type = list(string) + description = "Availability Zones for the EKS control plane" +} + +variable "azs_eks_worker_nodes" { + type = list(string) + description = "Availability Zones for the EKS worker nodes" } diff --git a/modules/sage-aws-eks/main.tf b/modules/sage-aws-eks/main.tf index 4c33b5ad..529582bd 100644 --- a/modules/sage-aws-eks/main.tf +++ b/modules/sage-aws-eks/main.tf @@ -104,9 +104,10 @@ module "eks" { } } - vpc_id = var.vpc_id - subnet_ids = var.private_vpc_subnet_ids - control_plane_subnet_ids = var.private_vpc_subnet_ids + vpc_id = var.vpc_id + # A list of subnet IDs where the nodes/node groups will be provisioned. + subnet_ids = var.private_subnet_ids_eks_worker_nodes + control_plane_subnet_ids = var.private_subnet_ids_eks_control_plane cluster_security_group_id = var.vpc_security_group_id iam_role_additional_policies = { diff --git a/modules/sage-aws-eks/variables.tf b/modules/sage-aws-eks/variables.tf index c5e108ad..03852adf 100644 --- a/modules/sage-aws-eks/variables.tf +++ b/modules/sage-aws-eks/variables.tf @@ -28,9 +28,14 @@ variable "vpc_id" { type = string } -variable "private_vpc_subnet_ids" { - description = "List of private subnets to deploy the cluster to" +variable "private_subnet_ids_eks_control_plane" { type = list(string) + description = "Private Subnet ID values for the EKS control plane" +} + +variable "private_subnet_ids_eks_worker_nodes" { + type = list(string) + description = "Private Subnet ID values for the EKS worker nodes" } variable "private_subnet_cidrs" { diff --git a/modules/sage-aws-vpc/main.tf b/modules/sage-aws-vpc/main.tf index 9414621b..b33f44eb 100644 --- a/modules/sage-aws-vpc/main.tf +++ b/modules/sage-aws-vpc/main.tf @@ -5,8 +5,8 @@ module "vpc" { name = var.vpc_name cidr = var.vpc_cidr_block - azs = var.azs - private_subnets = var.private_subnet_cidrs + azs = concat(var.azs_eks_control_plane, var.azs_eks_worker_nodes) + private_subnets = concat(var.private_subnet_cidrs_eks_control_plane, var.private_subnet_cidrs_eks_worker_nodes) public_subnets = var.public_subnet_cidrs diff --git a/modules/sage-aws-vpc/ouputs.tf b/modules/sage-aws-vpc/ouputs.tf index a06cf314..0faac8a8 100644 --- a/modules/sage-aws-vpc/ouputs.tf +++ b/modules/sage-aws-vpc/ouputs.tf @@ -22,6 +22,10 @@ output "vpc_public_subnet_cidrs" { value = var.public_subnet_cidrs } -output "vpc_private_subnet_cidrs" { - value = var.private_subnet_cidrs +output "private_subnet_ids_eks_control_plane" { + value = slice(module.vpc.private_subnets, 0, length(var.private_subnet_cidrs_eks_control_plane)) +} + +output "private_subnet_ids_eks_worker_nodes" { + value = slice(module.vpc.private_subnets, length(var.private_subnet_cidrs_eks_control_plane), length(var.private_subnet_cidrs_eks_worker_nodes)) } diff --git a/modules/sage-aws-vpc/variables.tf b/modules/sage-aws-vpc/variables.tf index 2de4201f..11304ed7 100644 --- a/modules/sage-aws-vpc/variables.tf +++ b/modules/sage-aws-vpc/variables.tf @@ -13,14 +13,24 @@ variable "public_subnet_cidrs" { description = "Public Subnet CIDR values" } -variable "private_subnet_cidrs" { +variable "private_subnet_cidrs_eks_control_plane" { type = list(string) - description = "Private Subnet CIDR values" + description = "Private Subnet CIDR values for the EKS control plane" } -variable "azs" { +variable "private_subnet_cidrs_eks_worker_nodes" { type = list(string) - description = "Availability Zones" + description = "Private Subnet CIDR values for the EKS worker nodes" +} + +variable "azs_eks_control_plane" { + type = list(string) + description = "Availability Zones for the EKS control plane" +} + +variable "azs_eks_worker_nodes" { + type = list(string) + description = "Availability Zones for the EKS worker nodes" } variable "region" { From f3f76473efc8cd8309d9d8228259bf5303e4b759 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 17 Oct 2024 15:20:06 -0700 Subject: [PATCH 55/85] Add back var --- modules/sage-aws-vpc/ouputs.tf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/sage-aws-vpc/ouputs.tf b/modules/sage-aws-vpc/ouputs.tf index 0faac8a8..b20e47a0 100644 --- a/modules/sage-aws-vpc/ouputs.tf +++ b/modules/sage-aws-vpc/ouputs.tf @@ -22,6 +22,10 @@ output "vpc_public_subnet_cidrs" { value = var.public_subnet_cidrs } +output "vpc_private_subnet_cidrs" { + value = concat(var.private_subnet_cidrs_eks_control_plane, var.private_subnet_cidrs_eks_worker_nodes) +} + output "private_subnet_ids_eks_control_plane" { value = slice(module.vpc.private_subnets, 0, length(var.private_subnet_cidrs_eks_control_plane)) } From 1dab2753c6a69baaeb92a3791b2ae31e130fb0af Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 17 Oct 2024 15:25:49 -0700 Subject: [PATCH 56/85] Correct cidr block --- deployments/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/main.tf b/deployments/main.tf index 2920e481..68654cf2 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -42,7 +42,7 @@ module "dpe-sandbox-spacelift-development" { private_subnet_cidrs_eks_control_plane = ["10.52.18.0/24", "10.52.19.0/24"] azs_eks_control_plane = ["us-east-1a", "us-east-1b"] - private_subnet_cidrs_eks_worker_nodes = ["10.52.20.0/22", "10.52.24.0/22", "10.52.26.0/22"] + private_subnet_cidrs_eks_worker_nodes = ["10.52.20.0/22", "10.52.24.0/22", "10.52.28.0/22"] azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] } From 63a54ad027d73b53b75946343c68bdceb5b19831 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 17 Oct 2024 16:47:11 -0700 Subject: [PATCH 57/85] Update cidr blocks --- deployments/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deployments/main.tf b/deployments/main.tf index 68654cf2..b4e07f3d 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -71,10 +71,10 @@ module "dpe-sandbox-spacelift-production" { vpc_name = "dpe-k8" vpc_cidr_block = "10.52.0.0/20" - public_subnet_cidrs = ["10.52.1.0/24", "10.52.2.0/24"] - private_subnet_cidrs_eks_control_plane = ["10.52.4.0/24", "10.52.5.0/24"] + public_subnet_cidrs = ["10.52.0.0/24", "10.52.1.0/24"] + private_subnet_cidrs_eks_control_plane = ["10.52.2.0/24", "10.52.3.0/24"] azs_eks_control_plane = ["us-east-1a", "us-east-1b"] - private_subnet_cidrs_eks_worker_nodes = ["10.52.6.0/22", "10.52.10.0/22", "10.52.14.0/22"] + private_subnet_cidrs_eks_worker_nodes = ["10.52.4.0/22", "10.52.8.0/22", "10.52.12.0/22"] azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] } From d4c79d78b69f6d233a8a702333eb84384bedee8b Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 17 Oct 2024 17:12:03 -0700 Subject: [PATCH 58/85] Correct node lengths --- modules/sage-aws-vpc/ouputs.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/sage-aws-vpc/ouputs.tf b/modules/sage-aws-vpc/ouputs.tf index b20e47a0..606921f5 100644 --- a/modules/sage-aws-vpc/ouputs.tf +++ b/modules/sage-aws-vpc/ouputs.tf @@ -27,9 +27,9 @@ output "vpc_private_subnet_cidrs" { } output "private_subnet_ids_eks_control_plane" { - value = slice(module.vpc.private_subnets, 0, length(var.private_subnet_cidrs_eks_control_plane)) + value = slice(module.vpc.private_subnets, 0, length(var.private_subnet_cidrs_eks_control_plane) + 1) } output "private_subnet_ids_eks_worker_nodes" { - value = slice(module.vpc.private_subnets, length(var.private_subnet_cidrs_eks_control_plane), length(var.private_subnet_cidrs_eks_worker_nodes)) + value = slice(module.vpc.private_subnets, length(var.private_subnet_cidrs_eks_control_plane) - 1, length(var.private_subnet_cidrs_eks_worker_nodes) + 1) } From 204b2ffa22093214a153f677359fb534ebee6c98 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 17 Oct 2024 17:16:11 -0700 Subject: [PATCH 59/85] Correct array slicing --- modules/sage-aws-vpc/ouputs.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/sage-aws-vpc/ouputs.tf b/modules/sage-aws-vpc/ouputs.tf index 606921f5..f21344b0 100644 --- a/modules/sage-aws-vpc/ouputs.tf +++ b/modules/sage-aws-vpc/ouputs.tf @@ -31,5 +31,5 @@ output "private_subnet_ids_eks_control_plane" { } output "private_subnet_ids_eks_worker_nodes" { - value = slice(module.vpc.private_subnets, length(var.private_subnet_cidrs_eks_control_plane) - 1, length(var.private_subnet_cidrs_eks_worker_nodes) + 1) + value = slice(module.vpc.private_subnets, length(var.private_subnet_cidrs_eks_control_plane), length(module.vpc.private_subnets) + 1) } From 34e27cbf2e675437fa260b15356470ff5cabe849 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 17 Oct 2024 17:19:24 -0700 Subject: [PATCH 60/85] Correct indexing --- modules/sage-aws-vpc/ouputs.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/sage-aws-vpc/ouputs.tf b/modules/sage-aws-vpc/ouputs.tf index f21344b0..36aaea3b 100644 --- a/modules/sage-aws-vpc/ouputs.tf +++ b/modules/sage-aws-vpc/ouputs.tf @@ -27,9 +27,9 @@ output "vpc_private_subnet_cidrs" { } output "private_subnet_ids_eks_control_plane" { - value = slice(module.vpc.private_subnets, 0, length(var.private_subnet_cidrs_eks_control_plane) + 1) + value = slice(module.vpc.private_subnets, 0, length(var.private_subnet_cidrs_eks_control_plane)) } output "private_subnet_ids_eks_worker_nodes" { - value = slice(module.vpc.private_subnets, length(var.private_subnet_cidrs_eks_control_plane), length(module.vpc.private_subnets) + 1) + value = slice(module.vpc.private_subnets, length(var.private_subnet_cidrs_eks_control_plane), length(module.vpc.private_subnets)) } From 373b800553a6c9335ff900cb3d0d3c3f4db09cc9 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 17 Oct 2024 17:39:24 -0700 Subject: [PATCH 61/85] Update default eks cluster version --- modules/sage-aws-eks/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/sage-aws-eks/variables.tf b/modules/sage-aws-eks/variables.tf index 03852adf..612c59f6 100644 --- a/modules/sage-aws-eks/variables.tf +++ b/modules/sage-aws-eks/variables.tf @@ -6,7 +6,7 @@ variable "cluster_name" { variable "cluster_version" { description = "Version of K8 cluster" type = string - default = "1.30" + default = "1.31" } variable "region" { From 1b6170e5360f4c308f1703c0d633e288ea5ffb5e Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 21 Oct 2024 09:56:45 -0700 Subject: [PATCH 62/85] Shrink EKS control plane subnet range --- deployments/main.tf | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/deployments/main.tf b/deployments/main.tf index b4e07f3d..c5e818e1 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -37,12 +37,13 @@ module "dpe-sandbox-spacelift-development" { cluster_name = "dpe-k8-sandbox" vpc_name = "dpe-sandbox" - vpc_cidr_block = "10.52.16.0/20" - public_subnet_cidrs = ["10.52.16.0/24", "10.52.17.0/24"] - private_subnet_cidrs_eks_control_plane = ["10.52.18.0/24", "10.52.19.0/24"] + vpc_cidr_block = "10.52.16.0/20" + public_subnet_cidrs = ["10.52.16.0/24", "10.52.17.0/24"] + # 10.52.18.32 -> 10.52.19.0 is free for future use + private_subnet_cidrs_eks_control_plane = ["10.52.18.0/28", "10.52.18.16/28"] azs_eks_control_plane = ["us-east-1a", "us-east-1b"] - private_subnet_cidrs_eks_worker_nodes = ["10.52.20.0/22", "10.52.24.0/22", "10.52.28.0/22"] + private_subnet_cidrs_eks_worker_nodes = ["10.52.19.0/22", "10.52.23.0/22", "10.52.27.0/22"] azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] } @@ -70,11 +71,12 @@ module "dpe-sandbox-spacelift-production" { cluster_name = "dpe-k8" vpc_name = "dpe-k8" - vpc_cidr_block = "10.52.0.0/20" - public_subnet_cidrs = ["10.52.0.0/24", "10.52.1.0/24"] - private_subnet_cidrs_eks_control_plane = ["10.52.2.0/24", "10.52.3.0/24"] + vpc_cidr_block = "10.52.0.0/20" + public_subnet_cidrs = ["10.52.0.0/24", "10.52.1.0/24"] + # 10.52.2.32 -> 10.52.3.0 is free for future use + private_subnet_cidrs_eks_control_plane = ["10.52.2.0/28", "10.52.2.16/28"] azs_eks_control_plane = ["us-east-1a", "us-east-1b"] - private_subnet_cidrs_eks_worker_nodes = ["10.52.4.0/22", "10.52.8.0/22", "10.52.12.0/22"] + private_subnet_cidrs_eks_worker_nodes = ["10.52.3.0/22", "10.52.7.0/22", "10.52.11.0/22"] azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] } From 34828377c76b3adef6dcb5c531d404a1b03e91e1 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 21 Oct 2024 10:09:26 -0700 Subject: [PATCH 63/85] Set range back --- deployments/main.tf | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/deployments/main.tf b/deployments/main.tf index c5e818e1..9b6e04c2 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -37,13 +37,12 @@ module "dpe-sandbox-spacelift-development" { cluster_name = "dpe-k8-sandbox" vpc_name = "dpe-sandbox" - vpc_cidr_block = "10.52.16.0/20" - public_subnet_cidrs = ["10.52.16.0/24", "10.52.17.0/24"] - # 10.52.18.32 -> 10.52.19.0 is free for future use + vpc_cidr_block = "10.52.16.0/20" + public_subnet_cidrs = ["10.52.16.0/24", "10.52.17.0/24"] private_subnet_cidrs_eks_control_plane = ["10.52.18.0/28", "10.52.18.16/28"] azs_eks_control_plane = ["us-east-1a", "us-east-1b"] - private_subnet_cidrs_eks_worker_nodes = ["10.52.19.0/22", "10.52.23.0/22", "10.52.27.0/22"] + private_subnet_cidrs_eks_worker_nodes = ["10.52.20.0/22", "10.52.24.0/22", "10.52.28.0/22"] azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] } @@ -71,12 +70,11 @@ module "dpe-sandbox-spacelift-production" { cluster_name = "dpe-k8" vpc_name = "dpe-k8" - vpc_cidr_block = "10.52.0.0/20" - public_subnet_cidrs = ["10.52.0.0/24", "10.52.1.0/24"] - # 10.52.2.32 -> 10.52.3.0 is free for future use + vpc_cidr_block = "10.52.0.0/20" + public_subnet_cidrs = ["10.52.0.0/24", "10.52.1.0/24"] private_subnet_cidrs_eks_control_plane = ["10.52.2.0/28", "10.52.2.16/28"] azs_eks_control_plane = ["us-east-1a", "us-east-1b"] - private_subnet_cidrs_eks_worker_nodes = ["10.52.3.0/22", "10.52.7.0/22", "10.52.11.0/22"] + private_subnet_cidrs_eks_worker_nodes = ["10.52.4.0/22", "10.52.8.0/22", "10.52.12.0/22"] azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] } From 5c5654fd47d5c667db2d85ece44ea61ac14616a3 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:38:21 -0700 Subject: [PATCH 64/85] [IBCDPE-1095] Setup TLS/Auth0 for cluster ingress with telemetry data & Set up SMTP settings with test email & Move to lets encrypt (#40) * Setup TLS/Auth0 for cluster ingress with telemetry data & Set up SMTP settings with test email & Move to lets encrypt --- README.md | 41 +- deployments/main.tf | 59 ++- deployments/spacelift/dpe-k8s/main.tf | 68 ++- deployments/spacelift/dpe-k8s/variables.tf | 67 +++ deployments/stacks/dpe-auth0/README.md | 36 ++ deployments/stacks/dpe-auth0/main.tf | 57 ++ deployments/stacks/dpe-auth0/provider.tf | 10 + deployments/stacks/dpe-auth0/variables.tf | 28 + deployments/stacks/dpe-auth0/versions.tf | 8 + .../stacks/dpe-k8s-deployments/main.tf | 34 +- .../stacks/dpe-k8s-deployments/variables.tf | 38 ++ deployments/stacks/dpe-k8s/main.tf | 6 + deployments/stacks/dpe-k8s/outputs.tf | 9 + deployments/stacks/dpe-k8s/variables.tf | 5 + modules/apache-airflow/README.md | 18 +- modules/apache-airflow/main.tf | 2 +- modules/apache-airflow/templates/values.yaml | 497 ++++++++++++++++-- modules/cluster-ingress/README.md | 49 -- modules/cluster-ingress/main.tf | 40 -- .../resources/cert-issuer.yaml | 25 - modules/cluster-ingress/variables.tf | 37 -- modules/cluster-ingress/versions.tf | 16 - modules/envoy-gateway/README.md | 82 +-- modules/envoy-gateway/main.tf | 32 ++ .../envoy-gateway/resources/cert-issuer.yaml | 18 + .../envoy-gateway/resources/envoy-proxy.yaml | 6 + .../resources/gateway-class.yaml | 5 + .../resources/gateway.yaml | 0 .../resources/http-to-https-redirect.yaml | 16 + .../resources/kustomization.yaml | 4 +- .../resources/traffic-policy.yaml | 0 modules/envoy-gateway/variables.tf | 10 + modules/sage-aws-ses/README.md | 35 ++ modules/sage-aws-ses/data.tf | 6 + modules/sage-aws-ses/main.tf | 23 + modules/sage-aws-ses/ouputs.tf | 9 + modules/sage-aws-ses/variables.tf | 12 + modules/sage-aws-ses/versions.tf | 8 + modules/signoz/README.md | 50 +- modules/signoz/main.tf | 54 +- .../resources-otel-ingress/http-route.yaml | 26 + .../resources-otel-ingress/kustomization.yaml | 6 + .../reference-grant-signoz.yaml | 14 + .../security-policy.yaml | 12 + modules/signoz/templates/values.yaml | 12 +- modules/signoz/variables.tf | 40 ++ 46 files changed, 1321 insertions(+), 309 deletions(-) create mode 100644 deployments/stacks/dpe-auth0/README.md create mode 100644 deployments/stacks/dpe-auth0/main.tf create mode 100644 deployments/stacks/dpe-auth0/provider.tf create mode 100644 deployments/stacks/dpe-auth0/variables.tf create mode 100644 deployments/stacks/dpe-auth0/versions.tf delete mode 100644 modules/cluster-ingress/README.md delete mode 100644 modules/cluster-ingress/main.tf delete mode 100644 modules/cluster-ingress/resources/cert-issuer.yaml delete mode 100644 modules/cluster-ingress/variables.tf delete mode 100644 modules/cluster-ingress/versions.tf create mode 100644 modules/envoy-gateway/resources/cert-issuer.yaml create mode 100644 modules/envoy-gateway/resources/envoy-proxy.yaml rename modules/{cluster-ingress => envoy-gateway}/resources/gateway-class.yaml (53%) rename modules/{cluster-ingress => envoy-gateway}/resources/gateway.yaml (100%) create mode 100644 modules/envoy-gateway/resources/http-to-https-redirect.yaml rename modules/{cluster-ingress => envoy-gateway}/resources/kustomization.yaml (75%) rename modules/{cluster-ingress => envoy-gateway}/resources/traffic-policy.yaml (100%) create mode 100644 modules/sage-aws-ses/README.md create mode 100644 modules/sage-aws-ses/data.tf create mode 100644 modules/sage-aws-ses/main.tf create mode 100644 modules/sage-aws-ses/ouputs.tf create mode 100644 modules/sage-aws-ses/variables.tf create mode 100644 modules/sage-aws-ses/versions.tf create mode 100644 modules/signoz/resources-otel-ingress/http-route.yaml create mode 100644 modules/signoz/resources-otel-ingress/kustomization.yaml create mode 100644 modules/signoz/resources-otel-ingress/reference-grant-signoz.yaml create mode 100644 modules/signoz/resources-otel-ingress/security-policy.yaml diff --git a/README.md b/README.md index 3edea25f..3d456eff 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,9 @@ This repo is used to deploy an EKS cluster to AWS. CI/CD is managed through Spac │ └── policies: Rego policies that can be attached to 0..* spacelift stacks ├── dev: Development/sandbox environment │ ├── spacelift: Terraform scripts to manage spacelift resources -│ │ └── dpe-sandbox: Spacelift specific resources to manage the CI/CD pipeline +│ │ └── dpe-k8s/dpe-sandbox: Spacelift specific resources to manage the CI/CD pipeline │ └── stacks: The deployable cloud resources +│ ├── dpe-auth0: Stack used to provision and setup auth0 IDP (Identity Provider) settings │ ├── dpe-sandbox-k8s: K8s + supporting AWS resources │ └── dpe-sandbox-k8s-deployments: Resources deployed inside of a K8s cluster └── modules: Templatized collections of terraform resources that are used in a stack @@ -19,15 +20,22 @@ This repo is used to deploy an EKS cluster to AWS. CI/CD is managed through Spac │ └── templates: Resources used during deployment of airflow ├── argo-cd: K8s deployment for Argo CD, a declarative, GitOps continuous delivery tool for Kubernetes. │ └── templates: Resources used during deployment of this helm chart - ├── trivy-operator: K8s deployment for trivy, along with a few supporting charts for security scanning - │ └── templates: Resources used during deployment of these helm charts - ├── victoria-metrics: K8s deployment for victoria metrics, a promethus like tool for cluster metric collection - │ └── templates: Resources used during deployment of these helm charts + ├── cert-manager: Handles provisioning TLS certificates for the cluster + ├── envoy-gateway: API Gateway for the cluster securing and providing secure traffic into the cluster + ├── postgres-cloud-native: Used to provision a postgres instance + ├── postgres-cloud-native-operator: Operator that manages the lifecycle of postgres instances on the cluster ├── demo-network-policies: K8s deployment for a demo showcasing how to use network policies ├── demo-pod-level-security-groups-strict: K8s deployment for a demo showcasing how to use pod level security groups in strict mode ├── sage-aws-eks: Sage specific EKS cluster for AWS + ├── sage-aws-eks-addons: Sets up additional resources that need to be installed post creation of the EKS cluster ├── sage-aws-k8s-node-autoscaler: K8s node autoscaler using spotinst ocean - └── sage-aws-vpc: Sage specific VPC for AWS + ├── sage-aws-ses: AWS SES (Simple email service) setup + ├── sage-aws-vpc: Sage specific VPC for AWS + ├── signoz: SigNoz provides APM, logs, traces, metrics, exceptions, & alerts in a single tool + ├── trivy-operator: K8s deployment for trivy, along with a few supporting charts for security scanning + │ └── templates: Resources used during deployment of these helm charts + ├── victoria-metrics: K8s deployment for victoria metrics, a promethus like tool for cluster metric collection + │ └── templates: Resources used during deployment of these helm charts ``` This root `main.tf` contains all the "Things" that are going to be deployed. @@ -283,10 +291,27 @@ This document describes the abbreviated process below: "iam:*PolicyVersion", "iam:*OpenIDConnectProvider", "iam:*InstanceProfile", - "iam:ListPolicyVersions" + "iam:ListPolicyVersions", + "iam:ListGroupsForUser", + "iam:ListAttachedUserPolicies" ], "Resource": "*" - } + }, + { + "Effect": "Allow", + "Action": [ + "iam:CreateUser", + "iam:AttachUserPolicy", + "iam:ListPolicies", + "iam:TagUser", + "iam:GetUser", + "iam:DeleteUser", + "iam:CreateAccessKey", + "iam:ListAccessKeys", + "iam:DeleteAccessKeys" + ], + "Resource": "arn:aws:iam::{{AWS ACCOUNT ID}}:user/smtp_user" + } ] } ``` diff --git a/deployments/main.tf b/deployments/main.tf index 9b6e04c2..f75f13b1 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -31,19 +31,49 @@ module "dpe-sandbox-spacelift-development" { k8s_stack_deployments_name = "DPE DEV Kubernetes Deployments" k8s_stack_deployments_project_root = "deployments/stacks/dpe-k8s-deployments" + auth0_stack_name = "DPE DEV Auth0" + auth0_stack_project_root = "deployments/stacks/dpe-auth0" + auth0_domain = "dev-sage-dpe.us.auth0.com" + auth0_clients = [ + { + name = "bfauble - automation" + description = "App for testing signoz" + app_type = "non_interactive" + }, + { + name = "schematic - Github Actions" + description = "Client for Github Actions to export telemetry data" + app_type = "non_interactive" + }, + { + name = "schematic - Dev" + description = "Client for schematic deployed to AWS DEV to export telemetry data" + app_type = "non_interactive" + }, + ] + aws_account_id = "631692904429" region = "us-east-1" cluster_name = "dpe-k8-sandbox" vpc_name = "dpe-sandbox" - vpc_cidr_block = "10.52.16.0/20" - public_subnet_cidrs = ["10.52.16.0/24", "10.52.17.0/24"] + vpc_cidr_block = "10.52.16.0/20" + # A public subnet is required for each AZ in which the worker nodes are deployed + public_subnet_cidrs = ["10.52.16.0/24", "10.52.17.0/24", "10.52.19.0/24"] private_subnet_cidrs_eks_control_plane = ["10.52.18.0/28", "10.52.18.16/28"] azs_eks_control_plane = ["us-east-1a", "us-east-1b"] - private_subnet_cidrs_eks_worker_nodes = ["10.52.20.0/22", "10.52.24.0/22", "10.52.28.0/22"] - azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] + private_subnet_cidrs_eks_worker_nodes = ["10.52.28.0/22", "10.52.24.0/22", "10.52.20.0/22"] + azs_eks_worker_nodes = ["us-east-1c", "us-east-1b", "us-east-1a"] + + enable_cluster_ingress = true + enable_otel_ingress = true + ssl_hostname = "dev.sagedpe.org" + auth0_jwks_uri = "https://dev-sage-dpe.us.auth0.com/.well-known/jwks.json" + ses_email_identities = ["aws-dpe-dev@sagebase.org"] + # Defines the email address that will be used as the sender of the email alerts + smtp_from = "aws-dpe-dev@sagebase.org" } module "dpe-sandbox-spacelift-production" { @@ -64,17 +94,30 @@ module "dpe-sandbox-spacelift-production" { k8s_stack_deployments_name = "DPE Kubernetes Deployments" k8s_stack_deployments_project_root = "deployments/stacks/dpe-k8s-deployments" + auth0_stack_name = "DPE Auth0" + auth0_stack_project_root = "deployments/stacks/dpe-auth0" + auth0_domain = "" + auth0_clients = [] + aws_account_id = "766808016710" region = "us-east-1" cluster_name = "dpe-k8" vpc_name = "dpe-k8" - vpc_cidr_block = "10.52.0.0/20" - public_subnet_cidrs = ["10.52.0.0/24", "10.52.1.0/24"] + vpc_cidr_block = "10.52.0.0/20" + # A public subnet is required for each AZ in which the worker nodes are deployed + public_subnet_cidrs = ["10.52.0.0/24", "10.52.1.0/24", "10.52.3.0/24"] private_subnet_cidrs_eks_control_plane = ["10.52.2.0/28", "10.52.2.16/28"] azs_eks_control_plane = ["us-east-1a", "us-east-1b"] - private_subnet_cidrs_eks_worker_nodes = ["10.52.4.0/22", "10.52.8.0/22", "10.52.12.0/22"] - azs_eks_worker_nodes = ["us-east-1a", "us-east-1b", "us-east-1c"] + private_subnet_cidrs_eks_worker_nodes = ["10.52.12.0/22", "10.52.8.0/22", "10.52.4.0/22"] + azs_eks_worker_nodes = ["us-east-1c", "us-east-1b", "us-east-1a"] + + enable_cluster_ingress = false + enable_otel_ingress = false + ssl_hostname = "" + auth0_jwks_uri = "" + + ses_email_identities = [] } diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index c0035bac..c29b3ce8 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -11,16 +11,28 @@ locals { private_subnet_cidrs_eks_worker_nodes = var.private_subnet_cidrs_eks_worker_nodes azs_eks_control_plane = var.azs_eks_control_plane azs_eks_worker_nodes = var.azs_eks_worker_nodes + ses_email_identities = var.ses_email_identities } k8s_stack_deployments_variables = { - spotinst_account = var.spotinst_account - vpc_cidr_block = var.vpc_cidr_block - cluster_name = var.cluster_name - auto_deploy = var.auto_deploy - auto_prune = var.auto_prune - git_revision = var.git_branch - aws_account_id = var.aws_account_id + spotinst_account = var.spotinst_account + vpc_cidr_block = var.vpc_cidr_block + cluster_name = var.cluster_name + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = var.git_branch + aws_account_id = var.aws_account_id + enable_cluster_ingress = var.enable_cluster_ingress + enable_otel_ingress = var.enable_otel_ingress + ssl_hostname = var.ssl_hostname + auth0_jwks_uri = var.auth0_jwks_uri + smtp_from = var.smtp_from + } + + auth0_stack_variables = { + cluster_name = var.cluster_name + auth0_domain = var.auth0_domain + auth0_clients = var.auth0_clients } # Variables to be passed from the k8s stack to the deployments stack @@ -29,6 +41,8 @@ locals { private_subnet_ids_eks_worker_nodes = "TF_VAR_private_subnet_ids_eks_worker_nodes" node_security_group_id = "TF_VAR_node_security_group_id" pod_to_node_dns_sg_id = "TF_VAR_pod_to_node_dns_sg_id" + smtp_user = "TF_VAR_smtp_user" + smtp_password = "TF_VAR_smtp_password" } } @@ -201,3 +215,43 @@ resource "spacelift_aws_integration_attachment" "k8s-deployments-aws-integration read = true write = true } + + +resource "spacelift_stack" "auth0" { + github_enterprise { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + + depends_on = [ + spacelift_space.dpe-space + ] + + administrative = false + autodeploy = var.auto_deploy + branch = var.git_branch + description = "Stack used to create and manage Auth0 for authentication" + name = var.auth0_stack_name + project_root = var.auth0_stack_project_root + repository = "eks-stack" + terraform_version = var.opentofu_version + terraform_workflow_tool = "OPEN_TOFU" + space_id = spacelift_space.dpe-space.id + additional_project_globs = [ + "deployments/" + ] +} + +resource "spacelift_stack_destructor" "auth0-stack-destructor" { + stack_id = spacelift_stack.auth0.id +} + + +resource "spacelift_environment_variable" "auth0-stack-environment-variables" { + for_each = local.auth0_stack_variables + + stack_id = spacelift_stack.auth0.id + name = "TF_VAR_${each.key}" + value = try(tostring(each.value), jsonencode(each.value)) + write_only = false +} diff --git a/deployments/spacelift/dpe-k8s/variables.tf b/deployments/spacelift/dpe-k8s/variables.tf index 3e43e436..f2fa71c8 100644 --- a/deployments/spacelift/dpe-k8s/variables.tf +++ b/deployments/spacelift/dpe-k8s/variables.tf @@ -137,3 +137,70 @@ variable "azs_eks_worker_nodes" { type = list(string) description = "Availability Zones for the EKS worker nodes" } + +variable "enable_cluster_ingress" { + description = "Enable cluster ingress" + type = bool +} + +variable "enable_otel_ingress" { + description = "Enable OpenTelemetry ingress, used to send traces to SigNoz" + type = bool +} + +variable "ssl_hostname" { + description = "The hostname to use for the SSL certificate" + type = string +} + +variable "auth0_jwks_uri" { + description = "The JWKS URI for Auth0" + type = string +} + +variable "auth0_stack_name" { + description = "Name of the auth0 stack" + type = string +} + +variable "auth0_stack_project_root" { + description = "Project root of the auth0 stack" + type = string +} + +variable "auth0_domain" { + description = "Auth0 domain" + type = string +} + +variable "auth0_clients" { + description = "List of clients to create in Auth0." + type = list(object({ + name = string + description = string + app_type = string + })) +} + +variable "ses_email_identities" { + type = list(string) + description = "List of email identities to be added to SES" +} + +variable "smtp_user" { + description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_password" { + description = "The SMTP password. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_from" { + description = "The SMTP from address. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} diff --git a/deployments/stacks/dpe-auth0/README.md b/deployments/stacks/dpe-auth0/README.md new file mode 100644 index 00000000..9b2a1d17 --- /dev/null +++ b/deployments/stacks/dpe-auth0/README.md @@ -0,0 +1,36 @@ +# Purpose +The purpose of this deployment stack is to manage all IDP (Identity Provider) settings. + +## Initial setup per tenant +For each tenant in Auth0 a number of settings will be required to setup in order to +access the management API. + +1. Access the newly created tenant in the Auth0 UI +2. Create a new "Machine to Machine" application named "Spacelift/OpenTofu access to management API" +3. Select the "Auth0 Management API" under "Authorized Machine to Machine Application" +4. Select all Permissions +5. From the newly created Application copy the "Client ID" and "Client Secret" +6. Create environment variables in the Spacelift UI for the stack that will be managing this tenant. +7. The following environment variables should be set: + +* TF_VAR_auth0_client_id +* TF_VAR_auth0_client_secret - Set this as "SECRET" +* TF_VAR_auth0_domain + +By setting the above environment variables and running the stack everything should be +setup according to the stack resources requested. + +## Handing out credentials created by this process +This stack is creating a number of clients for various automated processes to +authenticate themselves when sending data to the DPE kubernetes cluster. After the +stack has ran the handout of these credentials should occur over LastPass: + +1) Create a new item in LastPass and set a useful name such as "Client/Secret to export telemetry data (DEV)" +2) Retrieve the "Client ID" and "Client Secret" from the "Application" in the Auth0 UI +3) Share the item in LastPass to the users requesting the credentials + +Once the user has the requested credentials they will need to make sure that all +requests sent to the DPE kubernetes cluster contain a Bearer token in the +"Authorization" header of the HTTP request. The following document describes the process +that an application would follow to exchange the "Client ID" and "Client Secret" for +the access token: . \ No newline at end of file diff --git a/deployments/stacks/dpe-auth0/main.tf b/deployments/stacks/dpe-auth0/main.tf new file mode 100644 index 00000000..780ea89d --- /dev/null +++ b/deployments/stacks/dpe-auth0/main.tf @@ -0,0 +1,57 @@ +# Used to create the Auth0 resources for the DPE stack +resource "auth0_resource_server" "k8s-cluster-telemetry" { + name = "${var.cluster_name}-telemetry" + identifier = "${var.cluster_name}-telemetry" + signing_alg = "RS256" + + allow_offline_access = false + # 108000 seconds = 1.25 days + # An offset of 1.25 days allows a daily token refresh to occur by simple cronjob + # for the services that use the token + token_lifetime = 108000 + skip_consent_for_verifiable_first_party_clients = true + # https://registry.terraform.io/providers/auth0/auth0/latest/docs/resources/resource_server_scopes + # Says to use the following, however it errors out: + # This object has no argument, nested block, or exported attribute named "scopes". + # lifecycle { + # ignore_changes = [scopes] + # } +} + +resource "auth0_client" "oauth2_clients" { + for_each = { for client in var.auth0_clients : client.name => client } + + name = each.value.name + description = each.value.description + app_type = each.value.app_type + + jwt_configuration { + alg = "RS256" + } +} + +resource "auth0_resource_server_scopes" "k8s-cluster-scopes" { + resource_server_identifier = auth0_resource_server.k8s-cluster-telemetry.identifier + # This scope is not yet used, however, kept for future use to grant authorization based on scopes + scopes { + name = "write:telemetry" + description = "Grants write access to telemetry data" + } + +} + + +resource "auth0_client_credentials" "client_secrets" { + for_each = { for client in auth0_client.oauth2_clients : client.name => client } + + client_id = auth0_client.oauth2_clients[each.key].id + authentication_method = "client_secret_post" +} + +resource "auth0_client_grant" "access_to_k8s_cluster" { + for_each = { for client in var.auth0_clients : client.name => client } + + client_id = auth0_client.oauth2_clients[each.key].id + audience = auth0_resource_server.k8s-cluster-telemetry.identifier + scopes = [] +} diff --git a/deployments/stacks/dpe-auth0/provider.tf b/deployments/stacks/dpe-auth0/provider.tf new file mode 100644 index 00000000..d60bfaaa --- /dev/null +++ b/deployments/stacks/dpe-auth0/provider.tf @@ -0,0 +1,10 @@ +# Requires manually setting id and secret in the stack environment variables in the Spacelift UI +# These come from auth0 > Applications > Applications > API Explorer Application > Settings +# TF_VAR_auth0_client_id +# TF_VAR_auth0_client_secret +# TF_VAR_auth0_domain +provider "auth0" { + domain = var.auth0_domain + client_id = var.auth0_client_id + client_secret = var.auth0_client_secret +} \ No newline at end of file diff --git a/deployments/stacks/dpe-auth0/variables.tf b/deployments/stacks/dpe-auth0/variables.tf new file mode 100644 index 00000000..a348f001 --- /dev/null +++ b/deployments/stacks/dpe-auth0/variables.tf @@ -0,0 +1,28 @@ +variable "cluster_name" { + description = "EKS cluster name" + type = string +} + +variable "auth0_domain" { + description = "Auth0 domain" + type = string +} + +variable "auth0_client_id" { + description = "Auth0 client ID" + type = string +} + +variable "auth0_client_secret" { + description = "Auth0 client secret" + type = string +} + +variable "auth0_clients" { + description = "List of clients to create in Auth0." + type = list(object({ + name = string + description = string + app_type = string + })) +} diff --git a/deployments/stacks/dpe-auth0/versions.tf b/deployments/stacks/dpe-auth0/versions.tf new file mode 100644 index 00000000..bd7a226f --- /dev/null +++ b/deployments/stacks/dpe-auth0/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + auth0 = { + source = "auth0/auth0" + version = "1.7.1" + } + } +} diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index ebe7b1f7..5db44e36 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -85,11 +85,17 @@ module "signoz" { git_revision = var.git_revision namespace = "signoz" argo_deployment_name = "signoz" + enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress + gateway_namespace = "envoy-gateway" + cluster_name = var.cluster_name + auth0_jwks_uri = var.auth0_jwks_uri + smtp_password = var.smtp_password + smtp_user = var.smtp_user + smtp_from = var.smtp_from } module "envoy-gateway" { - # TODO: This is temporary until we are ready to deploy the ingress controller: https://sagebionetworks.jira.com/browse/IBCDPE-1095 - count = 0 + count = var.enable_cluster_ingress ? 1 : 0 depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" @@ -99,11 +105,12 @@ module "envoy-gateway" { git_revision = var.git_revision namespace = "envoy-gateway" argo_deployment_name = "envoy-gateway" + cluster_issuer_name = "lets-encrypt-prod" + ssl_hostname = var.ssl_hostname } module "cert-manager" { - # TODO: This is temporary until we are ready to deploy the ingress controller: https://sagebionetworks.jira.com/browse/IBCDPE-1095 - count = 0 + count = var.enable_cluster_ingress ? 1 : 0 depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" @@ -114,22 +121,3 @@ module "cert-manager" { namespace = "cert-manager" argo_deployment_name = "cert-manager" } - -module "cluster-ingress" { - # TODO: This is temporary until we are ready to deploy the ingress controller: https://sagebionetworks.jira.com/browse/IBCDPE-1095 - count = 0 - depends_on = [module.argo-cd] - # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" - # version = "0.5.0" - source = "../../../modules/cluster-ingress" - auto_deploy = var.auto_deploy - auto_prune = var.auto_prune - git_revision = var.git_revision - namespace = "envoy-gateway" - argo_deployment_name = "cluster-ingress" - - # To determine more elegant ways to fill in these values, for example, if we have - # a pre-defined DNS name for the cluster (https://sagebionetworks.jira.com/browse/IT-3931) - ssl_hostname = "unknown-to-fill-in" - cluster_issuer_name = "selfsigned" -} diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index 5797a052..2b9be26a 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -65,3 +65,41 @@ variable "aws_account_id" { description = "AWS account ID" type = string } + +variable "enable_cluster_ingress" { + description = "Enable cluster ingress" + type = bool +} + +variable "enable_otel_ingress" { + description = "Enable OpenTelemetry ingress, used to send traces to SigNoz" + type = bool +} + +variable "ssl_hostname" { + description = "The hostname to use for the SSL certificate" + type = string +} + +variable "auth0_jwks_uri" { + description = "The JWKS URI for Auth0" + type = string +} + +variable "smtp_user" { + description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_password" { + description = "The SMTP password. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_from" { + description = "The SMTP from address. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} diff --git a/deployments/stacks/dpe-k8s/main.tf b/deployments/stacks/dpe-k8s/main.tf index d9f9cf69..17c12f0d 100644 --- a/deployments/stacks/dpe-k8s/main.tf +++ b/deployments/stacks/dpe-k8s/main.tf @@ -36,3 +36,9 @@ module "sage-aws-eks" { private_subnet_ids_eks_control_plane = module.sage-aws-vpc.private_subnet_ids_eks_control_plane private_subnet_ids_eks_worker_nodes = module.sage-aws-vpc.private_subnet_ids_eks_worker_nodes } + +module "sage-aws-ses" { + source = "../../../modules/sage-aws-ses" + + email_identities = var.ses_email_identities +} diff --git a/deployments/stacks/dpe-k8s/outputs.tf b/deployments/stacks/dpe-k8s/outputs.tf index 19418d1c..4a062261 100644 --- a/deployments/stacks/dpe-k8s/outputs.tf +++ b/deployments/stacks/dpe-k8s/outputs.tf @@ -37,3 +37,12 @@ output "region" { output "cluster_name" { value = module.sage-aws-eks.cluster_name } + +output "smtp_user" { + value = module.sage-aws-ses.smtp_user +} + +output "smtp_password" { + sensitive = true + value = module.sage-aws-ses.smtp_password +} \ No newline at end of file diff --git a/deployments/stacks/dpe-k8s/variables.tf b/deployments/stacks/dpe-k8s/variables.tf index 6edc1cc1..9054a549 100644 --- a/deployments/stacks/dpe-k8s/variables.tf +++ b/deployments/stacks/dpe-k8s/variables.tf @@ -54,3 +54,8 @@ variable "azs_eks_worker_nodes" { type = list(string) description = "Availability Zones for the EKS worker nodes" } + +variable "ses_email_identities" { + type = list(string) + description = "List of email identities to be added to SES" +} diff --git a/modules/apache-airflow/README.md b/modules/apache-airflow/README.md index 736303cf..96e4d7df 100644 --- a/modules/apache-airflow/README.md +++ b/modules/apache-airflow/README.md @@ -65,4 +65,20 @@ YAML ## Accessing the web UI An `admin` user is created for airflow via the `airflow-admin-user-secret` secret that is added to the namespace. Decode the base64 encoded password/username and use it for -the UI. \ No newline at end of file +the UI. + +## Building a new image for airflow +The deployment of our airflow instance depends on a custom apache airflow image being +created and pushed to a public available GCHR url. The image is created from the +`orca-recipes` git repo: + +1. Update the dockerfile within the orca-recipes repo +2. Build the new image `docker build .` +3. Tag the build image with the tag you want to use `docker tag sha256:... ghcr.io/sage-bionetworks-workflows/orca-recipes:0.0.1` +4. Push to GCHR `docker push ghcr.io/sage-bionetworks-workflows/orca-recipes:0.0.1` (May require an admin of the repo to push this) +5. Update the `values.yaml` file in this `modules/apache-airflow/templates` directory. + +Transitive dependencies may also need to be updated when building a new image for +airflow, for example `py-orca` was updated in this example PR: . +Additionally, this PR covers what was completed in order to update the +requirements/dockerfile: . diff --git a/modules/apache-airflow/main.tf b/modules/apache-airflow/main.tf index 3ed255a5..81c219ec 100644 --- a/modules/apache-airflow/main.tf +++ b/modules/apache-airflow/main.tf @@ -66,7 +66,7 @@ spec: sources: - repoURL: 'https://airflow.apache.org' chart: airflow - targetRevision: 1.11.0 + targetRevision: 1.15.0 helm: releaseName: airflow valueFiles: diff --git a/modules/apache-airflow/templates/values.yaml b/modules/apache-airflow/templates/values.yaml index da1c2df8..f4e3eaa9 100644 --- a/modules/apache-airflow/templates/values.yaml +++ b/modules/apache-airflow/templates/values.yaml @@ -25,6 +25,20 @@ fullnameOverride: "" # Provide a name to substitute for the name of the chart nameOverride: "" +# Use standard naming for all resources using airflow.fullname template +# Consider removing this later and default it to true +# to make this chart follow standard naming conventions using the fullname template. +# For now this is an opt-in switch for backwards compatibility to leverage the standard naming convention +# and being able to use fully fullnameOverride and nameOverride in all resources +# For new installations - it is recommended to set it to True to follow standard naming conventions +# For existing installations, this will rename and redeploy your resources with the new names. Be aware that +# this will recreate your deployment/statefulsets along with their persistent volume claims and data storage +# migration may be needed to keep your old data +# +# Note:fernet-key,redis-password and broker-url secrets don't use this logic yet, +# as this may break existing installations due to how they get installed via pre-install hook. +useStandardNaming: false + # Max number of old replicasets to retain. Can be overridden by each deployment's revisionHistoryLimit revisionHistoryLimit: ~ @@ -43,21 +57,24 @@ securityContexts: pod: {} containers: {} +# Global container lifecycle hooks for airflow containers +containerLifecycleHooks: {} + # Airflow home directory # Used for mount paths airflowHome: /opt/airflow # Default airflow repository -- overridden by all the specific images below -defaultAirflowRepository: bfaublesage/airflow +defaultAirflowRepository: ghcr.io/sage-bionetworks-workflows/orca-recipes # Default airflow tag to deploy -defaultAirflowTag: "2.7.1-python-3.10" +defaultAirflowTag: "0.1.0" # Default airflow digest. If specified, it takes precedence over tag defaultAirflowDigest: ~ # Airflow version (Used to make some decisions based on Airflow Version being deployed) -airflowVersion: "2.7.1" +airflowVersion: "2.9.3" # Images images: @@ -88,23 +105,25 @@ images: pullPolicy: IfNotPresent statsd: repository: quay.io/prometheus/statsd-exporter - tag: v0.22.8 + tag: v0.26.1 pullPolicy: IfNotPresent redis: repository: redis - tag: 7-bullseye + # Redis is limited to 7.2-bookworm due to licencing change + # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ + tag: 7.2-bookworm pullPolicy: IfNotPresent pgbouncer: repository: apache/airflow - tag: airflow-pgbouncer-2023.02.24-1.16.1 + tag: airflow-pgbouncer-2024.01.19-1.21.0 pullPolicy: IfNotPresent pgbouncerExporter: repository: apache/airflow - tag: airflow-pgbouncer-exporter-2023.02.21-0.14.0 + tag: airflow-pgbouncer-exporter-2024.06.18-0.17.0 pullPolicy: IfNotPresent gitSync: repository: registry.k8s.io/git-sync/git-sync - tag: v3.6.3 + tag: v4.1.0 pullPolicy: IfNotPresent # Select certain nodes for airflow pods. @@ -114,6 +133,7 @@ nodeSelector: { affinity: {} tolerations: [] topologySpreadConstraints: [] +schedulerName: ~ # Add common labels to all objects and pods defined in this chart. labels: {} @@ -142,6 +162,7 @@ ingress: # The hostnames or hosts configuration for the web Ingress hosts: [] + # # The hostname for the web Ingress (can be templated) # - name: "" # # configs for web Ingress TLS # tls: @@ -185,6 +206,7 @@ ingress: # The hostnames or hosts configuration for the flower Ingress hosts: [] + # # The hostname for the flower Ingress (can be templated) # - name: "" # tls: # # Enable TLS termination for the flower Ingress @@ -225,7 +247,8 @@ airflowLocalSettings: |- UIAlert( 'Usage of a dynamic webserver secret key detected. We recommend a static webserver secret key instead.' ' See the ' + '"https://airflow.apache.org/docs/helm-chart/stable/production-guide.html#webserver-secret-key" ' + 'target="_blank" rel="noopener noreferrer">' 'Helm Chart Production Guide for more details.', category="warning", roles=["Admin"], @@ -253,6 +276,8 @@ allowPodLaunching: true # Environment variables for all airflow containers env: [] +# - name: "" +# value: "" # Volumes for all airflow containers volumes: [] @@ -319,6 +344,11 @@ extraSecrets: {} # '{{ .Release.Name }}-other-secret-name-suffix': # data: | # ... +# 'proxy-config': +# stringData: | +# HTTP_PROXY: http://proxy_user:proxy_password@192.168.0.10:2080 +# HTTPS_PROXY: http://proxy_user:proxy_password@192.168.0.10:2080 +# NO_PROXY: "localhost,127.0.0.1,.svc.cluster.local,kubernetes.default.svc" # Extra ConfigMaps that will be managed by the chart # (You can use them with extraEnv or extraEnvFrom or some of the extraVolumes values). @@ -513,6 +543,9 @@ workers: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -569,9 +602,36 @@ workers: # This configuration will be ignored if PGBouncer is not enabled usePgbouncer: true + # Allow HPA (KEDA must be disabled). + hpa: + enabled: false + + # Minimum number of workers created by HPA + minReplicaCount: 0 + + # Maximum number of workers created by HPA + maxReplicaCount: 5 + + # Specifications for which to use to calculate the desired replica count + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + + # Scaling behavior of the target in both Up and Down directions + behavior: {} + persistence: # Enable persistent volumes enabled: true + # This policy determines whether PVCs should be deleted when StatefulSet is scaled down or removed. + persistentVolumeClaimRetentionPolicy: ~ + # persistentVolumeClaimRetentionPolicy: + # whenDeleted: Delete + # whenScaled: Delete # Volume size for worker StatefulSet size: 30Gi # If using a custom storageClass, pass name ref to all statefulSets here @@ -585,6 +645,8 @@ workers: # Detailed default security context for persistence for container level securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} kerberosSidecar: # Enable kerberos sidecar @@ -599,6 +661,20 @@ workers: # Detailed default security context for kerberosSidecar for container level securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + + kerberosInitContainer: + # Enable kerberos init container + enabled: false + resources: {} + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + resources: {} # limits: @@ -613,13 +689,13 @@ workers: # This setting tells kubernetes that its ok to evict # when it wants to scale a node down. - safeToEvict: true + safeToEvict: false - # Launch additional containers into worker. + # Launch additional containers into worker (templated). # Note: If used with KubernetesExecutor, you are responsible for signaling sidecars to exit when the main # container finishes so Airflow can continue the worker shutdown process! extraContainers: [] - # Add additional init containers into workers. + # Add additional init containers into workers (templated). extraInitContainers: [] # Mount additional volumes into worker. It can be templated like in the following example: @@ -639,7 +715,8 @@ workers: # Select certain nodes for airflow worker pods. nodeSelector: { - spotinst.io/node-lifecycle: "od" + spotinst.io/node-lifecycle: "od", + topology.kubernetes.io/zone: "us-east-1a" } runtimeClassName: ~ priorityClassName: ~ @@ -704,8 +781,34 @@ workers: env: [] + volumeClaimTemplates: [] + # Additional volumeClaimTemplates needed. + # Comment out the above and uncomment the section below to enable it. + # Add more as needed + # Make sure to mount it under extraVolumeMounts. + # volumeClaimTemplates: + # - metadata: + # name: data-volume-1 + # spec: + # storageClassName: "storage-class-1" + # accessModes: + # - "ReadWriteOnce" + # resources: + # requests: + # storage: "10Gi" + # - metadata: + # name: data-volume-2 + # spec: + # storageClassName: "storage-class-2" + # accessModes: + # - "ReadWriteOnce" + # resources: + # requests: + # storage: "20Gi" + # Airflow scheduler settings scheduler: + enabled: true # hostAliases for the scheduler pod hostAliases: [] # - ip: "127.0.0.1" @@ -723,6 +826,15 @@ scheduler: failureThreshold: 5 periodSeconds: 60 command: ~ + + # Wait for at most 1 minute (6*10s) for the scheduler container to startup. + # livenessProbe kicks in after the first successful startupProbe + startupProbe: + failureThreshold: 6 + periodSeconds: 10 + timeoutSeconds: 20 + command: ~ + # Airflow 2.0 allows users to run multiple schedulers, # However this feature is only recommended for MySQL 8+ and Postgres replicas: 1 @@ -753,6 +865,9 @@ scheduler: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -789,9 +904,9 @@ scheduler: # when it wants to scale a node down. safeToEvict: true - # Launch additional containers into scheduler. + # Launch additional containers into scheduler (templated). extraContainers: [] - # Add additional init containers into scheduler. + # Add additional init containers into scheduler (templated). extraInitContainers: [] # Mount additional volumes into scheduler. It can be templated like in the following example: @@ -811,7 +926,8 @@ scheduler: # Select certain nodes for airflow scheduler pods. nodeSelector: { - spotinst.io/node-lifecycle: "od" + spotinst.io/node-lifecycle: "od", + topology.kubernetes.io/zone: "us-east-1a" } affinity: # default scheduler affinity is: @@ -855,6 +971,8 @@ scheduler: # Detailed default security context for logGroomerSidecar for container level securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} waitForMigrations: # Whether to create init container to wait for db migrations @@ -915,6 +1033,9 @@ createUserJob: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -932,6 +1053,9 @@ createUserJob: # Launch additional containers into user creation job extraContainers: [] + # Add additional init containers into user creation job (templated). + extraInitContainers: [] + # Mount additional volumes into user creation job. It can be templated like in the following example: # extraVolumes: # - name: my-templated-extra-volume @@ -951,6 +1075,7 @@ createUserJob: affinity: {} tolerations: [] topologySpreadConstraints: [] + priorityClassName: ~ # In case you need to disable the helm hooks that create the jobs after install. # Disable this if you are using ArgoCD for example useHelmHooks: false @@ -977,10 +1102,12 @@ migrateDatabaseJob: args: - "bash" - "-c" - # The format below is necessary to get `helm lint` happy - - |- + - >- exec \ - airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "db upgrade" "upgradedb" }} + + airflow {{ semverCompare ">=2.7.0" .Values.airflowVersion + | ternary "db migrate" (semverCompare ">=2.0.0" .Values.airflowVersion + | ternary "db upgrade" "upgradedb") }} # Annotations on the database migration pod annotations: {} @@ -989,6 +1116,9 @@ migrateDatabaseJob: argocd.argoproj.io/hook: Sync argocd.argoproj.io/hook-delete-policy: HookSucceeded + # Labels specific to migrate database job objects and pods + labels: {} + # When not set, the values defined in the global securityContext will be used securityContext: {} # runAsUser: 50000 @@ -1000,6 +1130,9 @@ migrateDatabaseJob: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -1025,6 +1158,9 @@ migrateDatabaseJob: # Launch additional containers into database migration job extraContainers: [] + # Add additional init containers into migrate database job (templated). + extraInitContainers: [] + # Mount additional volumes into database migration job. It can be templated like in the following example: # extraVolumes: # - name: my-templated-extra-volume @@ -1044,13 +1180,121 @@ migrateDatabaseJob: affinity: {} tolerations: [] topologySpreadConstraints: [] + priorityClassName: ~ # In case you need to disable the helm hooks that create the jobs after install. # Disable this if you are using ArgoCD for example useHelmHooks: false applyCustomEnv: true +# rpcServer support is experimental / dev purpose only and will later be renamed +_rpcServer: + enabled: false + + # Labels specific to workers objects and pods + labels: {} + + # Command to use when running the Airflow rpc server (templated). + command: + - "bash" + # Args to use when running the Airflow rpc server (templated). + args: ["-c", "exec airflow internal-api"] + env: [] + serviceAccount: + # default value is true + # ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ + automountServiceAccountToken: true + # Specifies whether a ServiceAccount should be created + create: true + # The name of the ServiceAccount to use. + # If not set and create is true, a name is generated using the release name + name: ~ + + # Annotations to add to webserver kubernetes service account. + annotations: {} + service: + type: ClusterIP + ## service annotations + annotations: {} + ports: + - name: rpc-server + port: "{{ .Values.ports._rpcServer }}" + + loadBalancerIP: ~ + ## Limit load balancer source ips to list of CIDRs + # loadBalancerSourceRanges: + # - "10.123.0.0/16" + loadBalancerSourceRanges: [] + + podDisruptionBudget: + enabled: false + + # PDB configuration + config: + # minAvailable and maxUnavailable are mutually exclusive + maxUnavailable: 1 + # minAvailable: 1 + + # Detailed default security contexts for webserver deployments for container and pod level + securityContexts: + pod: {} + container: {} + + waitForMigrations: + # Whether to create init container to wait for db migrations + enabled: true + env: [] + # Detailed default security context for waitForMigrations for container level + securityContexts: + container: {} + + # Launch additional containers into the flower pods. + extraContainers: [] + + # Additional network policies as needed (Deprecated - renamed to `webserver.networkPolicy.ingress.from`) + extraNetworkPolicies: [] + networkPolicy: + ingress: + # Peers for webserver NetworkPolicy ingress + from: [] + # Ports for webserver NetworkPolicy ingress (if `from` is set) + ports: + - port: "{{ .Values.ports._rpcServer }}" + + resources: {} + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + + livenessProbe: + initialDelaySeconds: 15 + timeoutSeconds: 5 + failureThreshold: 5 + periodSeconds: 10 + scheme: HTTP + + readinessProbe: + initialDelaySeconds: 15 + timeoutSeconds: 5 + failureThreshold: 5 + periodSeconds: 10 + scheme: HTTP + + # Wait for at most 1 minute (6*10s) for the RPC server container to startup. + # livenessProbe kicks in after the first successful startupProbe + startupProbe: + timeoutSeconds: 20 + failureThreshold: 6 + periodSeconds: 10 + scheme: HTTP + # Airflow webserver settings webserver: + enabled: true + # Add custom annotations to the webserver configmap + configMapAnnotations: {} # hostAliases for the webserver pod hostAliases: [] # - ip: "127.0.0.1" @@ -1074,6 +1318,14 @@ webserver: periodSeconds: 10 scheme: HTTP + # Wait for at most 1 minute (6*10s) for the webserver container to startup. + # livenessProbe kicks in after the first successful startupProbe + startupProbe: + timeoutSeconds: 20 + failureThreshold: 6 + periodSeconds: 10 + scheme: HTTP + # Number of webservers replicas: 1 # Max number of old replicasets to retain @@ -1123,6 +1375,9 @@ webserver: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Additional network policies as needed (Deprecated - renamed to `webserver.networkPolicy.ingress.from`) extraNetworkPolicies: [] networkPolicy: @@ -1151,9 +1406,9 @@ webserver: lastName: user password: admin # This is randomized during install - # Launch additional containers into webserver. + # Launch additional containers into webserver (templated). extraContainers: [] - # Add additional init containers into webserver. + # Add additional init containers into webserver (templated). extraInitContainers: [] # Mount additional volumes into webserver. It can be templated like in the following example: @@ -1304,9 +1559,15 @@ triggerer: securityContexts: pod: {} container: {} + + # container level lifecycle hooks + containerLifecycleHooks: {} + persistence: # Enable persistent volumes enabled: true + # This policy determines whether PVCs should be deleted when StatefulSet is scaled down or removed. + persistentVolumeClaimRetentionPolicy: ~ # Volume size for triggerer StatefulSet size: 30Gi # If using a custom storageClass, pass name ref to all statefulSets here @@ -1333,9 +1594,9 @@ triggerer: # when it wants to scale a node down. safeToEvict: true - # Launch additional containers into triggerer. + # Launch additional containers into triggerer (templated). extraContainers: [] - # Add additional init containers into triggerers. + # Add additional init containers into triggerers (templated). extraInitContainers: [] # Mount additional volumes into triggerer. It can be templated like in the following example: @@ -1355,7 +1616,8 @@ triggerer: # Select certain nodes for airflow triggerer pods. nodeSelector: { - spotinst.io/node-lifecycle: "od" + spotinst.io/node-lifecycle: "od", + topology.kubernetes.io/zone: "us-east-1a" } affinity: # default triggerer affinity is: @@ -1400,6 +1662,9 @@ triggerer: securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + waitForMigrations: # Whether to create init container to wait for db migrations enabled: true @@ -1410,6 +1675,44 @@ triggerer: env: [] + # Allow KEDA autoscaling. + keda: + enabled: false + namespaceLabels: {} + + # How often KEDA polls the airflow DB to report new scale requests to the HPA + pollingInterval: 5 + + # How many seconds KEDA will wait before scaling to zero. + # Note that HPA has a separate cooldown period for scale-downs + cooldownPeriod: 30 + + # Minimum number of triggerers created by keda + minReplicaCount: 0 + + # Maximum number of triggerers created by keda + maxReplicaCount: 10 + + # Specify HPA related options + advanced: {} + # horizontalPodAutoscalerConfig: + # behavior: + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Percent + # value: 100 + # periodSeconds: 15 + + # Query to use for KEDA autoscaling. Must return a single integer. + query: >- + SELECT ceil(COUNT(*)::decimal / {{ .Values.config.triggerer.default_capacity }}) + FROM trigger + + # Whether to use PGBouncer to connect to the database or not when it is enabled + # This configuration will be ignored if PGBouncer is not enabled + usePgbouncer: false + # Airflow Dag Processor Config dagProcessor: enabled: false @@ -1463,6 +1766,9 @@ dagProcessor: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + resources: {} # limits: # cpu: 100m @@ -1478,9 +1784,9 @@ dagProcessor: # when it wants to scale a node down. safeToEvict: true - # Launch additional containers into dag processor. + # Launch additional containers into dag processor (templated). extraContainers: [] - # Add additional init containers into dag processors. + # Add additional init containers into dag processors (templated). extraInitContainers: [] # Mount additional volumes into dag processor. It can be templated like in the following example: @@ -1536,11 +1842,16 @@ dagProcessor: # requests: # cpu: 100m # memory: 128Mi + securityContexts: + container: {} waitForMigrations: # Whether to create init container to wait for db migrations enabled: true env: [] + # Detailed default security context for waitForMigrations for container level + securityContexts: + container: {} env: [] @@ -1549,6 +1860,19 @@ flower: # Enable flower. # If True, and using CeleryExecutor/CeleryKubernetesExecutor, will deploy flower app. enabled: false + + livenessProbe: + initialDelaySeconds: 10 + timeoutSeconds: 5 + failureThreshold: 10 + periodSeconds: 5 + + readinessProbe: + initialDelaySeconds: 10 + timeoutSeconds: 5 + failureThreshold: 10 + periodSeconds: 5 + # Max number of old replicasets to retain revisionHistoryLimit: ~ @@ -1592,6 +1916,9 @@ flower: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -1667,6 +1994,9 @@ flower: # StatsD settings statsd: + # Add custom annotations to the statsd configmap + configMapAnnotations: {} + enabled: false # Max number of old replicasets to retain revisionHistoryLimit: ~ @@ -1705,6 +2035,9 @@ statsd: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Additional network policies as needed extraNetworkPolicies: [] resources: {} @@ -1737,6 +2070,7 @@ statsd: overrideMappings: [] podAnnotations: {} + env: [] # PgBouncer settings pgbouncer: @@ -1750,7 +2084,7 @@ pgbouncer: command: ["pgbouncer", "-u", "nobody", "/etc/pgbouncer/pgbouncer.ini"] # Args to use for PgBouncer(templated). args: ~ - auth_type: md5 + auth_type: scram-sha-256 auth_file: /etc/pgbouncer/users.txt # annotations to be added to the PgBouncer deployment @@ -1861,6 +2195,9 @@ pgbouncer: extraVolumes: [] extraVolumeMounts: [] + # Launch additional containers into pgbouncer. + extraContainers: [] + # Select certain nodes for PgBouncer pods. nodeSelector: {} affinity: {} @@ -1876,6 +2213,13 @@ pgbouncer: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: + preStop: + exec: + # Allow existing queries clients to complete within 120 seconds + command: ["/bin/sh", "-c", "killall -INT pgbouncer && sleep 120"] + metricsExporterSidecar: resources: {} # limits: @@ -1886,10 +2230,31 @@ pgbouncer: # memory: 128Mi sslmode: "disable" + # supply the name of existing secret with PGBouncer connection URI containing + # stats user and password. + # you can load them to a k8s secret like the one below + # apiVersion: v1 + # kind: Secret + # metadata: + # name: pgbouncer-stats-secret + # data: + # connection: postgresql://:@127.0.0.1:6543/pgbouncer? + # type: Opaque + # + # statsSecretName: pgbouncer-stats-secret + # + statsSecretName: ~ + + # Key containing the PGBouncer connection URI, defaults to `connection` if not defined + statsSecretKey: ~ + # Detailed default security context for metricsExporterSidecar for container level securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + livenessProbe: initialDelaySeconds: 10 periodSeconds: 10 @@ -1900,11 +2265,17 @@ pgbouncer: periodSeconds: 10 timeoutSeconds: 1 + # Environment variables to add to pgbouncer container + env: [] + # Configuration for the redis provisioned by the chart redis: enabled: true terminationGracePeriodSeconds: 600 + # Annotations for Redis Statefulset + annotations: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -1929,6 +2300,11 @@ redis: # Annotations to add to redis volumes annotations: {} + # Configuration for empty dir volume (if redis.persistence.enabled == false) + # emptyDirConfig: + # sizeLimit: 1Gi + # medium: Memory + resources: {} # limits: # cpu: 100m @@ -1951,11 +2327,13 @@ redis: # Select certain nodes for redis pods. nodeSelector: { - spotinst.io/node-lifecycle: "od" + spotinst.io/node-lifecycle: "od", + topology.kubernetes.io/zone: "us-east-1a" } affinity: {} tolerations: [] topologySpreadConstraints: [] + priorityClassName: ~ # Set to 0 for backwards-compatiblity uid: 0 @@ -1969,6 +2347,9 @@ redis: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + podAnnotations: {} # Auth secret for a private registry # This is used if pulling airflow images from a private registry @@ -1992,6 +2373,7 @@ elasticsearch: # Or an object representing the connection # Example: # connection: + # scheme: ~ # user: ~ # pass: ~ # host: ~ @@ -2009,6 +2391,8 @@ ports: statsdScrape: 9102 pgbouncer: 6543 pgbouncerScrape: 9127 + # rpcServer support is experimental / dev purpose only and will later be renamed + _rpcServer: 9080 # Define any ResourceQuotas for namespace quotas: {} @@ -2019,7 +2403,7 @@ limits: [] # This runs as a CronJob to cleanup old pods. cleanup: enabled: false - # Run every 15 minutes (templated). + # Run every 60 minutes (templated). schedule: "*/60 * * * *" # To select a random-ish, deterministic starting minute between 3 and 12 inclusive for each release: # '{{- add 3 (regexFind ".$" (adler32sum .Release.Name)) -}}-59/15 * * * *' @@ -2039,6 +2423,7 @@ cleanup: affinity: {} tolerations: [] topologySpreadConstraints: [] + priorityClassName: ~ podAnnotations: {} @@ -2075,8 +2460,12 @@ cleanup: # Detailed default security context for cleanup for container level securityContexts: + pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Specify history limit # When set, overwrite the default k8s number of successful and failed CronJob executions that are saved. failedJobsHistoryLimit: ~ @@ -2086,8 +2475,6 @@ cleanup: # Not recommended for production postgresql: enabled: false - image: - tag: "11" auth: enablePostgresUser: true postgresPassword: postgres @@ -2113,7 +2500,7 @@ config: # For Airflow 1.10, backward compatibility; moved to [logging] in 2.0 colored_console_log: 'False' remote_logging: '{{- ternary "True" "False" .Values.elasticsearch.enabled }}' - allowed_deserialization_classes: ".*" + allowed_deserialization_classes_regexp: ".*" logging: remote_logging: '{{- ternary "True" "False" .Values.elasticsearch.enabled }}' colored_console_log: 'False' @@ -2121,13 +2508,13 @@ config: statsd_on: '{{ ternary "True" "False" .Values.statsd.enabled }}' statsd_port: 9125 statsd_prefix: airflow - statsd_host: '{{ printf "%s-statsd" .Release.Name }}' + statsd_host: '{{ printf "%s-statsd" (include "airflow.fullname" .) }}' webserver: enable_proxy_fix: 'True' # For Airflow 1.10 rbac: 'True' celery: - flower_url_prefix: '{{ .Values.ingress.flower.path }}' + flower_url_prefix: '{{ ternary "" .Values.ingress.flower.path (eq .Values.ingress.flower.path "/") }}' worker_concurrency: 16 scheduler: standalone_dag_processor: '{{ ternary "True" "False" .Values.dagProcessor.enabled }}' @@ -2135,7 +2522,7 @@ config: statsd_on: '{{ ternary "True" "False" .Values.statsd.enabled }}' statsd_port: 9125 statsd_prefix: airflow - statsd_host: '{{ printf "%s-statsd" .Release.Name }}' + statsd_host: '{{ printf "%s-statsd" (include "airflow.fullname" .) }}' # `run_duration` included for Airflow 1.10 backward compatibility; removed in 2.0. run_duration: 41460 elasticsearch: @@ -2173,6 +2560,8 @@ config: secrets: backend: airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend backend_kwargs: '{"connections_prefix": "airflow/connections", "variables_prefix": "airflow/variables", "region_name": "us-east-1"}' + triggerer: + default_capacity: 1000 # yamllint enable rule:line-length # Whether Airflow can launch workers and/or pods in multiple namespaces @@ -2206,6 +2595,9 @@ podTemplate: ~ # Git sync dags: + # Where dags volume will be mounted. Works for both persistence and gitSync. + # If not specified, dags mount path will be set to $AIRFLOW_HOME/dags + mountPath: ~ persistence: # Annotations for dags PVC annotations: {} @@ -2230,6 +2622,8 @@ dags: repo: https://github.com/Sage-Bionetworks-Workflows/orca-recipes branch: main rev: HEAD + # The git revision (branch, tag, or hash) to check out, v4 only + ref: v2-2-stable depth: 1 # the number of consecutive failures allowed before aborting maxFailures: 0 @@ -2244,8 +2638,12 @@ dags: # metadata: # name: git-credentials # data: + # # For git-sync v3 # GIT_SYNC_USERNAME: # GIT_SYNC_PASSWORD: + # # For git-sync v4 + # GITSYNC_USERNAME: + # GITSYNC_PASSWORD: # and specify the name of the secret below # # credentialsSecret: git-credentials @@ -2264,6 +2662,12 @@ dags: # and specify the name of the secret below # sshKeySecret: airflow-ssh-secret # + # Or set sshKeySecret with your key + # sshKey: |- + # -----BEGIN {OPENSSH PRIVATE KEY}----- + # ... + # -----END {OPENSSH PRIVATE KEY}----- + # # If you are using an ssh private key, you can additionally # specify the content of your known_hosts file, example: # @@ -2274,7 +2678,16 @@ dags: # interval between git sync attempts in seconds # high values are more likely to cause DAGs to become out of sync between different components # low values cause more traffic to the remote git repository + # Go-style duration string (e.g. "100ms" or "0.1s" = 100ms). + # For backwards compatibility, wait will be used if it is specified. + period: 5s wait: 600 + # add variables from secret into gitSync containers, such proxy-config + envFrom: ~ + # envFrom: | + # - secretRef: + # name: 'proxy-config' + containerName: git-sync uid: 65533 @@ -2286,6 +2699,9 @@ dags: securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Mount additional volumes into git-sync. It can be templated like in the following example: # extraVolumeMounts: # - name: my-templated-extra-volume @@ -2297,6 +2713,11 @@ dags: # - name: "" # value: "" + # Configuration for empty dir volume + # emptyDirConfig: + # sizeLimit: 1Gi + # medium: Memory + resources: {} # limits: # cpu: 100m @@ -2306,6 +2727,11 @@ dags: # memory: 128Mi logs: + # Configuration for empty dir volume (if logs.persistence.enabled == false) + # emptyDirConfig: + # sizeLimit: 1Gi + # medium: Memory + persistence: # Enable persistent volume for storing logs enabled: false @@ -2317,3 +2743,4 @@ logs: storageClassName: gp3 ## the name of an existing PVC to use existingClaim: + diff --git a/modules/cluster-ingress/README.md b/modules/cluster-ingress/README.md deleted file mode 100644 index 6890c94c..00000000 --- a/modules/cluster-ingress/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# Purpose -The purpose of this module is to deploy kubernetes resources related to ingress for -the cluster. Along with the ingress we will also deploy out the related SSL cert issuer. - -## To implemement -The Envoy Gateway can secure ingress by verifying JWT. It can be applied to a specific -target, for example this applies it to all requests going through a `Gateway` called `eg` - -``` -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: SecurityPolicy -metadata: - name: jwt-example -spec: - targetRef: - group: gateway.networking.k8s.io - kind: Gateway - name: eg - jwt: - providers: - - name: auth0 - remoteJWKS: - uri: https://dev-57n3awu5je6q653y.us.auth0.com/.well-known/jwks.json -``` - - -The HTTPRoute is used to connect the envoy gateway ingress to a service in the cluster. -In this example the path `/get` routes the request to a service called `backend` on -port 3000. -``` -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: backend -spec: - parentRefs: - - name: eg - rules: - - backendRefs: - - group: "" - kind: Service - name: backend - port: 3000 - weight: 1 - matches: - - path: - type: PathPrefix - value: /get -``` \ No newline at end of file diff --git a/modules/cluster-ingress/main.tf b/modules/cluster-ingress/main.tf deleted file mode 100644 index ffd78f7c..00000000 --- a/modules/cluster-ingress/main.tf +++ /dev/null @@ -1,40 +0,0 @@ -resource "kubectl_manifest" "cluster-ingress" { - yaml_body = < +to make this happen. When an application needs to expose itself out to the internet the +application will need to create 2 resources: + +1) An `HTTPRoute`: which is used for specifying routing behavior of HTTP requests from a Gateway listener to a `Service`. +2) A `ReferenceGrant` which is used to enable cross namespace references within Gateway API. In particular, Routes may forward traffic to backends in other namespaces. + +For the above 2 resources the placement of them into the correct namespaces is required +for all permissions to work. The `HTTPRoute` exists within the namespace that this +module is deployed into. The `ReferenceGrant` is deployed into the namespace that is +exposing itself to the internet. The usage of this model prevents the gateway from +forwarding traffic to namespaces or services which have not explicitly allowed traffic. + + diff --git a/modules/envoy-gateway/main.tf b/modules/envoy-gateway/main.tf index 147c94db..25b2c7ff 100644 --- a/modules/envoy-gateway/main.tf +++ b/modules/envoy-gateway/main.tf @@ -31,6 +31,38 @@ spec: - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' targetRevision: ${var.git_revision} ref: values + - repoURL: 'https://github.com/Sage-Bionetworks-Workflows/eks-stack.git' + targetRevision: ${var.git_revision} + path: modules/envoy-gateway/resources + kustomize: + patches: + - target: + kind: ClusterIssuer + patch: |- + - op: replace + path: /metadata/name + value: ${var.cluster_issuer_name} + - target: + kind: GatewayClass + patch: |- + - op: replace + path: /spec/parametersRef/namespace + value: ${var.namespace} + - target: + kind: Gateway + patch: |- + - op: replace + path: /metadata/annotations/cert-manager.io~1cluster-issuer + value: ${var.cluster_issuer_name} + - op: replace + path: /spec/listeners/0/hostname + value: ${var.ssl_hostname} + - target: + kind: ClusterIssuer + patch: |- + - op: replace + path: /metadata/name + value: ${var.cluster_issuer_name} destination: server: 'https://kubernetes.default.svc' namespace: ${var.namespace} diff --git a/modules/envoy-gateway/resources/cert-issuer.yaml b/modules/envoy-gateway/resources/cert-issuer.yaml new file mode 100644 index 00000000..4608c97a --- /dev/null +++ b/modules/envoy-gateway/resources/cert-issuer.yaml @@ -0,0 +1,18 @@ +# To implement using something like letsencrypt +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: lets-encrypt-prod +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: "dpe@sagebase.org" + privateKeySecretRef: + name: letsencrypt-prod-account-key + solvers: + - http01: + gatewayHTTPRoute: + parentRefs: + - kind: Gateway + name: eg + namespace: envoy-gateway diff --git a/modules/envoy-gateway/resources/envoy-proxy.yaml b/modules/envoy-gateway/resources/envoy-proxy.yaml new file mode 100644 index 00000000..bb6dbb39 --- /dev/null +++ b/modules/envoy-gateway/resources/envoy-proxy.yaml @@ -0,0 +1,6 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyProxy +metadata: + name: custom-proxy-config +spec: + mergeGateways: false \ No newline at end of file diff --git a/modules/cluster-ingress/resources/gateway-class.yaml b/modules/envoy-gateway/resources/gateway-class.yaml similarity index 53% rename from modules/cluster-ingress/resources/gateway-class.yaml rename to modules/envoy-gateway/resources/gateway-class.yaml index a619d17a..67246b2e 100644 --- a/modules/cluster-ingress/resources/gateway-class.yaml +++ b/modules/envoy-gateway/resources/gateway-class.yaml @@ -4,3 +4,8 @@ metadata: name: eg spec: controllerName: gateway.envoyproxy.io/gatewayclass-controller + parametersRef: + group: gateway.envoyproxy.io + kind: EnvoyProxy + name: custom-proxy-config + namespace: envoy-gateway diff --git a/modules/cluster-ingress/resources/gateway.yaml b/modules/envoy-gateway/resources/gateway.yaml similarity index 100% rename from modules/cluster-ingress/resources/gateway.yaml rename to modules/envoy-gateway/resources/gateway.yaml diff --git a/modules/envoy-gateway/resources/http-to-https-redirect.yaml b/modules/envoy-gateway/resources/http-to-https-redirect.yaml new file mode 100644 index 00000000..73dc9836 --- /dev/null +++ b/modules/envoy-gateway/resources/http-to-https-redirect.yaml @@ -0,0 +1,16 @@ +# Upgrades HTTP requests to HTTPS +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: tls-redirect +spec: + parentRefs: + - name: eg + sectionName: http + hostnames: + - "*.sagedpe.org" + rules: + - filters: + - type: RequestRedirect + requestRedirect: + scheme: https \ No newline at end of file diff --git a/modules/cluster-ingress/resources/kustomization.yaml b/modules/envoy-gateway/resources/kustomization.yaml similarity index 75% rename from modules/cluster-ingress/resources/kustomization.yaml rename to modules/envoy-gateway/resources/kustomization.yaml index be0a9119..cea76074 100644 --- a/modules/cluster-ingress/resources/kustomization.yaml +++ b/modules/envoy-gateway/resources/kustomization.yaml @@ -1,7 +1,9 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- gateway.yaml - cert-issuer.yaml - gateway-class.yaml +- envoy-proxy.yaml +- gateway.yaml - traffic-policy.yaml +- http-to-https-redirect.yaml diff --git a/modules/cluster-ingress/resources/traffic-policy.yaml b/modules/envoy-gateway/resources/traffic-policy.yaml similarity index 100% rename from modules/cluster-ingress/resources/traffic-policy.yaml rename to modules/envoy-gateway/resources/traffic-policy.yaml diff --git a/modules/envoy-gateway/variables.tf b/modules/envoy-gateway/variables.tf index 770d80bd..03f078cd 100644 --- a/modules/envoy-gateway/variables.tf +++ b/modules/envoy-gateway/variables.tf @@ -25,3 +25,13 @@ variable "namespace" { description = "The namespace to deploy into" type = string } + +variable "cluster_issuer_name" { + description = "The name of the cluster issuer" + type = string +} + +variable "ssl_hostname" { + description = "The hostname to use for the SSL certificate" + type = string +} diff --git a/modules/sage-aws-ses/README.md b/modules/sage-aws-ses/README.md new file mode 100644 index 00000000..47561e34 --- /dev/null +++ b/modules/sage-aws-ses/README.md @@ -0,0 +1,35 @@ +# Purpose +This module is used to set up SES (Simple email service) in AWS. + +By setting a few variables we are able to create a number of Email addresses +to AWS SES. The variables to be set are: + +- `email_identities`, example: `["example@sagebase.org"]` + +# Manual steps required +After running this module a number of manual steps are required as they are external +processes that need to happen: + +## Verify Email address +1) Navigate to Amazon SES in the web console +2) Navigate to `identities` +3) Choose the Identity to verify +4) Send a test email and click the link received to verify the email + +Optional: Send a test email after verifying to confirm you may receive emails + +# Request production access +After creating AWS SES settings the first time you will be in "Sandbox" mode. In order +to request production access follow the following document: +under the section "To request that your account be removed from the Amazon SES sandbox using the AWS CLI". + +The command will look something like: + +``` +aws sesv2 put-account-details \ +--production-access-enabled \ +--mail-type TRANSACTIONAL \ +--website-url https://www.synapse.org/ \ +--additional-contact-email-addresses dpe@sagebase.org \ +--contact-language EN +``` diff --git a/modules/sage-aws-ses/data.tf b/modules/sage-aws-ses/data.tf new file mode 100644 index 00000000..3fe7d177 --- /dev/null +++ b/modules/sage-aws-ses/data.tf @@ -0,0 +1,6 @@ +data "aws_iam_policy_document" "ses_sender" { + statement { + actions = ["ses:SendRawEmail"] + resources = ["*"] + } +} \ No newline at end of file diff --git a/modules/sage-aws-ses/main.tf b/modules/sage-aws-ses/main.tf new file mode 100644 index 00000000..f733d8d2 --- /dev/null +++ b/modules/sage-aws-ses/main.tf @@ -0,0 +1,23 @@ +resource "aws_ses_email_identity" "identities" { + for_each = { for identity in var.email_identities : identity => identity } + email = each.value +} + +resource "aws_iam_user" "smtp_user" { + name = "smtp_user" +} + +resource "aws_iam_access_key" "smtp_user" { + user = aws_iam_user.smtp_user.name +} + +resource "aws_iam_policy" "ses_sender" { + name = "ses_sender" + description = "Allows sending of e-mails via Simple Email Service" + policy = data.aws_iam_policy_document.ses_sender.json +} + +resource "aws_iam_user_policy_attachment" "test-attach" { + user = aws_iam_user.smtp_user.name + policy_arn = aws_iam_policy.ses_sender.arn +} diff --git a/modules/sage-aws-ses/ouputs.tf b/modules/sage-aws-ses/ouputs.tf new file mode 100644 index 00000000..6a43bb52 --- /dev/null +++ b/modules/sage-aws-ses/ouputs.tf @@ -0,0 +1,9 @@ + +output "smtp_user" { + value = aws_iam_access_key.smtp_user.id +} + +output "smtp_password" { + sensitive = true + value = aws_iam_access_key.smtp_user.ses_smtp_password_v4 +} \ No newline at end of file diff --git a/modules/sage-aws-ses/variables.tf b/modules/sage-aws-ses/variables.tf new file mode 100644 index 00000000..d923b190 --- /dev/null +++ b/modules/sage-aws-ses/variables.tf @@ -0,0 +1,12 @@ +variable "email_identities" { + type = list(string) + description = "List of email identities to be added to SES" +} + +variable "tags" { + description = "AWS Resource Tags" + type = map(string) + default = { + "CostCenter" = "No Program / 000000" + } +} diff --git a/modules/sage-aws-ses/versions.tf b/modules/sage-aws-ses/versions.tf new file mode 100644 index 00000000..cba4c144 --- /dev/null +++ b/modules/sage-aws-ses/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/modules/signoz/README.md b/modules/signoz/README.md index a84055a2..efdf9ffb 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -13,7 +13,55 @@ A number of items are needed: - Set up ingress to the cluster/collector to send data to: https://sagebionetworks.jira.com/browse/IBCDPE-1095 - Set up accounts and access to the service decleratively -## Accessing signoz + +## Setting up SMTP for alertmanager +Alertmanager is an additional tool that is deployed to the kubernetes cluster that +handles forwarding an alert out to 1 or more streams that will receive the alert. +Alert manager is set to to send emails through AWS SES (Simple Email Service) set up +by the `modules/sage-aws-ses` terraform scripts. See that module for more information +about the setup of AWS SES. + +## Accessing signoz (Internet) + +#### Sending data into signoz (From internet) +When SigNoz is deployed with the terraform variables `enable_otel_ingress` and `gateway_namespace` +set, an HTTP route to the openTelemetry collector will be exposed out to the internet. +Using the defined URL a user may send telemetry data via HTTPS and a Bearer auth token +into the cluster. To accomplish this the sender of the data will need to configure +the sending application with the appropriate HTTPS url and authentication (Different +depending on the sender). The paths to send data to will be as follows: + +- `/telemetry/v1/traces` +- `/telemetry/v1/metrics` +- `/telemetry/v1/logs` + + +Un-authenticated requests will be rejected with an HTTP 401. + +#### Authentication +Authentication for data being sent into the cluster will occur via a JWT Bearer token. +As the sender, you will be required to ensure that every request sent has an unexpired +and valid token. The exact mechanism for attaching this authentication will change +depending on how data is forwarded into the cluster. For example if using an +open-telemetry collector you may use this oauth2 extension: +. + +##### Authentication from Python application directly +If you are sending data directly from an application into the cluster you may specify +an environment variable with headers to attach to the requests by setting: +`OTEL_EXPORTER_OTLP_HEADERS=Authorization=Bearer ...` + +> [!NOTE] +> This method is only a temporary solution as Bearer tokens will expire and need to be rotated. + +Future work would be to determine if we may be able to implement the usage of + to handle automatic token fetching +using a client ID/Client secret using Auth0 (Or related IDP). + + +## Accessing signoz (Port-forwarding) +This guide is for those that have access to the kubernetes cluster and are using +port-fowarding to access the data in the cluster. ### Pre-req This assumes that you have accessed the k8s cluster before using `k9s` or another tool. diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 40b456f6..896f7c00 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -1,3 +1,6 @@ +locals { + alertmanager_enabled = var.smtp_from != "" && var.smtp_user != "" && var.smtp_password != "" +} resource "kubernetes_namespace" "signoz" { metadata { @@ -7,7 +10,7 @@ resource "kubernetes_namespace" "signoz" { resource "kubectl_manifest" "signoz-deployment" { depends_on = [kubernetes_namespace.signoz] - + yaml_body = < diff --git a/modules/signoz/templates/values.yaml b/modules/signoz/templates/values.yaml index c160e354..e11387f3 100644 --- a/modules/signoz/templates/values.yaml +++ b/modules/signoz/templates/values.yaml @@ -990,7 +990,7 @@ frontend: # Default values for Alertmanager alertmanager: - enabled: false + enabled: name: "alertmanager" replicaCount: 1 @@ -1036,8 +1036,14 @@ alertmanager: nodePort: null # -- Additional environments to set for Alertmanager - additionalEnvs: {} - # env_key: env_value + additionalEnvs: + ALERTMANAGER_SMTP_FROM: + ALERTMANAGER_SMTP_HOST: email-smtp.us-east-1.amazonaws.com + # 587 is the STARTTLS port for SMTP + # https://docs.aws.amazon.com/ses/latest/dg/smtp-connect.html#smtp-connect-starttls + ALERTMANAGER_SMTP_PORT: "587" + ALERTMANAGER_SMTP_AUTH_USERNAME: + ALERTMANAGER_SMTP_AUTH_PASSWORD: initContainers: init: diff --git a/modules/signoz/variables.tf b/modules/signoz/variables.tf index 770d80bd..2a917ff1 100644 --- a/modules/signoz/variables.tf +++ b/modules/signoz/variables.tf @@ -25,3 +25,43 @@ variable "namespace" { description = "The namespace to deploy into" type = string } + + +variable "enable_otel_ingress" { + description = "Enable OpenTelemetry ingress" + type = bool + default = false +} + +variable "gateway_namespace" { + description = "The namespace of the gateway" + type = string +} + +variable "cluster_name" { + description = "EKS cluster name" + type = string +} + +variable "auth0_jwks_uri" { + description = "The JWKS URI for Auth0" + type = string +} + +variable "smtp_user" { + description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_password" { + description = "The SMTP password. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_from" { + description = "The SMTP from address. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} From f314bde3e92b49d7a2ce9fab139ee072ce161581 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:39:21 -0700 Subject: [PATCH 65/85] Remove py file --- .../create-synapse-oauth-client.py | 26 ------------------- 1 file changed, 26 deletions(-) delete mode 100644 modules/envoy-gateway/create-synapse-oauth-client.py diff --git a/modules/envoy-gateway/create-synapse-oauth-client.py b/modules/envoy-gateway/create-synapse-oauth-client.py deleted file mode 100644 index 2f0b23c5..00000000 --- a/modules/envoy-gateway/create-synapse-oauth-client.py +++ /dev/null @@ -1,26 +0,0 @@ -import synapseclient -import json -syn = synapseclient.login() - -client_meta_data = { - 'client_name': '', - 'redirect_uris': [ - '' - ], - # 'client_uri': 'https://yourhost.com/index.html', - # 'policy_uri': 'https://yourhost.com/policy', - # 'tos_uri': 'https://yourhost.com/terms_of_service', - 'userinfo_signed_response_alg': 'RS256' -} - -# Create the client: -client_meta_data = syn.restPOST(uri='/oauth2/client', - endpoint=syn.authEndpoint, body=json.dumps(client_meta_data)) - -client_id = client_meta_data['client_id'] - -# Generate and retrieve the client secret: -client_id_and_secret = syn.restPOST(uri='/oauth2/client/secret/'+client_id, - endpoint=syn.authEndpoint, body='') - -print(client_id_and_secret) From d4bf8959f3bc3f22ed59ad03fd4494632df90733 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:40:34 -0700 Subject: [PATCH 66/85] Update readme note --- modules/signoz/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/signoz/README.md b/modules/signoz/README.md index efdf9ffb..4cdcfa66 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -6,12 +6,11 @@ SigNoz is an open-source APM. It helps developers monitor their applications source Application Performance Monitoring (APM) & Observability tool. -## This module is a work in progress +## This module is a work in progress (To be completed before production, or determine if not needed) A number of items are needed: - Setting up backups and data retention: https://sagebionetworks.jira.com/browse/IBCDPE-1094 -- Set up ingress to the cluster/collector to send data to: https://sagebionetworks.jira.com/browse/IBCDPE-1095 -- Set up accounts and access to the service decleratively +- Set up accounts and access to the service declaratively ## Setting up SMTP for alertmanager From ae8eacb496a4a5ef03b79e770435b43e3fe52e84 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:49:35 -0700 Subject: [PATCH 67/85] Remove comments about moving to provider --- modules/cert-manager/versions.tf | 7 ------- modules/envoy-gateway/versions.tf | 7 ------- modules/signoz/versions.tf | 7 ------- 3 files changed, 21 deletions(-) diff --git a/modules/cert-manager/versions.tf b/modules/cert-manager/versions.tf index 28b5ab89..ce834c32 100644 --- a/modules/cert-manager/versions.tf +++ b/modules/cert-manager/versions.tf @@ -13,12 +13,5 @@ terraform { version = "1.14.0" } } - # TODO: Move to this provider - # required_providers { - # argocd = { - # source = "oboukili/argocd" - # version = "6.1.1" - # } - # } } diff --git a/modules/envoy-gateway/versions.tf b/modules/envoy-gateway/versions.tf index 28b5ab89..ce834c32 100644 --- a/modules/envoy-gateway/versions.tf +++ b/modules/envoy-gateway/versions.tf @@ -13,12 +13,5 @@ terraform { version = "1.14.0" } } - # TODO: Move to this provider - # required_providers { - # argocd = { - # source = "oboukili/argocd" - # version = "6.1.1" - # } - # } } diff --git a/modules/signoz/versions.tf b/modules/signoz/versions.tf index 28b5ab89..ce834c32 100644 --- a/modules/signoz/versions.tf +++ b/modules/signoz/versions.tf @@ -13,12 +13,5 @@ terraform { version = "1.14.0" } } - # TODO: Move to this provider - # required_providers { - # argocd = { - # source = "oboukili/argocd" - # version = "6.1.1" - # } - # } } From fc53860a6105ca28d39c951b9a649a736c6cc13d Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 12:02:04 -0700 Subject: [PATCH 68/85] Upgrade helmchart for signoz (#46) * Upgrade helmchart for signoz to 0.55.1 --- modules/signoz/main.tf | 4 +- modules/signoz/templates/values.yaml | 113 ++++++++++++++++++--------- 2 files changed, 78 insertions(+), 39 deletions(-) diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 896f7c00..e1c40079 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -10,7 +10,7 @@ resource "kubernetes_namespace" "signoz" { resource "kubectl_manifest" "signoz-deployment" { depends_on = [kubernetes_namespace.signoz] - + yaml_body = < Date: Wed, 6 Nov 2024 15:09:47 -0700 Subject: [PATCH 69/85] Deploy SES module only when emails are provided --- deployments/stacks/dpe-k8s/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/deployments/stacks/dpe-k8s/main.tf b/deployments/stacks/dpe-k8s/main.tf index 17c12f0d..325a0333 100644 --- a/deployments/stacks/dpe-k8s/main.tf +++ b/deployments/stacks/dpe-k8s/main.tf @@ -38,6 +38,7 @@ module "sage-aws-eks" { } module "sage-aws-ses" { + count = length(var.ses_email_identities) > 0 ? 1 : 0 source = "../../../modules/sage-aws-ses" email_identities = var.ses_email_identities From 1db7b42b200cde3695ed857277b95d62ab4f2579 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:15:59 -0700 Subject: [PATCH 70/85] Correct output conditional --- deployments/stacks/dpe-k8s/outputs.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deployments/stacks/dpe-k8s/outputs.tf b/deployments/stacks/dpe-k8s/outputs.tf index 4a062261..6851f6a7 100644 --- a/deployments/stacks/dpe-k8s/outputs.tf +++ b/deployments/stacks/dpe-k8s/outputs.tf @@ -39,10 +39,10 @@ output "cluster_name" { } output "smtp_user" { - value = module.sage-aws-ses.smtp_user + value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_user : null } output "smtp_password" { sensitive = true - value = module.sage-aws-ses.smtp_password -} \ No newline at end of file + value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_password : null +} From 7f652ec371786fc5a1d0ad6d714fcea0118e183b Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:19:31 -0700 Subject: [PATCH 71/85] Create moved blocks for resources --- deployments/stacks/dpe-k8s/main.tf | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/deployments/stacks/dpe-k8s/main.tf b/deployments/stacks/dpe-k8s/main.tf index 325a0333..cca29b5e 100644 --- a/deployments/stacks/dpe-k8s/main.tf +++ b/deployments/stacks/dpe-k8s/main.tf @@ -37,6 +37,31 @@ module "sage-aws-eks" { private_subnet_ids_eks_worker_nodes = module.sage-aws-vpc.private_subnet_ids_eks_worker_nodes } +moved { + from = aws_iam_access_key.smtp_user + to = module.sage-aws-ses[0].aws_iam_access_key.smtp_user +} + +moved { + from = aws_iam_policy.ses_sender + to = module.sage-aws-ses[0].aws_iam_policy.ses_sender +} + +moved { + from = aws_iam_user.smtp_user + to = module.sage-aws-ses[0].aws_iam_user.smtp_user +} + +moved { + from = aws_iam_user_policy_attachment.test-attach + to = module.sage-aws-ses[0].aws_iam_user_policy_attachment.test-attach +} + +moved { + from = aws_ses_email_identity.identities["aws-dpe-dev@sagebase.org"] + to = module.sage-aws-ses[0].aws_ses_email_identity.identities["aws-dpe-dev@sagebase.org"] +} + module "sage-aws-ses" { count = length(var.ses_email_identities) > 0 ? 1 : 0 source = "../../../modules/sage-aws-ses" From 24bc617b7eb9fe1bd0106df443c6b8d69b7a90a0 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:22:04 -0700 Subject: [PATCH 72/85] Correct moved blocks (Bad AI) --- deployments/stacks/dpe-k8s/main.tf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deployments/stacks/dpe-k8s/main.tf b/deployments/stacks/dpe-k8s/main.tf index cca29b5e..ea857276 100644 --- a/deployments/stacks/dpe-k8s/main.tf +++ b/deployments/stacks/dpe-k8s/main.tf @@ -38,27 +38,27 @@ module "sage-aws-eks" { } moved { - from = aws_iam_access_key.smtp_user + from = module.sage-aws-ses.aws_iam_access_key.smtp_user to = module.sage-aws-ses[0].aws_iam_access_key.smtp_user } moved { - from = aws_iam_policy.ses_sender + from = module.sage-aws-ses.aws_iam_policy.ses_sender to = module.sage-aws-ses[0].aws_iam_policy.ses_sender } moved { - from = aws_iam_user.smtp_user + from = module.sage-aws-ses.aws_iam_user.smtp_user to = module.sage-aws-ses[0].aws_iam_user.smtp_user } moved { - from = aws_iam_user_policy_attachment.test-attach + from = module.sage-aws-ses.aws_iam_user_policy_attachment.test-attach to = module.sage-aws-ses[0].aws_iam_user_policy_attachment.test-attach } moved { - from = aws_ses_email_identity.identities["aws-dpe-dev@sagebase.org"] + from = module.sage-aws-ses.aws_ses_email_identity.identities["aws-dpe-dev@sagebase.org"] to = module.sage-aws-ses[0].aws_ses_email_identity.identities["aws-dpe-dev@sagebase.org"] } From b8509dff7c6d472fd7de5f12c79f143509b72588 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:26:00 -0700 Subject: [PATCH 73/85] Remove moved blocks are they're not needed --- deployments/stacks/dpe-k8s/main.tf | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/deployments/stacks/dpe-k8s/main.tf b/deployments/stacks/dpe-k8s/main.tf index ea857276..325a0333 100644 --- a/deployments/stacks/dpe-k8s/main.tf +++ b/deployments/stacks/dpe-k8s/main.tf @@ -37,31 +37,6 @@ module "sage-aws-eks" { private_subnet_ids_eks_worker_nodes = module.sage-aws-vpc.private_subnet_ids_eks_worker_nodes } -moved { - from = module.sage-aws-ses.aws_iam_access_key.smtp_user - to = module.sage-aws-ses[0].aws_iam_access_key.smtp_user -} - -moved { - from = module.sage-aws-ses.aws_iam_policy.ses_sender - to = module.sage-aws-ses[0].aws_iam_policy.ses_sender -} - -moved { - from = module.sage-aws-ses.aws_iam_user.smtp_user - to = module.sage-aws-ses[0].aws_iam_user.smtp_user -} - -moved { - from = module.sage-aws-ses.aws_iam_user_policy_attachment.test-attach - to = module.sage-aws-ses[0].aws_iam_user_policy_attachment.test-attach -} - -moved { - from = module.sage-aws-ses.aws_ses_email_identity.identities["aws-dpe-dev@sagebase.org"] - to = module.sage-aws-ses[0].aws_ses_email_identity.identities["aws-dpe-dev@sagebase.org"] -} - module "sage-aws-ses" { count = length(var.ses_email_identities) > 0 ? 1 : 0 source = "../../../modules/sage-aws-ses" From fe1e37b27c9116092cd778ff38b307aa42a91bc0 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:33:10 -0700 Subject: [PATCH 74/85] Conditionally deploy auth0 spacelift stack --- deployments/main.tf | 5 ++++- deployments/spacelift/dpe-k8s/main.tf | 15 +++++++++++++-- deployments/spacelift/dpe-k8s/variables.tf | 6 ++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/deployments/main.tf b/deployments/main.tf index f75f13b1..c42d5c91 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -71,7 +71,9 @@ module "dpe-sandbox-spacelift-development" { enable_otel_ingress = true ssl_hostname = "dev.sagedpe.org" auth0_jwks_uri = "https://dev-sage-dpe.us.auth0.com/.well-known/jwks.json" - ses_email_identities = ["aws-dpe-dev@sagebase.org"] + deploy_auth0 = false + + ses_email_identities = ["aws-dpe-dev@sagebase.org"] # Defines the email address that will be used as the sender of the email alerts smtp_from = "aws-dpe-dev@sagebase.org" } @@ -118,6 +120,7 @@ module "dpe-sandbox-spacelift-production" { enable_otel_ingress = false ssl_hostname = "" auth0_jwks_uri = "" + deploy_auth0 = false ses_email_identities = [] } diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index c29b3ce8..e91a43e8 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -29,11 +29,11 @@ locals { smtp_from = var.smtp_from } - auth0_stack_variables = { + auth0_stack_variables = var.deploy_auth0 ? { cluster_name = var.cluster_name auth0_domain = var.auth0_domain auth0_clients = var.auth0_clients - } + } : {} # Variables to be passed from the k8s stack to the deployments stack k8s_stack_to_deployment_variables = { @@ -216,8 +216,18 @@ resource "spacelift_aws_integration_attachment" "k8s-deployments-aws-integration write = true } +moved { + from = spacelift_stack.auth0 + to = spacelift_stack.auth0[0] +} + +moved { + from = spacelift_stack_destructor.auth0-stack-destructor + to = spacelift_stack_destructor.auth0-stack-destructor[0] +} resource "spacelift_stack" "auth0" { + count = var.deploy_auth0 ? 1 : 0 github_enterprise { namespace = "Sage-Bionetworks-Workflows" id = "sage-bionetworks-workflows-gh" @@ -243,6 +253,7 @@ resource "spacelift_stack" "auth0" { } resource "spacelift_stack_destructor" "auth0-stack-destructor" { + count = var.deploy_auth0 ? 1 : 0 stack_id = spacelift_stack.auth0.id } diff --git a/deployments/spacelift/dpe-k8s/variables.tf b/deployments/spacelift/dpe-k8s/variables.tf index f2fa71c8..5c7d31b0 100644 --- a/deployments/spacelift/dpe-k8s/variables.tf +++ b/deployments/spacelift/dpe-k8s/variables.tf @@ -153,6 +153,12 @@ variable "ssl_hostname" { type = string } +variable "deploy_auth0" { + description = "Determines if a stack for Auth0 should be deployed" + type = bool + default = false +} + variable "auth0_jwks_uri" { description = "The JWKS URI for Auth0" type = string From 4a4da49e2bc34ea6087048f209d0443ccfbda724 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:35:58 -0700 Subject: [PATCH 75/85] Don't autodeploy admin stack and do deploy auth0 for dev --- deployments/main.tf | 2 +- main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deployments/main.tf b/deployments/main.tf index c42d5c91..aef63ffd 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -71,7 +71,7 @@ module "dpe-sandbox-spacelift-development" { enable_otel_ingress = true ssl_hostname = "dev.sagedpe.org" auth0_jwks_uri = "https://dev-sage-dpe.us.auth0.com/.well-known/jwks.json" - deploy_auth0 = false + deploy_auth0 = true ses_email_identities = ["aws-dpe-dev@sagebase.org"] # Defines the email address that will be used as the sender of the email alerts diff --git a/main.tf b/main.tf index 036cab9c..f14e77d9 100644 --- a/main.tf +++ b/main.tf @@ -20,7 +20,7 @@ resource "spacelift_stack" "root_administrative_stack" { } administrative = true - autodeploy = true + autodeploy = false branch = local.git_branch description = "Manages other spacelift resources" name = "Root Spacelift Administrative Stack" From 57741029d667e45be0ee61cb96b9f3b9a039ebf7 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:37:35 -0700 Subject: [PATCH 76/85] Point to specific resource instance --- deployments/spacelift/dpe-k8s/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index e91a43e8..a5f6b44f 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -254,14 +254,14 @@ resource "spacelift_stack" "auth0" { resource "spacelift_stack_destructor" "auth0-stack-destructor" { count = var.deploy_auth0 ? 1 : 0 - stack_id = spacelift_stack.auth0.id + stack_id = spacelift_stack.auth0[0].id } resource "spacelift_environment_variable" "auth0-stack-environment-variables" { for_each = local.auth0_stack_variables - stack_id = spacelift_stack.auth0.id + stack_id = spacelift_stack.auth0[0].id name = "TF_VAR_${each.key}" value = try(tostring(each.value), jsonencode(each.value)) write_only = false From 77f2c22a8d1b459144a356260a14971e0b27cc81 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:40:14 -0700 Subject: [PATCH 77/85] Move conditional check to for_each loop --- deployments/spacelift/dpe-k8s/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index a5f6b44f..7bfa6343 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -29,11 +29,11 @@ locals { smtp_from = var.smtp_from } - auth0_stack_variables = var.deploy_auth0 ? { + auth0_stack_variables = { cluster_name = var.cluster_name auth0_domain = var.auth0_domain auth0_clients = var.auth0_clients - } : {} + } # Variables to be passed from the k8s stack to the deployments stack k8s_stack_to_deployment_variables = { @@ -259,7 +259,7 @@ resource "spacelift_stack_destructor" "auth0-stack-destructor" { resource "spacelift_environment_variable" "auth0-stack-environment-variables" { - for_each = local.auth0_stack_variables + for_each = var.deploy_auth0 ? local.auth0_stack_variables : {} stack_id = spacelift_stack.auth0[0].id name = "TF_VAR_${each.key}" From 767ac8289692b6307a7738d7ba3b2a9599a8c1db Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:43:12 -0700 Subject: [PATCH 78/85] Try list instead of map --- deployments/spacelift/dpe-k8s/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 7bfa6343..f29ebe33 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -259,7 +259,7 @@ resource "spacelift_stack_destructor" "auth0-stack-destructor" { resource "spacelift_environment_variable" "auth0-stack-environment-variables" { - for_each = var.deploy_auth0 ? local.auth0_stack_variables : {} + for_each = var.deploy_auth0 ? local.auth0_stack_variables : [] stack_id = spacelift_stack.auth0[0].id name = "TF_VAR_${each.key}" From 908201bf4e1dca7458b0e7f57d5682b8d9154b9d Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:44:59 -0700 Subject: [PATCH 79/85] Try `tomap` conversion --- deployments/spacelift/dpe-k8s/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index f29ebe33..19810a54 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -259,7 +259,7 @@ resource "spacelift_stack_destructor" "auth0-stack-destructor" { resource "spacelift_environment_variable" "auth0-stack-environment-variables" { - for_each = var.deploy_auth0 ? local.auth0_stack_variables : [] + for_each = var.deploy_auth0 ? local.auth0_stack_variables : tomap({}) stack_id = spacelift_stack.auth0[0].id name = "TF_VAR_${each.key}" From afb6f6e2d4bbd426132dbe21cbb1d82f1e1f2acc Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:52:26 -0700 Subject: [PATCH 80/85] Try handling dependency with depends_on --- deployments/spacelift/dpe-k8s/main.tf | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 19810a54..cdd6ba50 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -259,7 +259,11 @@ resource "spacelift_stack_destructor" "auth0-stack-destructor" { resource "spacelift_environment_variable" "auth0-stack-environment-variables" { - for_each = var.deploy_auth0 ? local.auth0_stack_variables : tomap({}) + depends_on = [ + spacelift_stack.auth0 + ] + + for_each = local.auth0_stack_variables stack_id = spacelift_stack.auth0[0].id name = "TF_VAR_${each.key}" From 2cfcaee650741973b80bd12d58107ba611cdd6fa Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:57:04 -0700 Subject: [PATCH 81/85] Add if check within for_each loop --- deployments/spacelift/dpe-k8s/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index cdd6ba50..73146ec8 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -263,7 +263,7 @@ resource "spacelift_environment_variable" "auth0-stack-environment-variables" { spacelift_stack.auth0 ] - for_each = local.auth0_stack_variables + for_each = { for k, v in local.auth0_stack_variables : k => v if var.deploy_auth0 } stack_id = spacelift_stack.auth0[0].id name = "TF_VAR_${each.key}" From 436908f522a01b9ea9b8c766aec00ff3a65d0ff4 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:10:19 -0700 Subject: [PATCH 82/85] Remove unused moved blocks --- deployments/spacelift/dpe-k8s/main.tf | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 73146ec8..047fefcc 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -216,16 +216,6 @@ resource "spacelift_aws_integration_attachment" "k8s-deployments-aws-integration write = true } -moved { - from = spacelift_stack.auth0 - to = spacelift_stack.auth0[0] -} - -moved { - from = spacelift_stack_destructor.auth0-stack-destructor - to = spacelift_stack_destructor.auth0-stack-destructor[0] -} - resource "spacelift_stack" "auth0" { count = var.deploy_auth0 ? 1 : 0 github_enterprise { From 501b1d325b5152d3e3b89814f4a6e0168d56a85b Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 19 Nov 2024 09:53:24 -0700 Subject: [PATCH 83/85] [IBCDPE-1095] Use scope based authorization on telemetry upload route (#48) * Use scope based authroization on telemetry upload route --- deployments/main.tf | 5 +++ deployments/spacelift/dpe-k8s/main.tf | 8 ++-- deployments/spacelift/dpe-k8s/variables.tf | 6 +++ deployments/stacks/dpe-auth0/main.tf | 14 +++---- deployments/stacks/dpe-auth0/variables.tf | 6 +++ .../stacks/dpe-k8s-deployments/main.tf | 3 +- .../stacks/dpe-k8s-deployments/variables.tf | 5 +++ modules/envoy-gateway/main.tf | 2 +- modules/envoy-gateway/templates/values.yaml | 40 +++++++++++++++++-- modules/signoz/main.tf | 14 ++++++- .../security-policy.yaml | 1 + modules/signoz/variables.tf | 5 +++ 12 files changed, 92 insertions(+), 17 deletions(-) diff --git a/deployments/main.tf b/deployments/main.tf index aef63ffd..ea53ff7b 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -39,18 +39,22 @@ module "dpe-sandbox-spacelift-development" { name = "bfauble - automation" description = "App for testing signoz" app_type = "non_interactive" + scopes = ["write:telemetry"] }, { name = "schematic - Github Actions" description = "Client for Github Actions to export telemetry data" app_type = "non_interactive" + scopes = ["write:telemetry"] }, { name = "schematic - Dev" description = "Client for schematic deployed to AWS DEV to export telemetry data" app_type = "non_interactive" + scopes = ["write:telemetry"] }, ] + auth0_identifier = "https://dev.sagedpe.org" aws_account_id = "631692904429" region = "us-east-1" @@ -100,6 +104,7 @@ module "dpe-sandbox-spacelift-production" { auth0_stack_project_root = "deployments/stacks/dpe-auth0" auth0_domain = "" auth0_clients = [] + auth0_identifier = "" aws_account_id = "766808016710" region = "us-east-1" diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 047fefcc..cd03d65a 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -27,12 +27,14 @@ locals { ssl_hostname = var.ssl_hostname auth0_jwks_uri = var.auth0_jwks_uri smtp_from = var.smtp_from + auth0_identifier = var.auth0_identifier } auth0_stack_variables = { - cluster_name = var.cluster_name - auth0_domain = var.auth0_domain - auth0_clients = var.auth0_clients + cluster_name = var.cluster_name + auth0_domain = var.auth0_domain + auth0_clients = var.auth0_clients + auth0_identifier = var.auth0_identifier } # Variables to be passed from the k8s stack to the deployments stack diff --git a/deployments/spacelift/dpe-k8s/variables.tf b/deployments/spacelift/dpe-k8s/variables.tf index 5c7d31b0..599801f5 100644 --- a/deployments/spacelift/dpe-k8s/variables.tf +++ b/deployments/spacelift/dpe-k8s/variables.tf @@ -185,9 +185,15 @@ variable "auth0_clients" { name = string description = string app_type = string + scopes = list(string) })) } +variable "auth0_identifier" { + description = "Auth0 identifier for the created API." + type = string +} + variable "ses_email_identities" { type = list(string) description = "List of email identities to be added to SES" diff --git a/deployments/stacks/dpe-auth0/main.tf b/deployments/stacks/dpe-auth0/main.tf index 780ea89d..31d2a3b8 100644 --- a/deployments/stacks/dpe-auth0/main.tf +++ b/deployments/stacks/dpe-auth0/main.tf @@ -1,7 +1,7 @@ # Used to create the Auth0 resources for the DPE stack -resource "auth0_resource_server" "k8s-cluster-telemetry" { - name = "${var.cluster_name}-telemetry" - identifier = "${var.cluster_name}-telemetry" +resource "auth0_resource_server" "k8s-cluster-api" { + name = "${var.cluster_name}-api" + identifier = var.auth0_identifier signing_alg = "RS256" allow_offline_access = false @@ -31,8 +31,8 @@ resource "auth0_client" "oauth2_clients" { } resource "auth0_resource_server_scopes" "k8s-cluster-scopes" { - resource_server_identifier = auth0_resource_server.k8s-cluster-telemetry.identifier - # This scope is not yet used, however, kept for future use to grant authorization based on scopes + resource_server_identifier = auth0_resource_server.k8s-cluster-api.identifier + scopes { name = "write:telemetry" description = "Grants write access to telemetry data" @@ -52,6 +52,6 @@ resource "auth0_client_grant" "access_to_k8s_cluster" { for_each = { for client in var.auth0_clients : client.name => client } client_id = auth0_client.oauth2_clients[each.key].id - audience = auth0_resource_server.k8s-cluster-telemetry.identifier - scopes = [] + audience = auth0_resource_server.k8s-cluster-api.identifier + scopes = each.value.scopes } diff --git a/deployments/stacks/dpe-auth0/variables.tf b/deployments/stacks/dpe-auth0/variables.tf index a348f001..5176a7b0 100644 --- a/deployments/stacks/dpe-auth0/variables.tf +++ b/deployments/stacks/dpe-auth0/variables.tf @@ -24,5 +24,11 @@ variable "auth0_clients" { name = string description = string app_type = string + scopes = list(string) })) } + +variable "auth0_identifier" { + description = "Auth0 identifier for the created API." + type = string +} diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 5db44e36..30c32b9f 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -92,11 +92,12 @@ module "signoz" { smtp_password = var.smtp_password smtp_user = var.smtp_user smtp_from = var.smtp_from + auth0_identifier = var.auth0_identifier } module "envoy-gateway" { count = var.enable_cluster_ingress ? 1 : 0 - depends_on = [module.argo-cd] + depends_on = [module.argo-cd, module.cert-manager] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" source = "../../../modules/envoy-gateway" diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index 2b9be26a..8f62670b 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -86,6 +86,11 @@ variable "auth0_jwks_uri" { type = string } +variable "auth0_identifier" { + description = "Auth0 identifier for the API. Used to verify the audience in the JWT." + type = string +} + variable "smtp_user" { description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" type = string diff --git a/modules/envoy-gateway/main.tf b/modules/envoy-gateway/main.tf index 25b2c7ff..47bca383 100644 --- a/modules/envoy-gateway/main.tf +++ b/modules/envoy-gateway/main.tf @@ -23,7 +23,7 @@ spec: sources: - repoURL: registry-1.docker.io chart: envoyproxy/gateway-helm - targetRevision: v1.1.2 + targetRevision: v1.2.1 helm: releaseName: gateway-helm valueFiles: diff --git a/modules/envoy-gateway/templates/values.yaml b/modules/envoy-gateway/templates/values.yaml index 1edd623c..56cf3083 100644 --- a/modules/envoy-gateway/templates/values.yaml +++ b/modules/envoy-gateway/templates/values.yaml @@ -4,7 +4,7 @@ global: images: envoyGateway: # This is the full image name including the hub, repo, and tag. - image: docker.io/envoyproxy/gateway:v1.1.2 + image: docker.io/envoyproxy/gateway:v1.2.1 # Specify image pull policy if default behavior isn't desired. # Default behavior: latest images will be Always else IfNotPresent. pullPolicy: IfNotPresent @@ -12,7 +12,7 @@ global: pullSecrets: [] ratelimit: # This is the full image name including the hub, repo, and tag. - image: "docker.io/envoyproxy/ratelimit:26f28d78" + image: "docker.io/envoyproxy/ratelimit:master" # Specify image pull policy if default behavior isn't desired. # Default behavior: latest images will be Always else IfNotPresent. pullPolicy: IfNotPresent @@ -20,6 +20,8 @@ global: pullSecrets: [] podDisruptionBudget: minAvailable: 0 + # maxUnavailable: 1 + deployment: envoyGateway: image: @@ -29,11 +31,21 @@ deployment: imagePullSecrets: [] resources: limits: - cpu: 500m memory: 1024Mi requests: cpu: 100m memory: 256Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + runAsNonRoot: true + runAsGroup: 65532 + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault ports: - name: grpc port: 18000 @@ -47,6 +59,7 @@ deployment: - name: metrics port: 19001 targetPort: 19001 + priorityClassName: null replicas: 1 pod: affinity: {} @@ -56,6 +69,10 @@ deployment: labels: {} topologySpreadConstraints: [] tolerations: [] + nodeSelector: {} + +service: + annotations: {} config: envoyGateway: @@ -76,7 +93,22 @@ certgen: job: annotations: {} resources: {} + affinity: {} + tolerations: [] + nodeSelector: {} ttlSecondsAfterFinished: 30 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsGroup: 65534 + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault rbac: annotations: {} - labels: {} \ No newline at end of file + labels: {} diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index e1c40079..4a366f9f 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -86,7 +86,19 @@ spec: remoteJWKS: uri: ${var.auth0_jwks_uri} audiences: - - ${var.cluster_name}-telemetry + - ${var.auth0_identifier} + - op: replace + path: /spec/authorization + value: + defaultAction: Deny + rules: + - name: allow + action: Allow + principal: + jwt: + provider: auth0 + scopes: + - write:telemetry %{endif} destination: server: 'https://kubernetes.default.svc' diff --git a/modules/signoz/resources-otel-ingress/security-policy.yaml b/modules/signoz/resources-otel-ingress/security-policy.yaml index 34bd58da..3d45d127 100644 --- a/modules/signoz/resources-otel-ingress/security-policy.yaml +++ b/modules/signoz/resources-otel-ingress/security-policy.yaml @@ -10,3 +10,4 @@ spec: name: signoz-otel-collector-route jwt: providers: + authorization: diff --git a/modules/signoz/variables.tf b/modules/signoz/variables.tf index 2a917ff1..344c8f60 100644 --- a/modules/signoz/variables.tf +++ b/modules/signoz/variables.tf @@ -48,6 +48,11 @@ variable "auth0_jwks_uri" { type = string } +variable "auth0_identifier" { + description = "Auth0 identifier for the API. Used to verify the audience in the JWT." + type = string +} + variable "smtp_user" { description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" type = string From 74f33bfe38a8ca5679e39b103d55ba9711badf96 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 21 Nov 2024 09:27:17 -0700 Subject: [PATCH 84/85] [SCHEMATIC-138] SigNoz cold storage and backups (#47) * SigNoz cold storage and backups --- README.md | 2 +- deployments/spacelift/dpe-k8s/main.tf | 32 +- .../stacks/dpe-k8s-deployments/main.tf | 67 ++-- .../stacks/dpe-k8s-deployments/variables.tf | 5 + deployments/stacks/dpe-k8s/outputs.tf | 8 +- modules/flux-cd/README.md | 56 +++ modules/flux-cd/main.tf | 56 +++ modules/flux-cd/templates/values.yaml | 327 ++++++++++++++++ modules/flux-cd/versions.tf | 20 + modules/s3-bucket/README.md | 37 ++ modules/s3-bucket/main.tf | 64 +++ modules/s3-bucket/outputs.tf | 14 + modules/s3-bucket/variables.tf | 33 ++ modules/s3-bucket/versions.tf | 8 + modules/sage-aws-eks/ouputs.tf | 4 + modules/signoz/README.md | 20 +- modules/signoz/main.tf | 367 +++++++++++++----- .../kustomization.yaml | 4 + .../resources-service-scrape/scrape.yaml | 10 + modules/signoz/templates/values.yaml | 11 +- modules/signoz/variables.tf | 10 + .../victoria-metrics/templates/values.yaml | 4 + 22 files changed, 1003 insertions(+), 156 deletions(-) create mode 100644 modules/flux-cd/README.md create mode 100644 modules/flux-cd/main.tf create mode 100644 modules/flux-cd/templates/values.yaml create mode 100644 modules/flux-cd/versions.tf create mode 100644 modules/s3-bucket/README.md create mode 100644 modules/s3-bucket/main.tf create mode 100644 modules/s3-bucket/outputs.tf create mode 100644 modules/s3-bucket/variables.tf create mode 100644 modules/s3-bucket/versions.tf create mode 100644 modules/signoz/resources-service-scrape/kustomization.yaml create mode 100644 modules/signoz/resources-service-scrape/scrape.yaml diff --git a/README.md b/README.md index 3d456eff..cbedb844 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,7 @@ allow us to review for any security advisories. ### Deploying an application to the kubernetes cluster Deployment of applications to the kubernetes cluster is handled through the combination -of terraform (.tf) scripts, spacelift (CICD tool), and ArgoCd (Declarative definitions +of terraform (.tf) scripts, spacelift (CICD tool), and ArgoCd or Flux CD (Declarative definitions for applications). To start of the deployment journey the first step is to create a new terraform module diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index cd03d65a..c6ad7e5d 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -45,6 +45,7 @@ locals { pod_to_node_dns_sg_id = "TF_VAR_pod_to_node_dns_sg_id" smtp_user = "TF_VAR_smtp_user" smtp_password = "TF_VAR_smtp_password" + cluster_oidc_provider_arn = "TF_VAR_cluster_oidc_provider_arn" } } @@ -178,31 +179,6 @@ resource "spacelift_stack_dependency_reference" "cluster-name" { # stack_id = spacelift_stack.k8s-stack.id # } -resource "spacelift_stack_destructor" "k8s-stack-deployments-destructor" { - depends_on = [ - spacelift_stack.k8s-stack, - spacelift_aws_integration_attachment.k8s-deployments-aws-integration-attachment, - spacelift_context_attachment.k8s-kubeconfig-hooks, - spacelift_stack_dependency_reference.cluster-name, - spacelift_stack_dependency_reference.region-name, - spacelift_environment_variable.k8s-stack-deployments-environment-variables - ] - - stack_id = spacelift_stack.k8s-stack-deployments.id -} - -resource "spacelift_stack_destructor" "k8s-stack-destructor" { - depends_on = [ - spacelift_aws_integration_attachment.k8s-aws-integration-attachment, - spacelift_context_attachment.k8s-kubeconfig-hooks, - spacelift_stack_dependency_reference.cluster-name, - spacelift_stack_dependency_reference.region-name, - spacelift_environment_variable.k8s-stack-environment-variables - ] - - stack_id = spacelift_stack.k8s-stack.id -} - resource "spacelift_aws_integration_attachment" "k8s-aws-integration-attachment" { integration_id = var.aws_integration_id stack_id = spacelift_stack.k8s-stack.id @@ -244,12 +220,6 @@ resource "spacelift_stack" "auth0" { ] } -resource "spacelift_stack_destructor" "auth0-stack-destructor" { - count = var.deploy_auth0 ? 1 : 0 - stack_id = spacelift_stack.auth0[0].id -} - - resource "spacelift_environment_variable" "auth0-stack-environment-variables" { depends_on = [ spacelift_stack.auth0 diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 30c32b9f..1b78a28b 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -1,3 +1,6 @@ +locals { + git_revision = var.git_revision +} module "sage-aws-eks-autoscaler" { source = "spacelift.io/sagebionetworks/sage-aws-eks-autoscaler/aws" version = "0.9.0" @@ -26,13 +29,19 @@ module "argo-cd" { source = "../../../modules/argo-cd" } +module "flux-cd" { + depends_on = [module.sage-aws-eks-autoscaler] + source = "../../../modules/flux-cd" +} + module "victoria-metrics" { - depends_on = [module.argo-cd] - source = "spacelift.io/sagebionetworks/victoria-metrics/aws" - version = "0.4.8" + depends_on = [module.argo-cd] + # source = "spacelift.io/sagebionetworks/victoria-metrics/aws" + # version = "0.4.8" + source = "../../../modules/victoria-metrics" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision } module "trivy-operator" { @@ -41,7 +50,7 @@ module "trivy-operator" { version = "0.3.2" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision } module "airflow" { @@ -50,7 +59,7 @@ module "airflow" { version = "0.4.0" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "airflow" } @@ -60,7 +69,7 @@ module "postgres-cloud-native-operator" { version = "0.4.0" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision } module "postgres-cloud-native-database" { @@ -69,30 +78,40 @@ module "postgres-cloud-native-database" { version = "0.5.0" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "airflow" argo_deployment_name = "airflow-postgres-cloud-native" } +module "clickhouse-backup-bucket" { + source = "../../../modules/s3-bucket" + bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + enable_versioning = false + aws_account_id = var.aws_account_id + cluster_name = var.cluster_name + cluster_oidc_provider_arn = var.cluster_oidc_provider_arn +} module "signoz" { depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" - source = "../../../modules/signoz" - auto_deploy = var.auto_deploy - auto_prune = var.auto_prune - git_revision = var.git_revision - namespace = "signoz" - argo_deployment_name = "signoz" - enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress - gateway_namespace = "envoy-gateway" - cluster_name = var.cluster_name - auth0_jwks_uri = var.auth0_jwks_uri - smtp_password = var.smtp_password - smtp_user = var.smtp_user - smtp_from = var.smtp_from - auth0_identifier = var.auth0_identifier + source = "../../../modules/signoz" + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = local.git_revision + namespace = "signoz" + argo_deployment_name = "signoz" + enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress + gateway_namespace = "envoy-gateway" + cluster_name = var.cluster_name + auth0_jwks_uri = var.auth0_jwks_uri + smtp_password = var.smtp_password + smtp_user = var.smtp_user + smtp_from = var.smtp_from + auth0_identifier = var.auth0_identifier + s3_backup_bucket_name = module.clickhouse-backup-bucket.bucket_name + s3_access_role_arn = module.clickhouse-backup-bucket.access_role_arn } module "envoy-gateway" { @@ -103,7 +122,7 @@ module "envoy-gateway" { source = "../../../modules/envoy-gateway" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "envoy-gateway" argo_deployment_name = "envoy-gateway" cluster_issuer_name = "lets-encrypt-prod" @@ -118,7 +137,7 @@ module "cert-manager" { source = "../../../modules/cert-manager" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "cert-manager" argo_deployment_name = "cert-manager" } diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index 8f62670b..21b40836 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -40,6 +40,11 @@ variable "cluster_name" { type = string } +variable "cluster_oidc_provider_arn" { + description = "EKS cluster ARN for the oidc provider" + type = string +} + variable "spotinst_account" { description = "Spot.io account" type = string diff --git a/deployments/stacks/dpe-k8s/outputs.tf b/deployments/stacks/dpe-k8s/outputs.tf index 6851f6a7..1a920dad 100644 --- a/deployments/stacks/dpe-k8s/outputs.tf +++ b/deployments/stacks/dpe-k8s/outputs.tf @@ -38,11 +38,15 @@ output "cluster_name" { value = module.sage-aws-eks.cluster_name } +output "cluster_oidc_provider_arn" { + value = module.sage-aws-eks.cluster_oidc_provider_arn +} + output "smtp_user" { - value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_user : null + value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_user : "" } output "smtp_password" { sensitive = true - value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_password : null + value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_password : "" } diff --git a/modules/flux-cd/README.md b/modules/flux-cd/README.md new file mode 100644 index 00000000..df006cd6 --- /dev/null +++ b/modules/flux-cd/README.md @@ -0,0 +1,56 @@ +# Purpose +This module is used to deploy the `Flux CD` [helm chart](https://fluxcd-community.github.io/helm-charts) to the cluster. [`Flux CD`](https://fluxcd.io/) is a GitOps tool used to manage the application lifecycle on a Kubernetes cluster. It was originally deployed because unlike `Argo CD`, it supports the use of `postRenderers` which are used to apply any additional changes to the application after it has been deployed, and were needed to be used to deploy the `clickhouse-backup` sidecar container to the `signoz` helm release. We do not plan to move all existing applications to using `Flux CD` at this time, but it is available and preferred to be used for any new applications added to the cluster. + +## What resources are being deployed through this module +In addition to a `helm_release` which deploys the `Flux CD` helm chart, this module also creates a `capacitor` resource which is used as the frontend for `Flux CD`. + +## Accessing the Flux CD UI +To access the `Flux CD` UI, you only need to port-forward the `capacitor` pod and access it in your browser. + +# Deploying an application with Flux CD +To deploy an application with `Flux CD`, you will need to create a `HelmRepository` resource which points to the helm chart you want to deploy. In that resource definition, you will set the `apiVersion` to `source.toolkit.fluxcd.io/v1` and the `kind` to `HelmRepository`. For example (code from the `signoz` module): + +``` +resource "kubectl_manifest" "signoz-helm-repo" { + depends_on = [kubernetes_namespace.signoz] + + yaml_body = <=0.1.0" +YAML +} + +resource "kubectl_manifest" "capacitor-kustomization" { + depends_on = [helm_release.fluxcd] + + yaml_body = < +# secretkey: + +# Enables podMonitor creation for the Prometheus Operator +prometheus: + podMonitor: + # -- Enables podMonitor endpoint + create: false + podMetricsEndpoints: + - port: http-prom + relabelings: + # https://github.com/prometheus-operator/prometheus-operator/issues/4816 + - sourceLabels: [__meta_kubernetes_pod_phase] + action: keep + regex: Running diff --git a/modules/flux-cd/versions.tf b/modules/flux-cd/versions.tf new file mode 100644 index 00000000..31cbf926 --- /dev/null +++ b/modules/flux-cd/versions.tf @@ -0,0 +1,20 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + helm = { + source = "hashicorp/helm" + version = "~> 2.0" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "1.14.0" + } + } +} diff --git a/modules/s3-bucket/README.md b/modules/s3-bucket/README.md new file mode 100644 index 00000000..44aa0594 --- /dev/null +++ b/modules/s3-bucket/README.md @@ -0,0 +1,37 @@ +# Purpose +This is a simple module that can be used within applications to create an S3 bucket. + +## WARNING +If you are tearing down a stack with a deployed S3 Bucket, you will likely encounter an error similar to the following: +``` +deleting S3 Bucket (my-beautiful-bucket): operation error S3: DeleteBucket, https response error StatusCode: 409, RequestID: 123, HostID: 123456789+g=, api error BucketNotEmpty: The bucket you tried to delete is not empty. You must delete all versions in the bucket. +``` +We have intentionally not handled this behavior as a safeguard against accidental deletion of a bucket that contains important data. If you need to delete the bucket, you will need to manually delete all objects within it. If versioning is enabled for the bucket, you will also need to delete all versions of the objects. + +# Usage +Using this module only requires calling it in your terraform code: +``` +module "my_beautiful_bucket" { + source = "../../../modules/s3-bucket" + bucket_name = "my-beautiful-bucket" + enable_versioning = false + aws_account_id = var.aws_account_id + cluster_name = var.cluster_name + cluster_oidc_provider_arn = var.cluster_oidc_provider_arn +} +``` + +The module handles creating the necessary IAM policy, role, and role policy attachment for accessing the bucket and provides the role ARN as an output. + +After confirming that the policy and role are configured correctly, you can either use the ARN directly in your application code or configure a kubernetes service account bound to the IAM role. The latter can be done like so: +``` +resource "kubernetes_service_account" "my_beautiful_bucket_service_account" { + metadata { + name = "my-beautiful-bucket-service-account" + namespace = var.namespace + annotations = { + "eks.amazonaws.com/role-arn" = "${module.my_beautiful_bucket.iam_role_arn}" + } + } +} +``` diff --git a/modules/s3-bucket/main.tf b/modules/s3-bucket/main.tf new file mode 100644 index 00000000..a9d13f22 --- /dev/null +++ b/modules/s3-bucket/main.tf @@ -0,0 +1,64 @@ +resource "aws_s3_bucket" "bucket" { + bucket = var.bucket_name + tags = merge( + var.tags, + { + Name = var.bucket_name + } + ) +} + +resource "aws_s3_bucket_versioning" "versioning" { + bucket = aws_s3_bucket.bucket.id + versioning_configuration { + status = var.enable_versioning ? "Enabled" : "Disabled" + } +} + + +resource "aws_iam_policy" "s3-access-policy" { + name = "access-policy-${var.aws_account_id}-${var.cluster_name}-${var.bucket_name}" + description = "Policy to access the s3 bucket" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:ListBucket", + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + ] + Resource = [ + aws_s3_bucket.bucket.arn, + "${aws_s3_bucket.bucket.arn}/*" + ] + } + ] + }) +} + +resource "aws_iam_role" "s3-access-iam-role" { + name = "s3-${var.cluster_name}-${var.bucket_name}" + description = "Assumed role to access the s3 bucket with the given permissions." + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRoleWithWebIdentity" + Effect = "Allow" + Principal = { + Federated = "${var.cluster_oidc_provider_arn}", + } + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "s3-access-policy-attachment" { + role = aws_iam_role.s3-access-iam-role.name + policy_arn = aws_iam_policy.s3-access-policy.arn +} diff --git a/modules/s3-bucket/outputs.tf b/modules/s3-bucket/outputs.tf new file mode 100644 index 00000000..25983295 --- /dev/null +++ b/modules/s3-bucket/outputs.tf @@ -0,0 +1,14 @@ +output "bucket_name" { + description = "Name of the created S3 bucket" + value = aws_s3_bucket.bucket.id +} + +output "bucket_arn" { + description = "ARN of the created S3 bucket" + value = aws_s3_bucket.bucket.arn +} + +output "access_role_arn" { + description = "ARN of the role to access the S3 bucket" + value = aws_iam_role.s3-access-iam-role.arn +} \ No newline at end of file diff --git a/modules/s3-bucket/variables.tf b/modules/s3-bucket/variables.tf new file mode 100644 index 00000000..25877694 --- /dev/null +++ b/modules/s3-bucket/variables.tf @@ -0,0 +1,33 @@ +variable "bucket_name" { + description = "Name of the S3 bucket to create" + type = string +} + +variable "tags" { + description = "Tags to apply to the S3 bucket" + type = map(string) + default = { + "CostCenter" = "No Program / 000000" + } +} + +variable "enable_versioning" { + description = "Enable versioning on the bucket" + type = bool + default = true +} + +variable "aws_account_id" { + description = "AWS account ID" + type = string +} + +variable "cluster_name" { + description = "EKS cluster name" + type = string +} + +variable "cluster_oidc_provider_arn" { + description = "EKS cluster ARN for the oidc provider" + type = string +} diff --git a/modules/s3-bucket/versions.tf b/modules/s3-bucket/versions.tf new file mode 100644 index 00000000..cba4c144 --- /dev/null +++ b/modules/s3-bucket/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/modules/sage-aws-eks/ouputs.tf b/modules/sage-aws-eks/ouputs.tf index 59692964..8114420b 100644 --- a/modules/sage-aws-eks/ouputs.tf +++ b/modules/sage-aws-eks/ouputs.tf @@ -13,3 +13,7 @@ output "node_security_group_id" { output "pod_to_node_dns_sg_id" { value = aws_security_group.pod-dns-egress.id } + +output "cluster_oidc_provider_arn" { + value = module.eks.oidc_provider_arn +} diff --git a/modules/signoz/README.md b/modules/signoz/README.md index 4cdcfa66..82f5cb67 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -5,13 +5,12 @@ SigNoz is an open-source APM. It helps developers monitor their applications & troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open source Application Performance Monitoring (APM) & Observability tool. +## Initial setup -## This module is a work in progress (To be completed before production, or determine if not needed) -A number of items are needed: - -- Setting up backups and data retention: https://sagebionetworks.jira.com/browse/IBCDPE-1094 -- Set up accounts and access to the service declaratively - +- Accounts in SigNoz need to be manually set up (SSO is only available in the enterprise version) +- 120 months for "Total Retention Period" and 1 month for "Move to S3" settings should be set +- Any dashboards need to be copied or set up +- Alert channels (Email/Slack) need to be set ## Setting up SMTP for alertmanager Alertmanager is an additional tool that is deployed to the kubernetes cluster that @@ -107,3 +106,12 @@ Once you're connected via a port-forward session the next item is to make sure t application you're sending data from is instrumented with open-telemetry. This is going to be application specific so instructions will need to live within the application you are using. + +### Clickhouse Backups and Restores +This module uses the `clickhouse-backup` tool to automatically back up the clickhouse database and store the data in an S3 bucket to ensure continuity of the data regardless of the state of the cluster.`clickhouse-backup` is deployed as a sidecar container to the `signoz` helm release. It will perform incremental backups of the database every 8 hours and full backups every 24 hours. + +To restore the database from an S3 backup, you can use the following steps: +1. Scale the replica cluster (`chi-signoz-clickhouse-cluster-0-1`) `StatefulSet` to 0 replicas. +1. Identify the backup that you would like to restore from. You can get the full list of backups by shelling into the `clickhouse-backup-sidecar` container within the `chi-signoz-clickhouse-cluster-0-0-0` pod and running `clickhouse-backup list`. +1. Restore the database from your backup by running `clickhouse-backup restore_remote --rm --schema ` (assuming the backup from remote storage). +1. Scale the replica cluster `StatefulSet` back to 1 replica. Once the `chi-signoz-clickhouse-cluster-0-1-0` has fully come back up, you should see the restored data showing in the `signoz` UI. diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 4a366f9f..ed1f8f2d 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -8,104 +8,273 @@ resource "kubernetes_namespace" "signoz" { } } -resource "kubectl_manifest" "signoz-deployment" { +resource "kubectl_manifest" "signoz-helm-repo" { depends_on = [kubernetes_namespace.signoz] yaml_body = < + + + + 10485760 + + + s3 + https://${var.s3_backup_bucket_name}.s3.amazonaws.com/coldstorage/ + true + us-east-1 + + + + + + + default + + + s3 + 0 + 1 + + + 0 + + + + +YAML +} + +resource "kubectl_manifest" "signoz-git-repo" { + depends_on = [kubectl_manifest.signoz-helm-release] + + yaml_body = </data/ endpoint: https://.s3-.amazonaws.com/data/ # -- Access Key for S3 or GCS - accessKey: + # accessKey: # -- Secret Access Key for S3 or GCS - secretAccess: + # secretAccess: # AWS role configuration - to use environment variables instead of passing access and secret keys role: # -- Whether to enable AWS IAM ARN role. @@ -1286,7 +1291,7 @@ schemaMigrator: annotations: {} # In Helm, this is needed to apply helm hooks for pre-upgrade, delete policy and hook weight. # For ArgoCD, this is needed to apply the sync wave - ArgoCD equivalent of hook weight. - upgradeHelmHooks: true + upgradeHelmHooks: false # -- Whether to enable replication for schemaMigrator enableReplication: true diff --git a/modules/signoz/variables.tf b/modules/signoz/variables.tf index 344c8f60..2370bdc6 100644 --- a/modules/signoz/variables.tf +++ b/modules/signoz/variables.tf @@ -70,3 +70,13 @@ variable "smtp_from" { type = string default = "" } + +variable "s3_backup_bucket_name" { + description = "The name of the S3 bucket to use for backups" + type = string +} + +variable "s3_access_role_arn" { + description = "The ARN of the role to use for accessing the S3 bucket" + type = string +} diff --git a/modules/victoria-metrics/templates/values.yaml b/modules/victoria-metrics/templates/values.yaml index c4e84892..1cf22307 100644 --- a/modules/victoria-metrics/templates/values.yaml +++ b/modules/victoria-metrics/templates/values.yaml @@ -808,6 +808,10 @@ grafana: gnetId: 20417 revision: 3 datasource: VictoriaMetrics + altinity-clickhouse-operator-dashboard: + gnetId: 12163 + revision: 2 + datasource: VictoriaMetrics defaultDashboardsTimezone: utc From dbe7f70021a71d949a54c45385dc1804e4c2ef37 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:48:11 -0700 Subject: [PATCH 85/85] Correction to namespace of where ingress resources are deployed --- modules/signoz/main.tf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index ed1f8f2d..5f65e539 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -221,7 +221,6 @@ metadata: name: signoz-telemetry-ingress namespace: ${var.namespace} spec: - targetNamespace: ${var.namespace} interval: 1h retryInterval: 2m timeout: 5m @@ -235,6 +234,9 @@ spec: - target: kind: ReferenceGrant patch: |- + - op: replace + path: /metadata/namespace + value: ${var.namespace} - op: replace path: /spec/from/0/namespace value: ${var.gateway_namespace}