diff --git a/Dockerfile b/Dockerfile index e266e6b7e..d0fd40a56 100644 --- a/Dockerfile +++ b/Dockerfile @@ -63,7 +63,8 @@ RUN mkdir crds && \ cp -r network-operator-chart/crds /workspace/crds/network-operator/ && \ cp -r network-operator-chart/charts/sriov-network-operator/crds /workspace/crds/sriov-network-operator/ && \ cp -r network-operator-chart/charts/node-feature-discovery/crds /workspace/crds/node-feature-discovery/ && \ - cp -r network-operator-chart/charts/nic-configuration-operator-chart/crds /workspace/crds/nic-configuration-operator/ + cp -r network-operator-chart/charts/nic-configuration-operator-chart/crds /workspace/crds/nic-configuration-operator/ && \ + cp -r network-operator-chart/charts/maintenance-operator-chart/crds /workspace/crds/maintenance-operator/ # Build ARG ARCH diff --git a/deployment/network-operator/Chart.yaml b/deployment/network-operator/Chart.yaml index ad696631b..47e40debd 100644 --- a/deployment/network-operator/Chart.yaml +++ b/deployment/network-operator/Chart.yaml @@ -24,3 +24,7 @@ dependencies: name: nic-configuration-operator-chart repository: '' version: 0.0.1 +- condition: maintenanceOperator.enabled + name: maintenance-operator-chart + repository: '' + version: 0.1.1 diff --git a/deployment/network-operator/charts/maintenance-operator-chart/.helmignore b/deployment/network-operator/charts/maintenance-operator-chart/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/deployment/network-operator/charts/maintenance-operator-chart/Chart.yaml b/deployment/network-operator/charts/maintenance-operator-chart/Chart.yaml new file mode 100644 index 000000000..8b73ff4c4 --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: maintenance-operator-chart +description: Maintenance Operator Helm Chart +type: application +version: 0.0.1 +appVersion: "latest" diff --git a/deployment/network-operator/charts/maintenance-operator-chart/README.md b/deployment/network-operator/charts/maintenance-operator-chart/README.md new file mode 100644 index 000000000..9fb19b652 --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/README.md @@ -0,0 +1,33 @@ +# maintenance-operator-chart + +![Version: 0.0.1](https://img.shields.io/badge/Version-0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: latest](https://img.shields.io/badge/AppVersion-latest-informational?style=flat-square) + +Maintenance Operator Helm Chart + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| imagePullSecrets | list | `[]` | image pull secrets for the operator | +| metricsService | object | `{"ports":[{"name":"https","port":8443,"protocol":"TCP","targetPort":"https"}],"type":"ClusterIP"}` | metrics service configurations | +| operator.admissionController.certificates.certManager.enable | bool | `true` | use cert-manager for certificates | +| operator.admissionController.certificates.certManager.generateSelfSigned | bool | `true` | generate self-signed certificiates with cert-manager | +| operator.admissionController.certificates.custom.enable | bool | `false` | enable custom certificates using secrets | +| operator.admissionController.certificates.secretNames.operator | string | `"operator-webhook-cert"` | secret name containing certificates for the operator admission controller | +| operator.admissionController.enable | bool | `true` | enable admission controller of the operator | +| operator.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/master","operator":"Exists"}]},"weight":1},{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | node affinity for the operator | +| operator.image.imagePullPolicy | string | `nil` | image pull policy for the operator image | +| operator.image.repository | string | `"ghcr.io/mellanox/maintenance-operator"` | repository to use for the operator image | +| operator.image.tag | string | `nil` | image tag to use for the operator image | +| operator.nodeSelector | object | `{}` | node selector for the operator | +| operator.replicas | int | `1` | operator deployment number of repplicas | +| operator.resources | object | `{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}}` | specify resource requests and limits for the operator | +| operator.serviceAccount.annotations | object | `{}` | set annotations for the operator service account | +| operator.tolerations | list | `[{"effect":"NoSchedule","key":"node-role.kubernetes.io/master","operator":"Exists"},{"effect":"NoSchedule","key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]` | toleration for the operator | +| operatorConfig | object | `{"logLevel":"info","maxNodeMaintenanceTimeSeconds":null,"maxParallelOperations":null,"maxUnavailable":null}` | operator configuration values. fields here correspond to fields in MaintenanceOperatorConfig CR | +| operatorConfig.logLevel | string | `"info"` | log level configuration | +| operatorConfig.maxNodeMaintenanceTimeSeconds | string | `nil` | max time for node maintenance | +| operatorConfig.maxParallelOperations | string | `nil` | max number of parallel operations | +| operatorConfig.maxUnavailable | string | `nil` | max number of unavailable nodes | +| webhookService | object | `{"ports":[{"port":443,"protocol":"TCP","targetPort":9443}],"type":"ClusterIP"}` | webhook service configurations | + diff --git a/deployment/network-operator/charts/maintenance-operator-chart/crds/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml b/deployment/network-operator/charts/maintenance-operator-chart/crds/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml new file mode 100644 index 000000000..24a29d60f --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/crds/maintenance.nvidia.com_maintenanceoperatorconfigs.yaml @@ -0,0 +1,89 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.15.0 + name: maintenanceoperatorconfigs.maintenance.nvidia.com +spec: + group: maintenance.nvidia.com + names: + kind: MaintenanceOperatorConfig + listKind: MaintenanceOperatorConfigList + plural: maintenanceoperatorconfigs + singular: maintenanceoperatorconfig + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: MaintenanceOperatorConfig is the Schema for the maintenanceoperatorconfigs + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: MaintenanceOperatorConfigSpec defines the desired state of + MaintenanceOperatorConfig + properties: + logLevel: + default: info + description: LogLevel is the operator logging level + enum: + - debug + - info + - error + type: string + maxNodeMaintenanceTimeSeconds: + default: 1600 + description: |- + MaxNodeMaintenanceTimeSeconds is the time from when a NodeMaintenance is marked as ready (phase: Ready) + until the NodeMaintenance is considered stale and removed by the operator. + should be less than idle time for any autoscaler that is running. + default to 30m (1600 seconds) + format: int32 + minimum: 0 + type: integer + maxParallelOperations: + anyOf: + - type: integer + - type: string + default: 1 + description: |- + MaxParallelOperations indicates the maximal number nodes that can undergo maintenance + at a given time. 0 means no limit + value can be an absolute number (ex: 5) or a percentage of total nodes in the cluster (ex: 10%). + absolute number is calculated from percentage by rounding up. + defaults to 1. The actual number of nodes that can undergo maintenance may be lower depending + on the value of MaintenanceOperatorConfigSpec.MaxUnavailable. + x-kubernetes-int-or-string: true + maxUnavailable: + anyOf: + - type: integer + - type: string + description: |- + MaxUnavailable is the maximum number of nodes that can become unavailable in the cluster. + value can be an absolute number (ex: 5) or a percentage of total nodes in the cluster (ex: 10%). + absolute number is calculated from percentage by rounding up. + by default, unset. + new nodes will not be processed if the number of unavailable node will exceed this value + x-kubernetes-int-or-string: true + type: object + type: object + served: true + storage: true diff --git a/deployment/network-operator/charts/maintenance-operator-chart/crds/maintenance.nvidia.com_nodemaintenances.yaml b/deployment/network-operator/charts/maintenance-operator-chart/crds/maintenance.nvidia.com_nodemaintenances.yaml new file mode 100644 index 000000000..0a28c74bb --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/crds/maintenance.nvidia.com_nodemaintenances.yaml @@ -0,0 +1,285 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.15.0 + name: nodemaintenances.maintenance.nvidia.com +spec: + group: maintenance.nvidia.com + names: + kind: NodeMaintenance + listKind: NodeMaintenanceList + plural: nodemaintenances + singular: nodemaintenance + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.nodeName + name: Node + type: string + - jsonPath: .spec.requestorID + name: Requestor + type: string + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + - jsonPath: .status.conditions[?(@.type=='Ready')].reason + name: Phase + type: string + - jsonPath: .status.conditions[?(@.type=='Failed')].reason + name: Failed + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: NodeMaintenance is the Schema for the nodemaintenances API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NodeMaintenanceSpec defines the desired state of NodeMaintenance + properties: + additionalRequestors: + description: |- + AdditionalRequestors is a set of additional requestor IDs which are using the same NodeMaintenance + request. addition or removal of requiestor IDs to this list MUST be made with update operation (and retry on failure) + which will replace the entire list. + items: + type: string + type: array + x-kubernetes-list-type: set + cordon: + default: true + description: Cordon if set, marks node as unschedulable during maintenance + operation + type: boolean + drainSpec: + description: DrainSpec specifies how a node will be drained. if not + provided, no draining will be performed. + properties: + deleteEmptyDir: + default: false + description: |- + DeleteEmptyDir indicates if should continue even if there are pods using emptyDir + (local data that will be deleted when the node is drained) + type: boolean + force: + default: false + description: Force draining even if there are pods that do not + declare a controller + type: boolean + podEvictionFilters: + description: |- + PodEvictionFilters specifies filters for pods that need to undergo eviction during drain. + if specified. only pods that match PodEvictionFilters will be evicted during drain operation. + if unspecified. all non-daemonset pods will be evicted. + logical OR is performed between filter entires. logical AND is performed within different filters + in a filter entry. + items: + description: PodEvictionFiterEntry defines filters for Pod evictions + during drain operation + properties: + byResourceNameRegex: + description: ByResourceNameRegex filters pods by the name + of the resources they consume using regex. + type: string + type: object + type: array + podSelector: + description: |- + PodSelector specifies a label selector to filter pods on the node that need to be drained + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + type: string + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time in seconds + to wait before giving up drain, zero means infinite + format: int32 + minimum: 0 + type: integer + type: object + nodeName: + description: |- + NodeName is The name of the node that maintenance operation will be performed on + creation fails if node obj does not exist (webhook) + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + requestorID: + description: |- + RequestorID MUST follow domain name notation format (https://tools.ietf.org/html/rfc1035#section-2.3.1) + It MUST be 63 characters or less, beginning and ending with an alphanumeric + character ([a-z0-9A-Z]) with dashes (-), dots (.), and alphanumerics between. + caller SHOULD NOT create multiple objects with same requestorID and nodeName. + This field identifies the requestor of the operation. + maxLength: 63 + minLength: 2 + pattern: ^([a-z0-9A-Z]([-a-z0-9A-Z]*[a-z0-9A-Z])?(\.[a-z0-9A-Z]([-a-z0-9A-Z]*[a-z0-9A-Z])?)*)$ + type: string + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf + waitForPodCompletion: + description: |- + WaitForPodCompletion specifies pods via selector to wait for completion before performing drain operation + if not provided, will not wait for pods to complete + properties: + podSelector: + description: |- + PodSelector specifies a label selector for the pods to wait for completion + For more details on label selectors, see: + https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + example: app=my-workloads + type: string + timeoutSeconds: + default: 0 + description: |- + TimeoutSecond specifies the length of time in seconds + to wait before giving up on pod termination, zero means infinite + format: int32 + minimum: 0 + type: integer + type: object + required: + - nodeName + - requestorID + type: object + status: + description: NodeMaintenanceStatus defines the observed state of NodeMaintenance + properties: + conditions: + description: Conditions represents observations of NodeMaintenance + current state + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + drain: + description: Drain represents the drain status of the node + properties: + drainProgress: + description: DrainProgress represents the draining progress as + percentage + format: int32 + minimum: 0 + type: integer + evictionPods: + description: EvictionPods is the total number of pods that need + to be evicted at the time NodeMaintenance started draining + format: int32 + minimum: 0 + type: integer + totalPods: + description: TotalPods is the number of pods on the node at the + time NodeMaintenance started draining + format: int32 + minimum: 0 + type: integer + waitForEviction: + description: WaitForEviction is the list of namespaced named pods + that need to be evicted + items: + type: string + type: array + required: + - drainProgress + - evictionPods + - totalPods + type: object + waitForCompletion: + description: WaitForCompletion is the list of namespaced named pods + that we wait to complete + items: + type: string + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/deployment/network-operator/charts/maintenance-operator-chart/templates/_helpers.tpl b/deployment/network-operator/charts/maintenance-operator-chart/templates/_helpers.tpl new file mode 100644 index 000000000..ae44554c9 --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "maintenance-operator.name" -}} +{{- default "maintenance-operator" .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "maintenance-operator.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default "maintenance-operator" .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "maintenance-operator.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "maintenance-operator.labels" -}} +helm.sh/chart: {{ include "maintenance-operator.chart" . }} +{{ include "maintenance-operator.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "maintenance-operator.selectorLabels" -}} +app.kubernetes.io/name: {{ include "maintenance-operator.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "maintenance-operator.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "maintenance-operator.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/deployment/network-operator/charts/maintenance-operator-chart/templates/certificates.yaml b/deployment/network-operator/charts/maintenance-operator-chart/templates/certificates.yaml new file mode 100644 index 000000000..68383b59e --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/templates/certificates.yaml @@ -0,0 +1,43 @@ +{{- if and .Values.operator.admissionController.enable }} +{{- if and .Values.operator.admissionController.certificates.certManager.enable + .Values.operator.admissionController.certificates.certManager.generateSelfSigned }} +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "maintenance-operator.fullname" . }}-selfsigned-issuer + namespace: {{ .Release.Namespace }} + labels: + {{- include "maintenance-operator.labels" . | nindent 4 }} +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "maintenance-operator.fullname" . }}-serving-cert + namespace: {{ .Release.Namespace }} + labels: + {{- include "maintenance-operator.labels" . | nindent 4 }} +spec: + dnsNames: + - '{{ include "maintenance-operator.fullname" . }}-webhook-service.{{ .Release.Namespace + }}.svc' + - '{{ include "maintenance-operator.fullname" . }}-webhook-service.{{ .Release.Namespace + }}.svc.{{ .Values.kubernetesClusterDomain }}' + issuerRef: + kind: Issuer + name: '{{ include "maintenance-operator.fullname" . }}-selfsigned-issuer' + secretName: {{ .Values.operator.admissionController.certificates.secretNames.operator }} +{{- else if and (not .Values.operator.admissionController.certManager.enable) .Values.operator.admissionController.custom.enable }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Values.operator.admissionController.certificates.secretNames.operator }} + namespace: {{ .Release.Namespace }} +type: Opaque +data: + ca.crt: {{ .Values.operator.admissionController.certificates.custom.operator.caCrt | b64enc | b64enc | quote }} + tls.crt: {{ .Values.operator.admissionController.certificates.custom.operator.tlsCrt | b64enc | quote }} + tls.key: {{ .Values.operator.admissionController.certificates.custom.operator.tlsKey | b64enc | quote }} +{{- end }} +{{- end }} diff --git a/deployment/network-operator/charts/maintenance-operator-chart/templates/deployment.yaml b/deployment/network-operator/charts/maintenance-operator-chart/templates/deployment.yaml new file mode 100644 index 000000000..13d61a563 --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/templates/deployment.yaml @@ -0,0 +1,87 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "maintenance-operator.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: maintenance-operator + app.kubernetes.io/part-of: maintenance-operator + control-plane: {{ .Release.Name }}-controller-manager + {{- include "maintenance-operator.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.operator.replicas }} + selector: + matchLabels: + control-plane: {{ .Release.Name }}-controller-manager + {{- include "maintenance-operator.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + control-plane: {{ .Release.Name }}-controller-manager + {{- include "maintenance-operator.selectorLabels" . | nindent 8 }} + annotations: + kubectl.kubernetes.io/default-container: manager + spec: + tolerations: {{- toYaml .Values.operator.tolerations | nindent 8 }} + nodeSelector: {{- toYaml .Values.operator.nodeSelector | nindent 8 }} + affinity: {{- toYaml .Values.operator.affinity | nindent 8 }} + imagePullSecrets: {{ .Values.imagePullSecrets | default list | toJson }} + securityContext: + runAsNonRoot: true + serviceAccountName: {{ include "maintenance-operator.fullname" . }}-controller-manager + terminationGracePeriodSeconds: 10 + containers: + - name: manager + command: + - /manager + args: + - --leader-elect + env: + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: ENABLE_WEBHOOKS + value: {{ quote .Values.operator.admissionController.enable }} + image: {{ .Values.operator.image.repository }}:{{ .Values.operator.image.tag | default .Chart.AppVersion }} + {{- if .Values.operator.image.imagePullPolicy }} + imagePullPolicy: {{ .Values.operator.image.imagePullPolicy }} + {{- end }} + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + {{- if .Values.operator.admissionController.enable }} + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + {{- end }} + resources: {{- toYaml .Values.operator.resources | nindent 10 }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + {{- if .Values.operator.admissionController.enable }} + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + {{- end }} + volumes: + {{- if .Values.operator.admissionController.enable }} + - name: cert + secret: + defaultMode: 420 + secretName: {{ .Values.operator.admissionController.certificates.secretNames.operator }} + {{- end }} diff --git a/deployment/network-operator/charts/maintenance-operator-chart/templates/metrics-service.yaml b/deployment/network-operator/charts/maintenance-operator-chart/templates/metrics-service.yaml new file mode 100644 index 000000000..780fa8067 --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/templates/metrics-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "maintenance-operator.name" . }}-metrics-service + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: kube-rbac-proxy + app.kubernetes.io/created-by: maintenance-operator + app.kubernetes.io/part-of: maintenance-operator + control-plane: {{ .Release.Name }}-controller-manager + {{- include "maintenance-operator.labels" . | nindent 4 }} +spec: + type: {{ .Values.metricsService.type }} + selector: + control-plane: {{ .Release.Name }}-controller-manager + {{- include "maintenance-operator.selectorLabels" . | nindent 4 }} + ports: + {{- .Values.metricsService.ports | toYaml | nindent 2 }} \ No newline at end of file diff --git a/deployment/network-operator/charts/maintenance-operator-chart/templates/operatorconfig.yaml b/deployment/network-operator/charts/maintenance-operator-chart/templates/operatorconfig.yaml new file mode 100644 index 000000000..60f4b49a6 --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/templates/operatorconfig.yaml @@ -0,0 +1,21 @@ +apiVersion: maintenance.nvidia.com/v1alpha1 +kind: MaintenanceOperatorConfig +metadata: + name: default + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: config + app.kubernetes.io/created-by: maintenance-operator + app.kubernetes.io/part-of: maintenance-operator + {{- include "maintenance-operator.labels" . | nindent 4 }} +spec: + logLevel: {{ .Values.operatorConfig.logLevel }} +{{- if .Values.operatorConfig.maxParallelOperations }} + maxParallelOperations: {{ .Values.operatorConfig.maxParallelOperations }} +{{- end }} +{{- if .Values.operatorConfig.maxUnavailable }} + maxUnavailable: {{ .Values.operatorConfig.maxUnavailable }} +{{- end }} +{{- if .Values.operatorConfig.maxNodeMaintenanceTimeSeconds }} + maxNodeMaintenanceTimeSeconds: {{ .Values.operatorConfig.maxNodeMaintenanceTimeSeconds }} +{{- end }} diff --git a/deployment/network-operator/charts/maintenance-operator-chart/templates/role.yaml b/deployment/network-operator/charts/maintenance-operator-chart/templates/role.yaml new file mode 100644 index 000000000..1e303bd7c --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/templates/role.yaml @@ -0,0 +1,167 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "maintenance-operator.fullname" . }}-manager-role + labels: + {{- include "maintenance-operator.labels" . | nindent 4 }} +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +- nonResourceURLs: + - /metrics + verbs: + - get +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create + - delete + - get + - list + - patch + - update +- apiGroups: + - maintenance.nvidia.com + resources: + - maintenanceoperatorconfigs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - maintenance.nvidia.com + resources: + - maintenanceoperatorconfigs/finalizers + verbs: + - update +- apiGroups: + - maintenance.nvidia.com + resources: + - maintenanceoperatorconfigs/status + verbs: + - get + - patch + - update +- apiGroups: + - maintenance.nvidia.com + resources: + - nodemaintenances + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - maintenance.nvidia.com + resources: + - nodemaintenances/finalizers + verbs: + - update +- apiGroups: + - maintenance.nvidia.com + resources: + - nodemaintenances/status + verbs: + - get + - patch + - update +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list + - watch + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "maintenance-operator.fullname" . }}-manager-role + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: maintenance-operator + app.kubernetes.io/part-of: maintenance-operator + {{- include "maintenance-operator.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch diff --git a/deployment/network-operator/charts/maintenance-operator-chart/templates/role_binding.yaml b/deployment/network-operator/charts/maintenance-operator-chart/templates/role_binding.yaml new file mode 100644 index 000000000..6f3491d90 --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/templates/role_binding.yaml @@ -0,0 +1,36 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "maintenance-operator.fullname" . }}-manager-rolebinding + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: maintenance-operator + app.kubernetes.io/part-of: maintenance-operator + {{- include "maintenance-operator.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "maintenance-operator.fullname" . }}-manager-role' +subjects: +- kind: ServiceAccount + name: '{{ include "maintenance-operator.fullname" . }}-controller-manager' + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "maintenance-operator.fullname" . }}-manager-rolebinding + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: maintenance-operator + app.kubernetes.io/part-of: maintenance-operator + {{- include "maintenance-operator.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: '{{ include "maintenance-operator.fullname" . }}-manager-role' +subjects: +- kind: ServiceAccount + name: '{{ include "maintenance-operator.fullname" . }}-controller-manager' + namespace: {{ .Release.Namespace }} diff --git a/deployment/network-operator/charts/maintenance-operator-chart/templates/serviceaccount.yaml b/deployment/network-operator/charts/maintenance-operator-chart/templates/serviceaccount.yaml new file mode 100644 index 000000000..c948f0c8a --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "maintenance-operator.fullname" . }}-controller-manager + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: maintenance-operator + app.kubernetes.io/part-of: maintenance-operator + {{- include "maintenance-operator.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.operator.serviceAccount.annotations | nindent 4 }} diff --git a/deployment/network-operator/charts/maintenance-operator-chart/templates/webhook.yaml b/deployment/network-operator/charts/maintenance-operator-chart/templates/webhook.yaml new file mode 100644 index 000000000..6d22b6f0e --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/templates/webhook.yaml @@ -0,0 +1,51 @@ +{{- if .Values.operator.admissionController.enable }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "maintenance-operator.fullname" . }}-webhook-service + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: webhook + app.kubernetes.io/created-by: maintenance-operator + app.kubernetes.io/part-of: maintenance-operator + {{- include "maintenance-operator.labels" . | nindent 4 }} +spec: + type: {{ .Values.webhookService.type }} + selector: + control-plane: {{ .Release.Name }}-controller-manager + {{- include "maintenance-operator.selectorLabels" . | nindent 4 }} + ports: + {{- .Values.webhookService.ports | toYaml | nindent 2 }} +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: {{ include "maintenance-operator.fullname" . }}-validating-webhook-configuration + annotations: + cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "maintenance-operator.fullname" . }}-serving-cert + labels: + {{- include "maintenance-operator.labels" . | nindent 4 }} +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: '{{ include "maintenance-operator.fullname" . }}-webhook-service' + namespace: {{ .Release.Namespace }} + path: /validate-maintenance-nvidia-com-v1alpha1-nodemaintenance + {{- if not .Values.operator.admissionController.certificates.certManager.enable }} + caBundle: {{ .Values.operator.admissionController.certificates.custom.operator.tlsCrt | b64enc | quote }} + {{- end }} + failurePolicy: Fail + name: vnodemaintenance.kb.io + rules: + - apiGroups: + - maintenance.nvidia.com + apiVersions: + - v1alpha1 + operations: + - CREATE + resources: + - nodemaintenances + sideEffects: None +{{- end }} diff --git a/deployment/network-operator/charts/maintenance-operator-chart/values.yaml b/deployment/network-operator/charts/maintenance-operator-chart/values.yaml new file mode 100644 index 000000000..f253186e7 --- /dev/null +++ b/deployment/network-operator/charts/maintenance-operator-chart/values.yaml @@ -0,0 +1,107 @@ +operator: + image: + # -- repository to use for the operator image + repository: ghcr.io/mellanox/maintenance-operator + # -- image tag to use for the operator image + tag: null + # -- image pull policy for the operator image + imagePullPolicy: null + # -- toleration for the operator + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + # -- node selector for the operator + nodeSelector: {} + # -- node affinity for the operator + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/master" + operator: Exists + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: Exists + # -- specify resource requests and limits for the operator + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + # -- operator deployment number of repplicas + replicas: 1 + serviceAccount: + # -- set annotations for the operator service account + annotations: {} + admissionController: + # -- enable admission controller of the operator + enable: true + certificates: + secretNames: + # -- secret name containing certificates for the operator admission controller + operator: "operator-webhook-cert" + certManager: + # -- use cert-manager for certificates + enable: true + # -- generate self-signed certificates with cert-manager + generateSelfSigned: true + custom: + # -- enable custom certificates using secrets + enable: false + # operator: + # caCrt: | + # -----BEGIN CERTIFICATE----- + # MIIMIICLDCCAdKgAwIBAgIBADAKBggqhkjOPQQDAjB9MQswCQYDVQQGEwJCRTEPMA0G + # ... + # -----END CERTIFICATE----- + # tlsCrt: | + # -----BEGIN CERTIFICATE----- + # MIIMIICLDCCAdKgAwIBAgIBADAKBggqhkjOPQQDAjB9MQswCQYDVQQGEwJCRTEPMA0G + # ... + # -----END CERTIFICATE----- + # tlsKey: | + # -----BEGIN EC PRIVATE KEY----- + # MHcl4wOuDwKQa+upc8GftXE2C//4mKANBC6It01gUaTIpo= + # ... + # -----END EC PRIVATE KEY----- + +# -- operator configuration values. fields here correspond to fields in MaintenanceOperatorConfig CR +operatorConfig: + # -- log level configuration + logLevel: info + # operatorConfig.maxParallelOperations -- max number of parallel operations + maxParallelOperations: null + # -- max number of unavailable nodes + maxUnavailable: null + # -- max time for node maintenance + maxNodeMaintenanceTimeSeconds: null + +# -- image pull secrets for the operator +imagePullSecrets: [] + +# -- metrics service configurations +metricsService: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: https + type: ClusterIP + +# -- webhook service configurations +webhookService: + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + type: ClusterIP diff --git a/deployment/network-operator/templates/upgrade-crd-hook.yaml b/deployment/network-operator/templates/upgrade-crd-hook.yaml index c12cf13ee..e0c80e455 100644 --- a/deployment/network-operator/templates/upgrade-crd-hook.yaml +++ b/deployment/network-operator/templates/upgrade-crd-hook.yaml @@ -93,5 +93,8 @@ spec: {{- if .Values.nicConfigurationOperator.enabled }} - --crds-dir=/crds/nic-configuration-operator {{- end }} + {{- if .Values.maintenanceOperator.enabled }} + - --crds-dir=/crds/maintenance-operator + {{- end }} restartPolicy: OnFailure {{- end }} diff --git a/deployment/network-operator/values.yaml b/deployment/network-operator/values.yaml index ef752954d..da9159ad2 100644 --- a/deployment/network-operator/values.yaml +++ b/deployment/network-operator/values.yaml @@ -33,6 +33,10 @@ nicConfigurationOperator: # -- Deploy NIC Configuration Operator. enabled: false +maintenanceOperator: + # -- Deploy Maintenance Operator. + enabled: false + # Set both enableNodeFeatureApi and NodeFeatureAPI feature gate to false to disable. node-feature-discovery: # -- The Node Feature API enable communication between nfd master and worker @@ -181,6 +185,29 @@ nic-configuration-operator-chart: name: nic-configuration-operator-daemon tag: v0.1.13 +# Maintenance Operator chart related values. +maintenance-operator-chart: + operator: + image: + repository: gghcr.io/mellanox + name: maintenance-operator + tag: v0.1.1 + admissionController: + # -- enable admission controller of the operator + enable: false + certificates: + secretNames: + # -- secret name containing certificates for the operator admission controller + operator: "maintenance-webhook-cert" + certManager: + # -- use cert-manager for certificates + enable: false + # -- generate self-signed certificates with cert-manager + generateSelfSigned: false + custom: + # -- enable custom certificates using secrets + enable: false + # General Operator related values # The operator element allows to deploy network operator from an alternate location operator: diff --git a/hack/release.go b/hack/release.go index 5576117b0..a0906a8e9 100644 --- a/hack/release.go +++ b/hack/release.go @@ -77,6 +77,7 @@ type Release struct { RDMACni *ReleaseImageSpec NicConfigurationOperator *ReleaseImageSpec NicConfigurationConfigDaemon *ReleaseImageSpec + MaintenanceOperator *ReleaseImageSpec } func readDefaults(releaseDefaults string) Release { @@ -127,6 +128,7 @@ func readEnvironmentVariables(release *Release) { initWithEnvVariale("RDMA_CNI", release.RDMACni) initWithEnvVariale("NIC_CONFIGURATION_OPERATOR", release.NicConfigurationOperator) initWithEnvVariale("NIC_CONFIGURATION_CONFIG_DAEMON", release.NicConfigurationConfigDaemon) + initWithEnvVariale("MAINTENANCE_OPERATOR", release.MaintenanceOperator) } func main() { diff --git a/hack/release.yaml b/hack/release.yaml index 63ce08172..2603efe7d 100644 --- a/hack/release.yaml +++ b/hack/release.yaml @@ -86,3 +86,7 @@ nicConfigurationConfigDaemon: image: nic-configuration-operator-daemon repository: ghcr.io/mellanox version: v0.1.13 +maintenanceOperator: + image: maintenance-operator + repository: gghcr.io/mellanox + version: v0.1.1 diff --git a/hack/templates/values/values.template b/hack/templates/values/values.template index 46397d143..10209387e 100644 --- a/hack/templates/values/values.template +++ b/hack/templates/values/values.template @@ -33,6 +33,10 @@ nicConfigurationOperator: # -- Deploy NIC Configuration Operator. enabled: false +maintenanceOperator: + # -- Deploy Maintenance Operator. + enabled: false + # Set both enableNodeFeatureApi and NodeFeatureAPI feature gate to false to disable. node-feature-discovery: # -- The Node Feature API enable communication between nfd master and worker @@ -181,6 +185,29 @@ nic-configuration-operator-chart: name: {{ .NicConfigurationConfigDaemon.Image }} tag: {{ .NicConfigurationConfigDaemon.Version }} +# Maintenance Operator chart related values. +maintenance-operator-chart: + operator: + image: + repository: {{ .MaintenanceOperator.Repository }} + name: {{ .MaintenanceOperator.Image }} + tag: {{ .MaintenanceOperator.Version }} + admissionController: + # -- enable admission controller of the operator + enable: false + certificates: + secretNames: + # -- secret name containing certificates for the operator admission controller + operator: "maintenance-webhook-cert" + certManager: + # -- use cert-manager for certificates + enable: false + # -- generate self-signed certificates with cert-manager + generateSelfSigned: false + custom: + # -- enable custom certificates using secrets + enable: false + # General Operator related values # The operator element allows to deploy network operator from an alternate location operator: