chart: introduce guaranteed instance manager CPU setting for v2 volume

Longhorn 7631 Signed-off-by: Derek Su <derek.su@suse.com>
derekbit · Dec 26, 2023 · 71d812f · 71d812f
1 parent fe9dc3c
commit 71d812f
Show file tree

Hide file tree

Showing 6 changed files with 29 additions and 7 deletions.
diff --git a/chart/README.md b/chart/README.md
@@ -289,12 +289,13 @@ For more details like types or options, you can refer to **Settings Reference**
 | defaultSettings.engineReplicaTimeout | In seconds. The setting specifies the timeout between the engine and replica(s), and the value should be between 8 and 30 seconds. The default value is 8 seconds. |
 | defaultSettings.failedBackupTTL | In minutes. This setting determines how long Longhorn will keep the backup resource that was failed. Set to 0 to disable the auto-deletion. |
 | defaultSettings.fastReplicaRebuildEnabled | This feature supports the fast replica rebuilding. It relies on the checksum of snapshot disk files, so setting the snapshot-data-integrity to **enable** or **fast-check** is a prerequisite. |
-| defaultSettings.guaranteedInstanceManagerCPU | This integer value indicates how many percentages of the total allocatable CPU on each node will be reserved for each instance manager Pod. You can leave it with the default value, which is 12%. |
-| defaultSettings.kubernetesClusterAutoscalerEnabled | Enabling this setting will notify Longhorn that the cluster is using Kubernetes Cluster Autoscaler. |
+| defaultSettings.guaranteedInstanceManagerCPU | Percentage of the total allocatable CPU resources on each node to be reserved for each instance manager pod when the V1 Data Engine is enabled. The default value is 12. |
+| defaultSettings.kubernetesClusterAutoscalerEnabled | Setting that notifies Longhorn that the cluster is using the Kubernetes Cluster Autoscaler. |
 | defaultSettings.logLevel | The log level Panic, Fatal, Error, Warn, Info, Debug, Trace used in longhorn manager. Default to Info. |
 | defaultSettings.nodeDownPodDeletionPolicy | Defines the Longhorn action when a Volume is stuck with a StatefulSet/Deployment Pod on a node that is down. |
 | defaultSettings.nodeDrainPolicy | Define the policy to use when a node with the last healthy replica of a volume is drained. |
 | defaultSettings.offlineReplicaRebuilding | This setting allows users to enable the offline replica rebuilding for volumes using v2 data engine. |
+| defaultSettings.v2DataEngineGuaranteedInstanceManagerCPU | Number of millicpus on each node to be reserved for each instance manager pod when the V2 Data Engine is enabled. The default value is 1250. |
 | defaultSettings.orphanAutoDeletion | This setting allows Longhorn to delete the orphan resource and its corresponding orphaned data automatically like stale replicas. Orphan resources on down or unknown nodes will not be cleaned up automatically. |
 | defaultSettings.priorityClass | priorityClass for Longhorn system-managed components This setting can help prevent Longhorn components from being evicted under Node Pressure. Notice that this will be applied to Longhorn user-deployed components by default if there are no priority class values set yet, such as `longhornManager.priorityClass`. |
 | defaultSettings.recurringFailedJobsHistoryLimit | This setting specifies how many failed backup or snapshot job histories should be retained. History will not be retained if the value is 0. |

diff --git a/chart/questions.yaml b/chart/questions.yaml
@@ -542,7 +542,7 @@ questions:
     default: 300
   - variable: defaultSettings.guaranteedInstanceManagerCPU
     label: Guaranteed Instance Manager CPU
-    description: "This integer value indicates how many percentages of the total allocatable CPU on each node will be reserved for each instance manager Pod. You can leave it with the default value, which is 12%."
+    description: "Percentage of the total allocatable CPU resources on each node to be reserved for each instance manager pod when the V1 Data Engine is enabled. The default value is 12."
     group: "Longhorn Default Settings"
     type: int
     min: 0
@@ -562,7 +562,7 @@ questions:
     default: "false"
 - variable: defaultSettings.kubernetesClusterAutoscalerEnabled
   label: Kubernetes Cluster Autoscaler Enabled (Experimental)
-  description: "Enabling this setting will notify Longhorn that the cluster is using Kubernetes Cluster Autoscaler."
+  description: "Setting that notifies Longhorn that the cluster is using the Kubernetes Cluster Autoscaler."
   group: "Longhorn Default Settings"
   type: boolean
   default: false
@@ -861,3 +861,11 @@ questions:
       - "rke1"
       - "rke2"
       - "k3s"
+  - variable: defaultSettings.v2DataEngineGuaranteedInstanceManagerCPU
+    label: Guaranteed Instance Manager CPU for V2 Data Engine
+    description: "Number of millicpus on each node to be reserved for each instance manager pod when the V2 Data Engine is enabled. By default, the Storage Performance Development Kit (SPDK) target daemon within each instance manager pod uses 1 CPU core. Configuring a minimum CPU usage value is essential for maintaining engine and replica stability, especially during periods of high node workload. The default value is 1250 millicpus."
+    group: "Longhorn Default Settings"
+    type: int
+    min: 0
+    max: 2000
+    default: 1250
diff --git a/chart/templates/default-setting.yaml b/chart/templates/default-setting.yaml
@@ -215,3 +215,6 @@ data:
     {{- if not (kindIs "invalid" .Values.defaultSettings.disableSnapshotPurge) }}
     disable-snapshot-purge: {{ .Values.defaultSettings.disableSnapshotPurge }}
     {{- end }}
+    {{- if not (kindIs "invalid" .Values.defaultSettings.v2DataEngineGuaranteedInstanceManagerCPU) }}
+    v2-data-engine-guaranteed-instance-manager-cpu: {{ .Values.defaultSettings.v2DataEngineGuaranteedInstanceManagerCPU }}
+    {{- end }}
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -290,10 +290,10 @@ defaultSettings:
   # -- This interval in seconds determines how long Longhorn will wait before re-downloading the backing image file
   # when all disk files of this backing image become failed or unknown.
   backingImageRecoveryWaitInterval: ~
-  # -- This integer value indicates how many percentages of the total allocatable CPU on each node will be reserved for each instance manager Pod.
-  # You can leave it with the default value, which is 12%.
+  # -- Percentage of the total allocatable CPU resources on each node to be reserved for each instance manager pod when the V1 Data Engine is enabled.
+  # The default value is 12.
   guaranteedInstanceManagerCPU: ~
-  # -- Enabling this setting will notify Longhorn that the cluster is using Kubernetes Cluster Autoscaler.
+  # -- Setting that notifies Longhorn that the cluster is using the Kubernetes Cluster Autoscaler.
   kubernetesClusterAutoscalerEnabled: ~
   # -- This setting allows Longhorn to delete the orphan resource and its corresponding orphaned data automatically like stale replicas.
   # Orphan resources on down or unknown nodes will not be cleaned up automatically.
@@ -333,6 +333,10 @@ defaultSettings:
   v2DataEngine: ~
   # -- This setting allows users to enable the offline replica rebuilding for volumes using v2 data engine.
   offlineReplicaRebuilding: ~
+  # -- Number of millicpus on each node to be reserved for each instance manager pod when the V2 Data Engine is enabled.
+  # The default value is 1250 millicpus.
+  v2DataEngineGuaranteedInstanceManagerCPU: ~
+  # -- Setting that notifies Longhorn that the cluster is using the Kubernetes Cluster Autoscaler.
   # -- Allow Scheduling Empty Node Selector Volumes To Any Node
   allowEmptyNodeSelectorVolume: ~
   # -- Allow Scheduling Empty Disk Selector Volumes To Any Disk

diff --git a/deploy/upgrade_responder_server/chart-values.yaml b/deploy/upgrade_responder_server/chart-values.yaml
@@ -280,6 +280,9 @@ configMap:
         "longhornSettingGuaranteedInstanceManagerCpu": {
           "dataType": "float"
         },
+        "longhornSettingV2DataEngineGuaranteedInstanceManagerCpu": {
+          "dataType": "float"
+        },
         "longhornSettingRecurringFailedJobsHistoryLimit": {
           "dataType": "float"
         },

diff --git a/dev/upgrade-responder/install.sh b/dev/upgrade-responder/install.sh
@@ -297,6 +297,9 @@ configMap:
         "longhornSettingGuaranteedInstanceManagerCpu": {
           "dataType": "float"
         },
+        "longhornSettingV2DataEngineGuaranteedInstanceManagerCpu": {
+          "dataType": "float"
+        },
         "longhornSettingRecurringFailedJobsHistoryLimit": {
           "dataType": "float"
         },