chart: introduce guaranteed instance manager CPU setting for v2 volume

Longhorn 7631 Signed-off-by: Derek Su <derek.su@suse.com>
derekbit · Dec 25, 2023 · 8eeca3a · 8eeca3a
1 parent fe9dc3c
commit 8eeca3a
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 4 deletions.
diff --git a/chart/README.md b/chart/README.md
@@ -289,12 +289,13 @@ For more details like types or options, you can refer to **Settings Reference**
 | defaultSettings.engineReplicaTimeout | In seconds. The setting specifies the timeout between the engine and replica(s), and the value should be between 8 and 30 seconds. The default value is 8 seconds. |
 | defaultSettings.failedBackupTTL | In minutes. This setting determines how long Longhorn will keep the backup resource that was failed. Set to 0 to disable the auto-deletion. |
 | defaultSettings.fastReplicaRebuildEnabled | This feature supports the fast replica rebuilding. It relies on the checksum of snapshot disk files, so setting the snapshot-data-integrity to **enable** or **fast-check** is a prerequisite. |
-| defaultSettings.guaranteedInstanceManagerCPU | This integer value indicates how many percentages of the total allocatable CPU on each node will be reserved for each instance manager Pod. You can leave it with the default value, which is 12%. |
+| defaultSettings.guaranteedInstanceManagerCPU | This integer value indicates how many percentages of the total allocatable CPU on each node will be reserved for each instance manager Pod for v1 data engine. You can leave it with the default value, which is 12%. |
 | defaultSettings.kubernetesClusterAutoscalerEnabled | Enabling this setting will notify Longhorn that the cluster is using Kubernetes Cluster Autoscaler. |
 | defaultSettings.logLevel | The log level Panic, Fatal, Error, Warn, Info, Debug, Trace used in longhorn manager. Default to Info. |
 | defaultSettings.nodeDownPodDeletionPolicy | Defines the Longhorn action when a Volume is stuck with a StatefulSet/Deployment Pod on a node that is down. |
 | defaultSettings.nodeDrainPolicy | Define the policy to use when a node with the last healthy replica of a volume is drained. |
 | defaultSettings.offlineReplicaRebuilding | This setting allows users to enable the offline replica rebuilding for volumes using v2 data engine. |
+| defaultSettings.v2DataEngineGuaranteedInstanceManagerCPU | This integer value indicates how many millicpus of CPU on each node will be reserved for each instance manager Pod for v2 data engine. You can leave it with the default value, which is 1250 millicpus. |
 | defaultSettings.orphanAutoDeletion | This setting allows Longhorn to delete the orphan resource and its corresponding orphaned data automatically like stale replicas. Orphan resources on down or unknown nodes will not be cleaned up automatically. |
 | defaultSettings.priorityClass | priorityClass for Longhorn system-managed components This setting can help prevent Longhorn components from being evicted under Node Pressure. Notice that this will be applied to Longhorn user-deployed components by default if there are no priority class values set yet, such as `longhornManager.priorityClass`. |
 | defaultSettings.recurringFailedJobsHistoryLimit | This setting specifies how many failed backup or snapshot job histories should be retained. History will not be retained if the value is 0. |

diff --git a/chart/questions.yaml b/chart/questions.yaml
@@ -541,8 +541,8 @@ questions:
     min: 0
     default: 300
   - variable: defaultSettings.guaranteedInstanceManagerCPU
-    label: Guaranteed Instance Manager CPU
-    description: "This integer value indicates how many percentages of the total allocatable CPU on each node will be reserved for each instance manager Pod. You can leave it with the default value, which is 12%."
+    label: Guaranteed Instance Manager CPU for V1 Data Engine
+    description: "This integer value indicates how many percentages of the total allocatable CPU on each node will be reserved for each instance manager Pod for v1 data engine. You can leave it with the default value, which is 12%."
     group: "Longhorn Default Settings"
     type: int
     min: 0
@@ -861,3 +861,11 @@ questions:
       - "rke1"
       - "rke2"
       - "k3s"
+  - variable: defaultSettings.v2DataEngineGuaranteedInstanceManagerCPU
+    label: Guaranteed Instance Manager CPU for V2 Data Engine
+    description: "This integer value indicates how many millicpus on each node will be reserved for each instance manager Pod for v2 data engine. By default, the SPDK target daemon within an instance manager Pod utilizes 1 CPU core. Ensuring a minimum CPU usage is essential for sustaining engine and replica stability, especially during periods of high node workload."
+    group: "Longhorn Default Settings"
+    type: int
+    min: 0
+    max: 2000
+    default: 1250
diff --git a/chart/templates/default-setting.yaml b/chart/templates/default-setting.yaml
@@ -215,3 +215,6 @@ data:
     {{- if not (kindIs "invalid" .Values.defaultSettings.disableSnapshotPurge) }}
     disable-snapshot-purge: {{ .Values.defaultSettings.disableSnapshotPurge }}
     {{- end }}
+    {{- if not (kindIs "invalid" .Values.defaultSettings.v2DataEngineGuaranteedInstanceManagerCPU) }}
+    v2-data-engine-guaranteed-instance-manager-cpu: {{ .Values.defaultSettings.v2DataEngineGuaranteedInstanceManagerCPU }}
+    {{- end }}
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -290,7 +290,7 @@ defaultSettings:
   # -- This interval in seconds determines how long Longhorn will wait before re-downloading the backing image file
   # when all disk files of this backing image become failed or unknown.
   backingImageRecoveryWaitInterval: ~
-  # -- This integer value indicates how many percentages of the total allocatable CPU on each node will be reserved for each instance manager Pod.
+  # -- This integer value indicates how many percent of the total allocatable CPU on each node will be reserved for each instance manager Pod for v1 data engine.
   # You can leave it with the default value, which is 12%.
   guaranteedInstanceManagerCPU: ~
   # -- Enabling this setting will notify Longhorn that the cluster is using Kubernetes Cluster Autoscaler.
@@ -333,6 +333,10 @@ defaultSettings:
   v2DataEngine: ~
   # -- This setting allows users to enable the offline replica rebuilding for volumes using v2 data engine.
   offlineReplicaRebuilding: ~
+  # -- This integer value indicates how many millicpus of CPU on each node will be reserved for each instance manager Pod for v2 data engine.
+  # You can leave it with the default value, which is 1250 millicpus.
+  v2DataEngineGuaranteedInstanceManagerCPU: ~
+  # -- Enabling this setting will notify Longhorn that the cluster is using Kubernetes Cluster Autoscaler.
   # -- Allow Scheduling Empty Node Selector Volumes To Any Node
   allowEmptyNodeSelectorVolume: ~
   # -- Allow Scheduling Empty Disk Selector Volumes To Any Disk

diff --git a/deploy/upgrade_responder_server/chart-values.yaml b/deploy/upgrade_responder_server/chart-values.yaml
@@ -280,6 +280,9 @@ configMap:
         "longhornSettingGuaranteedInstanceManagerCpu": {
           "dataType": "float"
         },
+        "longhornSettingV2DataEngineGuaranteedInstanceManagerCpu": {
+          "dataType": "float"
+        },
         "longhornSettingRecurringFailedJobsHistoryLimit": {
           "dataType": "float"
         },

diff --git a/dev/upgrade-responder/install.sh b/dev/upgrade-responder/install.sh
@@ -297,6 +297,9 @@ configMap:
         "longhornSettingGuaranteedInstanceManagerCpu": {
           "dataType": "float"
         },
+        "longhornSettingV2DataEngineGuaranteedInstanceManagerCpu": {
+          "dataType": "float"
+        },
         "longhornSettingRecurringFailedJobsHistoryLimit": {
           "dataType": "float"
         },