Add new index and cluster level settings to limit the total primary s…

…hards per node and per index (opensearch-project#17295) * Added a new index level setting to limit the total primary shards per index per node. Added relevant files for unit test and integration test. Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * update files for code quality Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * moved primary shard count function to RoutingNode.java Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * removed unwanted files Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * added cluster level setting to limit total primary shards per node Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * allow the index level settings to be applied to both DOCUMENT and SEGMENT replication indices Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * Added necessary validator to restrict the index and cluster level primary shards per node settings only for remote store enabled cluster. Added relevant unit and integration tests. Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * refactoring changes Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * refactoring changes Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * Empty commit to rerun gradle test Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * optimised the calculation of total primary shards on a node Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * Refactoring changes Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * refactoring changes, added TODO to MetadataCreateIndexService Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> * Added integration test for scenario where primary shards setting is set for cluster which is not remote store enabled Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> --------- Signed-off-by: Divyansh Pandey <dpaandey@amazon.com> Signed-off-by: Divyansh Pandey <98746046+pandeydivyansh1803@users.noreply.github.com> Co-authored-by: Divyansh Pandey <dpaandey@amazon.com>
andrross · Feb 25, 2025 · bc209ee · bc209ee
1 parent 0714a1b
commit bc209ee
Show file tree

Hide file tree

Showing 16 changed files with 1,320 additions and 27 deletions.
diff --git a/CHANGELOG-3.0.md b/CHANGELOG-3.0.md
@@ -17,6 +17,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Add systemd configurations to strengthen OS core security ([#17107](https://github.com/opensearch-project/OpenSearch/pull/17107))
 - Added pull-based Ingestion (APIs, for ingestion source, a Kafka plugin, and IngestionEngine that pulls data from the ingestion source) ([#16958](https://github.com/opensearch-project/OpenSearch/pull/16958))
 - Added ConfigurationUtils to core for the ease of configuration parsing [#17223](https://github.com/opensearch-project/OpenSearch/pull/17223)
+- Add cluster and index level settings to limit the total primary shards per node and per index [#17295](https://github.com/opensearch-project/OpenSearch/pull/17295)
 - Add execution_hint to cardinality aggregator request (#[17312](https://github.com/opensearch-project/OpenSearch/pull/17312))
 - Arrow Flight RPC plugin with Flight server bootstrap logic and client for internode communication ([#16962](https://github.com/opensearch-project/OpenSearch/pull/16962))
 - Added offset management for the pull-based Ingestion ([#17354](https://github.com/opensearch-project/OpenSearch/pull/17354))

diff --git a/...ava/org/opensearch/cluster/routing/allocation/decider/ShardsLimitAllocationDeciderIT.java b/...ava/org/opensearch/cluster/routing/allocation/decider/ShardsLimitAllocationDeciderIT.java
@@ -0,0 +1,305 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.cluster.routing.allocation.decider;
+
+import org.opensearch.cluster.ClusterState;
+import org.opensearch.cluster.routing.IndexRoutingTable;
+import org.opensearch.cluster.routing.IndexShardRoutingTable;
+import org.opensearch.cluster.routing.RoutingNode;
+import org.opensearch.cluster.routing.ShardRouting;
+import org.opensearch.cluster.routing.ShardRoutingState;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.test.OpenSearchIntegTestCase;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
+import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
+import static org.opensearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider.CLUSTER_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING;
+import static org.opensearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider.CLUSTER_TOTAL_SHARDS_PER_NODE_SETTING;
+import static org.opensearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider.INDEX_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING;
+import static org.opensearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE_SETTING;
+
+@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 3)
+public class ShardsLimitAllocationDeciderIT extends OpenSearchIntegTestCase {
+
+    @Override
+    protected Settings nodeSettings(int nodeOrdinal) {
+        return Settings.builder().put(super.nodeSettings(nodeOrdinal)).build();
+    }
+
+    public void testClusterWideShardsLimit() {
+        // Set the cluster-wide shard limit to 2
+        updateClusterSetting(CLUSTER_TOTAL_SHARDS_PER_NODE_SETTING.getKey(), 4);
+
+        // Create the first two indices with 3 shards and 1 replica each
+        createIndex("test1", Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 3).put(SETTING_NUMBER_OF_REPLICAS, 1).build());
+        createIndex("test2", Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 3).put(SETTING_NUMBER_OF_REPLICAS, 1).build());
+
+        // Create the third index with 2 shards and 1 replica
+        createIndex("test3", Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 2).put(SETTING_NUMBER_OF_REPLICAS, 1).build());
+
+        // Wait for the shard limit to be applied
+        try {
+            assertBusy(() -> {
+                ClusterState state = client().admin().cluster().prepareState().get().getState();
+
+                // Check total number of shards
+                assertEquals(16, state.getRoutingTable().allShards().size());
+
+                // Check number of unassigned shards
+                int unassignedShards = state.getRoutingTable().shardsWithState(ShardRoutingState.UNASSIGNED).size();
+                assertEquals(4, unassignedShards);
+
+                // Check shard distribution across nodes
+                for (RoutingNode routingNode : state.getRoutingNodes()) {
+                    assertTrue("Node exceeds shard limit", routingNode.numberOfOwningShards() <= 4);
+                }
+            });
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+
+        // Additional assertions to verify shard distribution
+        ClusterState state = client().admin().cluster().prepareState().get().getState();
+        int totalAssignedShards = 0;
+        for (RoutingNode routingNode : state.getRoutingNodes()) {
+            totalAssignedShards += routingNode.numberOfOwningShards();
+        }
+        assertEquals("Total assigned shards should be 12", 12, totalAssignedShards);
+
+    }
+
+    public void testIndexSpecificShardLimit() {
+        // Set the index-specific shard limit to 2 for the first index only
+        Settings indexSettingsWithLimit = Settings.builder()
+            .put(SETTING_NUMBER_OF_SHARDS, 4)
+            .put(SETTING_NUMBER_OF_REPLICAS, 1)
+            .put(INDEX_TOTAL_SHARDS_PER_NODE_SETTING.getKey(), 2)
+            .build();
+
+        Settings indexSettingsWithoutLimit = Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 4).put(SETTING_NUMBER_OF_REPLICAS, 1).build();
+
+        // Create the first index with 4 shards, 1 replica, and the index-specific limit
+        createIndex("test1", indexSettingsWithLimit);
+
+        // Create the second index with 4 shards and 1 replica, without the index-specific limit
+        createIndex("test2", indexSettingsWithoutLimit);
+
+        // Create the third index with 3 shards and 1 replica, without the index-specific limit
+        createIndex("test3", Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 3).put(SETTING_NUMBER_OF_REPLICAS, 1).build());
+
+        try {
+            // Wait for the shard limit to be applied
+            assertBusy(() -> {
+                ClusterState state = client().admin().cluster().prepareState().get().getState();
+
+                // Check total number of shards
+                assertEquals(22, state.getRoutingTable().allShards().size());
+
+                // Check total number of assigned and unassigned shards
+                int totalAssignedShards = 0;
+                int totalUnassignedShards = 0;
+                Map<String, Integer> unassignedShardsByIndex = new HashMap<>();
+
+                for (IndexRoutingTable indexRoutingTable : state.getRoutingTable()) {
+                    String index = indexRoutingTable.getIndex().getName();
+                    int indexUnassignedShards = 0;
+
+                    for (IndexShardRoutingTable shardRoutingTable : indexRoutingTable) {
+                        for (ShardRouting shardRouting : shardRoutingTable) {
+                            if (shardRouting.unassigned()) {
+                                totalUnassignedShards++;
+                                indexUnassignedShards++;
+                            } else {
+                                totalAssignedShards++;
+                            }
+                        }
+                    }
+
+                    unassignedShardsByIndex.put(index, indexUnassignedShards);
+                }
+
+                assertEquals("Total assigned shards should be 20", 20, totalAssignedShards);
+                assertEquals("Total unassigned shards should be 2", 2, totalUnassignedShards);
+
+                // Check unassigned shards for each index
+                assertEquals("test1 should have 2 unassigned shards", 2, unassignedShardsByIndex.get("test1").intValue());
+                assertEquals("test2 should have 0 unassigned shards", 0, unassignedShardsByIndex.get("test2").intValue());
+                assertEquals("test3 should have 0 unassigned shards", 0, unassignedShardsByIndex.get("test3").intValue());
+            });
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void testCombinedClusterAndIndexSpecificShardLimits() {
+        // Set the cluster-wide shard limit to 6
+        updateClusterSetting(CLUSTER_TOTAL_SHARDS_PER_NODE_SETTING.getKey(), 6);
+
+        // Create the first index with 3 shards, 1 replica, and index-specific limit of 1
+        Settings indexSettingsWithLimit = Settings.builder()
+            .put(SETTING_NUMBER_OF_SHARDS, 3)
+            .put(SETTING_NUMBER_OF_REPLICAS, 1)
+            .put(INDEX_TOTAL_SHARDS_PER_NODE_SETTING.getKey(), 1)
+            .build();
+        createIndex("test1", indexSettingsWithLimit);
+
+        // Create the second index with 4 shards and 1 replica
+        createIndex("test2", Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 4).put(SETTING_NUMBER_OF_REPLICAS, 1).build());
+
+        // Create the third index with 3 shards and 1 replica
+        createIndex("test3", Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 3).put(SETTING_NUMBER_OF_REPLICAS, 1).build());
+
+        try {
+            assertBusy(() -> {
+                ClusterState state = client().admin().cluster().prepareState().get().getState();
+
+                // Check total number of shards
+                assertEquals("Total shards should be 20", 20, state.getRoutingTable().allShards().size());
+
+                int totalAssignedShards = 0;
+                int totalUnassignedShards = 0;
+                Map<String, Integer> unassignedShardsByIndex = new HashMap<>();
+                Map<String, Integer> nodeShardCounts = new HashMap<>();
+                Map<String, Set<String>> indexShardsPerNode = new HashMap<>();
+
+                for (RoutingNode routingNode : state.getRoutingNodes()) {
+                    String nodeName = routingNode.node().getName();
+                    nodeShardCounts.put(nodeName, routingNode.numberOfOwningShards());
+                    indexShardsPerNode.put(nodeName, new HashSet<>());
+
+                    for (ShardRouting shardRouting : routingNode) {
+                        indexShardsPerNode.get(nodeName).add(shardRouting.getIndexName());
+                    }
+                }
+
+                for (IndexRoutingTable indexRoutingTable : state.getRoutingTable()) {
+                    String index = indexRoutingTable.getIndex().getName();
+                    int indexUnassignedShards = 0;
+
+                    for (IndexShardRoutingTable shardRoutingTable : indexRoutingTable) {
+                        for (ShardRouting shardRouting : shardRoutingTable) {
+                            if (shardRouting.unassigned()) {
+                                totalUnassignedShards++;
+                                indexUnassignedShards++;
+                            } else {
+                                totalAssignedShards++;
+                            }
+                        }
+                    }
+
+                    unassignedShardsByIndex.put(index, indexUnassignedShards);
+                }
+
+                assertEquals("Total assigned shards should be 17", 17, totalAssignedShards);
+                assertEquals("Total unassigned shards should be 3", 3, totalUnassignedShards);
+                assertEquals("test1 should have 3 unassigned shards", 3, unassignedShardsByIndex.get("test1").intValue());
+                assertEquals("test2 should have 0 unassigned shards", 0, unassignedShardsByIndex.getOrDefault("test2", 0).intValue());
+                assertEquals("test3 should have 0 unassigned shards", 0, unassignedShardsByIndex.getOrDefault("test3", 0).intValue());
+
+                // Check shard distribution across nodes
+                List<Integer> shardCounts = new ArrayList<>(nodeShardCounts.values());
+                Collections.sort(shardCounts, Collections.reverseOrder());
+                assertEquals("Two nodes should have 6 shards", 6, shardCounts.get(0).intValue());
+                assertEquals("Two nodes should have 6 shards", 6, shardCounts.get(1).intValue());
+                assertEquals("One node should have 5 shards", 5, shardCounts.get(2).intValue());
+
+                // Check that all nodes have only one shard of the first index
+                for (Set<String> indexesOnNode : indexShardsPerNode.values()) {
+                    assertTrue("Each node should have a shard from test1", indexesOnNode.contains("test1"));
+                }
+            });
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Integration test to verify the behavior of INDEX_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING
+     * in a non-remote store environment.
+     *
+     * Scenario:
+     * An end-user attempts to create an index with INDEX_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING
+     * on a cluster where remote store is not enabled.
+     *
+     * Expected Outcome:
+     * The system should reject the index creation request and throw an appropriate exception,
+     * indicating that this setting is only applicable for remote store enabled clusters.
+     */
+    public void testIndexTotalPrimaryShardsPerNodeSettingWithoutRemoteStore() {
+        // Attempt to create an index with INDEX_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING
+        Settings indexSettings = Settings.builder()
+            .put(SETTING_NUMBER_OF_SHARDS, 3)
+            .put(SETTING_NUMBER_OF_REPLICAS, 1)
+            .put(INDEX_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING.getKey(), 1)
+            .build();
+
+        // Assert that creating the index throws an exception
+        IllegalArgumentException exception = expectThrows(
+            IllegalArgumentException.class,
+            () -> { createIndex("test_index", indexSettings); }
+        );
+
+        // Verify the exception message
+        assertTrue(
+            "Exception should mention that the setting requires remote store",
+            exception.getMessage()
+                .contains(
+                    "Setting [index.routing.allocation.total_primary_shards_per_node] can only be used with remote store enabled clusters"
+                )
+        );
+    }
+
+    /**
+     * Integration test to verify the behavior of CLUSTER_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING
+     * in a non-remote store environment.
+     *
+     * Scenario:
+     * An end-user attempts to create an index with CLUSTER_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING
+     * on a cluster where remote store is not enabled.
+     *
+     * Expected Outcome:
+     * The system should reject the index creation request and throw an appropriate exception,
+     * indicating that this setting is only applicable for remote store enabled clusters.
+     */
+    public void testClusterTotalPrimaryShardsPerNodeSettingWithoutRemoteStore() {
+        IllegalArgumentException exception = expectThrows(IllegalArgumentException.class, () -> {
+            updateClusterSetting(CLUSTER_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING.getKey(), 1);
+        });
+
+        // Verify the exception message
+        assertTrue(
+            "Exception should mention that the setting requires remote store",
+            exception.getMessage()
+                .contains(
+                    "Setting [cluster.routing.allocation.total_primary_shards_per_node] can only be used with remote store enabled clusters"
+                )
+        );
+
+        // Attempt to create an index with INDEX_TOTAL_PRIMARY_SHARDS_PER_NODE_SETTING
+        Settings indexSettings = Settings.builder()
+            .put(SETTING_NUMBER_OF_SHARDS, 3)
+            .put(SETTING_NUMBER_OF_REPLICAS, 1)
+            .put(INDEX_TOTAL_SHARDS_PER_NODE_SETTING.getKey(), 1)
+            .build();
+
+        createIndex("test_index", indexSettings);
+    }
+
+    private void updateClusterSetting(String setting, int value) {
+        client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(setting, value)).get();
+    }
+}