Fix uneven loading of gpu memory when use multi device

Presburger · Presburger · commit 696defee3008 · 2024-07-03T16:37:30.000+08:00
Signed-off-by: yusheng.ma &lt;yusheng.ma@zilliz.com&gt;
diff --git a/src/common/raft/integration/raft_knowhere_index.cuh b/src/common/raft/integration/raft_knowhere_index.cuh
@@ -626,7 +626,24 @@ struct raft_knowhere_index<IndexKind>::impl {
     }
 
     auto static deserialize(std::istream& is) {
-        auto new_device_id = select_device_id();
+        auto static device_count = []() {
+            auto result = 0;
+            RAFT_CUDA_TRY(cudaGetDeviceCount(&result));
+            RAFT_EXPECTS(result != 0, "No CUDA devices found");
+            return result;
+        }();
+        // The lazy allocation mode cannot completely eliminate uneven distribution, but it can alleviate it well.
+        int new_device_id = 0;
+        size_t free, total;
+        size_t max_free = 0;
+        for (int i = 0; i < device_count; ++i) {
+            auto scoped_device = raft::device_setter{i};
+            RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total));
+            if (max_free < free) {
+                max_free = free;
+                new_device_id = i;
+            }
+        }
         auto scoped_device = raft::device_setter{new_device_id};
         auto const& res = get_device_resources_without_mempool();
         auto des_index = raft_index_type::template deserialize<data_type, indexing_type>(res, is);
diff --git a/src/index/gpu_raft/gpu_raft.h b/src/index/gpu_raft/gpu_raft.h
@@ -41,7 +41,7 @@
 
 namespace knowhere {
 
-auto static constexpr cuda_concurrent_size_per_device = std::uint32_t{8};
+auto static constexpr cuda_concurrent_size_per_device = std::uint32_t{4};
 
 template <raft_proto::raft_index_kind K>
 struct KnowhereConfigType {};