add toolkit test method for metax memory bandwidth (#728)

* Update case_config.yaml best config for metax P2P intraserver * best config for metax , Update case_config.yaml * Update case_config.yaml best config for metax FP16 TEST * best config for metax, Update case_config.yaml * upload memory bandwidth toolkit --------- Co-authored-by: Hodoryu <yixiong.yu@metax-tech.com>
FlagOpen · Sep 3, 2024 · bda04b0 · bda04b0
1 parent e5393a8
commit bda04b0
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 0 deletions.
diff --git a/base/toolkits/main_memory-bandwidth/metax/C550/README.md b/base/toolkits/main_memory-bandwidth/metax/C550/README.md
@@ -0,0 +1,47 @@
+# 参评AI芯片信息
+
+* 厂商：Metax
+
+
+* 产品名称：C550
+* 产品型号：曦云®C550 64G
+* TDP：450W
+
+# 所用服务器配置
+
+* 服务器数量：1
+
+
+* 单服务器内使用卡数：2
+* 服务器型号：OAM C550-1500
+* 操作系统版本：Ubuntu 20.04.6 LTS
+* 操作系统内核：linux5.15.0-58-generic
+* CPU：Inter(R) Xeon(R) Plattinum 8480+
+* docker版本：24.0.7
+* 内存：2TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 主存储带宽测试值(8卡平均) | 主存储带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | 1495.70GB/s    | 1.8TB/s       | 83.09%        |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | 4284.0W | 4284.0W | 0.0W    | /     | 209.0W       | 317.0W       | 108.0W        | 450W  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | 0.076%    | 0.496%   | 39.0°C      | 51.579%        |
+
+
+# 厂商测试工具原理说明
+
+使用cuda核函数，进行读+写AI芯片主存储操作，计算AI芯片主存储带宽
diff --git a/base/toolkits/main_memory-bandwidth/metax/C550/bandwidth.cu b/base/toolkits/main_memory-bandwidth/metax/C550/bandwidth.cu
@@ -0,0 +1,66 @@
+// Copyright (c) 2024 BAAI. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (16ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 1000
+
+void checkCudaError(cudaError_t err, const char *msg) {
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+__global__ void copyKernel(void* d_dst, const void* d_src, size_t size) {
+    size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
+    if (offset < size) {
+        ((double*)d_dst)[offset] = ((const double*)d_src)[offset];
+    }
+}
+
+int main() {
+    double *d_src, *d_dst;
+    cudaEvent_t start, end;
+    float elapsed_time;
+
+    checkCudaError(cudaMalloc(&d_src, SIZE), "cudaMalloc");
+    checkCudaError(cudaMalloc(&d_dst, SIZE), "cudaMalloc");
+
+    checkCudaError(cudaEventCreate(&start), "cudaEventCreate");
+    checkCudaError(cudaEventCreate(&end), "cudaEventCreate");
+
+    int threadsPerBlock = 1024;
+    size_t numElem = SIZE/sizeof(double);
+    int blocksPerGrid = (numElem + threadsPerBlock - 1) / threadsPerBlock;
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+	    copyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_dst, d_src, SIZE);
+    }
+    cudaDeviceSynchronize();
+    checkCudaError(cudaEventRecord(start), "cudaEventRecord");
+    for (int i = 0; i < ITERATIONS; ++i) {
+	    copyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_dst, d_src, SIZE);
+    }
+   cudaDeviceSynchronize();
+    checkCudaError(cudaEventRecord(end), "cudaEventRecord");
+    checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize");
+
+    checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime");
+
+    double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0);
+
+    printf("[FlagPerf Result]main_memory-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
+    printf("[FlagPerf Result]main_memory-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+
+    checkCudaError(cudaFree(d_src), "cudaFree");
+    checkCudaError(cudaFree(d_dst), "cudaFree");
+    checkCudaError(cudaEventDestroy(start), "cudaEventDestroy");
+    checkCudaError(cudaEventDestroy(end), "cudaEventDestroy");
+
+    return 0;
+}
diff --git a/base/toolkits/main_memory-bandwidth/metax/C550/main.sh b/base/toolkits/main_memory-bandwidth/metax/C550/main.sh
@@ -0,0 +1,7 @@
+export MACA_PATH=/opt/maca
+export CUDA_PATH=$MACA_PATH/tools/cu-bridge
+export MACA_CLANG_PATH=$MACA_PATH/mxgpu_llvm/bin
+export LD_LIBRARY_PATH=./:$MACA_PATH/lib:$LD_LIBRARY_PATH
+export PATH=$CUDA_PATH/bin:$MACA_CLANG_PATH:$PATH
+cucc bandwidth.cu -lcublas -o bdtest
+./bdtest