Skip to content

Commit

Permalink
add toolkit test method for metax memory bandwidth (#728)
Browse files Browse the repository at this point in the history
* Update case_config.yaml

best config for metax P2P intraserver

* best config for metax , Update case_config.yaml

* Update case_config.yaml  best config for metax FP16 TEST

* best config for metax, Update case_config.yaml

* upload memory bandwidth toolkit

---------

Co-authored-by: Hodoryu <yixiong.yu@metax-tech.com>
  • Loading branch information
Hodoryu and Hodoryu authored Sep 3, 2024
1 parent e5393a8 commit bda04b0
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 0 deletions.
47 changes: 47 additions & 0 deletions base/toolkits/main_memory-bandwidth/metax/C550/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# 参评AI芯片信息

* 厂商:Metax


* 产品名称:C550
* 产品型号:曦云®C550 64G
* TDP:450W

# 所用服务器配置

* 服务器数量:1


* 单服务器内使用卡数:2
* 服务器型号:OAM C550-1500
* 操作系统版本:Ubuntu 20.04.6 LTS
* 操作系统内核:linux5.15.0-58-generic
* CPU:Inter(R) Xeon(R) Plattinum 8480+
* docker版本:24.0.7
* 内存:2TiB
* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信

# 评测结果

## 核心评测结果

| 评测项 | 主存储带宽测试值(8卡平均) | 主存储带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
| ---- | -------------- | -------------- | ------------ |
| 评测结果 | 1495.70GB/s | 1.8TB/s | 83.09% |

## 能耗监控结果

| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP |
| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
| 监控结果 | 4284.0W | 4284.0W | 0.0W | / | 209.0W | 317.0W | 108.0W | 450W |

## 其他重要监控结果

| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) |
| ---- | --------- | -------- | ------------ | -------------- |
| 监控结果 | 0.076% | 0.496% | 39.0°C | 51.579% |


# 厂商测试工具原理说明

使用cuda核函数,进行读+写AI芯片主存储操作,计算AI芯片主存储带宽
66 changes: 66 additions & 0 deletions base/toolkits/main_memory-bandwidth/metax/C550/bandwidth.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright (c) 2024 BAAI. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License")

#include <stdio.h>
#include <cuda_runtime.h>

#define GB (1024ULL * 1024ULL * 1024ULL)
#define SIZE (16ULL * GB)
#define WARMUP_ITERATIONS 100
#define ITERATIONS 1000

void checkCudaError(cudaError_t err, const char *msg) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}

__global__ void copyKernel(void* d_dst, const void* d_src, size_t size) {
size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
if (offset < size) {
((double*)d_dst)[offset] = ((const double*)d_src)[offset];
}
}

int main() {
double *d_src, *d_dst;
cudaEvent_t start, end;
float elapsed_time;

checkCudaError(cudaMalloc(&d_src, SIZE), "cudaMalloc");
checkCudaError(cudaMalloc(&d_dst, SIZE), "cudaMalloc");

checkCudaError(cudaEventCreate(&start), "cudaEventCreate");
checkCudaError(cudaEventCreate(&end), "cudaEventCreate");

int threadsPerBlock = 1024;
size_t numElem = SIZE/sizeof(double);
int blocksPerGrid = (numElem + threadsPerBlock - 1) / threadsPerBlock;
for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
copyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_dst, d_src, SIZE);
}
cudaDeviceSynchronize();
checkCudaError(cudaEventRecord(start), "cudaEventRecord");
for (int i = 0; i < ITERATIONS; ++i) {
copyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_dst, d_src, SIZE);
}
cudaDeviceSynchronize();
checkCudaError(cudaEventRecord(end), "cudaEventRecord");
checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize");

checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime");

double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0);

printf("[FlagPerf Result]main_memory-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
printf("[FlagPerf Result]main_memory-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));

checkCudaError(cudaFree(d_src), "cudaFree");
checkCudaError(cudaFree(d_dst), "cudaFree");
checkCudaError(cudaEventDestroy(start), "cudaEventDestroy");
checkCudaError(cudaEventDestroy(end), "cudaEventDestroy");

return 0;
}
7 changes: 7 additions & 0 deletions base/toolkits/main_memory-bandwidth/metax/C550/main.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
export MACA_PATH=/opt/maca
export CUDA_PATH=$MACA_PATH/tools/cu-bridge
export MACA_CLANG_PATH=$MACA_PATH/mxgpu_llvm/bin
export LD_LIBRARY_PATH=./:$MACA_PATH/lib:$LD_LIBRARY_PATH
export PATH=$CUDA_PATH/bin:$MACA_CLANG_PATH:$PATH
cucc bandwidth.cu -lcublas -o bdtest
./bdtest

0 comments on commit bda04b0

Please sign in to comment.