-
Notifications
You must be signed in to change notification settings - Fork 109
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add toolkit test method for metax memory bandwidth (#728)
* Update case_config.yaml best config for metax P2P intraserver * best config for metax , Update case_config.yaml * Update case_config.yaml best config for metax FP16 TEST * best config for metax, Update case_config.yaml * upload memory bandwidth toolkit --------- Co-authored-by: Hodoryu <yixiong.yu@metax-tech.com>
- Loading branch information
Showing
3 changed files
with
120 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# 参评AI芯片信息 | ||
|
||
* 厂商:Metax | ||
|
||
|
||
* 产品名称:C550 | ||
* 产品型号:曦云®C550 64G | ||
* TDP:450W | ||
|
||
# 所用服务器配置 | ||
|
||
* 服务器数量:1 | ||
|
||
|
||
* 单服务器内使用卡数:2 | ||
* 服务器型号:OAM C550-1500 | ||
* 操作系统版本:Ubuntu 20.04.6 LTS | ||
* 操作系统内核:linux5.15.0-58-generic | ||
* CPU:Inter(R) Xeon(R) Plattinum 8480+ | ||
* docker版本:24.0.7 | ||
* 内存:2TiB | ||
* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 | ||
|
||
# 评测结果 | ||
|
||
## 核心评测结果 | ||
|
||
| 评测项 | 主存储带宽测试值(8卡平均) | 主存储带宽标定值(8卡平均) | 测试标定比例(8卡平均) | | ||
| ---- | -------------- | -------------- | ------------ | | ||
| 评测结果 | 1495.70GB/s | 1.8TB/s | 83.09% | | ||
|
||
## 能耗监控结果 | ||
|
||
| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(8卡平均) | 单卡最大功耗(8卡最大) | 单卡功耗标准差(8卡最大) | 单卡TDP | | ||
| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- | | ||
| 监控结果 | 4284.0W | 4284.0W | 0.0W | / | 209.0W | 317.0W | 108.0W | 450W | | ||
|
||
## 其他重要监控结果 | ||
|
||
| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(8卡平均) | 单卡平均显存占用(8卡平均) | | ||
| ---- | --------- | -------- | ------------ | -------------- | | ||
| 监控结果 | 0.076% | 0.496% | 39.0°C | 51.579% | | ||
|
||
|
||
# 厂商测试工具原理说明 | ||
|
||
使用cuda核函数,进行读+写AI芯片主存储操作,计算AI芯片主存储带宽 |
66 changes: 66 additions & 0 deletions
66
base/toolkits/main_memory-bandwidth/metax/C550/bandwidth.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Copyright (c) 2024 BAAI. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License") | ||
|
||
#include <stdio.h> | ||
#include <cuda_runtime.h> | ||
|
||
#define GB (1024ULL * 1024ULL * 1024ULL) | ||
#define SIZE (16ULL * GB) | ||
#define WARMUP_ITERATIONS 100 | ||
#define ITERATIONS 1000 | ||
|
||
void checkCudaError(cudaError_t err, const char *msg) { | ||
if (err != cudaSuccess) { | ||
fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err)); | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
__global__ void copyKernel(void* d_dst, const void* d_src, size_t size) { | ||
size_t offset = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (offset < size) { | ||
((double*)d_dst)[offset] = ((const double*)d_src)[offset]; | ||
} | ||
} | ||
|
||
int main() { | ||
double *d_src, *d_dst; | ||
cudaEvent_t start, end; | ||
float elapsed_time; | ||
|
||
checkCudaError(cudaMalloc(&d_src, SIZE), "cudaMalloc"); | ||
checkCudaError(cudaMalloc(&d_dst, SIZE), "cudaMalloc"); | ||
|
||
checkCudaError(cudaEventCreate(&start), "cudaEventCreate"); | ||
checkCudaError(cudaEventCreate(&end), "cudaEventCreate"); | ||
|
||
int threadsPerBlock = 1024; | ||
size_t numElem = SIZE/sizeof(double); | ||
int blocksPerGrid = (numElem + threadsPerBlock - 1) / threadsPerBlock; | ||
for (int i = 0; i < WARMUP_ITERATIONS; ++i) { | ||
copyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_dst, d_src, SIZE); | ||
} | ||
cudaDeviceSynchronize(); | ||
checkCudaError(cudaEventRecord(start), "cudaEventRecord"); | ||
for (int i = 0; i < ITERATIONS; ++i) { | ||
copyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_dst, d_src, SIZE); | ||
} | ||
cudaDeviceSynchronize(); | ||
checkCudaError(cudaEventRecord(end), "cudaEventRecord"); | ||
checkCudaError(cudaEventSynchronize(end), "cudaEventSynchronize"); | ||
|
||
checkCudaError(cudaEventElapsedTime(&elapsed_time, start, end), "cudaEventElapsedTime"); | ||
|
||
double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0); | ||
|
||
printf("[FlagPerf Result]main_memory-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0)); | ||
printf("[FlagPerf Result]main_memory-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0)); | ||
|
||
checkCudaError(cudaFree(d_src), "cudaFree"); | ||
checkCudaError(cudaFree(d_dst), "cudaFree"); | ||
checkCudaError(cudaEventDestroy(start), "cudaEventDestroy"); | ||
checkCudaError(cudaEventDestroy(end), "cudaEventDestroy"); | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
export MACA_PATH=/opt/maca | ||
export CUDA_PATH=$MACA_PATH/tools/cu-bridge | ||
export MACA_CLANG_PATH=$MACA_PATH/mxgpu_llvm/bin | ||
export LD_LIBRARY_PATH=./:$MACA_PATH/lib:$LD_LIBRARY_PATH | ||
export PATH=$CUDA_PATH/bin:$MACA_CLANG_PATH:$PATH | ||
cucc bandwidth.cu -lcublas -o bdtest | ||
./bdtest |