diff --git a/base/toolkits/computation-BF16/mthreads/S4000/README.md b/base/toolkits/computation-BF16/mthreads/S4000/README.md new file mode 100644 index 000000000..5498c00c4 --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/README.md @@ -0,0 +1,42 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:1 +* 单服务器内使用卡数:1 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 + +# 评测结果 + +## 核心评测结果 + +| 评测项 | BF16算力测试值 | BF16算力标定值 | 测试标定比例 | +| ---- | ----------- | ---------- | ------ | +| 评测结果 | / | / | / | + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 | +| ---- | --------- | -------- | ------- | -------- | +| 监控结果 | / | / | / | / | + +# 厂商测试工具原理说明 + +使用GEMM算子进行computation-bound的计算任务,从而测得实际BF16算力 diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark.muh b/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark.muh new file mode 100644 index 000000000..09f681943 --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark.muh @@ -0,0 +1,56 @@ +#pragma once +#include "benchmark_bf16.h" + +template +float Benchmark::RunKernel(T func, dim3 block_num, dim3 block_size, uint iters, + Args... args) { + // unit micro secend + float timed = 0, timed_min = 0; + + // Dummy calls + for (uint i = 0; i < 2; i++) { + func <<>> (args...); + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } + + if (use_event_timer) { + MUSAEvent musaEvent(&timed); + for (uint i = 0; i < iters; i++) { + func <<>> (args...); + } + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } + else // std timer + { + Timer timer(&timed); + for (uint i = 0; i < iters; i++) { + func <<>> (args...); + } + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } + timed_min = timed; + // double check + if (use_event_timer) { + MUSAEvent musaEvent(&timed); + for (uint i = 0; i < iters; i++) { + func <<>> (args...); + } + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } + else // std timer + { + Timer timer(&timed); + for (uint i = 0; i < iters; i++) { + func <<>> (args...); + } + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } + timed_min = std::min(timed_min, timed); + + return (timed_min / static_cast(iters)); +} + +template +struct Int2Type { + enum { VALUE = A }; +}; \ No newline at end of file diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark_bf16.h b/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark_bf16.h new file mode 100644 index 000000000..f05c92b8a --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark_bf16.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +#define DEFAULT_BANDWIDTH_MEM_SIZE 2048 +#define DEFAULT_BANDWIDTH_ITERS 30 + +typedef enum class BANDWIDTH_MODE { + ALL, + READ_ONLY, + WRITE_ONLY, + READ_WRITE +} BW_MODE_T; + +typedef enum class OFFSET_MODE { LOCAL, GLOBAL } OFFSET_MODE_T; + +typedef enum GPU_ARCH { + MP_10 = 10, + MP_21 = 21, + MP_22 = 22, + MP_31 = 31 +} GPU_ARCH_T; + +class Benchmark { +public: + // devices + int specified_device; + const char* specified_device_name; + const char* specified_type_name; + bool force_device; + bool force_device_name; + bool force_type; + // option + bool all_cases; + bool use_event_timer; + BW_MODE_T bandwidth_mode; + int bandwidth_mem_size; + int bandwidth_iters; + + logger* log; + + Benchmark(); + ~Benchmark(); + + template + float RunKernel(T func, dim3 block_num, dim3 block_size, uint iters, + Args... args); + + int RunComputeMMABF16(device_info_t& dev_info); + + int RunBF16Test(); +}; \ No newline at end of file diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/common.h b/base/toolkits/computation-BF16/mthreads/S4000/include/common.h new file mode 100644 index 000000000..3bb25b966 --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/include/common.h @@ -0,0 +1,117 @@ +#pragma once + +#if defined(__APPLE__) || defined(__MACOSX) || defined(__FreeBSD__) +#include +#endif +#include "musa_runtime.h" +#include +#include + +#define TAB " " +#define NEWLINE "\n" +#ifndef __FreeBSD__ +#define uint unsigned int +#endif +#define ulong unsigned long + +#if defined(__APPLE__) || defined(__MACOSX) +#define OS_NAME "Macintosh" +#elif defined(__ANDROID__) +#define OS_NAME "Android" +#elif defined(_WIN32) +#if defined(_WIN64) +#define OS_NAME "Win64" +#else +#define OS_NAME "Win32" +#endif +#elif defined(__linux__) +#if defined(__x86_64__) +#define OS_NAME "Linux x64" +#elif defined(__i386__) +#define OS_NAME "Linux x86" +#elif defined(__arm__) +#define OS_NAME "Linux ARM" +#elif defined(__aarch64__) +#define OS_NAME "Linux ARM64" +#else +#define OS_NAME "Linux unknown" +#endif +#elif defined(__FreeBSD__) +#define OS_NAME "FreeBSD" +#else +#define OS_NAME "Unknown" +#endif + +int check_musa_error(musaError _err, int line, const char* func_name); + +#define _PERF_CHECK_MUSA_ERROR_INNER(cond, func, line) \ + do { \ + if (check_musa_error(cond, line, func)) \ + exit(1); \ + } while (0) + +#define CHECK_MUSA_ERROR(cond) \ + _PERF_CHECK_MUSA_ERROR_INNER(cond, __PRETTY_FUNCTION__, __LINE__) + +typedef struct { + std::string device_name; + std::string driver_version; + int device_arch; + + uint num_compute_units; + uint max_work_group_size; + uint64_t max_alloc_size; + uint64_t max_global_size; + uint max_clock_freq; + + bool half_supported; + bool double_supported; + bool imma_supported; + + // Test specific options + uint bw_global_iters; + uint bw_shmem_iters; + uint64_t bw_global_max_size; + uint64_t bw_shmem_max_size; + uint compute_work_groups_per_cu; + uint compute_dp_work_groups_per_cu; + uint shmem_work_groups_per_cu; + uint compute_iters; + uint bw_transfer_iters; + uint kernel_latency_iters; + uint64_t bw_transfer_max_size; + std::string extension; +} device_info_t; + +class Timer { +public: + explicit Timer(float* dur); + ~Timer(); + +private: + float* duration_us; + std::chrono::high_resolution_clock::time_point tick; + std::chrono::high_resolution_clock::time_point tock; +}; + +class MUSAEvent { +public: + explicit MUSAEvent(float* dur); + ~MUSAEvent(); + +private: + float* duration_us; + musaEvent_t startTime; + musaEvent_t stopTime; +}; + +int GetDeviceInfo(int dev, device_info_t* dev_info); + +// Round down to next multiple of the given base with an optional maximum value +uint64_t RoundToMultipleOf(uint64_t number, uint64_t base, + uint64_t maxValue = UINT64_MAX); + +void Populate(float* ptr, uint64_t N); +void Populate(double* ptr, uint64_t N); + +void TrimString(std::string& str); \ No newline at end of file diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/compute_mma_bf16.muh b/base/toolkits/computation-BF16/mthreads/S4000/include/compute_mma_bf16.muh new file mode 100644 index 000000000..b96f930cb --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/include/compute_mma_bf16.muh @@ -0,0 +1,45 @@ +#pragma once +#include "benchmark.muh" +#include + +#define UNROLL_NUM 16 +#define ITERS 256 +using namespace mtmusa; + +#define MMA_4(a, b, c) \ + wmma::mma_sync(c, a, b, c); \ + wmma::mma_sync(c, a, b, c); \ + wmma::mma_sync(c, a, b, c); \ + wmma::mma_sync(c, a, b, c); + +#define MMA_16(a, b, c) \ + MMA_4(a, b, c); \ + MMA_4(a, b, c); \ + MMA_4(a, b, c); \ + MMA_4(a, b, c); + +template +__device__ void compute_mma_bf16_impl(void* ptr, Int2Type) { + wmma::fragment + a_frag; + wmma::fragment + b_frag; + wmma::fragment c_frag; + // #pragma unroll + for (int i = 0; i < ITERS; i++) { + MMA_16(a_frag, b_frag, c_frag); + } + wmma::store_matrix_sync((float*)ptr, c_frag, M, wmma::mem_col_major); +} + +template +__device__ void compute_mma_bf16_impl(void* ptr, Int2Type) {} + +template +__global__ void compute_mma_bf16(void* ptr) { +#if defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 220) + compute_mma_bf16_impl(ptr, Int2Type<(ARCH == MP_22)>()); +#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 310) + compute_mma_bf16_impl(ptr, Int2Type<(ARCH == MP_31)>()); +#endif +} \ No newline at end of file diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/logger.h b/base/toolkits/computation-BF16/mthreads/S4000/include/logger.h new file mode 100644 index 000000000..46856f87c --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/include/logger.h @@ -0,0 +1,16 @@ +#pragma once +#include +#include +class logger { + std::ofstream outFile; + std::string FileName; +public: + logger(); + ~logger(); + + void print(std::string str); + void print(double val); + void print(float val); + void print(int val); + void print(unsigned int val); +}; \ No newline at end of file diff --git a/base/toolkits/computation-BF16/mthreads/S4000/main.sh b/base/toolkits/computation-BF16/mthreads/S4000/main.sh new file mode 100644 index 000000000..fe45a4ee6 --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/main.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +CXX=g++ +MCC=mcc +CXXFLAGS="-std=c++17 -I./include -I/usr/local/musa/include -fPIC" +MCCFLAGS="-std=c++17 --offload-arch=mp_22 -I../include -mtgpu -fPIC -I./include -I/usr/local/musa/include" +LDFLAGS="-lmusart -L/usr/local/musa/lib" + +SRC_DIR=src +BUILD_DIR=build +EXECUTABLE=gemm + + +mkdir -p $BUILD_DIR + +$CXX $CXXFLAGS -c $SRC_DIR/common.cpp -o $BUILD_DIR/common.o +$CXX $CXXFLAGS -c $SRC_DIR/logger.cpp -o $BUILD_DIR/logger.o +$CXX $CXXFLAGS -c $SRC_DIR/benchmark_bf16.cpp -o $BUILD_DIR/benchmark.o + +$MCC $MCCFLAGS -c $SRC_DIR/compute_mma_bf16.mu -o $BUILD_DIR/compute_mma_bf16.o + +$CXX $CXXFLAGS -c $SRC_DIR/main.cpp -o $BUILD_DIR/main.o + +$CXX $CXXFLAGS $BUILD_DIR/*.o -o $EXECUTABLE $LDFLAGS + +./gemm \ No newline at end of file diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/benchmark_bf16.cpp b/base/toolkits/computation-BF16/mthreads/S4000/src/benchmark_bf16.cpp new file mode 100644 index 000000000..4d0917c37 --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/src/benchmark_bf16.cpp @@ -0,0 +1,36 @@ +#include "benchmark_bf16.h" +#include +#include + +Benchmark::Benchmark() + : specified_device(0), + specified_device_name(0), + specified_type_name(0), + force_device(false), + force_device_name(false), + force_type(false), + all_cases(false), + use_event_timer(false), + bandwidth_mode(BANDWIDTH_MODE::ALL), + bandwidth_mem_size(DEFAULT_BANDWIDTH_MEM_SIZE), + bandwidth_iters(DEFAULT_BANDWIDTH_ITERS) { + log = new logger(); +} + +Benchmark::~Benchmark() { + if (log) { + delete log; + } +} + +int Benchmark::RunBF16Test() { + musaSetDevice(0); + device_info_t dev_info; + if (GetDeviceInfo(0, &dev_info)) { + log->print(TAB "Can not get informations for Device " + + std::to_string(0) + NEWLINE); + } + + RunComputeMMABF16(dev_info); + return 0; +} diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/common.cpp b/base/toolkits/computation-BF16/mthreads/S4000/src/common.cpp new file mode 100644 index 000000000..e6b613248 --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/src/common.cpp @@ -0,0 +1,138 @@ +#include "common.h" + +#define MUSA_ERROR_CASE(ERR) \ + case ERR: { \ + printf("" #ERR " in %s on line %i\n", func_name, line); \ + return 1; \ + } + +int check_musa_error(musaError _err, int line, const char* func_name) { + switch (_err) { + case musaSuccess: + return 0; + MUSA_ERROR_CASE(musaErrorInvalidValue) + MUSA_ERROR_CASE(musaErrorMemoryAllocation) + MUSA_ERROR_CASE(musaErrorInitializationError) + MUSA_ERROR_CASE(musaErrorInvalidMemcpyDirection) + MUSA_ERROR_CASE(musaErrorAddressOfConstant) + MUSA_ERROR_CASE(musaErrorSynchronizationError) + MUSA_ERROR_CASE(musaErrorNotYetImplemented) + MUSA_ERROR_CASE(musaErrorMemoryValueTooLarge) + MUSA_ERROR_CASE(musaErrorNoDevice) + MUSA_ERROR_CASE(musaErrorInvalidDevice) + MUSA_ERROR_CASE(musaErrorHostMemoryAlreadyRegistered) + MUSA_ERROR_CASE(musaErrorHostMemoryNotRegistered) + MUSA_ERROR_CASE(musaErrorIllegalInstruction) + MUSA_ERROR_CASE(musaErrorInvalidAddressSpace) + MUSA_ERROR_CASE(musaErrorLaunchFailure) + MUSA_ERROR_CASE(musaErrorNotSupported) + MUSA_ERROR_CASE(musaErrorTimeout) + MUSA_ERROR_CASE(musaErrorUnknown) + MUSA_ERROR_CASE(musaErrorApiFailureBase) + default: + printf("Unknown MUSA error %i in %s on line %i\n", _err, func_name, line); + return 1; + } +} + +int GetDeviceInfo(int dev, device_info_t* dev_info) { + musaDeviceProp deviceProp; + if (musaSuccess != musaGetDeviceProperties(&deviceProp, dev)) { + return -1; + } + dev_info->device_name = deviceProp.name; + dev_info->device_arch = 10 * deviceProp.major + deviceProp.minor; + int driverVersion = 0; + musaDriverGetVersion(&driverVersion); + dev_info->driver_version = std::to_string(driverVersion); + TrimString(dev_info->device_name); + TrimString(dev_info->driver_version); + + dev_info->num_compute_units = deviceProp.multiProcessorCount; + dev_info->max_work_group_size = deviceProp.maxThreadsPerBlock; + + // Limiting max work-group size to 512 +#define MAX_WG_SIZE 256 + dev_info->max_work_group_size = + std::min(dev_info->max_work_group_size, (uint)MAX_WG_SIZE); +#undef MAX_WG_SIZE + + /* Size of global device memory in bytes. */ + dev_info->max_global_size = static_cast(deviceProp.totalGlobalMem); + /* Max size of memory object allocation in bytes.*/ + dev_info->max_alloc_size = dev_info->max_global_size / 3; + dev_info->max_clock_freq = static_cast(deviceProp.clockRate / 1000); + dev_info->double_supported = true; + dev_info->half_supported = true; + + dev_info->bw_global_max_size = 1 << 31; + dev_info->bw_shmem_max_size = 1 << 28; + dev_info->bw_transfer_max_size = 1 << 28; + dev_info->compute_work_groups_per_cu = 2048; + dev_info->compute_dp_work_groups_per_cu = 512; + dev_info->shmem_work_groups_per_cu = 128; + dev_info->compute_iters = 20; + dev_info->bw_global_iters = 30; + dev_info->bw_shmem_iters = 20; + dev_info->bw_transfer_iters = 20; + dev_info->kernel_latency_iters = 20000; + + return 0; +} + +MUSAEvent::MUSAEvent(float* dur) : duration_us(dur) { + CHECK_MUSA_ERROR(musaEventCreate(&startTime)); + CHECK_MUSA_ERROR(musaEventCreate(&stopTime)); + CHECK_MUSA_ERROR(musaEventRecord(startTime)); +} + +MUSAEvent::~MUSAEvent() { + CHECK_MUSA_ERROR(musaEventRecord(stopTime)); + CHECK_MUSA_ERROR(musaEventSynchronize(stopTime)); + CHECK_MUSA_ERROR(musaEventElapsedTime(duration_us, startTime, stopTime)); + *duration_us *= 1e3f; + CHECK_MUSA_ERROR(musaEventDestroy(startTime)); + CHECK_MUSA_ERROR(musaEventDestroy(stopTime)); +} + +Timer::Timer(float* dur) : duration_us(dur) { + tick = std::chrono::high_resolution_clock::now(); +} + +Timer::~Timer() { + tock = std::chrono::high_resolution_clock::now(); + *duration_us = + (float)(std::chrono::duration_cast(tock - tick) + .count()); +} + +void Populate(float* ptr, uint64_t num) { + srand((unsigned int)time(NULL)); + + for (uint64_t i = 0; i < num; i++) { + // ptr[i] = (float)rand(); + // to ensure the sum of arr is a positive number + // avoid the STORE in ReadOnly + ptr[i] = (float)i; + } +} + +void Populate(double* ptr, uint64_t num) { + srand((unsigned int)time(NULL)); + for (uint64_t i = 0; i < num; i++) { + // ptr[i] = (double)rand(); + ptr[i] = (double)i; + } +} + +uint64_t RoundToMultipleOf(uint64_t number, uint64_t base, uint64_t max_value) { + uint64_t n = (number > max_value) ? max_value : number; + return (n / base) * base; +} + +void TrimString(std::string& str) { + size_t pos = str.find('\0'); + if (pos != std::string::npos) { + str.erase(pos); + } +} \ No newline at end of file diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/compute_mma_bf16.mu b/base/toolkits/computation-BF16/mthreads/S4000/src/compute_mma_bf16.mu new file mode 100644 index 000000000..852c50cb4 --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/src/compute_mma_bf16.mu @@ -0,0 +1,80 @@ +#include "benchmark.muh" +#include "compute_mma_bf16.muh" + +#define WARPSIZE 128 +#define TEST_DETAIL(show_, kernel_, tag_, M_, N_, K_, ARCH_) \ + work_per_warp = M_ * N_ * K_ * 2 * ITERS * UNROLL_NUM; \ + timed = RunKernel(kernel_, block_num, block_size, iters, \ + d_x); \ + gops = (static_cast(total_num) / WARPSIZE) * work_per_warp / timed / \ + 1e3f; \ + gops_max = std::max(gops_max, gops); \ + if (show_) { \ + log->print(TAB TAB TAB #tag_ " : "); \ + log->print(gops); \ + log->print(NEWLINE); \ + } + +#define TEST_END(show_, type_) \ + if (!show_) { \ + log->print("[FlagPerf Result]computation-" #type_ "=");\ + log->print(gops_max/1e3);\ + log->print("TFLOPS");\ + log->print(NEWLINE); \ + } + +int Benchmark::RunComputeMMABF16(device_info_t& dev_info) { + + float timed, gops, gops_max; + int work_per_warp; + dim3 block_size(1024); + int grid_size = std::min((dev_info.num_compute_units) * + (dev_info.compute_work_groups_per_cu) * + (block_size.x) * sizeof(int), + dev_info.max_alloc_size) / + ((block_size.x) * sizeof(int)); + grid_size = std::min(grid_size, 1024); + dim3 block_num(grid_size); + size_t total_num = block_size.x * block_num.x; + uint iters = dev_info.compute_iters; + { + if (dev_info.device_arch == MP_22) { + void* d_x; + CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int))); + gops_max = 0.0f; + TEST_DETAIL(all_cases, compute_mma_bf16, BF16_16_16_16, 16, 16, 16, + MP_22); + TEST_DETAIL(all_cases, compute_mma_bf16, BF16_32_8_16, 32, 8, 16, + MP_22); + TEST_DETAIL(all_cases, compute_mma_bf16, BF16_8_32_16, 8, 32, 16, + MP_22); + TEST_DETAIL(all_cases, compute_mma_bf16, BF16_32_32_16, 32, 32, 16, + MP_22); + TEST_END(all_cases, BF16); + CHECK_MUSA_ERROR(musaFree(d_x)); + } + else if (dev_info.device_arch == MP_31) { + void* d_x; + CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int))); + gops_max = 0.0f; + TEST_DETAIL(all_cases, compute_mma_bf16, BF16_16_16_16, 16, 16, 16, + MP_31); + TEST_DETAIL(all_cases, compute_mma_bf16, BF16_32_8_16, 32, 8, 16, + MP_31); + TEST_DETAIL(all_cases, compute_mma_bf16, BF16_8_32_16, 8, 32, 16, + MP_31); + // TEST_DETAIL(all_cases, compute_mma_bf16, BF16_32_32_16, 32, 32, 16, + // MP_31); + TEST_DETAIL(all_cases, compute_mma_bf16, BF16_16_16_32, 16, 16, 32, + MP_31); + TEST_END(all_cases, BF16); + CHECK_MUSA_ERROR(musaFree(d_x)); + } + else { + log->print(TAB TAB TAB "NOT SUPPORT mp_" + + std::to_string(dev_info.device_arch) + NEWLINE); + } + } + + return 0; +} \ No newline at end of file diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/logger.cpp b/base/toolkits/computation-BF16/mthreads/S4000/src/logger.cpp new file mode 100644 index 000000000..1bab6bb37 --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/src/logger.cpp @@ -0,0 +1,73 @@ +#include "logger.h" +#include +#include + +logger::logger() :FileName("benchmark.log") { + outFile.open(FileName); + if (!outFile.is_open()) { + throw std::runtime_error("Failed to open log file."); + } + outFile.flush(); +} + +logger::~logger() { + if (outFile.is_open()) { + outFile.close(); + } + + // Delete the log file + if (std::remove(FileName.c_str()) != 0) { + std::cerr << "Failed to delete log file: " << FileName << "\n"; + } +} + + +void logger::print(std::string str) { + + std::cout << str; + std::cout.flush(); + if (!outFile.is_open()) { + throw std::runtime_error("Failed to open log file."); + } + if (!outFile.good()) { + std::cerr << "outFile is in a bad state!" << std::endl; + outFile.clear(); + } + if (!(outFile << str)) { + std::cerr << "Failed to write to outFile." << std::endl; + } + outFile.flush(); + +} + +void logger::print(double val) { + std::cout << std::setprecision(2) << std::fixed; + std::cout << val; + std::cout.flush(); + outFile << std::setprecision(2) << std::fixed; + outFile << val; + outFile.flush(); +} + +void logger::print(float val) { + std::cout << std::setprecision(2) << std::fixed; + std::cout << val; + std::cout.flush(); + outFile << std::setprecision(2) << std::fixed; + outFile << val; + outFile.flush(); +} + +void logger::print(int val) { + std::cout << val; + std::cout.flush(); + outFile << val; + outFile.flush(); +} + +void logger::print(unsigned int val) { + std::cout << val; + std::cout.flush(); + outFile << val; + outFile.flush(); +} \ No newline at end of file diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/main.cpp b/base/toolkits/computation-BF16/mthreads/S4000/src/main.cpp new file mode 100644 index 000000000..53263e35a --- /dev/null +++ b/base/toolkits/computation-BF16/mthreads/S4000/src/main.cpp @@ -0,0 +1,7 @@ +#include "benchmark_bf16.h" + +int main() { + Benchmark bm; + bm.RunBF16Test(); + return 0; +} \ No newline at end of file diff --git a/base/toolkits/computation-FP16/mthreads/S4000/README.md b/base/toolkits/computation-FP16/mthreads/S4000/README.md index b3f294459..8e8a656b7 100644 --- a/base/toolkits/computation-FP16/mthreads/S4000/README.md +++ b/base/toolkits/computation-FP16/mthreads/S4000/README.md @@ -2,7 +2,7 @@ * 厂商:MThreads * 产品名称:S4000 -* 产品型号:/ +* 产品型号:MTT S4000 * TDP:/ # 所用服务器配置 @@ -10,8 +10,8 @@ * 服务器数量:1 * 单服务器内使用卡数:1 * 服务器型号:/ -* 操作系统版本:Ubuntu 20.04.4 LTS -* 操作系统内核:Linux 5.4.0-42-generic +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic * CPU:/ * docker版本:24.0.7 * 内存:1TiB diff --git a/base/toolkits/computation-FP16/mthreads/S4000/gemm.mu b/base/toolkits/computation-FP16/mthreads/S4000/gemm.mu index e8652f187..197509140 100644 --- a/base/toolkits/computation-FP16/mthreads/S4000/gemm.mu +++ b/base/toolkits/computation-FP16/mthreads/S4000/gemm.mu @@ -1,7 +1,3 @@ -// Copyright (c) 2024 BAAI. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License") - #include #include #include @@ -25,9 +21,9 @@ struct PrecisionConfig { void test(const PrecisionConfig& config) { __half* d_A, * d_B, * d_C; - std::vector<__half> h_A(M * K, __float2half(1.0f)); - std::vector<__half> h_B(K * N, __float2half(1.0f)); - std::vector<__half> h_C(M * N); + std::vector<__half> h_A(M * K, __float2half(1.0f)); + std::vector<__half> h_B(K * N, __float2half(1.0f)); + std::vector<__half> h_C(M * N); musaMalloc(&d_A, M * K * config.bytesPerElement); musaMalloc(&d_B, K * N * config.bytesPerElement); diff --git a/base/toolkits/computation-FP32/mthreads/S4000/README.md b/base/toolkits/computation-FP32/mthreads/S4000/README.md new file mode 100644 index 000000000..0a6248b96 --- /dev/null +++ b/base/toolkits/computation-FP32/mthreads/S4000/README.md @@ -0,0 +1,42 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:1 +* 单服务器内使用卡数:1 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 + +# 评测结果 + +## 核心评测结果 + +| 评测项 | FP32算力测试值 | FP32算力标定值 | 测试标定比例 | +| ---- | ----------- | ---------- | ------ | +| 评测结果 | / | / | / | + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 | +| ---- | --------- | -------- | ------- | -------- | +| 监控结果 | / | / | / | / | + +# 厂商测试工具原理说明 + +使用GEMM算子进行computation-bound的计算任务,从而测得实际FP32算力 \ No newline at end of file diff --git a/base/toolkits/computation-FP32/mthreads/S4000/gemm.mu b/base/toolkits/computation-FP32/mthreads/S4000/gemm.mu new file mode 100644 index 000000000..1c97e7537 --- /dev/null +++ b/base/toolkits/computation-FP32/mthreads/S4000/gemm.mu @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include + +constexpr int M = 8192; +constexpr int N = 8192; +constexpr int K = 8192; + +struct PrecisionConfig { + int bytesPerElement; + const char* name; + int NUM_ITERATIONS; + int WARMUP_ITERATIONS = 10; +}; + +void test(const PrecisionConfig& config) { + float* d_A, * d_B, * d_C; + std::vector h_A(M * K, float(1.0f)); + std::vector h_B(K * N, float(1.0f)); + std::vector h_C(M * N); + + musaMalloc(&d_A, M * K * config.bytesPerElement); + musaMalloc(&d_B, K * N * config.bytesPerElement); + musaMalloc(&d_C, M * N * config.bytesPerElement); + + musaMemcpy(d_A, h_A.data(), M * K * config.bytesPerElement, musaMemcpyHostToDevice); + musaMemcpy(d_B, h_B.data(), K * N * config.bytesPerElement, musaMemcpyHostToDevice); + + mublasHandle_t handle; + mublasCreate(&handle); + + float alpha = 1.0f; + float beta = 0.0f; + + for (int i = 0; i < config.WARMUP_ITERATIONS; ++i) { + mublasSgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N, + M, N, K, &alpha, + d_A, M, + d_B, K, + &beta, + d_C, M); + + } + + musaError_t syncError = musaDeviceSynchronize(); + auto start = std::chrono::high_resolution_clock::now(); + + if (syncError != musaSuccess) { + std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl; + } + + for (int i = 0; i < config.NUM_ITERATIONS; ++i) { + mublasSgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N, + M, N, K, &alpha, + d_A, M, + d_B, K, + &beta, + d_C, M); + } + syncError = musaDeviceSynchronize(); + auto end = std::chrono::high_resolution_clock::now(); + + if (syncError != musaSuccess) { + std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl; + } + auto duration = + std::chrono::duration_cast(end - start); + std::cout << "Average " << config.name << " Single Op Duration: " + << duration.count() / config.NUM_ITERATIONS << " us" << std::endl; + + double time_second = duration.count() / 1.0e6; + double flops = 2.0 * M * N * K * config.NUM_ITERATIONS; + double FLOPS = flops / time_second; + double TFLOPS = FLOPS / 1.0e12; + + std::cout << "[FlagPerf Result]" << "computation-FP32=" << TFLOPS << "TFLOPS" + << std::endl; + + musaMemcpy(h_C.data(), d_C, M * N * config.bytesPerElement, musaMemcpyDeviceToHost); + + musaFree(d_A); + musaFree(d_B); + musaFree(d_C); + + mublasDestroy(handle); +} + +int main() { + PrecisionConfig fp32 = { sizeof(float), "FP32", 10000, 10 }; + + test(fp32); + + return 0; +} \ No newline at end of file diff --git a/base/toolkits/computation-FP32/mthreads/S4000/main.sh b/base/toolkits/computation-FP32/mthreads/S4000/main.sh new file mode 100644 index 000000000..e07c73a09 --- /dev/null +++ b/base/toolkits/computation-FP32/mthreads/S4000/main.sh @@ -0,0 +1,2 @@ +mcc gemm.mu -lmusart -lmublas -o gemm +./gemm \ No newline at end of file diff --git a/base/toolkits/computation-FP64/mthreads/S4000/README.md b/base/toolkits/computation-FP64/mthreads/S4000/README.md new file mode 100644 index 000000000..116cbb4ae --- /dev/null +++ b/base/toolkits/computation-FP64/mthreads/S4000/README.md @@ -0,0 +1,42 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:1 +* 单服务器内使用卡数:1 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 + +# 评测结果 + +## 核心评测结果 + +| 评测项 | FP64算力测试值 | FP64算力标定值 | 测试标定比例 | +| ---- | ----------- | ---------- | ------ | +| 评测结果 | / | / | / | + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 | +| ---- | --------- | -------- | ------- | -------- | +| 监控结果 | / | / | / | / | + +# 厂商测试工具原理说明 + +使用GEMM算子进行computation-bound的计算任务,从而测得实际FP64算力 \ No newline at end of file diff --git a/base/toolkits/computation-FP64/mthreads/S4000/gemm.mu b/base/toolkits/computation-FP64/mthreads/S4000/gemm.mu new file mode 100644 index 000000000..54e576885 --- /dev/null +++ b/base/toolkits/computation-FP64/mthreads/S4000/gemm.mu @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include + +constexpr int M = 8192; +constexpr int N = 8192; +constexpr int K = 8192; + +struct PrecisionConfig { + int bytesPerElement; + const char* name; + int NUM_ITERATIONS; + int WARMUP_ITERATIONS = 10; +}; + +void test(const PrecisionConfig& config) { + double* d_A, * d_B, * d_C; + std::vector h_A(M * K, double(1.0f)); + std::vector h_B(K * N, double(1.0f)); + std::vector h_C(M * N); + + musaMalloc(&d_A, M * K * config.bytesPerElement); + musaMalloc(&d_B, K * N * config.bytesPerElement); + musaMalloc(&d_C, M * N * config.bytesPerElement); + + musaMemcpy(d_A, h_A.data(), M * K * config.bytesPerElement, musaMemcpyHostToDevice); + musaMemcpy(d_B, h_B.data(), K * N * config.bytesPerElement, musaMemcpyHostToDevice); + + mublasHandle_t handle; + mublasCreate(&handle); + + double alpha = 1.0f; + double beta = 0.0f; + + for (int i = 0; i < config.WARMUP_ITERATIONS; ++i) { + mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N, + M, N, K, &alpha, + d_A, M, + d_B, K, + &beta, + d_C, M); + + } + + musaError_t syncError = musaDeviceSynchronize(); + auto start = std::chrono::high_resolution_clock::now(); + + if (syncError != musaSuccess) { + std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl; + } + + for (int i = 0; i < config.NUM_ITERATIONS; ++i) { + mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N, + M, N, K, &alpha, + d_A, M, + d_B, K, + &beta, + d_C, M); + } + syncError = musaDeviceSynchronize(); + auto end = std::chrono::high_resolution_clock::now(); + + if (syncError != musaSuccess) { + std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl; + } + auto duration = + std::chrono::duration_cast(end - start); + std::cout << "Average " << config.name << " Single Op Duration: " + << duration.count() / config.NUM_ITERATIONS << " us" << std::endl; + + double time_second = duration.count() / 1.0e6; + double flops = 2.0 * M * N * K * config.NUM_ITERATIONS; + double FLOPS = flops / time_second; + double TFLOPS = FLOPS / 1.0e12; + + std::cout << "[FlagPerf Result]" << "computation-FP32=" << TFLOPS << "TFLOPS" + << std::endl; + + musaMemcpy(h_C.data(), d_C, M * N * config.bytesPerElement, musaMemcpyDeviceToHost); + + musaFree(d_A); + musaFree(d_B); + musaFree(d_C); + + mublasDestroy(handle); +} + +int main() { + PrecisionConfig fp64 = { sizeof(double), "FP64", 70, 10 }; + + test(fp64); + + return 0; +} \ No newline at end of file diff --git a/base/toolkits/computation-FP64/mthreads/S4000/main.sh b/base/toolkits/computation-FP64/mthreads/S4000/main.sh new file mode 100644 index 000000000..e07c73a09 --- /dev/null +++ b/base/toolkits/computation-FP64/mthreads/S4000/main.sh @@ -0,0 +1,2 @@ +mcc gemm.mu -lmusart -lmublas -o gemm +./gemm \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/README.md b/base/toolkits/computation-INT8/mthreads/S4000/README.md new file mode 100644 index 000000000..3242e3e47 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/README.md @@ -0,0 +1,42 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:1 +* 单服务器内使用卡数:1 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 + +# 评测结果 + +## 核心评测结果 + +| 评测项 | INT8算力测试值 | INT8算力标定值 | 测试标定比例 | +| ---- | ----------- | ---------- | ------ | +| 评测结果 | / | / | / | + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 | +| ---- | --------- | -------- | ------- | -------- | +| 监控结果 | / | / | / | / | + +# 厂商测试工具原理说明 + +使用GEMM算子进行computation-bound的计算任务,从而测得实际INT8算力 diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark.muh b/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark.muh new file mode 100644 index 000000000..7fbc4afa8 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark.muh @@ -0,0 +1,54 @@ +#pragma once +#include "benchmark_int8.h" + +template +float Benchmark::RunKernel(T func, dim3 block_num, dim3 block_size, uint iters, + Args... args) { + // unit micro secend + float timed = 0, timed_min = 0; + + // Dummy calls + for (uint i = 0; i < 2; i++) { + func<<>>(args...); + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } + + if (use_event_timer) { + MUSAEvent musaEvent(&timed); + for (uint i = 0; i < iters; i++) { + func<<>>(args...); + } + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } else // std timer + { + Timer timer(&timed); + for (uint i = 0; i < iters; i++) { + func<<>>(args...); + } + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } + timed_min = timed; + // double check + if (use_event_timer) { + MUSAEvent musaEvent(&timed); + for (uint i = 0; i < iters; i++) { + func<<>>(args...); + } + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } else // std timer + { + Timer timer(&timed); + for (uint i = 0; i < iters; i++) { + func<<>>(args...); + } + CHECK_MUSA_ERROR(musaDeviceSynchronize()); + } + timed_min = std::min(timed_min, timed); + + return (timed_min / static_cast(iters)); +} + +template +struct Int2Type { + enum { VALUE = A }; +}; \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark_int8.h b/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark_int8.h new file mode 100644 index 000000000..8ffc6a3f7 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark_int8.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include + +#define DEFAULT_BANDWIDTH_MEM_SIZE 2048 +#define DEFAULT_BANDWIDTH_ITERS 30 + +typedef enum class BANDWIDTH_MODE { + ALL, + READ_ONLY, + WRITE_ONLY, + READ_WRITE +} BW_MODE_T; + +typedef enum class OFFSET_MODE { LOCAL, GLOBAL } OFFSET_MODE_T; + +typedef enum GPU_ARCH { + MP_10 = 10, + MP_21 = 21, + MP_22 = 22, + MP_31 = 31 +} GPU_ARCH_T; + +class Benchmark { +public: + // devices + int specified_device; + const char* specified_device_name; + const char* specified_type_name; + bool force_device; + bool force_device_name; + bool force_type; + // option + bool all_cases; + bool use_event_timer; + BW_MODE_T bandwidth_mode; + int bandwidth_mem_size; + int bandwidth_iters; + + + logger* log; + + Benchmark(); + ~Benchmark(); + + + // return avg time in us + template + float RunKernel(T func, dim3 block_num, dim3 block_size, uint iters, + Args... args); + + + int RunComputeMMAINT8(device_info_t& dev_info); + + int RunINT8Test(); +}; \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/common.h b/base/toolkits/computation-INT8/mthreads/S4000/include/common.h new file mode 100644 index 000000000..3bb25b966 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/include/common.h @@ -0,0 +1,117 @@ +#pragma once + +#if defined(__APPLE__) || defined(__MACOSX) || defined(__FreeBSD__) +#include +#endif +#include "musa_runtime.h" +#include +#include + +#define TAB " " +#define NEWLINE "\n" +#ifndef __FreeBSD__ +#define uint unsigned int +#endif +#define ulong unsigned long + +#if defined(__APPLE__) || defined(__MACOSX) +#define OS_NAME "Macintosh" +#elif defined(__ANDROID__) +#define OS_NAME "Android" +#elif defined(_WIN32) +#if defined(_WIN64) +#define OS_NAME "Win64" +#else +#define OS_NAME "Win32" +#endif +#elif defined(__linux__) +#if defined(__x86_64__) +#define OS_NAME "Linux x64" +#elif defined(__i386__) +#define OS_NAME "Linux x86" +#elif defined(__arm__) +#define OS_NAME "Linux ARM" +#elif defined(__aarch64__) +#define OS_NAME "Linux ARM64" +#else +#define OS_NAME "Linux unknown" +#endif +#elif defined(__FreeBSD__) +#define OS_NAME "FreeBSD" +#else +#define OS_NAME "Unknown" +#endif + +int check_musa_error(musaError _err, int line, const char* func_name); + +#define _PERF_CHECK_MUSA_ERROR_INNER(cond, func, line) \ + do { \ + if (check_musa_error(cond, line, func)) \ + exit(1); \ + } while (0) + +#define CHECK_MUSA_ERROR(cond) \ + _PERF_CHECK_MUSA_ERROR_INNER(cond, __PRETTY_FUNCTION__, __LINE__) + +typedef struct { + std::string device_name; + std::string driver_version; + int device_arch; + + uint num_compute_units; + uint max_work_group_size; + uint64_t max_alloc_size; + uint64_t max_global_size; + uint max_clock_freq; + + bool half_supported; + bool double_supported; + bool imma_supported; + + // Test specific options + uint bw_global_iters; + uint bw_shmem_iters; + uint64_t bw_global_max_size; + uint64_t bw_shmem_max_size; + uint compute_work_groups_per_cu; + uint compute_dp_work_groups_per_cu; + uint shmem_work_groups_per_cu; + uint compute_iters; + uint bw_transfer_iters; + uint kernel_latency_iters; + uint64_t bw_transfer_max_size; + std::string extension; +} device_info_t; + +class Timer { +public: + explicit Timer(float* dur); + ~Timer(); + +private: + float* duration_us; + std::chrono::high_resolution_clock::time_point tick; + std::chrono::high_resolution_clock::time_point tock; +}; + +class MUSAEvent { +public: + explicit MUSAEvent(float* dur); + ~MUSAEvent(); + +private: + float* duration_us; + musaEvent_t startTime; + musaEvent_t stopTime; +}; + +int GetDeviceInfo(int dev, device_info_t* dev_info); + +// Round down to next multiple of the given base with an optional maximum value +uint64_t RoundToMultipleOf(uint64_t number, uint64_t base, + uint64_t maxValue = UINT64_MAX); + +void Populate(float* ptr, uint64_t N); +void Populate(double* ptr, uint64_t N); + +void TrimString(std::string& str); \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/compute_mma_int8.muh b/base/toolkits/computation-INT8/mthreads/S4000/include/compute_mma_int8.muh new file mode 100644 index 000000000..5c9e8d987 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/include/compute_mma_int8.muh @@ -0,0 +1,75 @@ +#pragma once +#include "benchmark.muh" +#include + +#define UNROLL_NUM 16 +#define ITERS 256 +using namespace mtmusa; + +#define MMA_4(a, b, c) \ + wmma::mma_sync(c, a, b, c, 0); \ + wmma::mma_sync(c, a, b, c, 0); \ + wmma::mma_sync(c, a, b, c, 0); \ + wmma::mma_sync(c, a, b, c, 0); + +#define MMA_16(a, b, c) \ + MMA_4(a, b, c); \ + MMA_4(a, b, c); \ + MMA_4(a, b, c); \ + MMA_4(a, b, c); + +//////////////// uint8 /////////////////////////// +template +__device__ void compute_mma_uint8_impl(void* ptr, Int2Type) { + wmma::fragment + a_frag; + wmma::fragment + b_frag; + wmma::fragment c_frag; + // #pragma unroll + for (int i = 0; i < ITERS; i++) { + MMA_16(a_frag, b_frag, c_frag); + } + wmma::store_matrix_sync((int*)ptr, c_frag, M, wmma::mem_col_major); +} + +template +__device__ void compute_mma_uint8_impl(void* ptr, Int2Type) {} + +template +__global__ void compute_mma_uint8(void* ptr) { +#if defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 210) + compute_mma_uint8_impl(ptr, Int2Type<(ARCH == MP_21)>()); +#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 220) + compute_mma_uint8_impl(ptr, Int2Type<(ARCH == MP_22)>()); +#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 310) + compute_mma_uint8_impl(ptr, Int2Type<(ARCH == MP_31)>()); +#endif +} + +//////////////////// int8 ///////////////////////////// +template +__device__ void compute_mma_int8_impl(void* ptr, Int2Type) { + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment c_frag; + // #pragma unroll + for (int i = 0; i < ITERS; i++) { + MMA_16(a_frag, b_frag, c_frag); + } + wmma::store_matrix_sync((int*)ptr, c_frag, M, wmma::mem_col_major); +} + +template +__device__ void compute_mma_int8_impl(void* ptr, Int2Type) {} + +template +__global__ void compute_mma_int8(void* ptr) { +#if defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 210) + compute_mma_int8_impl(ptr, Int2Type<(ARCH == MP_21)>()); +#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 220) + compute_mma_int8_impl(ptr, Int2Type<(ARCH == MP_22)>()); +#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 310) + compute_mma_int8_impl(ptr, Int2Type<(ARCH == MP_31)>()); +#endif +} \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/logger.h b/base/toolkits/computation-INT8/mthreads/S4000/include/logger.h new file mode 100644 index 000000000..46856f87c --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/include/logger.h @@ -0,0 +1,16 @@ +#pragma once +#include +#include +class logger { + std::ofstream outFile; + std::string FileName; +public: + logger(); + ~logger(); + + void print(std::string str); + void print(double val); + void print(float val); + void print(int val); + void print(unsigned int val); +}; \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/main.sh b/base/toolkits/computation-INT8/mthreads/S4000/main.sh new file mode 100644 index 000000000..402ad38e7 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/main.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +CXX=g++ +MCC=mcc +CXXFLAGS="-std=c++17 -I./include -I/usr/local/musa/include -fPIC" +MCCFLAGS="-std=c++17 --offload-arch=mp_22 -I../include -fPIC -I./include -I/usr/local/musa/include" +LDFLAGS="-lmusart -L/usr/local/musa/lib" + +SRC_DIR=src +BUILD_DIR=build +EXECUTABLE=gemm + + +mkdir -p $BUILD_DIR + +$CXX $CXXFLAGS -c $SRC_DIR/common.cpp -o $BUILD_DIR/common.o +$CXX $CXXFLAGS -c $SRC_DIR/logger.cpp -o $BUILD_DIR/logger.o +$CXX $CXXFLAGS -c $SRC_DIR/benchmark_int8.cpp -o $BUILD_DIR/benchmark.o + +$MCC $MCCFLAGS -c $SRC_DIR/compute_mma_int8.mu -o $BUILD_DIR/compute_mma_int8.o + +$CXX $CXXFLAGS -c $SRC_DIR/main.cpp -o $BUILD_DIR/main.o + +$CXX $CXXFLAGS $BUILD_DIR/*.o -o $EXECUTABLE $LDFLAGS + +./gemm \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/benchmark_int8.cpp b/base/toolkits/computation-INT8/mthreads/S4000/src/benchmark_int8.cpp new file mode 100644 index 000000000..3c04d151a --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/src/benchmark_int8.cpp @@ -0,0 +1,36 @@ +#include "benchmark_int8.h" +#include +#include + +Benchmark::Benchmark() + : specified_device(0), + specified_device_name(0), + specified_type_name(0), + force_device(false), + force_device_name(false), + force_type(false), + all_cases(false), + use_event_timer(false), + bandwidth_mode(BANDWIDTH_MODE::ALL), + bandwidth_mem_size(DEFAULT_BANDWIDTH_MEM_SIZE), + bandwidth_iters(DEFAULT_BANDWIDTH_ITERS) { + log = new logger(); +} + +Benchmark::~Benchmark() { + if (log) { + delete log; + } +} + +int Benchmark::RunINT8Test() { + musaSetDevice(0); + device_info_t dev_info; + if (GetDeviceInfo(0, &dev_info)) { + log->print(TAB "Can not get informations for Device " + + std::to_string(0) + NEWLINE); + } + + RunComputeMMAINT8(dev_info); + return 0; +} \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/common.cpp b/base/toolkits/computation-INT8/mthreads/S4000/src/common.cpp new file mode 100644 index 000000000..e6b613248 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/src/common.cpp @@ -0,0 +1,138 @@ +#include "common.h" + +#define MUSA_ERROR_CASE(ERR) \ + case ERR: { \ + printf("" #ERR " in %s on line %i\n", func_name, line); \ + return 1; \ + } + +int check_musa_error(musaError _err, int line, const char* func_name) { + switch (_err) { + case musaSuccess: + return 0; + MUSA_ERROR_CASE(musaErrorInvalidValue) + MUSA_ERROR_CASE(musaErrorMemoryAllocation) + MUSA_ERROR_CASE(musaErrorInitializationError) + MUSA_ERROR_CASE(musaErrorInvalidMemcpyDirection) + MUSA_ERROR_CASE(musaErrorAddressOfConstant) + MUSA_ERROR_CASE(musaErrorSynchronizationError) + MUSA_ERROR_CASE(musaErrorNotYetImplemented) + MUSA_ERROR_CASE(musaErrorMemoryValueTooLarge) + MUSA_ERROR_CASE(musaErrorNoDevice) + MUSA_ERROR_CASE(musaErrorInvalidDevice) + MUSA_ERROR_CASE(musaErrorHostMemoryAlreadyRegistered) + MUSA_ERROR_CASE(musaErrorHostMemoryNotRegistered) + MUSA_ERROR_CASE(musaErrorIllegalInstruction) + MUSA_ERROR_CASE(musaErrorInvalidAddressSpace) + MUSA_ERROR_CASE(musaErrorLaunchFailure) + MUSA_ERROR_CASE(musaErrorNotSupported) + MUSA_ERROR_CASE(musaErrorTimeout) + MUSA_ERROR_CASE(musaErrorUnknown) + MUSA_ERROR_CASE(musaErrorApiFailureBase) + default: + printf("Unknown MUSA error %i in %s on line %i\n", _err, func_name, line); + return 1; + } +} + +int GetDeviceInfo(int dev, device_info_t* dev_info) { + musaDeviceProp deviceProp; + if (musaSuccess != musaGetDeviceProperties(&deviceProp, dev)) { + return -1; + } + dev_info->device_name = deviceProp.name; + dev_info->device_arch = 10 * deviceProp.major + deviceProp.minor; + int driverVersion = 0; + musaDriverGetVersion(&driverVersion); + dev_info->driver_version = std::to_string(driverVersion); + TrimString(dev_info->device_name); + TrimString(dev_info->driver_version); + + dev_info->num_compute_units = deviceProp.multiProcessorCount; + dev_info->max_work_group_size = deviceProp.maxThreadsPerBlock; + + // Limiting max work-group size to 512 +#define MAX_WG_SIZE 256 + dev_info->max_work_group_size = + std::min(dev_info->max_work_group_size, (uint)MAX_WG_SIZE); +#undef MAX_WG_SIZE + + /* Size of global device memory in bytes. */ + dev_info->max_global_size = static_cast(deviceProp.totalGlobalMem); + /* Max size of memory object allocation in bytes.*/ + dev_info->max_alloc_size = dev_info->max_global_size / 3; + dev_info->max_clock_freq = static_cast(deviceProp.clockRate / 1000); + dev_info->double_supported = true; + dev_info->half_supported = true; + + dev_info->bw_global_max_size = 1 << 31; + dev_info->bw_shmem_max_size = 1 << 28; + dev_info->bw_transfer_max_size = 1 << 28; + dev_info->compute_work_groups_per_cu = 2048; + dev_info->compute_dp_work_groups_per_cu = 512; + dev_info->shmem_work_groups_per_cu = 128; + dev_info->compute_iters = 20; + dev_info->bw_global_iters = 30; + dev_info->bw_shmem_iters = 20; + dev_info->bw_transfer_iters = 20; + dev_info->kernel_latency_iters = 20000; + + return 0; +} + +MUSAEvent::MUSAEvent(float* dur) : duration_us(dur) { + CHECK_MUSA_ERROR(musaEventCreate(&startTime)); + CHECK_MUSA_ERROR(musaEventCreate(&stopTime)); + CHECK_MUSA_ERROR(musaEventRecord(startTime)); +} + +MUSAEvent::~MUSAEvent() { + CHECK_MUSA_ERROR(musaEventRecord(stopTime)); + CHECK_MUSA_ERROR(musaEventSynchronize(stopTime)); + CHECK_MUSA_ERROR(musaEventElapsedTime(duration_us, startTime, stopTime)); + *duration_us *= 1e3f; + CHECK_MUSA_ERROR(musaEventDestroy(startTime)); + CHECK_MUSA_ERROR(musaEventDestroy(stopTime)); +} + +Timer::Timer(float* dur) : duration_us(dur) { + tick = std::chrono::high_resolution_clock::now(); +} + +Timer::~Timer() { + tock = std::chrono::high_resolution_clock::now(); + *duration_us = + (float)(std::chrono::duration_cast(tock - tick) + .count()); +} + +void Populate(float* ptr, uint64_t num) { + srand((unsigned int)time(NULL)); + + for (uint64_t i = 0; i < num; i++) { + // ptr[i] = (float)rand(); + // to ensure the sum of arr is a positive number + // avoid the STORE in ReadOnly + ptr[i] = (float)i; + } +} + +void Populate(double* ptr, uint64_t num) { + srand((unsigned int)time(NULL)); + for (uint64_t i = 0; i < num; i++) { + // ptr[i] = (double)rand(); + ptr[i] = (double)i; + } +} + +uint64_t RoundToMultipleOf(uint64_t number, uint64_t base, uint64_t max_value) { + uint64_t n = (number > max_value) ? max_value : number; + return (n / base) * base; +} + +void TrimString(std::string& str) { + size_t pos = str.find('\0'); + if (pos != std::string::npos) { + str.erase(pos); + } +} \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/compute_mma_int8.mu b/base/toolkits/computation-INT8/mthreads/S4000/src/compute_mma_int8.mu new file mode 100644 index 000000000..ea2374617 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/src/compute_mma_int8.mu @@ -0,0 +1,107 @@ +#include "benchmark.muh" +#include "compute_mma_int8.muh" + +#define WARPSIZE 128 +#define TEST_DETAIL(show_, kernel_, tag_, M_, N_, K_, ARCH_) \ + work_per_warp = M_ * N_ * K_ * 2 * ITERS * UNROLL_NUM; \ + timed = RunKernel(kernel_, block_num, block_size, iters, \ + d_x); \ + gops = (static_cast(total_num) / WARPSIZE) * work_per_warp / timed / \ + 1e3f; \ + gops_max = std::max(gops_max, gops); \ + if (show_) { \ + log->print(TAB TAB TAB #tag_ " : "); \ + log->print(gops); \ + log->print(NEWLINE); \ + } + +#define TEST_END(show_, type_) \ + if (!show_) { \ + log->print("[FlagPerf Result]computation-" #type_ "=");\ + log->print(gops_max/1e3);\ + log->print("TFLOPS");\ + log->print(NEWLINE); \ + } + +int Benchmark::RunComputeMMAINT8(device_info_t& dev_info) { + + // mtgpu imma only + + float timed, gops, gops_max; + int work_per_warp; + dim3 block_size(1024); + int grid_size = std::min((dev_info.num_compute_units) * + (dev_info.compute_work_groups_per_cu) * + (block_size.x) * sizeof(int), + dev_info.max_alloc_size) / + ((block_size.x) * sizeof(int)); + grid_size = std::min(grid_size, 2048); + dim3 block_num(grid_size); + size_t total_num = block_size.x * block_num.x; + uint iters = dev_info.compute_iters; + { + if (dev_info.device_arch == MP_21) { + void* d_x; + CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int))); + gops_max = 0.0f; + TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_16_8_16, 16, 16, 16, + MP_21); + TEST_END(all_cases, UINT8); + CHECK_MUSA_ERROR(musaFree(d_x)); + } + else if (dev_info.device_arch == MP_22) { + void* d_x; + CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int))); + gops_max = 0.0f; + TEST_DETAIL(all_cases, compute_mma_int8, INT8_16_16_16, 16, 16, 16, + MP_22); + TEST_DETAIL(all_cases, compute_mma_int8, INT8_32_8_16, 32, 8, 16, + MP_22); + TEST_DETAIL(all_cases, compute_mma_int8, INT8_8_32_16, 8, 32, 16, + MP_22); + TEST_DETAIL(all_cases, compute_mma_int8, INT8_32_32_32, 32, 32, 32, + MP_22); + TEST_END(all_cases, INT8); + CHECK_MUSA_ERROR(musaFree(d_x)); + } + else if (dev_info.device_arch == MP_31) { + void* d_x; + CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int))); + gops_max = 0.0f; + TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_16_16_16, 16, 16, 16, + MP_31); + TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_32_8_16, 32, 8, 16, + MP_31); + TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_8_32_16, 8, 32, 16, + MP_31); + // TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_32_32_32, 32, 32, 32, + // MP_31); + TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_16_16_32, 16, 16, 32, + MP_31); + TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_16_16_64, 16, 16, 64, + MP_31); + TEST_END(all_cases, UINT8); + gops_max = 0.0f; + TEST_DETAIL(all_cases, compute_mma_int8, INT8_16_16_16, 16, 16, 16, + MP_31); + TEST_DETAIL(all_cases, compute_mma_int8, INT8_32_8_16, 32, 8, 16, + MP_31); + TEST_DETAIL(all_cases, compute_mma_int8, INT8_8_32_16, 8, 32, 16, + MP_31); + // TEST_DETAIL(all_cases, compute_mma_int8, INT8_32_32_32, 32, 32, 32, + // MP_31); + TEST_DETAIL(all_cases, compute_mma_int8, INT8_16_16_32, 16, 16, 32, + MP_31); + TEST_DETAIL(all_cases, compute_mma_int8, INT8_16_16_64, 16, 16, 64, + MP_31); + TEST_END(all_cases, INT8); + CHECK_MUSA_ERROR(musaFree(d_x)); + } + else { + log->print(TAB TAB TAB "NOT SUPPORT mp_" + + std::to_string(dev_info.device_arch) + NEWLINE); + } + } + + return 0; +} \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/logger.cpp b/base/toolkits/computation-INT8/mthreads/S4000/src/logger.cpp new file mode 100644 index 000000000..1bab6bb37 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/src/logger.cpp @@ -0,0 +1,73 @@ +#include "logger.h" +#include +#include + +logger::logger() :FileName("benchmark.log") { + outFile.open(FileName); + if (!outFile.is_open()) { + throw std::runtime_error("Failed to open log file."); + } + outFile.flush(); +} + +logger::~logger() { + if (outFile.is_open()) { + outFile.close(); + } + + // Delete the log file + if (std::remove(FileName.c_str()) != 0) { + std::cerr << "Failed to delete log file: " << FileName << "\n"; + } +} + + +void logger::print(std::string str) { + + std::cout << str; + std::cout.flush(); + if (!outFile.is_open()) { + throw std::runtime_error("Failed to open log file."); + } + if (!outFile.good()) { + std::cerr << "outFile is in a bad state!" << std::endl; + outFile.clear(); + } + if (!(outFile << str)) { + std::cerr << "Failed to write to outFile." << std::endl; + } + outFile.flush(); + +} + +void logger::print(double val) { + std::cout << std::setprecision(2) << std::fixed; + std::cout << val; + std::cout.flush(); + outFile << std::setprecision(2) << std::fixed; + outFile << val; + outFile.flush(); +} + +void logger::print(float val) { + std::cout << std::setprecision(2) << std::fixed; + std::cout << val; + std::cout.flush(); + outFile << std::setprecision(2) << std::fixed; + outFile << val; + outFile.flush(); +} + +void logger::print(int val) { + std::cout << val; + std::cout.flush(); + outFile << val; + outFile.flush(); +} + +void logger::print(unsigned int val) { + std::cout << val; + std::cout.flush(); + outFile << val; + outFile.flush(); +} \ No newline at end of file diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/main.cpp b/base/toolkits/computation-INT8/mthreads/S4000/src/main.cpp new file mode 100644 index 000000000..62baf7099 --- /dev/null +++ b/base/toolkits/computation-INT8/mthreads/S4000/src/main.cpp @@ -0,0 +1,6 @@ +#include "benchmark_int8.h" +int main() { + Benchmark bm; + bm.RunINT8Test(); + return 0; +} \ No newline at end of file diff --git a/base/toolkits/computation-TF32/mthreads/S4000/README.md b/base/toolkits/computation-TF32/mthreads/S4000/README.md index f81e83b45..ee67fc119 100644 --- a/base/toolkits/computation-TF32/mthreads/S4000/README.md +++ b/base/toolkits/computation-TF32/mthreads/S4000/README.md @@ -2,7 +2,7 @@ * 厂商:MThreads * 产品名称:S4000 -* 产品型号:/ +* 产品型号:MTT S4000 * TDP:/ # 所用服务器配置 @@ -10,13 +10,12 @@ * 服务器数量:1 * 单服务器内使用卡数:1 * 服务器型号:/ -* 操作系统版本:Ubuntu 20.04.4 LTS -* 操作系统内核:Linux 5.4.0-42-generic +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic * CPU:/ * docker版本:24.0.7 * 内存:1TiB * 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 - # 评测结果 ## 核心评测结果 diff --git a/base/toolkits/computation-TF32/mthreads/S4000/gemm.mu b/base/toolkits/computation-TF32/mthreads/S4000/gemm.mu index 2430f2814..aef21db4a 100644 --- a/base/toolkits/computation-TF32/mthreads/S4000/gemm.mu +++ b/base/toolkits/computation-TF32/mthreads/S4000/gemm.mu @@ -1,7 +1,3 @@ -// Copyright (c) 2024 BAAI. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License") - #include #include @@ -24,9 +20,9 @@ struct PrecisionConfig { void test(const PrecisionConfig& config) { float* d_A, * d_B, * d_C; - std::vector h_A(M * K); - std::vector h_B(K * N); - std::vector h_C(M * N); + std::vector h_A(M * K); + std::vector h_B(K * N); + std::vector h_C(M * N); musaMalloc(&d_A, M * K * config.bytesPerElement); musaMalloc(&d_B, K * N * config.bytesPerElement);