Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mthreads] Support base/toolkits: add gemm code with musa toolkits #768

Merged
merged 3 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions base/toolkits/computation-BF16/mthreads/S4000/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# 参评AI芯片信息

* 厂商:MThreads
* 产品名称:S4000
* 产品型号:MTT S4000
* TDP:/

# 所用服务器配置

* 服务器数量:1
* 单服务器内使用卡数:1
* 服务器型号:/
* 操作系统版本:Ubuntu 22.04.5 LTS
* 操作系统内核:Linux 5.15.0-105-generic
* CPU:/
* docker版本:24.0.7
* 内存:1TiB
* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信

# 评测结果

## 核心评测结果

| 评测项 | BF16算力测试值 | BF16算力标定值 | 测试标定比例 |
| ---- | ----------- | ---------- | ------ |
| 评测结果 | / | / | / |

## 能耗监控结果

| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
| 监控结果 | / | / | / | / | / | / | / | / |

## 其他重要监控结果

| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 |
| ---- | --------- | -------- | ------- | -------- |
| 监控结果 | / | / | / | / |

# 厂商测试工具原理说明

使用GEMM算子进行computation-bound的计算任务,从而测得实际BF16算力
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#pragma once
#include "benchmark_bf16.h"

template <class T, typename... Args>
float Benchmark::RunKernel(T func, dim3 block_num, dim3 block_size, uint iters,
Args... args) {
// unit micro secend
float timed = 0, timed_min = 0;

// Dummy calls
for (uint i = 0; i < 2; i++) {
func <<<block_num, block_size >>> (args...);
CHECK_MUSA_ERROR(musaDeviceSynchronize());
}

if (use_event_timer) {
MUSAEvent musaEvent(&timed);
for (uint i = 0; i < iters; i++) {
func <<<block_num, block_size >>> (args...);
}
CHECK_MUSA_ERROR(musaDeviceSynchronize());
}
else // std timer
{
Timer timer(&timed);
for (uint i = 0; i < iters; i++) {
func <<<block_num, block_size >>> (args...);
}
CHECK_MUSA_ERROR(musaDeviceSynchronize());
}
timed_min = timed;
// double check
if (use_event_timer) {
MUSAEvent musaEvent(&timed);
for (uint i = 0; i < iters; i++) {
func <<<block_num, block_size >>> (args...);
}
CHECK_MUSA_ERROR(musaDeviceSynchronize());
}
else // std timer
{
Timer timer(&timed);
for (uint i = 0; i < iters; i++) {
func <<<block_num, block_size >>> (args...);
}
CHECK_MUSA_ERROR(musaDeviceSynchronize());
}
timed_min = std::min(timed_min, timed);

return (timed_min / static_cast<float>(iters));
}

template <int A>
struct Int2Type {
enum { VALUE = A };
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#pragma once

#include <common.h>
#include <logger.h>

#define DEFAULT_BANDWIDTH_MEM_SIZE 2048
#define DEFAULT_BANDWIDTH_ITERS 30

typedef enum class BANDWIDTH_MODE {
ALL,
READ_ONLY,
WRITE_ONLY,
READ_WRITE
} BW_MODE_T;

typedef enum class OFFSET_MODE { LOCAL, GLOBAL } OFFSET_MODE_T;

typedef enum GPU_ARCH {
MP_10 = 10,
MP_21 = 21,
MP_22 = 22,
MP_31 = 31
} GPU_ARCH_T;

class Benchmark {
public:
// devices
int specified_device;
const char* specified_device_name;
const char* specified_type_name;
bool force_device;
bool force_device_name;
bool force_type;
// option
bool all_cases;
bool use_event_timer;
BW_MODE_T bandwidth_mode;
int bandwidth_mem_size;
int bandwidth_iters;

logger* log;

Benchmark();
~Benchmark();

template <class T, typename... Args>
float RunKernel(T func, dim3 block_num, dim3 block_size, uint iters,
Args... args);

int RunComputeMMABF16(device_info_t& dev_info);

int RunBF16Test();
};
117 changes: 117 additions & 0 deletions base/toolkits/computation-BF16/mthreads/S4000/include/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#pragma once

#if defined(__APPLE__) || defined(__MACOSX) || defined(__FreeBSD__)
#include <sys/types.h>
#endif
#include "musa_runtime.h"
#include <string>
#include <chrono>

#define TAB " "
#define NEWLINE "\n"
#ifndef __FreeBSD__
#define uint unsigned int
#endif
#define ulong unsigned long

#if defined(__APPLE__) || defined(__MACOSX)
#define OS_NAME "Macintosh"
#elif defined(__ANDROID__)
#define OS_NAME "Android"
#elif defined(_WIN32)
#if defined(_WIN64)
#define OS_NAME "Win64"
#else
#define OS_NAME "Win32"
#endif
#elif defined(__linux__)
#if defined(__x86_64__)
#define OS_NAME "Linux x64"
#elif defined(__i386__)
#define OS_NAME "Linux x86"
#elif defined(__arm__)
#define OS_NAME "Linux ARM"
#elif defined(__aarch64__)
#define OS_NAME "Linux ARM64"
#else
#define OS_NAME "Linux unknown"
#endif
#elif defined(__FreeBSD__)
#define OS_NAME "FreeBSD"
#else
#define OS_NAME "Unknown"
#endif

int check_musa_error(musaError _err, int line, const char* func_name);

#define _PERF_CHECK_MUSA_ERROR_INNER(cond, func, line) \
do { \
if (check_musa_error(cond, line, func)) \
exit(1); \
} while (0)

#define CHECK_MUSA_ERROR(cond) \
_PERF_CHECK_MUSA_ERROR_INNER(cond, __PRETTY_FUNCTION__, __LINE__)

typedef struct {
std::string device_name;
std::string driver_version;
int device_arch;

uint num_compute_units;
uint max_work_group_size;
uint64_t max_alloc_size;
uint64_t max_global_size;
uint max_clock_freq;

bool half_supported;
bool double_supported;
bool imma_supported;

// Test specific options
uint bw_global_iters;
uint bw_shmem_iters;
uint64_t bw_global_max_size;
uint64_t bw_shmem_max_size;
uint compute_work_groups_per_cu;
uint compute_dp_work_groups_per_cu;
uint shmem_work_groups_per_cu;
uint compute_iters;
uint bw_transfer_iters;
uint kernel_latency_iters;
uint64_t bw_transfer_max_size;
std::string extension;
} device_info_t;

class Timer {
public:
explicit Timer(float* dur);
~Timer();

private:
float* duration_us;
std::chrono::high_resolution_clock::time_point tick;
std::chrono::high_resolution_clock::time_point tock;
};

class MUSAEvent {
public:
explicit MUSAEvent(float* dur);
~MUSAEvent();

private:
float* duration_us;
musaEvent_t startTime;
musaEvent_t stopTime;
};

int GetDeviceInfo(int dev, device_info_t* dev_info);

// Round down to next multiple of the given base with an optional maximum value
uint64_t RoundToMultipleOf(uint64_t number, uint64_t base,
uint64_t maxValue = UINT64_MAX);

void Populate(float* ptr, uint64_t N);
void Populate(double* ptr, uint64_t N);

void TrimString(std::string& str);
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#pragma once
#include "benchmark.muh"
#include <mma.h>

#define UNROLL_NUM 16
#define ITERS 256
using namespace mtmusa;

#define MMA_4(a, b, c) \
wmma::mma_sync(c, a, b, c); \
wmma::mma_sync(c, a, b, c); \
wmma::mma_sync(c, a, b, c); \
wmma::mma_sync(c, a, b, c);

#define MMA_16(a, b, c) \
MMA_4(a, b, c); \
MMA_4(a, b, c); \
MMA_4(a, b, c); \
MMA_4(a, b, c);

template <int M, int N, int K>
__device__ void compute_mma_bf16_impl(void* ptr, Int2Type<true>) {
wmma::fragment<wmma::matrix_a, M, N, K, __mt_bfloat16, wmma::row_major>
a_frag;
wmma::fragment<wmma::matrix_b, M, N, K, __mt_bfloat16, wmma::row_major>
b_frag;
wmma::fragment<wmma::accumulator, M, N, K, float> c_frag;
// #pragma unroll
for (int i = 0; i < ITERS; i++) {
MMA_16(a_frag, b_frag, c_frag);
}
wmma::store_matrix_sync((float*)ptr, c_frag, M, wmma::mem_col_major);
}

template <int M, int N, int K>
__device__ void compute_mma_bf16_impl(void* ptr, Int2Type<false>) {}

template <int M, int N, int K, int ARCH>
__global__ void compute_mma_bf16(void* ptr) {
#if defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 220)
compute_mma_bf16_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_22)>());
#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 310)
compute_mma_bf16_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_31)>());
#endif
}
16 changes: 16 additions & 0 deletions base/toolkits/computation-BF16/mthreads/S4000/include/logger.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once
#include <string>
#include <fstream>
class logger {
std::ofstream outFile;
std::string FileName;
public:
logger();
~logger();

void print(std::string str);
void print(double val);
void print(float val);
void print(int val);
void print(unsigned int val);
};
26 changes: 26 additions & 0 deletions base/toolkits/computation-BF16/mthreads/S4000/main.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

CXX=g++
MCC=mcc
CXXFLAGS="-std=c++17 -I./include -I/usr/local/musa/include -fPIC"
MCCFLAGS="-std=c++17 --offload-arch=mp_22 -I../include -mtgpu -fPIC -I./include -I/usr/local/musa/include"
LDFLAGS="-lmusart -L/usr/local/musa/lib"

SRC_DIR=src
BUILD_DIR=build
EXECUTABLE=gemm


mkdir -p $BUILD_DIR

$CXX $CXXFLAGS -c $SRC_DIR/common.cpp -o $BUILD_DIR/common.o
$CXX $CXXFLAGS -c $SRC_DIR/logger.cpp -o $BUILD_DIR/logger.o
$CXX $CXXFLAGS -c $SRC_DIR/benchmark_bf16.cpp -o $BUILD_DIR/benchmark.o

$MCC $MCCFLAGS -c $SRC_DIR/compute_mma_bf16.mu -o $BUILD_DIR/compute_mma_bf16.o

$CXX $CXXFLAGS -c $SRC_DIR/main.cpp -o $BUILD_DIR/main.o

$CXX $CXXFLAGS $BUILD_DIR/*.o -o $EXECUTABLE $LDFLAGS

./gemm
Loading