diff --git a/base/toolkits/computation-BF16/mthreads/S4000/README.md b/base/toolkits/computation-BF16/mthreads/S4000/README.md
new file mode 100644
index 000000000..5498c00c4
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | BF16算力测试值   | BF16算力标定值  | 测试标定比例 |
+| ---- | ----------- | ---------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /  | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用GEMM算子进行computation-bound的计算任务，从而测得实际BF16算力
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark.muh b/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark.muh
new file mode 100644
index 000000000..09f681943
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark.muh
@@ -0,0 +1,56 @@
+#pragma once
+#include "benchmark_bf16.h"
+
+template <class T, typename... Args>
+float Benchmark::RunKernel(T func, dim3 block_num, dim3 block_size, uint iters,
+  Args... args) {
+  // unit micro secend
+  float timed = 0, timed_min = 0;
+
+  // Dummy calls
+  for (uint i = 0; i < 2; i++) {
+    func <<<block_num, block_size >>> (args...);
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  }
+
+  if (use_event_timer) {
+    MUSAEvent musaEvent(&timed);
+    for (uint i = 0; i < iters; i++) {
+      func <<<block_num, block_size >>> (args...);
+    }
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  }
+  else  // std timer
+  {
+    Timer timer(&timed);
+    for (uint i = 0; i < iters; i++) {
+      func <<<block_num, block_size >>> (args...);
+    }
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  }
+  timed_min = timed;
+  // double check
+  if (use_event_timer) {
+    MUSAEvent musaEvent(&timed);
+    for (uint i = 0; i < iters; i++) {
+      func <<<block_num, block_size >>> (args...);
+    }
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  }
+  else  // std timer
+  {
+    Timer timer(&timed);
+    for (uint i = 0; i < iters; i++) {
+      func <<<block_num, block_size >>> (args...);
+    }
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  }
+  timed_min = std::min(timed_min, timed);
+
+  return (timed_min / static_cast<float>(iters));
+}
+
+template <int A>
+struct Int2Type {
+  enum { VALUE = A };
+};
\ No newline at end of file
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark_bf16.h b/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark_bf16.h
new file mode 100644
index 000000000..f05c92b8a
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/include/benchmark_bf16.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <common.h>
+#include <logger.h>
+
+#define DEFAULT_BANDWIDTH_MEM_SIZE 2048
+#define DEFAULT_BANDWIDTH_ITERS 30
+
+typedef enum class BANDWIDTH_MODE {
+    ALL,
+    READ_ONLY,
+    WRITE_ONLY,
+    READ_WRITE
+} BW_MODE_T;
+
+typedef enum class OFFSET_MODE { LOCAL, GLOBAL } OFFSET_MODE_T;
+
+typedef enum GPU_ARCH {
+    MP_10 = 10,
+    MP_21 = 21,
+    MP_22 = 22,
+    MP_31 = 31
+} GPU_ARCH_T;
+
+class Benchmark {
+public:
+    // devices
+    int specified_device;
+    const char* specified_device_name;
+    const char* specified_type_name;
+    bool force_device;
+    bool force_device_name;
+    bool force_type;
+    // option
+    bool all_cases;
+    bool use_event_timer;
+    BW_MODE_T bandwidth_mode;
+    int bandwidth_mem_size;
+    int bandwidth_iters;
+
+    logger* log;
+
+    Benchmark();
+    ~Benchmark();
+
+    template <class T, typename... Args>
+    float RunKernel(T func, dim3 block_num, dim3 block_size, uint iters,
+        Args... args);
+
+    int RunComputeMMABF16(device_info_t& dev_info);
+
+    int RunBF16Test();
+};
\ No newline at end of file
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/common.h b/base/toolkits/computation-BF16/mthreads/S4000/include/common.h
new file mode 100644
index 000000000..3bb25b966
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/include/common.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#if defined(__APPLE__) || defined(__MACOSX) || defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include "musa_runtime.h"
+#include <string>
+#include <chrono>
+
+#define TAB "  "
+#define NEWLINE "\n"
+#ifndef __FreeBSD__
+#define uint unsigned int
+#endif
+#define ulong unsigned long
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#define OS_NAME "Macintosh"
+#elif defined(__ANDROID__)
+#define OS_NAME "Android"
+#elif defined(_WIN32)
+#if defined(_WIN64)
+#define OS_NAME "Win64"
+#else
+#define OS_NAME "Win32"
+#endif
+#elif defined(__linux__)
+#if defined(__x86_64__)
+#define OS_NAME "Linux x64"
+#elif defined(__i386__)
+#define OS_NAME "Linux x86"
+#elif defined(__arm__)
+#define OS_NAME "Linux ARM"
+#elif defined(__aarch64__)
+#define OS_NAME "Linux ARM64"
+#else
+#define OS_NAME "Linux unknown"
+#endif
+#elif defined(__FreeBSD__)
+#define OS_NAME "FreeBSD"
+#else
+#define OS_NAME "Unknown"
+#endif
+
+int check_musa_error(musaError _err, int line, const char* func_name);
+
+#define _PERF_CHECK_MUSA_ERROR_INNER(cond, func, line) \
+  do {                                                 \
+    if (check_musa_error(cond, line, func))            \
+      exit(1);                                         \
+  } while (0)
+
+#define CHECK_MUSA_ERROR(cond) \
+  _PERF_CHECK_MUSA_ERROR_INNER(cond, __PRETTY_FUNCTION__, __LINE__)
+
+typedef struct {
+    std::string device_name;
+    std::string driver_version;
+    int device_arch;
+
+    uint num_compute_units;
+    uint max_work_group_size;
+    uint64_t max_alloc_size;
+    uint64_t max_global_size;
+    uint max_clock_freq;
+
+    bool half_supported;
+    bool double_supported;
+    bool imma_supported;
+
+    // Test specific options
+    uint bw_global_iters;
+    uint bw_shmem_iters;
+    uint64_t bw_global_max_size;
+    uint64_t bw_shmem_max_size;
+    uint compute_work_groups_per_cu;
+    uint compute_dp_work_groups_per_cu;
+    uint shmem_work_groups_per_cu;
+    uint compute_iters;
+    uint bw_transfer_iters;
+    uint kernel_latency_iters;
+    uint64_t bw_transfer_max_size;
+    std::string extension;
+} device_info_t;
+
+class Timer {
+public:
+    explicit Timer(float* dur);
+    ~Timer();
+
+private:
+    float* duration_us;
+    std::chrono::high_resolution_clock::time_point tick;
+    std::chrono::high_resolution_clock::time_point tock;
+};
+
+class MUSAEvent {
+public:
+    explicit MUSAEvent(float* dur);
+    ~MUSAEvent();
+
+private:
+    float* duration_us;
+    musaEvent_t startTime;
+    musaEvent_t stopTime;
+};
+
+int GetDeviceInfo(int dev, device_info_t* dev_info);
+
+// Round down to next multiple of the given base with an optional maximum value
+uint64_t RoundToMultipleOf(uint64_t number, uint64_t base,
+    uint64_t maxValue = UINT64_MAX);
+
+void Populate(float* ptr, uint64_t N);
+void Populate(double* ptr, uint64_t N);
+
+void TrimString(std::string& str);
\ No newline at end of file
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/compute_mma_bf16.muh b/base/toolkits/computation-BF16/mthreads/S4000/include/compute_mma_bf16.muh
new file mode 100644
index 000000000..b96f930cb
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/include/compute_mma_bf16.muh
@@ -0,0 +1,45 @@
+#pragma once
+#include "benchmark.muh"
+#include <mma.h>
+
+#define UNROLL_NUM 16
+#define ITERS 256
+using namespace mtmusa;
+
+#define MMA_4(a, b, c)        \
+  wmma::mma_sync(c, a, b, c); \
+  wmma::mma_sync(c, a, b, c); \
+  wmma::mma_sync(c, a, b, c); \
+  wmma::mma_sync(c, a, b, c);
+
+#define MMA_16(a, b, c) \
+  MMA_4(a, b, c);       \
+  MMA_4(a, b, c);       \
+  MMA_4(a, b, c);       \
+  MMA_4(a, b, c);
+
+template <int M, int N, int K>
+__device__ void compute_mma_bf16_impl(void* ptr, Int2Type<true>) {
+  wmma::fragment<wmma::matrix_a, M, N, K, __mt_bfloat16, wmma::row_major>
+      a_frag;
+  wmma::fragment<wmma::matrix_b, M, N, K, __mt_bfloat16, wmma::row_major>
+      b_frag;
+  wmma::fragment<wmma::accumulator, M, N, K, float> c_frag;
+  // #pragma unroll
+  for (int i = 0; i < ITERS; i++) {
+    MMA_16(a_frag, b_frag, c_frag);
+  }
+  wmma::store_matrix_sync((float*)ptr, c_frag, M, wmma::mem_col_major);
+}
+
+template <int M, int N, int K>
+__device__ void compute_mma_bf16_impl(void* ptr, Int2Type<false>) {}
+
+template <int M, int N, int K, int ARCH>
+__global__ void compute_mma_bf16(void* ptr) {
+#if defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 220)
+  compute_mma_bf16_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_22)>());
+#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 310)
+  compute_mma_bf16_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_31)>());
+#endif
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/include/logger.h b/base/toolkits/computation-BF16/mthreads/S4000/include/logger.h
new file mode 100644
index 000000000..46856f87c
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/include/logger.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <string>
+#include <fstream>
+class logger {
+    std::ofstream outFile;
+    std::string FileName;
+public:
+    logger();
+    ~logger();
+
+    void print(std::string str);
+    void print(double val);
+    void print(float val);
+    void print(int val);
+    void print(unsigned int val);
+};
\ No newline at end of file
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/main.sh b/base/toolkits/computation-BF16/mthreads/S4000/main.sh
new file mode 100644
index 000000000..fe45a4ee6
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/main.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+CXX=g++
+MCC=mcc
+CXXFLAGS="-std=c++17 -I./include -I/usr/local/musa/include -fPIC"
+MCCFLAGS="-std=c++17 --offload-arch=mp_22 -I../include -mtgpu -fPIC -I./include -I/usr/local/musa/include"
+LDFLAGS="-lmusart -L/usr/local/musa/lib"
+
+SRC_DIR=src
+BUILD_DIR=build
+EXECUTABLE=gemm
+
+
+mkdir -p $BUILD_DIR
+
+$CXX $CXXFLAGS -c $SRC_DIR/common.cpp -o $BUILD_DIR/common.o
+$CXX $CXXFLAGS -c $SRC_DIR/logger.cpp -o $BUILD_DIR/logger.o
+$CXX $CXXFLAGS -c $SRC_DIR/benchmark_bf16.cpp -o $BUILD_DIR/benchmark.o
+
+$MCC $MCCFLAGS -c $SRC_DIR/compute_mma_bf16.mu -o $BUILD_DIR/compute_mma_bf16.o
+
+$CXX $CXXFLAGS -c $SRC_DIR/main.cpp -o $BUILD_DIR/main.o
+
+$CXX $CXXFLAGS $BUILD_DIR/*.o -o $EXECUTABLE $LDFLAGS
+
+./gemm
\ No newline at end of file
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/benchmark_bf16.cpp b/base/toolkits/computation-BF16/mthreads/S4000/src/benchmark_bf16.cpp
new file mode 100644
index 000000000..4d0917c37
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/src/benchmark_bf16.cpp
@@ -0,0 +1,36 @@
+#include "benchmark_bf16.h"
+#include <cstring>
+#include <string>
+
+Benchmark::Benchmark()
+    : specified_device(0),
+    specified_device_name(0),
+    specified_type_name(0),
+    force_device(false),
+    force_device_name(false),
+    force_type(false),
+    all_cases(false),
+    use_event_timer(false),
+    bandwidth_mode(BANDWIDTH_MODE::ALL),
+    bandwidth_mem_size(DEFAULT_BANDWIDTH_MEM_SIZE),
+    bandwidth_iters(DEFAULT_BANDWIDTH_ITERS) {
+    log = new logger();
+}
+
+Benchmark::~Benchmark() {
+    if (log) {
+        delete log;
+    }
+}
+
+int Benchmark::RunBF16Test() {
+    musaSetDevice(0);
+    device_info_t dev_info;
+    if (GetDeviceInfo(0, &dev_info)) {
+        log->print(TAB "Can not get informations for Device " +
+            std::to_string(0) + NEWLINE);
+    }
+
+    RunComputeMMABF16(dev_info);
+    return 0;
+}
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/common.cpp b/base/toolkits/computation-BF16/mthreads/S4000/src/common.cpp
new file mode 100644
index 000000000..e6b613248
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/src/common.cpp
@@ -0,0 +1,138 @@
+#include "common.h"
+
+#define MUSA_ERROR_CASE(ERR)                                \
+  case ERR: {                                               \
+    printf("" #ERR " in %s on line %i\n", func_name, line); \
+    return 1;                                               \
+  }
+
+int check_musa_error(musaError _err, int line, const char* func_name) {
+    switch (_err) {
+        case musaSuccess:
+            return 0;
+            MUSA_ERROR_CASE(musaErrorInvalidValue)
+                MUSA_ERROR_CASE(musaErrorMemoryAllocation)
+                MUSA_ERROR_CASE(musaErrorInitializationError)
+                MUSA_ERROR_CASE(musaErrorInvalidMemcpyDirection)
+                MUSA_ERROR_CASE(musaErrorAddressOfConstant)
+                MUSA_ERROR_CASE(musaErrorSynchronizationError)
+                MUSA_ERROR_CASE(musaErrorNotYetImplemented)
+                MUSA_ERROR_CASE(musaErrorMemoryValueTooLarge)
+                MUSA_ERROR_CASE(musaErrorNoDevice)
+                MUSA_ERROR_CASE(musaErrorInvalidDevice)
+                MUSA_ERROR_CASE(musaErrorHostMemoryAlreadyRegistered)
+                MUSA_ERROR_CASE(musaErrorHostMemoryNotRegistered)
+                MUSA_ERROR_CASE(musaErrorIllegalInstruction)
+                MUSA_ERROR_CASE(musaErrorInvalidAddressSpace)
+                MUSA_ERROR_CASE(musaErrorLaunchFailure)
+                MUSA_ERROR_CASE(musaErrorNotSupported)
+                MUSA_ERROR_CASE(musaErrorTimeout)
+                MUSA_ERROR_CASE(musaErrorUnknown)
+                MUSA_ERROR_CASE(musaErrorApiFailureBase)
+        default:
+            printf("Unknown MUSA error %i in %s on line %i\n", _err, func_name, line);
+            return 1;
+    }
+}
+
+int GetDeviceInfo(int dev, device_info_t* dev_info) {
+    musaDeviceProp deviceProp;
+    if (musaSuccess != musaGetDeviceProperties(&deviceProp, dev)) {
+        return -1;
+    }
+    dev_info->device_name = deviceProp.name;
+    dev_info->device_arch = 10 * deviceProp.major + deviceProp.minor;
+    int driverVersion = 0;
+    musaDriverGetVersion(&driverVersion);
+    dev_info->driver_version = std::to_string(driverVersion);
+    TrimString(dev_info->device_name);
+    TrimString(dev_info->driver_version);
+
+    dev_info->num_compute_units = deviceProp.multiProcessorCount;
+    dev_info->max_work_group_size = deviceProp.maxThreadsPerBlock;
+
+    // Limiting max work-group size to 512
+#define MAX_WG_SIZE 256
+    dev_info->max_work_group_size =
+        std::min(dev_info->max_work_group_size, (uint)MAX_WG_SIZE);
+#undef MAX_WG_SIZE
+
+    /*  Size of global device memory in bytes.  */
+    dev_info->max_global_size = static_cast<uint64_t>(deviceProp.totalGlobalMem);
+    /*  Max size of memory object allocation in bytes.*/
+    dev_info->max_alloc_size = dev_info->max_global_size / 3;
+    dev_info->max_clock_freq = static_cast<uint>(deviceProp.clockRate / 1000);
+    dev_info->double_supported = true;
+    dev_info->half_supported = true;
+
+    dev_info->bw_global_max_size = 1 << 31;
+    dev_info->bw_shmem_max_size = 1 << 28;
+    dev_info->bw_transfer_max_size = 1 << 28;
+    dev_info->compute_work_groups_per_cu = 2048;
+    dev_info->compute_dp_work_groups_per_cu = 512;
+    dev_info->shmem_work_groups_per_cu = 128;
+    dev_info->compute_iters = 20;
+    dev_info->bw_global_iters = 30;
+    dev_info->bw_shmem_iters = 20;
+    dev_info->bw_transfer_iters = 20;
+    dev_info->kernel_latency_iters = 20000;
+
+    return 0;
+}
+
+MUSAEvent::MUSAEvent(float* dur) : duration_us(dur) {
+    CHECK_MUSA_ERROR(musaEventCreate(&startTime));
+    CHECK_MUSA_ERROR(musaEventCreate(&stopTime));
+    CHECK_MUSA_ERROR(musaEventRecord(startTime));
+}
+
+MUSAEvent::~MUSAEvent() {
+    CHECK_MUSA_ERROR(musaEventRecord(stopTime));
+    CHECK_MUSA_ERROR(musaEventSynchronize(stopTime));
+    CHECK_MUSA_ERROR(musaEventElapsedTime(duration_us, startTime, stopTime));
+    *duration_us *= 1e3f;
+    CHECK_MUSA_ERROR(musaEventDestroy(startTime));
+    CHECK_MUSA_ERROR(musaEventDestroy(stopTime));
+}
+
+Timer::Timer(float* dur) : duration_us(dur) {
+    tick = std::chrono::high_resolution_clock::now();
+}
+
+Timer::~Timer() {
+    tock = std::chrono::high_resolution_clock::now();
+    *duration_us =
+        (float)(std::chrono::duration_cast<std::chrono::microseconds>(tock - tick)
+            .count());
+}
+
+void Populate(float* ptr, uint64_t num) {
+    srand((unsigned int)time(NULL));
+
+    for (uint64_t i = 0; i < num; i++) {
+        // ptr[i] = (float)rand();
+        // to ensure the sum of arr is a positive number
+        // avoid the STORE in ReadOnly
+        ptr[i] = (float)i;
+    }
+}
+
+void Populate(double* ptr, uint64_t num) {
+    srand((unsigned int)time(NULL));
+    for (uint64_t i = 0; i < num; i++) {
+        // ptr[i] = (double)rand();
+        ptr[i] = (double)i;
+    }
+}
+
+uint64_t RoundToMultipleOf(uint64_t number, uint64_t base, uint64_t max_value) {
+    uint64_t n = (number > max_value) ? max_value : number;
+    return (n / base) * base;
+}
+
+void TrimString(std::string& str) {
+    size_t pos = str.find('\0');
+    if (pos != std::string::npos) {
+        str.erase(pos);
+    }
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/compute_mma_bf16.mu b/base/toolkits/computation-BF16/mthreads/S4000/src/compute_mma_bf16.mu
new file mode 100644
index 000000000..852c50cb4
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/src/compute_mma_bf16.mu
@@ -0,0 +1,80 @@
+#include "benchmark.muh"
+#include "compute_mma_bf16.muh"
+
+#define WARPSIZE 128
+#define TEST_DETAIL(show_, kernel_, tag_, M_, N_, K_, ARCH_)                  \
+  work_per_warp = M_ * N_ * K_ * 2 * ITERS * UNROLL_NUM;                      \
+  timed = RunKernel(kernel_<M_, N_, K_, ARCH_>, block_num, block_size, iters, \
+                    d_x);                                                     \
+  gops = (static_cast<float>(total_num) / WARPSIZE) * work_per_warp / timed / \
+      1e3f;                                                                   \
+  gops_max = std::max(gops_max, gops);                                        \
+  if (show_) {                                                                \
+    log->print(TAB TAB TAB #tag_ "  : ");                                     \
+    log->print(gops);                                                         \
+    log->print(NEWLINE);                                                      \
+  }
+
+#define TEST_END(show_, type_)             \
+  if (!show_) {                            \
+    log->print("[FlagPerf Result]computation-" #type_ "=");\
+    log->print(gops_max/1e3);\
+    log->print("TFLOPS");\
+    log->print(NEWLINE); \
+  }
+
+int Benchmark::RunComputeMMABF16(device_info_t& dev_info) {
+
+    float timed, gops, gops_max;
+    int work_per_warp;
+    dim3 block_size(1024);
+    int grid_size = std::min((dev_info.num_compute_units) *
+        (dev_info.compute_work_groups_per_cu) *
+        (block_size.x) * sizeof(int),
+        dev_info.max_alloc_size) /
+        ((block_size.x) * sizeof(int));
+    grid_size = std::min(grid_size, 1024);
+    dim3 block_num(grid_size);
+    size_t total_num = block_size.x * block_num.x;
+    uint iters = dev_info.compute_iters;
+    {
+        if (dev_info.device_arch == MP_22) {
+            void* d_x;
+            CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int)));
+            gops_max = 0.0f;
+            TEST_DETAIL(all_cases, compute_mma_bf16, BF16_16_16_16, 16, 16, 16,
+                MP_22);
+            TEST_DETAIL(all_cases, compute_mma_bf16, BF16_32_8_16, 32, 8, 16,
+                MP_22);
+            TEST_DETAIL(all_cases, compute_mma_bf16, BF16_8_32_16, 8, 32, 16,
+                MP_22);
+            TEST_DETAIL(all_cases, compute_mma_bf16, BF16_32_32_16, 32, 32, 16,
+                MP_22);
+            TEST_END(all_cases, BF16);
+            CHECK_MUSA_ERROR(musaFree(d_x));
+        }
+        else if (dev_info.device_arch == MP_31) {
+            void* d_x;
+            CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int)));
+            gops_max = 0.0f;
+            TEST_DETAIL(all_cases, compute_mma_bf16, BF16_16_16_16, 16, 16, 16,
+                MP_31);
+            TEST_DETAIL(all_cases, compute_mma_bf16, BF16_32_8_16, 32, 8, 16,
+                MP_31);
+            TEST_DETAIL(all_cases, compute_mma_bf16, BF16_8_32_16, 8, 32, 16,
+                MP_31);
+            // TEST_DETAIL(all_cases, compute_mma_bf16, BF16_32_32_16, 32, 32, 16,
+            //             MP_31);
+            TEST_DETAIL(all_cases, compute_mma_bf16, BF16_16_16_32, 16, 16, 32,
+                MP_31);
+            TEST_END(all_cases, BF16);
+            CHECK_MUSA_ERROR(musaFree(d_x));
+        }
+        else {
+            log->print(TAB TAB TAB "NOT SUPPORT mp_" +
+                std::to_string(dev_info.device_arch) + NEWLINE);
+        }
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/logger.cpp b/base/toolkits/computation-BF16/mthreads/S4000/src/logger.cpp
new file mode 100644
index 000000000..1bab6bb37
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/src/logger.cpp
@@ -0,0 +1,73 @@
+#include "logger.h"
+#include <iomanip>
+#include <iostream>
+
+logger::logger() :FileName("benchmark.log") {
+    outFile.open(FileName);
+    if (!outFile.is_open()) {
+        throw std::runtime_error("Failed to open log file.");
+    }
+    outFile.flush();
+}
+
+logger::~logger() {
+    if (outFile.is_open()) {
+        outFile.close();
+    }
+
+    // Delete the log file
+    if (std::remove(FileName.c_str()) != 0) {
+        std::cerr << "Failed to delete log file: " << FileName << "\n";
+    }
+}
+
+
+void logger::print(std::string str) {
+
+    std::cout << str;
+    std::cout.flush();
+    if (!outFile.is_open()) {
+        throw std::runtime_error("Failed to open log file.");
+    }
+    if (!outFile.good()) {
+        std::cerr << "outFile is in a bad state!" << std::endl;
+        outFile.clear();
+    }
+    if (!(outFile << str)) {
+        std::cerr << "Failed to write to outFile." << std::endl;
+    }
+    outFile.flush();
+
+}
+
+void logger::print(double val) {
+    std::cout << std::setprecision(2) << std::fixed;
+    std::cout << val;
+    std::cout.flush();
+    outFile << std::setprecision(2) << std::fixed;
+    outFile << val;
+    outFile.flush();
+}
+
+void logger::print(float val) {
+    std::cout << std::setprecision(2) << std::fixed;
+    std::cout << val;
+    std::cout.flush();
+    outFile << std::setprecision(2) << std::fixed;
+    outFile << val;
+    outFile.flush();
+}
+
+void logger::print(int val) {
+    std::cout << val;
+    std::cout.flush();
+    outFile << val;
+    outFile.flush();
+}
+
+void logger::print(unsigned int val) {
+    std::cout << val;
+    std::cout.flush();
+    outFile << val;
+    outFile.flush();
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-BF16/mthreads/S4000/src/main.cpp b/base/toolkits/computation-BF16/mthreads/S4000/src/main.cpp
new file mode 100644
index 000000000..53263e35a
--- /dev/null
+++ b/base/toolkits/computation-BF16/mthreads/S4000/src/main.cpp
@@ -0,0 +1,7 @@
+#include "benchmark_bf16.h"
+
+int main() {
+    Benchmark bm;
+    bm.RunBF16Test();
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-FP16/mthreads/S4000/README.md b/base/toolkits/computation-FP16/mthreads/S4000/README.md
index b3f294459..8e8a656b7 100644
--- a/base/toolkits/computation-FP16/mthreads/S4000/README.md
+++ b/base/toolkits/computation-FP16/mthreads/S4000/README.md
@@ -2,7 +2,7 @@
 
 * 厂商：MThreads
 * 产品名称：S4000
-* 产品型号：/
+* 产品型号：MTT S4000
 * TDP：/
 
 # 所用服务器配置
@@ -10,8 +10,8 @@
 * 服务器数量：1
 * 单服务器内使用卡数：1
 * 服务器型号：/
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：Linux 5.4.0-42-generic
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
 * CPU：/
 * docker版本：24.0.7
 * 内存：1TiB
diff --git a/base/toolkits/computation-FP16/mthreads/S4000/gemm.mu b/base/toolkits/computation-FP16/mthreads/S4000/gemm.mu
index e8652f187..197509140 100644
--- a/base/toolkits/computation-FP16/mthreads/S4000/gemm.mu
+++ b/base/toolkits/computation-FP16/mthreads/S4000/gemm.mu
@@ -1,7 +1,3 @@
-// Copyright (c) 2024 BAAI. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License")
-
 #include <mublas.h>
 #include <musa_fp16.h>
 #include <musa_runtime.h>
@@ -25,9 +21,9 @@ struct PrecisionConfig {
 
 void test(const PrecisionConfig& config) {
     __half* d_A, * d_B, * d_C;
-    std::vector<__half> h_A(M * K, __float2half(1.0f)); 
-    std::vector<__half> h_B(K * N, __float2half(1.0f)); 
-    std::vector<__half> h_C(M * N); 
+    std::vector<__half> h_A(M * K, __float2half(1.0f));
+    std::vector<__half> h_B(K * N, __float2half(1.0f));
+    std::vector<__half> h_C(M * N);
 
     musaMalloc(&d_A, M * K * config.bytesPerElement);
     musaMalloc(&d_B, K * N * config.bytesPerElement);
diff --git a/base/toolkits/computation-FP32/mthreads/S4000/README.md b/base/toolkits/computation-FP32/mthreads/S4000/README.md
new file mode 100644
index 000000000..0a6248b96
--- /dev/null
+++ b/base/toolkits/computation-FP32/mthreads/S4000/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | FP32算力测试值   | FP32算力标定值  | 测试标定比例 |
+| ---- | ----------- | ---------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /  | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用GEMM算子进行computation-bound的计算任务，从而测得实际FP32算力
\ No newline at end of file
diff --git a/base/toolkits/computation-FP32/mthreads/S4000/gemm.mu b/base/toolkits/computation-FP32/mthreads/S4000/gemm.mu
new file mode 100644
index 000000000..1c97e7537
--- /dev/null
+++ b/base/toolkits/computation-FP32/mthreads/S4000/gemm.mu
@@ -0,0 +1,96 @@
+#include <chrono>
+#include <iostream>
+#include <mublas.h>
+#include <musa_runtime.h>
+#include <vector>
+
+constexpr int M = 8192;
+constexpr int N = 8192;
+constexpr int K = 8192;
+
+struct PrecisionConfig {
+    int bytesPerElement;
+    const char* name;
+    int NUM_ITERATIONS;
+    int WARMUP_ITERATIONS = 10;
+};
+
+void test(const PrecisionConfig& config) {
+    float* d_A, * d_B, * d_C;
+    std::vector<float> h_A(M * K, float(1.0f));
+    std::vector<float> h_B(K * N, float(1.0f));
+    std::vector<float> h_C(M * N);
+
+    musaMalloc(&d_A, M * K * config.bytesPerElement);
+    musaMalloc(&d_B, K * N * config.bytesPerElement);
+    musaMalloc(&d_C, M * N * config.bytesPerElement);
+
+    musaMemcpy(d_A, h_A.data(), M * K * config.bytesPerElement, musaMemcpyHostToDevice);
+    musaMemcpy(d_B, h_B.data(), K * N * config.bytesPerElement, musaMemcpyHostToDevice);
+
+    mublasHandle_t handle;
+    mublasCreate(&handle);
+
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    for (int i = 0; i < config.WARMUP_ITERATIONS; ++i) {
+        mublasSgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N,
+            M, N, K, &alpha,
+            d_A, M,
+            d_B, K,
+            &beta,
+            d_C, M);
+
+    }
+
+    musaError_t syncError = musaDeviceSynchronize();
+    auto start = std::chrono::high_resolution_clock::now();
+
+    if (syncError != musaSuccess) {
+        std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl;
+    }
+
+    for (int i = 0; i < config.NUM_ITERATIONS; ++i) {
+        mublasSgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N,
+            M, N, K, &alpha,
+            d_A, M,
+            d_B, K,
+            &beta,
+            d_C, M);
+    }
+    syncError = musaDeviceSynchronize();
+    auto end = std::chrono::high_resolution_clock::now();
+
+    if (syncError != musaSuccess) {
+        std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl;
+    }
+    auto duration =
+        std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    std::cout << "Average " << config.name << " Single Op Duration: "
+        << duration.count() / config.NUM_ITERATIONS << " us" << std::endl;
+
+    double time_second = duration.count() / 1.0e6;
+    double flops = 2.0 * M * N * K * config.NUM_ITERATIONS;
+    double FLOPS = flops / time_second;
+    double TFLOPS = FLOPS / 1.0e12;
+
+    std::cout << "[FlagPerf Result]" << "computation-FP32=" << TFLOPS << "TFLOPS"
+        << std::endl;
+
+    musaMemcpy(h_C.data(), d_C, M * N * config.bytesPerElement, musaMemcpyDeviceToHost);
+
+    musaFree(d_A);
+    musaFree(d_B);
+    musaFree(d_C);
+
+    mublasDestroy(handle);
+}
+
+int main() {
+    PrecisionConfig fp32 = { sizeof(float), "FP32", 10000, 10 };
+
+    test(fp32);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-FP32/mthreads/S4000/main.sh b/base/toolkits/computation-FP32/mthreads/S4000/main.sh
new file mode 100644
index 000000000..e07c73a09
--- /dev/null
+++ b/base/toolkits/computation-FP32/mthreads/S4000/main.sh
@@ -0,0 +1,2 @@
+mcc gemm.mu -lmusart -lmublas -o gemm
+./gemm
\ No newline at end of file
diff --git a/base/toolkits/computation-FP64/mthreads/S4000/README.md b/base/toolkits/computation-FP64/mthreads/S4000/README.md
new file mode 100644
index 000000000..116cbb4ae
--- /dev/null
+++ b/base/toolkits/computation-FP64/mthreads/S4000/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | FP64算力测试值   | FP64算力标定值  | 测试标定比例 |
+| ---- | ----------- | ---------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /  | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用GEMM算子进行computation-bound的计算任务，从而测得实际FP64算力
\ No newline at end of file
diff --git a/base/toolkits/computation-FP64/mthreads/S4000/gemm.mu b/base/toolkits/computation-FP64/mthreads/S4000/gemm.mu
new file mode 100644
index 000000000..54e576885
--- /dev/null
+++ b/base/toolkits/computation-FP64/mthreads/S4000/gemm.mu
@@ -0,0 +1,96 @@
+#include <chrono>
+#include <iostream>
+#include <mublas.h>
+#include <musa_runtime.h>
+#include <vector>
+
+constexpr int M = 8192;
+constexpr int N = 8192;
+constexpr int K = 8192;
+
+struct PrecisionConfig {
+    int bytesPerElement;
+    const char* name;
+    int NUM_ITERATIONS;
+    int WARMUP_ITERATIONS = 10;
+};
+
+void test(const PrecisionConfig& config) {
+    double* d_A, * d_B, * d_C;
+    std::vector<double> h_A(M * K, double(1.0f));
+    std::vector<double> h_B(K * N, double(1.0f));
+    std::vector<double> h_C(M * N);
+
+    musaMalloc(&d_A, M * K * config.bytesPerElement);
+    musaMalloc(&d_B, K * N * config.bytesPerElement);
+    musaMalloc(&d_C, M * N * config.bytesPerElement);
+
+    musaMemcpy(d_A, h_A.data(), M * K * config.bytesPerElement, musaMemcpyHostToDevice);
+    musaMemcpy(d_B, h_B.data(), K * N * config.bytesPerElement, musaMemcpyHostToDevice);
+
+    mublasHandle_t handle;
+    mublasCreate(&handle);
+
+    double alpha = 1.0f;
+    double beta = 0.0f;
+
+    for (int i = 0; i < config.WARMUP_ITERATIONS; ++i) {
+        mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N,
+            M, N, K, &alpha,
+            d_A, M,
+            d_B, K,
+            &beta,
+            d_C, M);
+
+    }
+
+    musaError_t syncError = musaDeviceSynchronize();
+    auto start = std::chrono::high_resolution_clock::now();
+
+    if (syncError != musaSuccess) {
+        std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl;
+    }
+
+    for (int i = 0; i < config.NUM_ITERATIONS; ++i) {
+        mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_N,
+            M, N, K, &alpha,
+            d_A, M,
+            d_B, K,
+            &beta,
+            d_C, M);
+    }
+    syncError = musaDeviceSynchronize();
+    auto end = std::chrono::high_resolution_clock::now();
+
+    if (syncError != musaSuccess) {
+        std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl;
+    }
+    auto duration =
+        std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+    std::cout << "Average " << config.name << " Single Op Duration: "
+        << duration.count() / config.NUM_ITERATIONS << " us" << std::endl;
+
+    double time_second = duration.count() / 1.0e6;
+    double flops = 2.0 * M * N * K * config.NUM_ITERATIONS;
+    double FLOPS = flops / time_second;
+    double TFLOPS = FLOPS / 1.0e12;
+
+    std::cout << "[FlagPerf Result]" << "computation-FP32=" << TFLOPS << "TFLOPS"
+        << std::endl;
+
+    musaMemcpy(h_C.data(), d_C, M * N * config.bytesPerElement, musaMemcpyDeviceToHost);
+
+    musaFree(d_A);
+    musaFree(d_B);
+    musaFree(d_C);
+
+    mublasDestroy(handle);
+}
+
+int main() {
+    PrecisionConfig fp64 = { sizeof(double), "FP64", 70, 10 };
+
+    test(fp64);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-FP64/mthreads/S4000/main.sh b/base/toolkits/computation-FP64/mthreads/S4000/main.sh
new file mode 100644
index 000000000..e07c73a09
--- /dev/null
+++ b/base/toolkits/computation-FP64/mthreads/S4000/main.sh
@@ -0,0 +1,2 @@
+mcc gemm.mu -lmusart -lmublas -o gemm
+./gemm
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/README.md b/base/toolkits/computation-INT8/mthreads/S4000/README.md
new file mode 100644
index 000000000..3242e3e47
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | INT8算力测试值   | INT8算力标定值  | 测试标定比例 |
+| ---- | ----------- | ---------- | ------ |
+| 评测结果 | / | / | / |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /   | /     | / | / | /  | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用GEMM算子进行computation-bound的计算任务，从而测得实际INT8算力
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark.muh b/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark.muh
new file mode 100644
index 000000000..7fbc4afa8
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark.muh
@@ -0,0 +1,54 @@
+#pragma once
+#include "benchmark_int8.h"
+
+template <class T, typename... Args>
+float Benchmark::RunKernel(T func, dim3 block_num, dim3 block_size, uint iters,
+                           Args... args) {
+  // unit micro secend
+  float timed = 0, timed_min = 0;
+
+  // Dummy calls
+  for (uint i = 0; i < 2; i++) {
+    func<<<block_num, block_size>>>(args...);
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  }
+
+  if (use_event_timer) {
+    MUSAEvent musaEvent(&timed);
+    for (uint i = 0; i < iters; i++) {
+      func<<<block_num, block_size>>>(args...);
+    }
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  } else  // std timer
+  {
+    Timer timer(&timed);
+    for (uint i = 0; i < iters; i++) {
+      func<<<block_num, block_size>>>(args...);
+    }
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  }
+  timed_min = timed;
+  // double check
+  if (use_event_timer) {
+    MUSAEvent musaEvent(&timed);
+    for (uint i = 0; i < iters; i++) {
+      func<<<block_num, block_size>>>(args...);
+    }
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  } else  // std timer
+  {
+    Timer timer(&timed);
+    for (uint i = 0; i < iters; i++) {
+      func<<<block_num, block_size>>>(args...);
+    }
+    CHECK_MUSA_ERROR(musaDeviceSynchronize());
+  }
+  timed_min = std::min(timed_min, timed);
+
+  return (timed_min / static_cast<float>(iters));
+}
+
+template <int A>
+struct Int2Type {
+  enum { VALUE = A };
+};
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark_int8.h b/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark_int8.h
new file mode 100644
index 000000000..8ffc6a3f7
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/include/benchmark_int8.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <common.h>
+#include <logger.h>
+
+#define DEFAULT_BANDWIDTH_MEM_SIZE 2048
+#define DEFAULT_BANDWIDTH_ITERS 30
+
+typedef enum class BANDWIDTH_MODE {
+    ALL,
+    READ_ONLY,
+    WRITE_ONLY,
+    READ_WRITE
+} BW_MODE_T;
+
+typedef enum class OFFSET_MODE { LOCAL, GLOBAL } OFFSET_MODE_T;
+
+typedef enum GPU_ARCH {
+    MP_10 = 10,
+    MP_21 = 21,
+    MP_22 = 22,
+    MP_31 = 31
+} GPU_ARCH_T;
+
+class Benchmark {
+public:
+    // devices
+    int specified_device;
+    const char* specified_device_name;
+    const char* specified_type_name;
+    bool force_device;
+    bool force_device_name;
+    bool force_type;
+    // option
+    bool all_cases;
+    bool use_event_timer;
+    BW_MODE_T bandwidth_mode;
+    int bandwidth_mem_size;
+    int bandwidth_iters;
+
+
+    logger* log;
+
+    Benchmark();
+    ~Benchmark();
+
+
+    // return avg time in us
+    template <class T, typename... Args>
+    float RunKernel(T func, dim3 block_num, dim3 block_size, uint iters,
+        Args... args);
+
+
+    int RunComputeMMAINT8(device_info_t& dev_info);
+
+    int RunINT8Test();
+};
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/common.h b/base/toolkits/computation-INT8/mthreads/S4000/include/common.h
new file mode 100644
index 000000000..3bb25b966
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/include/common.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#if defined(__APPLE__) || defined(__MACOSX) || defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include "musa_runtime.h"
+#include <string>
+#include <chrono>
+
+#define TAB "  "
+#define NEWLINE "\n"
+#ifndef __FreeBSD__
+#define uint unsigned int
+#endif
+#define ulong unsigned long
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#define OS_NAME "Macintosh"
+#elif defined(__ANDROID__)
+#define OS_NAME "Android"
+#elif defined(_WIN32)
+#if defined(_WIN64)
+#define OS_NAME "Win64"
+#else
+#define OS_NAME "Win32"
+#endif
+#elif defined(__linux__)
+#if defined(__x86_64__)
+#define OS_NAME "Linux x64"
+#elif defined(__i386__)
+#define OS_NAME "Linux x86"
+#elif defined(__arm__)
+#define OS_NAME "Linux ARM"
+#elif defined(__aarch64__)
+#define OS_NAME "Linux ARM64"
+#else
+#define OS_NAME "Linux unknown"
+#endif
+#elif defined(__FreeBSD__)
+#define OS_NAME "FreeBSD"
+#else
+#define OS_NAME "Unknown"
+#endif
+
+int check_musa_error(musaError _err, int line, const char* func_name);
+
+#define _PERF_CHECK_MUSA_ERROR_INNER(cond, func, line) \
+  do {                                                 \
+    if (check_musa_error(cond, line, func))            \
+      exit(1);                                         \
+  } while (0)
+
+#define CHECK_MUSA_ERROR(cond) \
+  _PERF_CHECK_MUSA_ERROR_INNER(cond, __PRETTY_FUNCTION__, __LINE__)
+
+typedef struct {
+    std::string device_name;
+    std::string driver_version;
+    int device_arch;
+
+    uint num_compute_units;
+    uint max_work_group_size;
+    uint64_t max_alloc_size;
+    uint64_t max_global_size;
+    uint max_clock_freq;
+
+    bool half_supported;
+    bool double_supported;
+    bool imma_supported;
+
+    // Test specific options
+    uint bw_global_iters;
+    uint bw_shmem_iters;
+    uint64_t bw_global_max_size;
+    uint64_t bw_shmem_max_size;
+    uint compute_work_groups_per_cu;
+    uint compute_dp_work_groups_per_cu;
+    uint shmem_work_groups_per_cu;
+    uint compute_iters;
+    uint bw_transfer_iters;
+    uint kernel_latency_iters;
+    uint64_t bw_transfer_max_size;
+    std::string extension;
+} device_info_t;
+
+class Timer {
+public:
+    explicit Timer(float* dur);
+    ~Timer();
+
+private:
+    float* duration_us;
+    std::chrono::high_resolution_clock::time_point tick;
+    std::chrono::high_resolution_clock::time_point tock;
+};
+
+class MUSAEvent {
+public:
+    explicit MUSAEvent(float* dur);
+    ~MUSAEvent();
+
+private:
+    float* duration_us;
+    musaEvent_t startTime;
+    musaEvent_t stopTime;
+};
+
+int GetDeviceInfo(int dev, device_info_t* dev_info);
+
+// Round down to next multiple of the given base with an optional maximum value
+uint64_t RoundToMultipleOf(uint64_t number, uint64_t base,
+    uint64_t maxValue = UINT64_MAX);
+
+void Populate(float* ptr, uint64_t N);
+void Populate(double* ptr, uint64_t N);
+
+void TrimString(std::string& str);
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/compute_mma_int8.muh b/base/toolkits/computation-INT8/mthreads/S4000/include/compute_mma_int8.muh
new file mode 100644
index 000000000..5c9e8d987
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/include/compute_mma_int8.muh
@@ -0,0 +1,75 @@
+#pragma once
+#include "benchmark.muh"
+#include <mma.h>
+
+#define UNROLL_NUM 16
+#define ITERS 256
+using namespace mtmusa;
+
+#define MMA_4(a, b, c)           \
+  wmma::mma_sync(c, a, b, c, 0); \
+  wmma::mma_sync(c, a, b, c, 0); \
+  wmma::mma_sync(c, a, b, c, 0); \
+  wmma::mma_sync(c, a, b, c, 0);
+
+#define MMA_16(a, b, c) \
+  MMA_4(a, b, c);       \
+  MMA_4(a, b, c);       \
+  MMA_4(a, b, c);       \
+  MMA_4(a, b, c);
+
+//////////////// uint8 ///////////////////////////
+template <int M, int N, int K>
+__device__ void compute_mma_uint8_impl(void* ptr, Int2Type<true>) {
+  wmma::fragment<wmma::matrix_a, M, N, K, unsigned char, wmma::row_major>
+      a_frag;
+  wmma::fragment<wmma::matrix_b, M, N, K, unsigned char, wmma::row_major>
+      b_frag;
+  wmma::fragment<wmma::accumulator, M, N, K, int> c_frag;
+  // #pragma unroll
+  for (int i = 0; i < ITERS; i++) {
+    MMA_16(a_frag, b_frag, c_frag);
+  }
+  wmma::store_matrix_sync((int*)ptr, c_frag, M, wmma::mem_col_major);
+}
+
+template <int M, int N, int K>
+__device__ void compute_mma_uint8_impl(void* ptr, Int2Type<false>) {}
+
+template <int M, int N, int K, int ARCH>
+__global__ void compute_mma_uint8(void* ptr) {
+#if defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 210)
+  compute_mma_uint8_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_21)>());
+#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 220)
+  compute_mma_uint8_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_22)>());
+#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 310)
+  compute_mma_uint8_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_31)>());
+#endif
+}
+
+//////////////////// int8 /////////////////////////////
+template <int M, int N, int K>
+__device__ void compute_mma_int8_impl(void* ptr, Int2Type<true>) {
+  wmma::fragment<wmma::matrix_a, M, N, K, signed char, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, M, N, K, signed char, wmma::row_major> b_frag;
+  wmma::fragment<wmma::accumulator, M, N, K, int> c_frag;
+  // #pragma unroll
+  for (int i = 0; i < ITERS; i++) {
+    MMA_16(a_frag, b_frag, c_frag);
+  }
+  wmma::store_matrix_sync((int*)ptr, c_frag, M, wmma::mem_col_major);
+}
+
+template <int M, int N, int K>
+__device__ void compute_mma_int8_impl(void* ptr, Int2Type<false>) {}
+
+template <int M, int N, int K, int ARCH>
+__global__ void compute_mma_int8(void* ptr) {
+#if defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 210)
+  compute_mma_int8_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_21)>());
+#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 220)
+  compute_mma_int8_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_22)>());
+#elif defined(__MUSA_ARCH__) && (__MUSA_ARCH__ == 310)
+  compute_mma_int8_impl<M, N, K>(ptr, Int2Type<(ARCH == MP_31)>());
+#endif
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/include/logger.h b/base/toolkits/computation-INT8/mthreads/S4000/include/logger.h
new file mode 100644
index 000000000..46856f87c
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/include/logger.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <string>
+#include <fstream>
+class logger {
+    std::ofstream outFile;
+    std::string FileName;
+public:
+    logger();
+    ~logger();
+
+    void print(std::string str);
+    void print(double val);
+    void print(float val);
+    void print(int val);
+    void print(unsigned int val);
+};
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/main.sh b/base/toolkits/computation-INT8/mthreads/S4000/main.sh
new file mode 100644
index 000000000..402ad38e7
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/main.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+CXX=g++
+MCC=mcc
+CXXFLAGS="-std=c++17 -I./include -I/usr/local/musa/include -fPIC"
+MCCFLAGS="-std=c++17 --offload-arch=mp_22 -I../include -fPIC -I./include -I/usr/local/musa/include"
+LDFLAGS="-lmusart -L/usr/local/musa/lib"
+
+SRC_DIR=src
+BUILD_DIR=build
+EXECUTABLE=gemm
+
+
+mkdir -p $BUILD_DIR
+
+$CXX $CXXFLAGS -c $SRC_DIR/common.cpp -o $BUILD_DIR/common.o
+$CXX $CXXFLAGS -c $SRC_DIR/logger.cpp -o $BUILD_DIR/logger.o
+$CXX $CXXFLAGS -c $SRC_DIR/benchmark_int8.cpp -o $BUILD_DIR/benchmark.o
+
+$MCC $MCCFLAGS -c $SRC_DIR/compute_mma_int8.mu -o $BUILD_DIR/compute_mma_int8.o
+
+$CXX $CXXFLAGS -c $SRC_DIR/main.cpp -o $BUILD_DIR/main.o
+
+$CXX $CXXFLAGS $BUILD_DIR/*.o -o $EXECUTABLE $LDFLAGS
+
+./gemm
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/benchmark_int8.cpp b/base/toolkits/computation-INT8/mthreads/S4000/src/benchmark_int8.cpp
new file mode 100644
index 000000000..3c04d151a
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/src/benchmark_int8.cpp
@@ -0,0 +1,36 @@
+#include "benchmark_int8.h"
+#include <cstring>
+#include <string>
+
+Benchmark::Benchmark()
+    : specified_device(0),
+    specified_device_name(0),
+    specified_type_name(0),
+    force_device(false),
+    force_device_name(false),
+    force_type(false),
+    all_cases(false),
+    use_event_timer(false),
+    bandwidth_mode(BANDWIDTH_MODE::ALL),
+    bandwidth_mem_size(DEFAULT_BANDWIDTH_MEM_SIZE),
+    bandwidth_iters(DEFAULT_BANDWIDTH_ITERS) {
+    log = new logger();
+}
+
+Benchmark::~Benchmark() {
+    if (log) {
+        delete log;
+    }
+}
+
+int Benchmark::RunINT8Test() {
+    musaSetDevice(0);
+    device_info_t dev_info;
+    if (GetDeviceInfo(0, &dev_info)) {
+        log->print(TAB "Can not get informations for Device " +
+            std::to_string(0) + NEWLINE);
+    }
+
+    RunComputeMMAINT8(dev_info);
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/common.cpp b/base/toolkits/computation-INT8/mthreads/S4000/src/common.cpp
new file mode 100644
index 000000000..e6b613248
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/src/common.cpp
@@ -0,0 +1,138 @@
+#include "common.h"
+
+#define MUSA_ERROR_CASE(ERR)                                \
+  case ERR: {                                               \
+    printf("" #ERR " in %s on line %i\n", func_name, line); \
+    return 1;                                               \
+  }
+
+int check_musa_error(musaError _err, int line, const char* func_name) {
+    switch (_err) {
+        case musaSuccess:
+            return 0;
+            MUSA_ERROR_CASE(musaErrorInvalidValue)
+                MUSA_ERROR_CASE(musaErrorMemoryAllocation)
+                MUSA_ERROR_CASE(musaErrorInitializationError)
+                MUSA_ERROR_CASE(musaErrorInvalidMemcpyDirection)
+                MUSA_ERROR_CASE(musaErrorAddressOfConstant)
+                MUSA_ERROR_CASE(musaErrorSynchronizationError)
+                MUSA_ERROR_CASE(musaErrorNotYetImplemented)
+                MUSA_ERROR_CASE(musaErrorMemoryValueTooLarge)
+                MUSA_ERROR_CASE(musaErrorNoDevice)
+                MUSA_ERROR_CASE(musaErrorInvalidDevice)
+                MUSA_ERROR_CASE(musaErrorHostMemoryAlreadyRegistered)
+                MUSA_ERROR_CASE(musaErrorHostMemoryNotRegistered)
+                MUSA_ERROR_CASE(musaErrorIllegalInstruction)
+                MUSA_ERROR_CASE(musaErrorInvalidAddressSpace)
+                MUSA_ERROR_CASE(musaErrorLaunchFailure)
+                MUSA_ERROR_CASE(musaErrorNotSupported)
+                MUSA_ERROR_CASE(musaErrorTimeout)
+                MUSA_ERROR_CASE(musaErrorUnknown)
+                MUSA_ERROR_CASE(musaErrorApiFailureBase)
+        default:
+            printf("Unknown MUSA error %i in %s on line %i\n", _err, func_name, line);
+            return 1;
+    }
+}
+
+int GetDeviceInfo(int dev, device_info_t* dev_info) {
+    musaDeviceProp deviceProp;
+    if (musaSuccess != musaGetDeviceProperties(&deviceProp, dev)) {
+        return -1;
+    }
+    dev_info->device_name = deviceProp.name;
+    dev_info->device_arch = 10 * deviceProp.major + deviceProp.minor;
+    int driverVersion = 0;
+    musaDriverGetVersion(&driverVersion);
+    dev_info->driver_version = std::to_string(driverVersion);
+    TrimString(dev_info->device_name);
+    TrimString(dev_info->driver_version);
+
+    dev_info->num_compute_units = deviceProp.multiProcessorCount;
+    dev_info->max_work_group_size = deviceProp.maxThreadsPerBlock;
+
+    // Limiting max work-group size to 512
+#define MAX_WG_SIZE 256
+    dev_info->max_work_group_size =
+        std::min(dev_info->max_work_group_size, (uint)MAX_WG_SIZE);
+#undef MAX_WG_SIZE
+
+    /*  Size of global device memory in bytes.  */
+    dev_info->max_global_size = static_cast<uint64_t>(deviceProp.totalGlobalMem);
+    /*  Max size of memory object allocation in bytes.*/
+    dev_info->max_alloc_size = dev_info->max_global_size / 3;
+    dev_info->max_clock_freq = static_cast<uint>(deviceProp.clockRate / 1000);
+    dev_info->double_supported = true;
+    dev_info->half_supported = true;
+
+    dev_info->bw_global_max_size = 1 << 31;
+    dev_info->bw_shmem_max_size = 1 << 28;
+    dev_info->bw_transfer_max_size = 1 << 28;
+    dev_info->compute_work_groups_per_cu = 2048;
+    dev_info->compute_dp_work_groups_per_cu = 512;
+    dev_info->shmem_work_groups_per_cu = 128;
+    dev_info->compute_iters = 20;
+    dev_info->bw_global_iters = 30;
+    dev_info->bw_shmem_iters = 20;
+    dev_info->bw_transfer_iters = 20;
+    dev_info->kernel_latency_iters = 20000;
+
+    return 0;
+}
+
+MUSAEvent::MUSAEvent(float* dur) : duration_us(dur) {
+    CHECK_MUSA_ERROR(musaEventCreate(&startTime));
+    CHECK_MUSA_ERROR(musaEventCreate(&stopTime));
+    CHECK_MUSA_ERROR(musaEventRecord(startTime));
+}
+
+MUSAEvent::~MUSAEvent() {
+    CHECK_MUSA_ERROR(musaEventRecord(stopTime));
+    CHECK_MUSA_ERROR(musaEventSynchronize(stopTime));
+    CHECK_MUSA_ERROR(musaEventElapsedTime(duration_us, startTime, stopTime));
+    *duration_us *= 1e3f;
+    CHECK_MUSA_ERROR(musaEventDestroy(startTime));
+    CHECK_MUSA_ERROR(musaEventDestroy(stopTime));
+}
+
+Timer::Timer(float* dur) : duration_us(dur) {
+    tick = std::chrono::high_resolution_clock::now();
+}
+
+Timer::~Timer() {
+    tock = std::chrono::high_resolution_clock::now();
+    *duration_us =
+        (float)(std::chrono::duration_cast<std::chrono::microseconds>(tock - tick)
+            .count());
+}
+
+void Populate(float* ptr, uint64_t num) {
+    srand((unsigned int)time(NULL));
+
+    for (uint64_t i = 0; i < num; i++) {
+        // ptr[i] = (float)rand();
+        // to ensure the sum of arr is a positive number
+        // avoid the STORE in ReadOnly
+        ptr[i] = (float)i;
+    }
+}
+
+void Populate(double* ptr, uint64_t num) {
+    srand((unsigned int)time(NULL));
+    for (uint64_t i = 0; i < num; i++) {
+        // ptr[i] = (double)rand();
+        ptr[i] = (double)i;
+    }
+}
+
+uint64_t RoundToMultipleOf(uint64_t number, uint64_t base, uint64_t max_value) {
+    uint64_t n = (number > max_value) ? max_value : number;
+    return (n / base) * base;
+}
+
+void TrimString(std::string& str) {
+    size_t pos = str.find('\0');
+    if (pos != std::string::npos) {
+        str.erase(pos);
+    }
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/compute_mma_int8.mu b/base/toolkits/computation-INT8/mthreads/S4000/src/compute_mma_int8.mu
new file mode 100644
index 000000000..ea2374617
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/src/compute_mma_int8.mu
@@ -0,0 +1,107 @@
+#include "benchmark.muh"
+#include "compute_mma_int8.muh"
+
+#define WARPSIZE 128
+#define TEST_DETAIL(show_, kernel_, tag_, M_, N_, K_, ARCH_)                  \
+  work_per_warp = M_ * N_ * K_ * 2 * ITERS * UNROLL_NUM;                      \
+  timed = RunKernel(kernel_<M_, N_, K_, ARCH_>, block_num, block_size, iters, \
+                    d_x);                                                     \
+  gops = (static_cast<float>(total_num) / WARPSIZE) * work_per_warp / timed / \
+      1e3f;                                                                   \
+  gops_max = std::max(gops_max, gops);                                        \
+  if (show_) {                                                                \
+    log->print(TAB TAB TAB #tag_ "  : ");                                     \
+    log->print(gops);                                                         \
+    log->print(NEWLINE);                                                      \
+  }
+
+#define TEST_END(show_, type_)             \
+  if (!show_) {                            \
+    log->print("[FlagPerf Result]computation-" #type_ "=");\
+    log->print(gops_max/1e3);\
+    log->print("TFLOPS");\
+    log->print(NEWLINE); \
+  }
+
+int Benchmark::RunComputeMMAINT8(device_info_t& dev_info) {
+
+    // mtgpu imma only
+
+    float timed, gops, gops_max;
+    int work_per_warp;
+    dim3 block_size(1024);
+    int grid_size = std::min((dev_info.num_compute_units) *
+        (dev_info.compute_work_groups_per_cu) *
+        (block_size.x) * sizeof(int),
+        dev_info.max_alloc_size) /
+        ((block_size.x) * sizeof(int));
+    grid_size = std::min(grid_size, 2048);
+    dim3 block_num(grid_size);
+    size_t total_num = block_size.x * block_num.x;
+    uint iters = dev_info.compute_iters;
+    {
+        if (dev_info.device_arch == MP_21) {
+            void* d_x;
+            CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int)));
+            gops_max = 0.0f;
+            TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_16_8_16, 16, 16, 16,
+                MP_21);
+            TEST_END(all_cases, UINT8);
+            CHECK_MUSA_ERROR(musaFree(d_x));
+        }
+        else if (dev_info.device_arch == MP_22) {
+            void* d_x;
+            CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int)));
+            gops_max = 0.0f;
+            TEST_DETAIL(all_cases, compute_mma_int8, INT8_16_16_16, 16, 16, 16,
+                MP_22);
+            TEST_DETAIL(all_cases, compute_mma_int8, INT8_32_8_16, 32, 8, 16,
+                MP_22);
+            TEST_DETAIL(all_cases, compute_mma_int8, INT8_8_32_16, 8, 32, 16,
+                MP_22);
+            TEST_DETAIL(all_cases, compute_mma_int8, INT8_32_32_32, 32, 32, 32,
+                MP_22);
+            TEST_END(all_cases, INT8);
+            CHECK_MUSA_ERROR(musaFree(d_x));
+        }
+        else if (dev_info.device_arch == MP_31) {
+            void* d_x;
+            CHECK_MUSA_ERROR(musaMalloc(&d_x, total_num * sizeof(int)));
+            gops_max = 0.0f;
+            TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_16_16_16, 16, 16, 16,
+                MP_31);
+            TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_32_8_16, 32, 8, 16,
+                MP_31);
+            TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_8_32_16, 8, 32, 16,
+                MP_31);
+            // TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_32_32_32, 32, 32, 32,
+            //             MP_31);
+            TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_16_16_32, 16, 16, 32,
+                MP_31);
+            TEST_DETAIL(all_cases, compute_mma_uint8, UINT8_16_16_64, 16, 16, 64,
+                MP_31);
+            TEST_END(all_cases, UINT8);
+            gops_max = 0.0f;
+            TEST_DETAIL(all_cases, compute_mma_int8, INT8_16_16_16, 16, 16, 16,
+                MP_31);
+            TEST_DETAIL(all_cases, compute_mma_int8, INT8_32_8_16, 32, 8, 16,
+                MP_31);
+            TEST_DETAIL(all_cases, compute_mma_int8, INT8_8_32_16, 8, 32, 16,
+                MP_31);
+            // TEST_DETAIL(all_cases, compute_mma_int8, INT8_32_32_32, 32, 32, 32,
+            //             MP_31);
+            TEST_DETAIL(all_cases, compute_mma_int8, INT8_16_16_32, 16, 16, 32,
+                MP_31);
+            TEST_DETAIL(all_cases, compute_mma_int8, INT8_16_16_64, 16, 16, 64,
+                MP_31);
+            TEST_END(all_cases, INT8);
+            CHECK_MUSA_ERROR(musaFree(d_x));
+        }
+        else {
+            log->print(TAB TAB TAB "NOT SUPPORT mp_" +
+                std::to_string(dev_info.device_arch) + NEWLINE);
+        }
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/logger.cpp b/base/toolkits/computation-INT8/mthreads/S4000/src/logger.cpp
new file mode 100644
index 000000000..1bab6bb37
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/src/logger.cpp
@@ -0,0 +1,73 @@
+#include "logger.h"
+#include <iomanip>
+#include <iostream>
+
+logger::logger() :FileName("benchmark.log") {
+    outFile.open(FileName);
+    if (!outFile.is_open()) {
+        throw std::runtime_error("Failed to open log file.");
+    }
+    outFile.flush();
+}
+
+logger::~logger() {
+    if (outFile.is_open()) {
+        outFile.close();
+    }
+
+    // Delete the log file
+    if (std::remove(FileName.c_str()) != 0) {
+        std::cerr << "Failed to delete log file: " << FileName << "\n";
+    }
+}
+
+
+void logger::print(std::string str) {
+
+    std::cout << str;
+    std::cout.flush();
+    if (!outFile.is_open()) {
+        throw std::runtime_error("Failed to open log file.");
+    }
+    if (!outFile.good()) {
+        std::cerr << "outFile is in a bad state!" << std::endl;
+        outFile.clear();
+    }
+    if (!(outFile << str)) {
+        std::cerr << "Failed to write to outFile." << std::endl;
+    }
+    outFile.flush();
+
+}
+
+void logger::print(double val) {
+    std::cout << std::setprecision(2) << std::fixed;
+    std::cout << val;
+    std::cout.flush();
+    outFile << std::setprecision(2) << std::fixed;
+    outFile << val;
+    outFile.flush();
+}
+
+void logger::print(float val) {
+    std::cout << std::setprecision(2) << std::fixed;
+    std::cout << val;
+    std::cout.flush();
+    outFile << std::setprecision(2) << std::fixed;
+    outFile << val;
+    outFile.flush();
+}
+
+void logger::print(int val) {
+    std::cout << val;
+    std::cout.flush();
+    outFile << val;
+    outFile.flush();
+}
+
+void logger::print(unsigned int val) {
+    std::cout << val;
+    std::cout.flush();
+    outFile << val;
+    outFile.flush();
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-INT8/mthreads/S4000/src/main.cpp b/base/toolkits/computation-INT8/mthreads/S4000/src/main.cpp
new file mode 100644
index 000000000..62baf7099
--- /dev/null
+++ b/base/toolkits/computation-INT8/mthreads/S4000/src/main.cpp
@@ -0,0 +1,6 @@
+#include "benchmark_int8.h"
+int main() {
+    Benchmark bm;
+    bm.RunINT8Test();
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/computation-TF32/mthreads/S4000/README.md b/base/toolkits/computation-TF32/mthreads/S4000/README.md
index f81e83b45..ee67fc119 100644
--- a/base/toolkits/computation-TF32/mthreads/S4000/README.md
+++ b/base/toolkits/computation-TF32/mthreads/S4000/README.md
@@ -2,7 +2,7 @@
 
 * 厂商：MThreads
 * 产品名称：S4000
-* 产品型号：/
+* 产品型号：MTT S4000
 * TDP：/
 
 # 所用服务器配置
@@ -10,13 +10,12 @@
 * 服务器数量：1
 * 单服务器内使用卡数：1
 * 服务器型号：/
-* 操作系统版本：Ubuntu 20.04.4 LTS
-* 操作系统内核：Linux 5.4.0-42-generic
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
 * CPU：/
 * docker版本：24.0.7
 * 内存：1TiB
 * 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
-
 # 评测结果
 
 ## 核心评测结果
diff --git a/base/toolkits/computation-TF32/mthreads/S4000/gemm.mu b/base/toolkits/computation-TF32/mthreads/S4000/gemm.mu
index 2430f2814..aef21db4a 100644
--- a/base/toolkits/computation-TF32/mthreads/S4000/gemm.mu
+++ b/base/toolkits/computation-TF32/mthreads/S4000/gemm.mu
@@ -1,7 +1,3 @@
-// Copyright (c) 2024 BAAI. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License")
-
 #include <mublas.h>
 #include <musa_runtime.h>
 
@@ -24,9 +20,9 @@ struct PrecisionConfig {
 
 void test(const PrecisionConfig& config) {
     float* d_A, * d_B, * d_C;
-    std::vector<float> h_A(M * K); 
-    std::vector<float> h_B(K * N); 
-    std::vector<float> h_C(M * N); 
+    std::vector<float> h_A(M * K);
+    std::vector<float> h_B(K * N);
+    std::vector<float> h_C(M * N);
 
     musaMalloc(&d_A, M * K * config.bytesPerElement);
     musaMalloc(&d_B, K * N * config.bytesPerElement);